Imported Upstream version 1.15.0upstream/1.15.0 submit/tizen/20210427.093759 submit/tizen/20210423.055448 submit/tizen/20210422.015846 submit/tizen/20210421.062230 accepted/tizen/unified/20210428.040443

author: Chunseok Lee <chunseok.lee@samsung.com> 2021-04-20 18:01:41 +0900
committer: Chunseok Lee <chunseok.lee@samsung.com> 2021-04-20 18:01:41 +0900
commit: 589bb1db6db6784efe21b3fbbfbfdb79aaa5f14e (patch)
tree: 47a2b23ce4220e3a4150c8b12ed941555272fb0c
parent: 62529acabbafce7730601ed01d5709d7bc0d378a (diff)
download: nnfw-589bb1db6db6784efe21b3fbbfbfdb79aaa5f14e.tar.gz
nnfw-589bb1db6db6784efe21b3fbbfbfdb79aaa5f14e.tar.bz2
nnfw-589bb1db6db6784efe21b3fbbfbfdb79aaa5f14e.zip
2521 files changed, 73583 insertions, 19570 deletions
diff --git a/.ahub/tcchecker-tca/config.yaml b/.ahub/tcchecker-tca/config.yaml
index cd34d792f..a42e23645 100644
--- a/.ahub/tcchecker-tca/config.yaml
+++ b/.ahub/tcchecker-tca/config.yaml
@@ -5,7 +5,7 @@ test:
     testFW: GTEST
     testCaseFolder:
       - ./compute/test/cker
-      - ./runtime/onert/core/src/backend/cpu_common
+      - ./runtime/onert/core/src/backend/basic
       - ./runtime/onert/frontend/nnapi
       - ./runtime/onert/test/core/compiler
       - ./runtime/onert/test/core/exec
@@ -31,7 +31,7 @@ test:
         - functionName:
             starts:
               - TEST
- 
+
     negativeTestCase:
       - condition:
         - testName:
diff --git a/.clang-format b/.clang-format
index 5699ccff8..9243c9a2b 100644
--- a/.clang-format
+++ b/.clang-format
@@ -1,4 +1,3 @@
----
 Language:        Cpp
 BasedOnStyle: Google
 AccessModifierOffset: -2
@@ -21,17 +20,18 @@ AlwaysBreakTemplateDeclarations: false
 BinPackArguments: true
 BinPackParameters: true
 BraceWrapping:
-  AfterClass:      true
-  AfterControlStatement: true
-  AfterEnum:       true
-  AfterFunction:   true
-  AfterNamespace:  true
-  AfterObjCDeclaration: false
-  AfterStruct:     true
-  AfterUnion:      false
-  BeforeCatch:     true
-  BeforeElse:      true
-  IndentBraces:    false
+  AfterClass:             true
+  AfterControlStatement:  true
+  AfterEnum:              true
+  AfterFunction:          true
+  AfterNamespace:         true
+  AfterObjCDeclaration:   false
+  AfterStruct:            true
+  AfterUnion:             false
+  AfterExternBlock:       false
+  BeforeCatch:            true
+  BeforeElse:             true
+  IndentBraces:           false
 BreakBeforeBraces: Custom
 BreakBeforeTernaryOperators: true
 BreakConstructorInitializersBeforeComma: false
@@ -40,12 +40,13 @@ BreakStringLiterals: true
 ColumnLimit:     100
 CommentPragmas:  '^ IWYU pragma:'
 ConstructorInitializerAllOnOneLineOrOnePerLine: false
-ConstructorInitializerIndentWidth: 4
-ContinuationIndentWidth: 4
+ConstructorInitializerIndentWidth: 2
+ContinuationIndentWidth: 2
 Cpp11BracedListStyle: true
 DerivePointerAlignment: false
 DisableFormat:   false
 ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
 IncludeCategories:
   - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
     Priority:        2
@@ -75,6 +76,7 @@ PenaltyReturnTypeOnItsOwnLine: 60
 PointerAlignment: Right
 ReflowComments:  true
 SortIncludes:    false
+SortUsingDeclarations: false
 SpaceAfterCStyleCast: false
 SpaceBeforeAssignmentOperators: true
 SpaceBeforeParens: ControlStatements
@@ -86,5 +88,5 @@ SpacesInCStyleCastParentheses: false
 SpacesInParentheses: false
 SpacesInSquareBrackets: false
 Standard:        Cpp11
-TabWidth:        4
+TabWidth:        2
 UseTab:          Never
diff --git a/.clang-format.8 b/.clang-format.8
deleted file mode 100644
index d2db97619..000000000
--- a/.clang-format.8
+++ /dev/null
@@ -1,92 +0,0 @@
-Language:        Cpp
-BasedOnStyle: Google
-AccessModifierOffset: -2
-AlignAfterOpenBracket: Align
-AlignEscapedNewlinesLeft: true
-AlignConsecutiveAssignments: false
-AlignConsecutiveDeclarations: false
-AlignOperands:   true
-AlignTrailingComments: true
-AllowAllParametersOfDeclarationOnNextLine: true
-AllowShortBlocksOnASingleLine: false
-AllowShortCaseLabelsOnASingleLine: false
-AllowShortFunctionsOnASingleLine: All
-AllowShortIfStatementsOnASingleLine: false
-AllowShortLoopsOnASingleLine: false
-AlwaysBreakAfterDefinitionReturnType: None
-AlwaysBreakAfterReturnType: None
-AlwaysBreakBeforeMultilineStrings: false
-AlwaysBreakTemplateDeclarations: false
-BinPackArguments: true
-BinPackParameters: true
-BraceWrapping:
-  AfterClass:             true
-  AfterControlStatement:  true
-  AfterEnum:              true
-  AfterFunction:          true
-  AfterNamespace:         true
-  AfterObjCDeclaration:   false
-  AfterStruct:            true
-  AfterUnion:             false
-  AfterExternBlock:       false
-  BeforeCatch:            true
-  BeforeElse:             true
-  IndentBraces:           false
-BreakBeforeBraces: Custom
-BreakBeforeTernaryOperators: true
-BreakConstructorInitializersBeforeComma: false
-BreakAfterJavaFieldAnnotations: false
-BreakStringLiterals: true
-ColumnLimit:     100
-CommentPragmas:  '^ IWYU pragma:'
-ConstructorInitializerAllOnOneLineOrOnePerLine: false
-ConstructorInitializerIndentWidth: 2
-ContinuationIndentWidth: 2
-Cpp11BracedListStyle: true
-DerivePointerAlignment: false
-DisableFormat:   false
-ExperimentalAutoDetectBinPacking: false
-FixNamespaceComments: false
-IncludeCategories:
-  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
-    Priority:        2
-  - Regex:           '^(<|"(gtest|isl|json)/)'
-    Priority:        3
-  - Regex:           '.*'
-    Priority:        1
-IndentCaseLabels: true
-IndentWidth:     2
-IndentWrappedFunctionNames: false
-JavaScriptQuotes: Leave
-JavaScriptWrapImports: true
-KeepEmptyLinesAtTheStartOfBlocks: true
-MacroBlockBegin: ''
-MacroBlockEnd:   ''
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: None
-ObjCBlockIndentWidth: 2
-ObjCSpaceAfterProperty: false
-ObjCSpaceBeforeProtocolList: true
-PenaltyBreakBeforeFirstCallParameter: 19
-PenaltyBreakComment: 300
-PenaltyBreakFirstLessLess: 120
-PenaltyBreakString: 1000
-PenaltyExcessCharacter: 1000000
-PenaltyReturnTypeOnItsOwnLine: 60
-PointerAlignment: Right
-ReflowComments:  true
-SortIncludes:    false
-SortUsingDeclarations: false
-SpaceAfterCStyleCast: false
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeParens: ControlStatements
-SpaceInEmptyParentheses: false
-SpacesBeforeTrailingComments: 1
-SpacesInAngles:  false
-SpacesInContainerLiterals: true
-SpacesInCStyleCastParentheses: false
-SpacesInParentheses: false
-SpacesInSquareBrackets: false
-Standard:        Cpp11
-TabWidth:        2
-UseTab:          Never
diff --git a/Makefile.template b/Makefile.template
index 8e88e9092..0d601ded1 100644
--- a/Makefile.template
+++ b/Makefile.template
@@ -130,7 +130,7 @@ ifneq ($(EXT_ACL_FOLDER),)
 	mkdir -p $(OVERLAY_FOLDER)/lib
 	cp $(EXT_ACL_FOLDER)/* $(OVERLAY_FOLDER)/lib
 # Make stamp file
-	printf "20.05" > $(OVERLAY_FOLDER)/ARMCOMPUTE.stamp
+	printf "21.02" > $(OVERLAY_FOLDER)/ARMCOMPUTE.stamp
 endif
 
 	NNFW_WORKSPACE="$(WORKSPACE)" NNFW_INSTALL_PREFIX=$(INSTALL_PATH) ./nnfw configure \
diff --git a/README.md b/README.md
index bbeb66c82..e3ed259c7 100644
--- a/README.md
+++ b/README.md
@@ -54,8 +54,3 @@ Thank you in advance!
 
 - Please post questions, issues, or suggestions into [Issues](https://github.com/Samsung/ONE/issues). This is the best way to communicate with the developer.
 - You can also have an open discussion with community members through [gitter.im](https://gitter.im/Samsung/ONE) channel.
-
-## Hall of Fame
-
-[![](https://sourcerer.io/fame/lemmaa/Samsung/ONE/images/0)](https://sourcerer.io/fame/lemmaa/Samsung/ONE/links/0)[![](https://sourcerer.io/fame/lemmaa/Samsung/ONE/images/1)](https://sourcerer.io/fame/lemmaa/Samsung/ONE/links/1)[![](https://sourcerer.io/fame/lemmaa/Samsung/ONE/images/2)](https://sourcerer.io/fame/lemmaa/Samsung/ONE/links/2)[![](https://sourcerer.io/fame/lemmaa/Samsung/ONE/images/3)](https://sourcerer.io/fame/lemmaa/Samsung/ONE/links/3)[![](https://sourcerer.io/fame/lemmaa/Samsung/ONE/images/4)](https://sourcerer.io/fame/lemmaa/Samsung/ONE/links/4)[![](https://sourcerer.io/fame/lemmaa/Samsung/ONE/images/5)](https://sourcerer.io/fame/lemmaa/Samsung/ONE/links/5)[![](https://sourcerer.io/fame/lemmaa/Samsung/ONE/images/6)](https://sourcerer.io/fame/lemmaa/Samsung/ONE/links/6)[![](https://sourcerer.io/fame/lemmaa/Samsung/ONE/images/7)](https://sourcerer.io/fame/lemmaa/Samsung/ONE/links/7)
-
diff --git a/compiler/.ahub/tcchecker-tca/config.yaml b/compiler/.ahub/tcchecker-tca/config.yaml
index ef681de1a..9d3e6b8eb 100644
--- a/compiler/.ahub/tcchecker-tca/config.yaml
+++ b/compiler/.ahub/tcchecker-tca/config.yaml
@@ -8,6 +8,7 @@ test:
       - ./arser
       - ./circle2circle
       - ./circle-quantizer
+      - ./crew
       - ./cwrap
       - ./foder
       - ./hermes
@@ -19,6 +20,8 @@ test:
       - ./logo-core
       - ./luci
       - ./luci-interpreter
+      - ./luci-eval-driver
+      - ./luci-pass-value-test
       - ./luci-value-test
       - ./mio-circle
       - ./mio-tflite
@@ -30,7 +33,6 @@ test:
       - ./record-minmax
       - ./safemain
       - ./souschef
-      - ./stdex
       - ./tflite2circle
 
     testFile:
diff --git a/compiler/angkor/include/nncc/core/ADT/feature/Overlay.h b/compiler/angkor/include/nncc/core/ADT/feature/Overlay.h
index 93d86f56b..0af13c56a 100644
--- a/compiler/angkor/include/nncc/core/ADT/feature/Overlay.h
+++ b/compiler/angkor/include/nncc/core/ADT/feature/Overlay.h
@@ -34,7 +34,7 @@ template <typename T> class Overlay final : public View<T>
 {
 public:
   explicit Overlay(const Shape &shape, const Layout &layout, T *base)
-      : View<T>{shape, layout}, _base{base}
+    : View<T>{shape, layout}, _base{base}
   {
     // DO NOTHING
   }
diff --git a/compiler/angkor/include/nncc/core/ADT/feature/Shape.h b/compiler/angkor/include/nncc/core/ADT/feature/Shape.h
index 319326308..7d086b9b7 100644
--- a/compiler/angkor/include/nncc/core/ADT/feature/Shape.h
+++ b/compiler/angkor/include/nncc/core/ADT/feature/Shape.h
@@ -35,7 +35,7 @@ class Shape
 {
 public:
   Shape(uint32_t depth, uint32_t height, uint32_t width)
-      : _depth{depth}, _height{height}, _width{width}
+    : _depth{depth}, _height{height}, _width{width}
   {
     // DO NOTHING
   }
diff --git a/compiler/angkor/include/nncc/core/ADT/kernel/Overlay.h b/compiler/angkor/include/nncc/core/ADT/kernel/Overlay.h
index e348a8769..0684277fa 100644
--- a/compiler/angkor/include/nncc/core/ADT/kernel/Overlay.h
+++ b/compiler/angkor/include/nncc/core/ADT/kernel/Overlay.h
@@ -35,7 +35,7 @@ template <typename T, typename InputIt> class Overlay final : public View<T>
 {
 public:
   explicit Overlay(const Shape &shape, const Layout &layout, InputIt it)
-      : _impl{shape, layout}, _it{it}
+    : _impl{shape, layout}, _it{it}
   {
     // DO NOTHING
   }
diff --git a/compiler/angkor/include/nncc/core/ADT/kernel/Shape.h b/compiler/angkor/include/nncc/core/ADT/kernel/Shape.h
index d485d526b..92f90970a 100644
--- a/compiler/angkor/include/nncc/core/ADT/kernel/Shape.h
+++ b/compiler/angkor/include/nncc/core/ADT/kernel/Shape.h
@@ -35,7 +35,7 @@ class Shape
 {
 public:
   Shape(uint32_t count, uint32_t depth, uint32_t height, uint32_t width)
-      : _count{count}, _depth{depth}, _height{height}, _width{width}
+    : _count{count}, _depth{depth}, _height{height}, _width{width}
   {
     // DO NOTHING
   }
diff --git a/compiler/angkor/include/nncc/core/ADT/tensor/Overlay.h b/compiler/angkor/include/nncc/core/ADT/tensor/Overlay.h
index 11ee5350c..5fa36bbc9 100644
--- a/compiler/angkor/include/nncc/core/ADT/tensor/Overlay.h
+++ b/compiler/angkor/include/nncc/core/ADT/tensor/Overlay.h
@@ -32,7 +32,7 @@ template <typename T> class Overlay final : public View<T>
 {
 public:
   explicit Overlay(const Shape &shape, const Layout &layout, T *base)
-      : View<T>{shape, layout}, _base{base}
+    : View<T>{shape, layout}, _base{base}
   {
     // DO NOTHING
   }
diff --git a/compiler/angkor/include/nncc/core/ADT/tensor/View.h b/compiler/angkor/include/nncc/core/ADT/tensor/View.h
index 4c9a91539..8407df3be 100644
--- a/compiler/angkor/include/nncc/core/ADT/tensor/View.h
+++ b/compiler/angkor/include/nncc/core/ADT/tensor/View.h
@@ -36,7 +36,7 @@ template <typename T> class View : public Reader<T>, public Accessor<T>
 {
 public:
   explicit View(const Shape &shape, const Layout &layout)
-      : _shape{shape}, _layout{std::move(layout)}
+    : _shape{shape}, _layout{std::move(layout)}
   {
     // DO NOTHING
   }
diff --git a/compiler/angkor/src/ADT/feature/Overlay.test.cpp b/compiler/angkor/src/ADT/feature/Overlay.test.cpp
index 8ba28bf5a..1ac62f856 100644
--- a/compiler/angkor/src/ADT/feature/Overlay.test.cpp
+++ b/compiler/angkor/src/ADT/feature/Overlay.test.cpp
@@ -30,7 +30,7 @@ TEST(ADT_FEATURE_OVERLAY, ctor)
   const Shape shape{4, 6, 3};
 
   int data[4 * 6 * 3] = {
-      0,
+    0,
   };
   auto overlay = make_overlay<int, CHWLayout>(shape, data);
 
@@ -44,7 +44,7 @@ TEST(ADT_FEATURE_OVERLAY, read)
   const Shape shape{4, 6, 3};
 
   int data[4 * 6 * 3] = {
-      0,
+    0,
   };
   const auto overlay = make_overlay<int, CHWLayout>(shape, data);
 
@@ -60,7 +60,7 @@ TEST(ADT_FEATURE_OVERLAY, access)
   const Shape shape{4, 6, 3};
 
   int data[4 * 6 * 3] = {
-      0,
+    0,
   };
   auto overlay = make_overlay<int, CHWLayout>(shape, data);
 
diff --git a/compiler/angkor/src/ADT/kernel/Overlay.test.cpp b/compiler/angkor/src/ADT/kernel/Overlay.test.cpp
index 4e9bd8dbd..7129fe242 100644
--- a/compiler/angkor/src/ADT/kernel/Overlay.test.cpp
+++ b/compiler/angkor/src/ADT/kernel/Overlay.test.cpp
@@ -30,7 +30,7 @@ TEST(ADT_KERNEL_OVERLAY, ctor)
   const Shape shape{2, 4, 6, 3};
 
   int data[2 * 4 * 6 * 3] = {
-      0,
+    0,
   };
   auto overlay = make_overlay<int, NCHWLayout>(shape, data);
 
@@ -45,7 +45,7 @@ TEST(ADT_KERNEL_OVERLAY, read)
   const Shape shape{2, 4, 6, 3};
 
   int data[2 * 4 * 6 * 3] = {
-      0,
+    0,
   };
   const auto overlay = make_overlay<int, NCHWLayout>(shape, data);
 
@@ -61,7 +61,7 @@ TEST(ADT_KERNEL_OVERLAY, access)
   const Shape shape{2, 4, 6, 3};
 
   int data[2 * 4 * 6 * 3] = {
-      0,
+    0,
   };
   auto overlay = make_overlay<int, NCHWLayout>(shape, data);
 
diff --git a/compiler/angkor/src/ADT/tensor/Overlay.test.cpp b/compiler/angkor/src/ADT/tensor/Overlay.test.cpp
index 57cd1e6f9..d5369dffc 100644
--- a/compiler/angkor/src/ADT/tensor/Overlay.test.cpp
+++ b/compiler/angkor/src/ADT/tensor/Overlay.test.cpp
@@ -31,7 +31,7 @@ TEST(ADT_TENSOR_OVERLAY, ctor)
   const Shape shape{2, 3};
 
   int data[2 * 3] = {
-      0,
+    0,
   };
   auto view = make_overlay<int, LexicalLayout>(shape, data);
 
@@ -43,7 +43,7 @@ TEST(ADT_TENSOR_OVERLAY, read)
   const Shape shape{2, 3};
 
   int data[2 * 3] = {
-      0,
+    0,
   };
   const auto view = make_overlay<int, LexicalLayout>(shape, data);
 
@@ -61,7 +61,7 @@ TEST(ADT_TENSOR_OVERLAY, access)
   const Shape shape{2, 3};
 
   int data[2 * 3] = {
-      0,
+    0,
   };
   auto view = make_overlay<int, LexicalLayout>(shape, data);
 
diff --git a/compiler/arser/CMakeLists.txt b/compiler/arser/CMakeLists.txt
index 63d19f538..7eda21564 100644
--- a/compiler/arser/CMakeLists.txt
+++ b/compiler/arser/CMakeLists.txt
@@ -4,12 +4,14 @@ add_library(arser INTERFACE)
 # It means that a developer who want to link arser just need to add one line.
 # target_link_library(another-users-target arser) 
 target_include_directories(arser INTERFACE include/)
+target_link_libraries(arser INTERFACE nncc_coverage)
 
 if(NOT ENABLE_TEST)
   return()
 endif(NOT ENABLE_TEST)
 
 nnas_find_package(GTest REQUIRED)
-set(TESTS "${CMAKE_CURRENT_SOURCE_DIR}/tests/arser.test.cpp")
+set(TESTS "${CMAKE_CURRENT_SOURCE_DIR}/tests/arser.test.cpp"
+          "${CMAKE_CURRENT_SOURCE_DIR}/tests/HelpMessage.test.cpp")
 GTest_AddTest(arser_test ${TESTS})
-target_include_directories(arser_test PRIVATE include)
+target_link_libraries(arser_test arser)
diff --git a/compiler/arser/include/arser/arser.h b/compiler/arser/include/arser/arser.h
index 64bb557c4..f2a7a2b85 100644
--- a/compiler/arser/include/arser/arser.h
+++ b/compiler/arser/include/arser/arser.h
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+#ifndef __ARSER_H__
+#define __ARSER_H__
+
 #include <iostream>
 #include <sstream>
 
@@ -29,7 +32,11 @@
 
 #include <cstring>
 
-namespace
+#include <cassert>
+
+namespace arser
+{
+namespace internal
 {
 
 template <typename T> T lexical_cast(const std::string &str)
@@ -41,7 +48,7 @@ template <typename T> T lexical_cast(const std::string &str)
   return data;
 }
 
-template <> bool lexical_cast(const std::string &str)
+template <> inline bool lexical_cast(const std::string &str)
 {
   bool data = true;
   if (str == "false" || str == "False" || str == "FALSE" || str == "0")
@@ -55,7 +62,33 @@ template <> inline std::string to_string(const char *value) { return std::string
 
 template <> inline std::string to_string(const bool value) { return value ? "true" : "false"; }
 
-} // namespace
+/**
+ * @brief Returns the string with the leading dash removed.
+ *
+ * If there is no dash, it returns as it is.
+ */
+inline std::string remove_dash(const std::string &str)
+{
+  std::string ret{str};
+  auto pos = ret.find_first_not_of('-');
+  if (pos == std::string::npos)
+    return ret;
+  return ret.substr(pos);
+}
+
+/**
+ * @brief Returns the string that created by concatenating the elements of a vector with commas.
+ */
+inline std::string make_comma_concatenated(const std::vector<std::string> &vec)
+{
+  std::ostringstream oss;
+  std::copy(vec.begin(), std::prev(vec.end()), std::ostream_iterator<std::string>(oss, ", "));
+  oss << vec.back();
+  return oss.str();
+}
+
+} // namespace internal
+} // namespace arser
 
 namespace arser
 {
@@ -116,10 +149,41 @@ enum class DataType
 
 class Arser;
 
+/**
+ * Argument
+ *   ├── positional argument
+ *   └── optioanl argument  [ dash at the beginning of the string ]
+ *       ├── long option    [ two or more dashes ]
+ *       └── short option   [ one dash ]
+ *
+ * Argument has two types - positional argument, optional argument.
+ *
+ * The way to distinguish the two types is whether there is a dash('-') at the beginning of the
+ * string.
+ *
+ * And, optional argument has two types as well - long option, short option, which is distinguished
+ * by the number of dash.
+ */
 class Argument
 {
 public:
-  explicit Argument(const std::string &arg_name) : _name{arg_name} {}
+  explicit Argument(const std::string &arg_name) : _long_name{arg_name}, _names{arg_name} {}
+  explicit Argument(const std::string &short_name, const std::string &long_name)
+    : _short_name{short_name}, _long_name{long_name}, _names{short_name, long_name}
+  {
+  }
+  explicit Argument(const std::string &short_name, const std::string &long_name,
+                    const std::vector<std::string> &names)
+    : _short_name{short_name}, _long_name{long_name}, _names{names}
+  {
+    // 'names' must have 'short_name' and 'long_name'.
+    auto it = std::find(names.begin(), names.end(), short_name);
+    assert(it != names.end());
+    it = std::find(names.begin(), names.end(), long_name);
+    assert(it != names.end());
+    // for avoiding unused warning.
+    (void)it;
+  }
 
   Argument &nargs(uint32_t num)
   {
@@ -190,7 +254,7 @@ public:
   {
     if ((_nargs <= 1 && TypeName<T>::Get() == _type) ||
         (_nargs > 1 && TypeName<std::vector<T>>::Get() == _type))
-      _values.emplace_back(::to_string(value));
+      _values.emplace_back(internal::to_string(value));
     else
     {
       throw std::runtime_error("Type mismatch. "
@@ -207,7 +271,7 @@ public:
     if ((_nargs <= 1 && TypeName<T>::Get() == _type) ||
         (_nargs > 1 && TypeName<std::vector<T>>::Get() == _type))
     {
-      _values.emplace_back(::to_string(value));
+      _values.emplace_back(internal::to_string(value));
       default_value(values...);
     }
     else
@@ -222,7 +286,11 @@ public:
   }
 
 private:
-  std::string _name;
+  // The '_names' vector contains all of the options specified by the user.
+  // And among them, '_long_name' and '_short_name' are selected.
+  std::string _long_name;
+  std::string _short_name;
+  std::vector<std::string> _names;
   std::string _type;
   std::string _help_message;
   std::function<void(void)> _func;
@@ -238,33 +306,113 @@ class Arser
 {
 public:
   explicit Arser(const std::string &program_description = {})
-      : _program_description{program_description}
+    : _program_description{program_description}
   {
-    add_argument("--help").help("Show help message and exit").nargs(0);
+    add_argument("-h", "--help").help("Show help message and exit").nargs(0);
   }
 
   Argument &add_argument(const std::string &arg_name)
   {
-    if (arg_name.at(0) != '-')
+    if (arg_name.at(0) != '-') /* positional */
     {
       _positional_arg_vec.emplace_back(arg_name);
       _arg_map[arg_name] = &_positional_arg_vec.back();
     }
-    else
+    else /* optional */
     {
+      // The length of optional argument name must be 2 or more.
+      // And it shouldn't be hard to recognize. e.g. '-', '--'
+      if (arg_name.size() < 2)
+      {
+        throw std::runtime_error("Too short name. The length of argument name must be 2 or more.");
+      }
+      if (arg_name == "--")
+      {
+        throw std::runtime_error(
+          "Too short name. Option name must contain at least one character other than dash.");
+      }
       _optional_arg_vec.emplace_back(arg_name);
+      _optional_arg_vec.back()._short_name = arg_name;
       _arg_map[arg_name] = &_optional_arg_vec.back();
     }
     return *_arg_map[arg_name];
   }
 
+  Argument &add_argument(const std::vector<std::string> &arg_name_vec)
+  {
+    assert(arg_name_vec.size() >= 2);
+    std::string long_opt, short_opt;
+    // find long and short option
+    for (const auto &arg_name : arg_name_vec)
+    {
+      if (arg_name.at(0) != '-')
+      {
+        throw std::runtime_error("Invalid argument. "
+                                 "Positional argument cannot have short option.");
+      }
+      assert(arg_name.size() >= 2);
+      if (long_opt.empty() && arg_name.at(0) == '-' && arg_name.at(1) == '-')
+      {
+        long_opt = arg_name;
+      }
+      if (short_opt.empty() && arg_name.at(0) == '-' && arg_name.at(1) != '-')
+      {
+        short_opt = arg_name;
+      }
+    }
+    // If one of the two is empty, fill it with the non-empty one for pretty printing.
+    if (long_opt.empty())
+    {
+      assert(not short_opt.empty());
+      long_opt = short_opt;
+    }
+    if (short_opt.empty())
+    {
+      assert(not long_opt.empty());
+      short_opt = long_opt;
+    }
+
+    _optional_arg_vec.emplace_back(short_opt, long_opt, arg_name_vec);
+    for (const auto &arg_name : arg_name_vec)
+    {
+      _arg_map[arg_name] = &_optional_arg_vec.back();
+    }
+    return _optional_arg_vec.back();
+  }
+
+  template <typename... Ts> Argument &add_argument(const std::string &arg_name, Ts... arg_names)
+  {
+    if (sizeof...(arg_names) == 0)
+    {
+      return add_argument(arg_name);
+    }
+    // sizeof...(arg_names) > 0
+    else
+    {
+      return add_argument(std::vector<std::string>{arg_name, arg_names...});
+    }
+  }
+
+  void validate_arguments(void)
+  {
+    // positional argument is always required.
+    for (const auto &arg : _positional_arg_vec)
+    {
+      if (arg._is_required)
+      {
+        throw std::runtime_error("Invalid arguments. Positional argument must always be required.");
+      }
+    }
+  }
+
   void parse(int argc, char **argv)
   {
+    validate_arguments();
     _program_name = argv[0];
     _program_name.erase(0, _program_name.find_last_of("/\\") + 1);
     if (argc >= 2)
     {
-      if (!std::strcmp(argv[1], "--help"))
+      if (!std::strcmp(argv[1], "--help") || !std::strcmp(argv[1], "-h"))
       {
         std::cout << *this;
         std::exit(0);
@@ -274,7 +422,7 @@ public:
         for (const auto &arg : _arg_map)
         {
           const auto &func = arg.second->_func;
-          if (func && !std::strcmp(argv[1], arg.second->_name.c_str()))
+          if (func && !std::strcmp(argv[1], arg.first.c_str()))
           {
             func();
             std::exit(0);
@@ -354,14 +502,111 @@ public:
 
   template <typename T> T get(const std::string &arg_name);
 
+  friend std::ostream &operator<<(std::ostream &stream, const Arser &parser)
+  {
+    // print description
+    if (!parser._program_description.empty())
+    {
+      stream << "What " << parser._program_name << " does: " << parser._program_description
+             << "\n\n";
+    }
+    /*
+    ** print usage
+    */
+    stream << "Usage: ./" << parser._program_name << " ";
+    // required optional argument
+    for (const auto &arg : parser._optional_arg_vec)
+    {
+      if (!arg._is_required)
+        continue;
+      stream << arg._short_name << " ";
+      std::string arg_name = arser::internal::remove_dash(arg._long_name);
+      std::for_each(arg_name.begin(), arg_name.end(),
+                    [&stream](const char &c) { stream << static_cast<char>(::toupper(c)); });
+      stream << " ";
+    }
+    // rest of the optional argument
+    for (const auto &arg : parser._optional_arg_vec)
+    {
+      if (arg._is_required)
+        continue;
+      stream << "[" << arg._short_name;
+      if (arg._nargs)
+      {
+        stream << " ";
+        std::string arg_name = arser::internal::remove_dash(arg._long_name);
+        std::for_each(arg_name.begin(), arg_name.end(),
+                      [&stream](const char &c) { stream << static_cast<char>(::toupper(c)); });
+      }
+      stream << "]"
+             << " ";
+    }
+    // positional arguement
+    for (const auto &arg : parser._positional_arg_vec)
+    {
+      stream << arg._long_name << " ";
+    }
+    stream << "\n\n";
+    /*
+    ** print argument list and its help message
+    */
+    // get the length of the longest argument
+    size_t length_of_longest_arg = 0;
+    for (const auto &arg : parser._positional_arg_vec)
+    {
+      length_of_longest_arg = std::max(length_of_longest_arg,
+                                       arser::internal::make_comma_concatenated(arg._names).size());
+    }
+    for (const auto &arg : parser._optional_arg_vec)
+    {
+      length_of_longest_arg = std::max(length_of_longest_arg,
+                                       arser::internal::make_comma_concatenated(arg._names).size());
+    }
+
+    const size_t message_width = 60;
+    // positional argument
+    if (!parser._positional_arg_vec.empty())
+    {
+      stream << "[Positional argument]" << std::endl;
+      for (const auto &arg : parser._positional_arg_vec)
+      {
+        stream.width(length_of_longest_arg);
+        stream << std::left << arser::internal::make_comma_concatenated(arg._names) << "\t";
+        for (size_t i = 0; i < arg._help_message.length(); i += message_width)
+        {
+          if (i)
+            stream << std::string(length_of_longest_arg, ' ') << "\t";
+          stream << arg._help_message.substr(i, message_width) << std::endl;
+        }
+      }
+      std::cout << std::endl;
+    }
+    // optional argument
+    if (!parser._optional_arg_vec.empty())
+    {
+      stream << "[Optional argument]" << std::endl;
+      for (const auto &arg : parser._optional_arg_vec)
+      {
+        stream.width(length_of_longest_arg);
+        stream << std::left << arser::internal::make_comma_concatenated(arg._names) << "\t";
+        for (size_t i = 0; i < arg._help_message.length(); i += message_width)
+        {
+          if (i)
+            stream << std::string(length_of_longest_arg, ' ') << "\t";
+          stream << arg._help_message.substr(i, message_width) << std::endl;
+        }
+      }
+    }
+
+    return stream;
+  }
+
 private:
   std::string _program_name;
   std::string _program_description;
   std::list<Argument> _positional_arg_vec;
   std::list<Argument> _optional_arg_vec;
   std::map<std::string, Argument *> _arg_map;
-
-  friend std::ostream &operator<<(std::ostream &, const Arser &);
 };
 
 template <typename T> T Arser::get_impl(const std::string &arg_name, T *)
@@ -369,7 +614,8 @@ template <typename T> T Arser::get_impl(const std::string &arg_name, T *)
   auto arg = _arg_map.find(arg_name);
   if (arg == _arg_map.end())
     throw std::runtime_error("Invalid argument. "
-                             "There is no argument you are looking for.");
+                             "There is no argument you are looking for: " +
+                             arg_name);
 
   if (arg->second->_type != TypeName<T>::Get())
     throw std::runtime_error("Type mismatch. "
@@ -383,7 +629,7 @@ template <typename T> T Arser::get_impl(const std::string &arg_name, T *)
                              "You must make sure that the argument is given before accessing it. "
                              "You can do it by calling arser[\"argument\"].");
 
-  return ::lexical_cast<T>(arg->second->_values[0]);
+  return internal::lexical_cast<T>(arg->second->_values[0]);
 }
 
 template <typename T> std::vector<T> Arser::get_impl(const std::string &arg_name, std::vector<T> *)
@@ -391,7 +637,8 @@ template <typename T> std::vector<T> Arser::get_impl(const std::string &arg_name
   auto arg = _arg_map.find(arg_name);
   if (arg == _arg_map.end())
     throw std::runtime_error("Invalid argument. "
-                             "There is no argument you are looking for.");
+                             "There is no argument you are looking for: " +
+                             arg_name);
 
   if (arg->second->_type != TypeName<std::vector<T>>::Get())
     throw std::runtime_error("Type mismatch. "
@@ -399,7 +646,7 @@ template <typename T> std::vector<T> Arser::get_impl(const std::string &arg_name
 
   std::vector<T> data;
   std::transform(arg->second->_values.begin(), arg->second->_values.end(), std::back_inserter(data),
-                 [](std::string str) -> T { return ::lexical_cast<T>(str); });
+                 [](std::string str) -> T { return internal::lexical_cast<T>(str); });
   return data;
 }
 
@@ -408,100 +655,6 @@ template <typename T> T Arser::get(const std::string &arg_name)
   return get_impl(arg_name, static_cast<T *>(nullptr));
 }
 
-std::ostream &operator<<(std::ostream &stream, const Arser &parser)
-{
-  // print description
-  if (!parser._program_description.empty())
-  {
-    stream << "What " << parser._program_name << " does: " << parser._program_description << "\n\n";
-  }
-  /*
-  ** print usage
-  */
-  stream << "Usage: ./" << parser._program_name << " ";
-  // required optional argument
-  for (const auto &arg : parser._optional_arg_vec)
-  {
-    if (!arg._is_required)
-      continue;
-    stream << arg._name << " ";
-    std::string arg_name = arg._name.substr(2);
-    std::for_each(arg_name.begin(), arg_name.end(),
-                  [&stream](const char &c) { stream << static_cast<char>(::toupper(c)); });
-    stream << " ";
-  }
-  // rest of the optional argument
-  for (const auto &arg : parser._optional_arg_vec)
-  {
-    if (arg._is_required)
-      continue;
-    stream << "[" << arg._name;
-    if (arg._nargs)
-    {
-      stream << " ";
-      std::string arg_name = arg._name.substr(2);
-      std::for_each(arg_name.begin(), arg_name.end(),
-                    [&stream](const char &c) { stream << static_cast<char>(::toupper(c)); });
-    }
-    stream << "]"
-           << " ";
-  }
-  // positional arguement
-  for (const auto &arg : parser._positional_arg_vec)
-  {
-    stream << arg._name << " ";
-  }
-  stream << "\n\n";
-  /*
-  ** print argument list and its help message
-  */
-  // get the length of the longest argument
-  size_t length_of_longest_arg = 0;
-  for (const auto &arg : parser._positional_arg_vec)
-  {
-    length_of_longest_arg = std::max(length_of_longest_arg, arg._name.length());
-  }
-  for (const auto &arg : parser._optional_arg_vec)
-  {
-    length_of_longest_arg = std::max(length_of_longest_arg, arg._name.length());
-  }
-
-  const size_t message_width = 60;
-  // positional argument
-  if (!parser._positional_arg_vec.empty())
-  {
-    stream << "[Positional argument]" << std::endl;
-    for (const auto &arg : parser._positional_arg_vec)
-    {
-      stream.width(length_of_longest_arg);
-      stream << std::left << arg._name << "\t";
-      for (size_t i = 0; i < arg._help_message.length(); i += message_width)
-      {
-        if (i)
-          stream << std::string(length_of_longest_arg, ' ') << "\t";
-        stream << arg._help_message.substr(i, message_width) << std::endl;
-      }
-    }
-    std::cout << std::endl;
-  }
-  // optional argument
-  if (!parser._optional_arg_vec.empty())
-  {
-    stream << "[Optional argument]" << std::endl;
-    for (const auto &arg : parser._optional_arg_vec)
-    {
-      stream.width(length_of_longest_arg);
-      stream << std::left << arg._name << "\t";
-      for (size_t i = 0; i < arg._help_message.length(); i += message_width)
-      {
-        if (i)
-          stream << std::string(length_of_longest_arg, ' ') << "\t";
-        stream << arg._help_message.substr(i, message_width) << std::endl;
-      }
-    }
-  }
-
-  return stream;
-}
-
 } // namespace arser
+
+#endif // __ARSER_H__
diff --git a/compiler/arser/tests/HelpMessage.test.cpp b/compiler/arser/tests/HelpMessage.test.cpp
new file mode 100644
index 000000000..45cf840e6
--- /dev/null
+++ b/compiler/arser/tests/HelpMessage.test.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "arser/arser.h"
+
+#include "Prompt.h"
+
+using namespace arser;
+
+/**
+ * [WARNING] DO NOT GIVE THE ARSER '-h' or '--help' OPTION IN BELOW TESTS.
+ *
+ * arser exits with code 0 when '-h' option is given, which forces googletest to pass.
+ */
+
+TEST(HelpMessageTest, Default)
+{
+  /* arrange */
+  Arser arser;
+
+  arser.add_argument("--dummy").nargs(0).help("Dummy optional argument");
+
+  std::ostringstream oss;
+  std::string expected_out = "Usage: ./arser [-h] [--dummy] \n"
+                             "\n"
+                             "[Optional argument]\n"
+                             "-h, --help	Show help message and exit\n"
+                             "--dummy   \tDummy optional argument\n";
+
+  test::Prompt prompt("./arser --dummy");
+  /* act */
+  arser.parse(prompt.argc(), prompt.argv());
+  oss << arser;
+
+  /* assert */
+  EXPECT_EQ(expected_out, oss.str());
+}
+
+TEST(HelpMessageTest, ShortOption)
+{
+  /* arrange */
+  Arser arser;
+
+  arser.add_argument("-v", "--verbose").nargs(0).help("Provides additional details");
+
+  std::ostringstream oss;
+  std::string expected_out = "Usage: ./arser [-h] [-v] \n"
+                             "\n"
+                             "[Optional argument]\n"
+                             "-h, --help   \tShow help message and exit\n"
+                             "-v, --verbose\tProvides additional details\n";
+
+  test::Prompt prompt("./arser -v");
+  /* act */
+  arser.parse(prompt.argc(), prompt.argv());
+  oss << arser;
+
+  /* assert */
+  EXPECT_EQ(expected_out, oss.str());
+}
diff --git a/compiler/arser/tests/Prompt.h b/compiler/arser/tests/Prompt.h
new file mode 100644
index 000000000..d816f199c
--- /dev/null
+++ b/compiler/arser/tests/Prompt.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ARSER_PROMPT_H__
+#define __ARSER_PROMPT_H__
+
+#include <iterator>
+#include <sstream>
+#include <string>
+#include <vector>
+
+namespace arser
+{
+namespace test
+{
+
+class Prompt
+{
+public:
+  Prompt(const std::string &command)
+  {
+    std::istringstream iss(command);
+    std::vector<std::string> token(std::istream_iterator<std::string>{iss},
+                                   std::istream_iterator<std::string>());
+    _arg = std::move(token);
+    _argv.reserve(_arg.size());
+    for (const auto &t : _arg)
+    {
+      _argv.push_back(const_cast<char *>(t.data()));
+    }
+  }
+  int argc(void) const { return _argv.size(); }
+  char **argv(void) { return _argv.data(); }
+
+private:
+  std::vector<char *> _argv;
+  std::vector<std::string> _arg;
+};
+
+} // namespace test
+} // namespace arser
+
+#endif // __ARSER_PROMPT_H__
diff --git a/compiler/arser/tests/arser.test.cpp b/compiler/arser/tests/arser.test.cpp
index 28bee4238..b37d0dec3 100644
--- a/compiler/arser/tests/arser.test.cpp
+++ b/compiler/arser/tests/arser.test.cpp
@@ -54,8 +54,8 @@ TEST(BasicTest, option)
   Arser arser;
 
   arser.add_argument("--verbose")
-      .nargs(0)
-      .help("It provides additional details as to what the executable is doing");
+    .nargs(0)
+    .help("It provides additional details as to what the executable is doing");
 
   Prompt prompt("./executable --verbose");
   /* act */
@@ -71,13 +71,13 @@ TEST(BasicTest, OptionalArgument)
   Arser arser;
 
   arser.add_argument("--volume")
-      .nargs(1)
-      .type(arser::DataType::INT32)
-      .help("Set a volume as you provided.");
+    .nargs(1)
+    .type(arser::DataType::INT32)
+    .help("Set a volume as you provided.");
   arser.add_argument("--frequency")
-      .nargs(1)
-      .type(arser::DataType::FLOAT)
-      .help("Set a frequency as you provided.");
+    .nargs(1)
+    .type(arser::DataType::FLOAT)
+    .help("Set a frequency as you provided.");
 
   Prompt prompt("./radio --volume 5 --frequency 128.5");
   /* act */
@@ -99,9 +99,9 @@ TEST(BasicTest, NonRequiredOptionalArgument)
   Arser arser;
 
   arser.add_argument("--weight")
-      .nargs(1)
-      .type(arser::DataType::INT32)
-      .help("Set a volume as you provided.");
+    .nargs(1)
+    .type(arser::DataType::INT32)
+    .help("Set a volume as you provided.");
 
   Prompt prompt("./radio"); // empty argument
   /* act */
@@ -117,10 +117,10 @@ TEST(BasicTest, RequiredOptionalArgument)
   Arser arser;
 
   arser.add_argument("--volume")
-      .nargs(1)
-      .type(arser::DataType::INT32)
-      .required()
-      .help("Set a volume as you provided.");
+    .nargs(1)
+    .type(arser::DataType::INT32)
+    .required()
+    .help("Set a volume as you provided.");
 
   Prompt prompt("./radio");
   /* act */ /* assert */
@@ -152,20 +152,20 @@ TEST(BasicTest, MultipleOptionalArgument)
   Arser arser;
 
   arser.add_argument("--input_path")
-      .nargs(1)
-      .type(arser::DataType::STR)
-      .help("input path of this program.")
-      .required();
+    .nargs(1)
+    .type(arser::DataType::STR)
+    .help("input path of this program.")
+    .required();
   arser.add_argument("--output_path")
-      .nargs(1)
-      .type(arser::DataType::STR)
-      .help("output path of this program.")
-      .required(true);
+    .nargs(1)
+    .type(arser::DataType::STR)
+    .help("output path of this program.")
+    .required(true);
   arser.add_argument("--training_data")
-      .nargs(5)
-      .type(arser::DataType::INT32_VEC)
-      .help("give traning data to this program.")
-      .required();
+    .nargs(5)
+    .type(arser::DataType::INT32_VEC)
+    .help("give traning data to this program.")
+    .required();
 
   Prompt prompt("./ml --input_path /I/am/in.put --output_path I/am/out.put "
                 "--training_data 2 43 234 3 334");
@@ -191,9 +191,9 @@ TEST(BasicTest, MultipleFloatValue)
   Arser arser;
 
   arser.add_argument("--add_float")
-      .nargs(2)
-      .type(arser::DataType::FLOAT_VEC)
-      .help("Add two float numbers.");
+    .nargs(2)
+    .type(arser::DataType::FLOAT_VEC)
+    .help("Add two float numbers.");
 
   Prompt prompt("./calculator --add_float 3.2 5.4");
   /* act */
@@ -213,9 +213,9 @@ TEST(BasicTest, MultipleStringValue)
   Arser arser;
 
   arser.add_argument("--three_color")
-      .nargs(3)
-      .type(arser::DataType::STR_VEC)
-      .help("insert your three favorite color");
+    .nargs(3)
+    .type(arser::DataType::STR_VEC)
+    .help("insert your three favorite color");
 
   Prompt prompt("./color_factory --three_color red blue yellow");
   /* act */
@@ -255,8 +255,8 @@ TEST(BasicTest, ExitWithFunctionCallWithBind)
   Arser arser;
 
   arser.add_argument("--version")
-      .help("Show version and exit")
-      .exit_with(std::bind(printVersion, "1.2.0"));
+    .help("Show version and exit")
+    .exit_with(std::bind(printVersion, "1.2.0"));
 
   Prompt prompt("./arser --version");
   /* act */ /* assert */
@@ -286,34 +286,34 @@ TEST(BasicTest, DefaultValue)
   Arser arser;
 
   arser.add_argument("--delivery")
-      .nargs(3)
-      .type(arser::DataType::STR_VEC)
-      .default_value("pizza", "chicken", "hamburger")
-      .help("Enter three foods that you want to deliver");
+    .nargs(3)
+    .type(arser::DataType::STR_VEC)
+    .default_value("pizza", "chicken", "hamburger")
+    .help("Enter three foods that you want to deliver");
   arser.add_argument("--assistant")
-      .type(arser::DataType::STR)
-      .default_value("Bixby")
-      .help("Enter name of your assistant");
+    .type(arser::DataType::STR)
+    .default_value("Bixby")
+    .help("Enter name of your assistant");
   arser.add_argument("--sound")
-      .type(arser::DataType::BOOL)
-      .nargs(1)
-      .default_value(true)
-      .help("Sound on/off");
+    .type(arser::DataType::BOOL)
+    .nargs(1)
+    .default_value(true)
+    .help("Sound on/off");
   arser.add_argument("--number")
-      .type(arser::DataType::INT32_VEC)
-      .nargs(4)
-      .default_value(1, 2, 3, 4)
-      .help("Enter the number that you want to call");
+    .type(arser::DataType::INT32_VEC)
+    .nargs(4)
+    .default_value(1, 2, 3, 4)
+    .help("Enter the number that you want to call");
   arser.add_argument("--time")
-      .type(arser::DataType::INT32_VEC)
-      .nargs(3)
-      .default_value(0, 0, 0)
-      .help("Current time(H/M/S)");
+    .type(arser::DataType::INT32_VEC)
+    .nargs(3)
+    .default_value(0, 0, 0)
+    .help("Current time(H/M/S)");
   arser.add_argument("--name")
-      .type(arser::DataType::STR)
-      .nargs(1)
-      .default_value("no name")
-      .help("Enter your name");
+    .type(arser::DataType::STR)
+    .nargs(1)
+    .default_value("no name")
+    .help("Enter your name");
 
   Prompt prompt("/phone --time 1 52 34 --name arser");
   /* act */
@@ -342,3 +342,102 @@ TEST(BasicTest, DefaultValue)
   // 1 string, 1 argument
   EXPECT_EQ("arser", arser.get<std::string>("--name"));
 }
+
+TEST(BasicTest, shortOption)
+{
+  /* arrange */
+  Arser arser;
+
+  arser.add_argument("--input_path", "-i")
+    .nargs(1)
+    .type(arser::DataType::STR)
+    .help("input path of this program.")
+    .required();
+  arser.add_argument("--output_path", "-o")
+    .nargs(1)
+    .type(arser::DataType::STR)
+    .help("output path of this program.")
+    .required(true);
+
+  Prompt prompt("./driver -i /I/am/in.put --output_path I/am/out.put");
+  /* act */
+  arser.parse(prompt.argc(), prompt.argv());
+  /* assert */
+  EXPECT_TRUE(arser["--input_path"]);
+  EXPECT_EQ("/I/am/in.put", arser.get<std::string>("--input_path"));
+  EXPECT_TRUE(arser["--output_path"]);
+  EXPECT_EQ("I/am/out.put", arser.get<std::string>("--output_path"));
+}
+
+TEST(BasicTest, shortMultipleOption)
+{
+  /* arrange */
+  Arser arser;
+
+  arser.add_argument("--input_path", "-i", "--input", "--in")
+    .nargs(1)
+    .type(arser::DataType::STR)
+    .help("input path of this program.")
+    .required();
+  arser.add_argument("--output_path", "-o")
+    .nargs(1)
+    .type(arser::DataType::STR)
+    .help("output path of this program.")
+    .required(true);
+
+  Prompt prompt("./driver --in /I/am/in.put -o I/am/out.put");
+  /* act */
+  arser.parse(prompt.argc(), prompt.argv());
+  /* assert */
+  EXPECT_TRUE(arser["--input"]);
+  EXPECT_EQ("/I/am/in.put", arser.get<std::string>("--input"));
+  EXPECT_TRUE(arser["--output_path"]);
+  EXPECT_EQ("I/am/out.put", arser.get<std::string>("--output_path"));
+}
+
+TEST(BasicTest, OptWithRequiredDuplicate)
+{
+  /* arrange */
+  Arser arser;
+
+  arser.add_argument("--input_path", "-i", "--input", "--in")
+    .nargs(1)
+    .type(arser::DataType::STR)
+    .help("input path of this program.")
+    .required();
+  arser.add_argument("--output_path", "-o")
+    .nargs(1)
+    .type(arser::DataType::STR)
+    .help("output path of this program.")
+    .required(true);
+
+  Prompt prompt("./driver --in /I/am/in.put -o I/am/out.put -i /I/am/duplicate");
+  /* act */ /* assert */
+  EXPECT_THROW(arser.parse(prompt.argc(), prompt.argv()), std::runtime_error);
+}
+
+TEST(BasicTest, OptWithNonRequiredDuplicate)
+{
+  /* arrange */
+  Arser arser;
+
+  arser.add_argument("--input_path", "-i", "--input", "--in")
+    .nargs(1)
+    .type(arser::DataType::STR)
+    .help("input path of this program.");
+  /* .required() */
+  arser.add_argument("--output_path", "-o")
+    .nargs(1)
+    .type(arser::DataType::STR)
+    .help("output path of this program.")
+    .required(true);
+
+  Prompt prompt("./driver --in /I/am/in.put -o I/am/out.put -i /I/am/duplicate");
+  /* act */
+  arser.parse(prompt.argc(), prompt.argv());
+  /* assert */
+  EXPECT_TRUE(arser["--input"]);
+  EXPECT_EQ("/I/am/duplicate", arser.get<std::string>("--input"));
+  EXPECT_TRUE(arser["--output_path"]);
+  EXPECT_EQ("I/am/out.put", arser.get<std::string>("--output_path"));
+}
diff --git a/compiler/bino/include/bino.h b/compiler/bino/include/bino.h
index fc22d1285..bf540dffe 100644
--- a/compiler/bino/include/bino.h
+++ b/compiler/bino/include/bino.h
@@ -33,8 +33,8 @@ public:
 public:
   template <typename T>
   auto operator()(const std::pair<T, T> &p) const
-      -> decltype(std::make_pair(std::declval<Callable>()(p.first),
-                                 std::declval<Callable>()(p.second)))
+    -> decltype(std::make_pair(std::declval<Callable>()(p.first),
+                               std::declval<Callable>()(p.second)))
   {
     return std::make_pair(f(p.first), f(p.second));
   }
diff --git a/compiler/caffegen/CMakeLists.txt b/compiler/caffegen/CMakeLists.txt
index 334174dcd..b963b5294 100644
--- a/compiler/caffegen/CMakeLists.txt
+++ b/compiler/caffegen/CMakeLists.txt
@@ -7,7 +7,6 @@ endif(NOT Caffe_FOUND)
 file(GLOB_RECURSE SOURCES "src/*.cpp")
 
 add_executable(caffegen ${SOURCES})
-target_link_libraries(caffegen stdex)
 target_link_libraries(caffegen cli)
 # NOTE "Caffe" package provides both caffe and caffeproto target
 # NOTE "caffeproto" is linked to "caffe"
diff --git a/compiler/caffegen/src/Driver.cpp b/compiler/caffegen/src/Driver.cpp
index 81b01e6f1..17e3ebb7f 100644
--- a/compiler/caffegen/src/Driver.cpp
+++ b/compiler/caffegen/src/Driver.cpp
@@ -20,12 +20,12 @@
 #include "MergeCommand.h"
 
 #include <cli/App.h>
-#include <stdex/Memory.h>
 
+#include <memory>
 #include <map>
 #include <string>
 
-using stdex::make_unique;
+using std::make_unique;
 
 int main(int argc, char **argv)
 {
diff --git a/compiler/circle-inspect/driver/Driver.cpp b/compiler/circle-inspect/driver/Driver.cpp
index 72cfa28a3..a450fd9e0 100644
--- a/compiler/circle-inspect/driver/Driver.cpp
+++ b/compiler/circle-inspect/driver/Driver.cpp
@@ -29,11 +29,11 @@
 int entry(int argc, char **argv)
 {
   arser::Arser arser{
-      "circle-inspect allows users to retrieve various information from a Circle model file"};
+    "circle-inspect allows users to retrieve various information from a Circle model file"};
   arser.add_argument("--operators").nargs(0).help("Dump operators in circle file");
   arser.add_argument("--conv2d_weight")
-      .nargs(0)
-      .help("Dump Conv2D series weight operators in circle file");
+    .nargs(0)
+    .help("Dump Conv2D series weight operators in circle file");
   arser.add_argument("--op_version").nargs(0).help("Dump versions of the operators in circle file");
   arser.add_argument("circle").type(arser::DataType::STR).help("Circle file to inspect");
 
diff --git a/compiler/circle-part-driver/CMakeLists.txt b/compiler/circle-part-driver/CMakeLists.txt
new file mode 100644
index 000000000..cb708742c
--- /dev/null
+++ b/compiler/circle-part-driver/CMakeLists.txt
@@ -0,0 +1,17 @@
+set(SRCS_PART_TESTER
+      src/Driver.cpp
+      src/PModelsRunner.cpp
+   )
+
+add_executable(circle_part_driver ${SRCS_PART_TESTER})
+target_link_libraries(circle_part_driver foder)
+target_link_libraries(circle_part_driver loco)
+target_link_libraries(circle_part_driver luci_import)
+target_link_libraries(circle_part_driver luci_lang)
+target_link_libraries(circle_part_driver luci_log)
+target_link_libraries(circle_part_driver luci_interpreter)
+target_link_libraries(circle_part_driver crew)
+target_link_libraries(circle_part_driver safemain)
+target_link_libraries(circle_part_driver nncc_common)
+
+install(TARGETS circle_part_driver DESTINATION bin)
diff --git a/compiler/circle-part-driver/README.md b/compiler/circle-part-driver/README.md
new file mode 100644
index 000000000..d66ecf5fa
--- /dev/null
+++ b/compiler/circle-part-driver/README.md
@@ -0,0 +1,3 @@
+# circle-part-driver
+
+_circle-part-driver_ is test driver to run partitioned circle models
diff --git a/compiler/circle-part-driver/requires.cmake b/compiler/circle-part-driver/requires.cmake
new file mode 100644
index 000000000..72296e32f
--- /dev/null
+++ b/compiler/circle-part-driver/requires.cmake
@@ -0,0 +1,6 @@
+require("foder")
+require("loco")
+require("luci")
+require("luci-interpreter")
+require("crew")
+require("safemain")
diff --git a/compiler/circle-part-driver/src/Driver.cpp b/compiler/circle-part-driver/src/Driver.cpp
new file mode 100644
index 000000000..a39bbf187
--- /dev/null
+++ b/compiler/circle-part-driver/src/Driver.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "PModelsRunner.h"
+
+#include <luci/Log.h>
+
+#include <iostream>
+
+int entry(int argc, char **argv)
+{
+  LOGGER(l);
+
+  if (argc != 5)
+  {
+    std::cerr
+      << "Usage: " << argv[0]
+      << " <path/to/partition/config> <num_inputs> <path/to/input/prefix> <path/to/output/file>\n";
+    return EXIT_FAILURE;
+  }
+  // NOTE: about input/output data file name
+  // - I/O file name format is like filename.ext0, filename.ext1, ...
+  // NOTE: about output shape
+  // - file name with filename.ext0.shape, filename.ext1.shape, ...
+  //   having one line text content of CSV format(like H,W or N,C,H,W)
+
+  const char *config_filename = argv[1];
+  const int32_t num_inputs = atoi(argv[2]);
+  const char *input_prefix = argv[3];
+  const char *output_file = argv[4];
+
+  prunner::PModelsRunner pmrunner;
+
+  INFO(l) << "Read config file: " << config_filename << std::endl;
+  if (not pmrunner.load_config(config_filename))
+    return EXIT_FAILURE;
+
+  INFO(l) << "Read input file: " << input_prefix << ", #inputs: " << num_inputs << std::endl;
+  pmrunner.load_inputs(input_prefix, num_inputs);
+
+  INFO(l) << "Run all partitioned models..." << std::endl;
+  if (!pmrunner.run())
+    return EXIT_FAILURE;
+
+  INFO(l) << "Save output file: " << output_file << std::endl;
+  pmrunner.save_outputs(output_file);
+
+  return EXIT_SUCCESS;
+}
diff --git a/compiler/circle-part-driver/src/PModelsRunner.cpp b/compiler/circle-part-driver/src/PModelsRunner.cpp
new file mode 100644
index 000000000..453ce9b5f
--- /dev/null
+++ b/compiler/circle-part-driver/src/PModelsRunner.cpp
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "PModelsRunner.h"
+
+#include <luci/IR/Nodes/CircleInput.h>
+#include <luci/IR/Nodes/CircleOutput.h>
+#include <luci/Importer.h>
+#include <luci/Log.h>
+#include <luci_interpreter/Interpreter.h>
+
+#include <foder/FileLoader.h>
+#include <crew/PConfig.h>
+
+#include <fstream>
+#include <iostream>
+#include <vector>
+#include <string>
+#include <stdexcept>
+
+namespace
+{
+
+void write_file(const std::string &filename, const char *data, size_t data_size)
+{
+  std::ofstream fs(filename, std::ofstream::binary);
+  if (fs.fail())
+    throw std::runtime_error("Cannot open file \"" + filename + "\".\n");
+  if (fs.write(data, data_size).fail())
+  {
+    throw std::runtime_error("Failed to write data to file \"" + filename + "\".\n");
+  }
+}
+
+std::unique_ptr<luci::Module> import_circle(const std::string &filename)
+{
+  std::ifstream fs(filename, std::ifstream::binary);
+  if (fs.fail())
+  {
+    throw std::runtime_error("Cannot open model file \"" + filename + "\".\n");
+  }
+  std::vector<char> model_data((std::istreambuf_iterator<char>(fs)),
+                               std::istreambuf_iterator<char>());
+
+  return luci::Importer().importModule(circle::GetModel(model_data.data()));
+}
+
+void save_shape(const std::string &shape_filename, const luci::CircleOutput *output_node)
+{
+  if (output_node->rank() == 0)
+  {
+    write_file(shape_filename, "1", 1);
+  }
+  else
+  {
+    auto shape_str = std::to_string(output_node->dim(0).value());
+    for (uint32_t j = 1; j < output_node->rank(); j++)
+    {
+      shape_str += ",";
+      shape_str += std::to_string(output_node->dim(j).value());
+    }
+    write_file(shape_filename, shape_str.c_str(), shape_str.size());
+  }
+}
+
+template <typename NodeT> size_t tensor_size(const NodeT *node)
+{
+  uint32_t tsize = loco::size(node->dtype());
+  for (uint32_t i = 0; i < node->rank(); ++i)
+  {
+    assert(node->dim(i).known());
+    tsize *= node->dim(i).value();
+  }
+  return tsize;
+}
+
+} // namespace
+
+namespace prunner
+{
+
+bool PModelsRunner::load_config(const std::string &filename)
+{
+  if (!crew::read_ini(filename, _pconfig))
+  {
+    std::cerr << "ERROR: Invalid config ini file: '" << filename << "'" << std::endl;
+    return false;
+  }
+
+  for (auto &part : _pconfig.parts)
+  {
+    _models_to_run.push_back(part.model_file);
+  }
+  return true;
+}
+
+void PModelsRunner::load_inputs(const std::string &input_prefix, int32_t num_inputs)
+{
+  LOGGER(l);
+
+  auto its = _pconfig.source.inputs.begin();
+  for (int32_t i = 0; i < num_inputs; ++i, ++its)
+  {
+    std::string filename = input_prefix + std::to_string(i);
+
+    INFO(l) << "Load input data: " << filename << std::endl;
+    foder::FileLoader file_loader{filename};
+
+    std::string input_name = *its;
+    _data_stage[input_name] = file_loader.load();
+
+    INFO(l) << "Input: [" << input_name << "], size " << _data_stage[input_name].size()
+            << std::endl;
+  }
+}
+
+/**
+ * @brief return true if all inputs of the model is ready in _data_storage
+ */
+bool PModelsRunner::is_input_ready(const RunModel &model)
+{
+  for (auto &part : _pconfig.parts)
+  {
+    if (part.model_file != model)
+      continue;
+
+    for (auto &input : part.inputs)
+    {
+      auto it = _data_stage.find(input);
+      if (it == _data_stage.end())
+        return false;
+    }
+  }
+  return true;
+}
+
+bool PModelsRunner::run(void)
+{
+  LOGGER(l);
+
+  // for each partitioned model, if the inputs of the model are ready, run the model
+  do
+  {
+    bool found_model = false;
+
+    for (auto it = _models_to_run.begin(); it != _models_to_run.end(); ++it)
+    {
+      auto model_fname = *it;
+
+      INFO(l) << "Check model input ready: " << model_fname << std::endl;
+      if (is_input_ready(model_fname))
+      {
+        found_model = true;
+
+        INFO(l) << "Run model: " << model_fname << std::endl;
+        auto module = import_circle(model_fname);
+
+        luci_interpreter::Interpreter interpreter(module.get());
+
+        // Set input
+        // TODO support multiple subgraphs
+        assert(module->size() == 1);
+        const auto input_nodes = loco::input_nodes(module->graph());
+        int32_t num_inputs = static_cast<int32_t>(input_nodes.size());
+        for (int32_t i = 0; i < num_inputs; i++)
+        {
+          const auto *input_node = loco::must_cast<const luci::CircleInput *>(input_nodes[i]);
+
+          auto input_name = input_node->name();
+          assert(_data_stage.find(input_name) != _data_stage.end());
+
+          auto input_data = _data_stage[input_name];
+
+          interpreter.writeInputTensor(input_node, input_data.data(), input_data.size());
+        }
+
+        // Run interpreter
+        interpreter.interpret();
+        INFO(l) << "Run model: " << model_fname << " done" << std::endl;
+
+        // Get output.
+        const auto output_nodes = loco::output_nodes(module->graph());
+        for (uint32_t i = 0; i < module->graph()->outputs()->size(); i++)
+        {
+          const auto *output_node = loco::must_cast<const luci::CircleOutput *>(output_nodes[i]);
+          auto output_name = output_node->name();
+
+          Buffer output_data(tensor_size(output_node));
+
+          interpreter.readOutputTensor(output_node, output_data.data(), output_data.size());
+
+          // There should not exist same output names
+          // TODO check with multiple virtual outputs
+          assert(_data_stage.find(output_name) == _data_stage.end());
+          _data_stage[output_name] = output_data;
+        }
+
+        // We've ran this model, remove from the model list
+        _models_to_run.erase(it);
+        break;
+      }
+    }
+
+    if (not found_model)
+    {
+      std::cerr << "ERROR: model partition or configuration has problems" << std::endl;
+      return false;
+    }
+  } while (not _models_to_run.empty());
+
+  return true;
+}
+
+void PModelsRunner::save_outputs(const std::string &output_file)
+{
+  // load source model as we need to get both shape and node name
+  // TODO check for unknown shape
+  auto source_fname = _pconfig.source.model_file;
+
+  auto module = import_circle(source_fname);
+
+  const auto output_nodes = loco::output_nodes(module->graph());
+  for (uint32_t i = 0; i < module->graph()->outputs()->size(); i++)
+  {
+    const auto *output_node = loco::must_cast<const luci::CircleOutput *>(output_nodes[i]);
+
+    auto output_name = output_node->name();
+    assert(_data_stage.find(output_name) != _data_stage.end());
+
+    auto tensor_data = _data_stage[output_name];
+    auto output_filename = output_file + std::to_string(i);
+
+    write_file(output_filename, tensor_data.data(), tensor_data.size());
+    save_shape(output_filename + ".shape", output_node);
+  }
+}
+
+} // namespace prunner
diff --git a/compiler/circle-part-driver/src/PModelsRunner.h b/compiler/circle-part-driver/src/PModelsRunner.h
new file mode 100644
index 000000000..c1a45f01c
--- /dev/null
+++ b/compiler/circle-part-driver/src/PModelsRunner.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CIRCLE_PRUNNER_PMODELS_RUNNER_H__
+#define __CIRCLE_PRUNNER_PMODELS_RUNNER_H__
+
+#include <crew/PConfig.h>
+
+#include <map>
+#include <string>
+#include <vector>
+
+namespace prunner
+{
+
+using Buffer = std::vector<char>;
+
+using Buffers = std::map<std::string, Buffer>;
+
+using RunModel = std::string;
+
+using RunModels = std::vector<RunModel>;
+
+/**
+ * @brief PModelsRunner runs partitioned models from input data file and stores
+ *        output data to a file
+ */
+class PModelsRunner
+{
+public:
+  PModelsRunner() = default;
+
+public:
+  bool load_config(const std::string &filename);
+  void load_inputs(const std::string &input_prefix, int32_t num_inputs);
+  bool run(void);
+  void save_outputs(const std::string &output_file);
+
+private:
+  bool is_input_ready(const RunModel &model);
+
+private:
+  crew::PConfig _pconfig;
+  RunModels _models_to_run;
+  Buffers _data_stage;
+};
+
+} // namespace prunner
+
+#endif // __CIRCLE_PRUNNER_PMODELS_RUNNER_H__
diff --git a/compiler/circle-part-value-test/CMakeLists.txt b/compiler/circle-part-value-test/CMakeLists.txt
new file mode 100644
index 000000000..d75b17d1f
--- /dev/null
+++ b/compiler/circle-part-value-test/CMakeLists.txt
@@ -0,0 +1,99 @@
+#
+# this project validates partitioned models produced by circle-partitioner
+# with circle-part-driver and two scripts; part_eval_all.sh and part_eval_one.py
+#
+
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
+get_target_property(ARTIFACTS_BIN_PATH testDataGenerator BINARY_DIR)
+
+unset(RECIPE_LIST)
+unset(PARTITION_LIST)
+unset(TEST_DEPS)
+
+macro(add RECIPE_NAME PARTITION_NAME)
+  list(APPEND RECIPE_LIST ${RECIPE_NAME})
+  list(APPEND PARTITION_LIST ${PARTITION_NAME})
+endmacro(add)
+
+# Read "test.lst"
+include("test.lst")
+
+list(LENGTH RECIPE_LIST RECIPE_LENGTH)
+math(EXPR RECIPE_LENGTH_M1 "${RECIPE_LENGTH} - 1")
+
+foreach(IDX RANGE ${RECIPE_LENGTH_M1})
+  list(GET RECIPE_LIST ${IDX} RECIPE_NAME)
+  list(GET PARTITION_LIST ${IDX} PARTITION_NAME)
+
+  # NOTE about the name:
+  # Use '.recipe' name for source tflite and circle files
+  # Use '.part' name for actual test folder and test files
+
+  # Output to a folder
+  set(PARTITIONER_OUTPUT_PATH "${CMAKE_CURRENT_BINARY_DIR}/${PARTITION_NAME}")
+
+  add_custom_command(OUTPUT ${PARTITIONER_OUTPUT_PATH}
+    COMMAND ${CMAKE_COMMAND} -E make_directory "${PARTITIONER_OUTPUT_PATH}"
+    COMMENT "Make directory ${PARTITIONER_OUTPUT_PATH}"
+  )
+
+  # Copy tflite
+  set(TFLITE_SRC_PATH "${ARTIFACTS_BIN_PATH}/${RECIPE_NAME}.tflite")
+  set(TFLITE_DST_PATH "${PARTITIONER_OUTPUT_PATH}/${PARTITION_NAME}.tflite")
+
+  add_custom_command(OUTPUT ${TFLITE_DST_PATH}
+    COMMAND ${CMAKE_COMMAND} -E copy "${TFLITE_SRC_PATH}" "${TFLITE_DST_PATH}"
+    DEPENDS ${TFLITE_SRC_PATH}
+    COMMENT "Copy ${RECIPE_NAME}.tflite"
+  )
+  list(APPEND TEST_DEPS ${TFLITE_DST_PATH})
+
+  # Copy circle
+  set(CIRCLE_SRC_PATH "${ARTIFACTS_BIN_PATH}/${RECIPE_NAME}.circle")
+  set(CIRCLE_DST_PATH "${PARTITIONER_OUTPUT_PATH}/${PARTITION_NAME}.circle")
+
+  add_custom_command(OUTPUT ${CIRCLE_DST_PATH}
+    COMMAND ${CMAKE_COMMAND} -E copy "${CIRCLE_SRC_PATH}" "${CIRCLE_DST_PATH}"
+    DEPENDS ${CIRCLE_SRC_PATH}
+    COMMENT "Copy ${RECIPE_NAME}.circle"
+  )
+  list(APPEND TEST_DEPS ${CIRCLE_DST_PATH})
+
+  # Copy .part
+  set(PART_FILE "${PARTITION_NAME}.part")
+  set(PART_SRC_PATH "${CMAKE_CURRENT_SOURCE_DIR}/parts/${PART_FILE}")
+  set(PART_DST_PATH "${PARTITIONER_OUTPUT_PATH}/${PART_FILE}")
+
+  add_custom_command(OUTPUT ${PART_DST_PATH}
+    COMMAND ${CMAKE_COMMAND} -E copy "${PART_SRC_PATH}" "${PART_DST_PATH}"
+    DEPENDS ${PART_SRC_PATH}
+    COMMENT "Copy ${PART_FILE}"
+  )
+  list(APPEND TEST_DEPS ${PART_DST_PATH})
+
+  # Partition connection file to generate
+  set(PARTITIONER_CONN_JSON "${PARTITIONER_OUTPUT_PATH}/${PARTITION_NAME}.conn.json")
+
+  # Run partitioner
+  add_custom_command(OUTPUT ${PARTITIONER_CONN_JSON}
+    COMMAND circle_partitioner "${PART_FILE}" "${PARTITION_NAME}.circle" "${PARTITIONER_OUTPUT_PATH}"
+    DEPENDS circle_partitioner ${PART_DST_PATH} ${CIRCLE_DST_PATH}
+    COMMENT "Parition ${RECIPE_NAME}.circle with ${PART_FILE}"
+  )
+  list(APPEND TEST_DEPS ${PARTITIONER_CONN_JSON})
+endforeach(IDX)
+
+add_custom_target(circle_part_value_test_prepare ALL DEPENDS ${TEST_DEPS})
+add_dependencies(circle_part_value_test_prepare common_artifacts_deps)
+
+# run evaluation
+add_test(NAME circle_part_value_test
+  COMMAND "${CMAKE_CURRENT_SOURCE_DIR}/part_eval_all.sh"
+          "${CMAKE_CURRENT_BINARY_DIR}"
+          "${NNCC_OVERLAY_DIR}/venv_2_3_0"
+          "$<TARGET_FILE:circle_part_driver>"
+          ${PARTITION_LIST}
+)
diff --git a/compiler/circle-part-value-test/README.md b/compiler/circle-part-value-test/README.md
new file mode 100644
index 000000000..6322b0791
--- /dev/null
+++ b/compiler/circle-part-value-test/README.md
@@ -0,0 +1,15 @@
+# circle-part-value-test
+
+_circle-part-value-test_ evaluates partitioned models produced by circle-partitioner.
+
+### Process of evaluation
+
+Evaluation process is like how _luci-value-test_ does.
+
+1) generates random input and stores to reference input file(s)
+2) executes tflite file from common-artifacts for reference output
+3) partitions circle file with .part file and produces into output folder
+4) executes produced partitioned circle models with reference input file(s)
+5) saves output(s) of circle models to file(s)
+6) compares reference output with saved output file(s)
+7) fail test if values differ
diff --git a/compiler/circle-part-value-test/part_eval_all.sh b/compiler/circle-part-value-test/part_eval_all.sh
new file mode 100755
index 000000000..ae8ae4731
--- /dev/null
+++ b/compiler/circle-part-value-test/part_eval_all.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+
+# This script verifies the basic behavior of circle-partitioner
+#
+# HOW TO USE
+#
+# ./part_eval_all.sh <path/to/work_dir> <path/to/venv_dir> <path/to/driver> <TEST 1> <TEST 2> ...
+#
+#    bin_dir  : build directory of circle-part-value-test (ex: build/compiler/circle-part-value-test)
+#    work_dir : artifacts directoy where test materials exist
+#    venv_dir : python virtual environment home directory
+
+VERIFY_SOURCE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+VERIFY_SCRIPT_PATH="${VERIFY_SOURCE_PATH}/part_eval_one.py"
+WORKDIR="$1"; shift
+VIRTUALENV="$1"; shift
+CIRCLE_PART_DRIVER_PATH="$1"; shift
+
+TESTED=()
+PASSED=()
+FAILED=()
+
+for TESTCASE in "$@"; do
+  TESTED+=("${TESTCASE}")
+
+  # for simplicity, folder uses same ${TESTCASE}
+  TESTCASE_FOLDER="${WORKDIR}/${TESTCASE}"
+  
+  PASSED_TAG="${TESTCASE_FOLDER}.passed"
+  rm -f "${PASSED_TAG}"
+
+  cat > "${TESTCASE_FOLDER}.log" <(
+    exec 2>&1
+    set -ex
+
+    # chdir into the folder as ini has relative filename of the model
+    pushd ${TESTCASE_FOLDER}
+
+    source "${VIRTUALENV}/bin/activate"
+    "${VIRTUALENV}/bin/python" "${VERIFY_SCRIPT_PATH}" \
+    --driver "${CIRCLE_PART_DRIVER_PATH}" \
+    --name "${TESTCASE}"
+
+    if [[ $? -eq 0 ]]; then
+      touch "${PASSED_TAG}"
+    fi
+
+    popd
+  )
+
+  if [[ -f "${PASSED_TAG}" ]]; then
+    PASSED+=("${TESTCASE}")
+  else
+    FAILED+=("${TESTCASE}")
+  fi
+done
+
+if [[ ${#TESTED[@]} -ne ${#PASSED[@]} ]]; then
+  echo "FAILED"
+  for TEST in "${FAILED[@]}"
+  do
+    echo "- ${TEST}"
+  done
+  exit 255
+fi
+
+echo "PASSED"
+exit 0
diff --git a/compiler/circle-part-value-test/part_eval_one.py b/compiler/circle-part-value-test/part_eval_one.py
new file mode 100755
index 000000000..b0b65fd74
--- /dev/null
+++ b/compiler/circle-part-value-test/part_eval_one.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python3
+import numpy as np
+import tensorflow as tf
+import subprocess
+import argparse
+import traceback
+
+#
+# This script compares the execution result of TFLite interpreter and
+# partitioned model(s) from a circle model
+#
+# Basic usage for example:
+#   part_eval_one.py \
+#       --driver build/compiler/circle-part-driver/circle-part-driver \
+#       --name test_file
+#
+parser = argparse.ArgumentParser()
+parser.add_argument('--driver', type=str, required=True)
+parser.add_argument('--name', type=str, required=True)
+args = parser.parse_args()
+
+driver = args.driver
+tflite_model = args.name + ".tflite"
+circle_model = args.name + ".circle"
+partition_conn_ini = args.name + ".conn.ini"
+
+# Build TFLite interpreter.
+interpreter = tf.lite.Interpreter(tflite_model)
+interpreter.allocate_tensors()
+
+# Generate random input data.
+num_inputs = len(interpreter.get_input_details())
+for i in range(num_inputs):
+    input_details = interpreter.get_input_details()[i]
+    if input_details["dtype"] == np.float32:
+        input_data = np.array(
+            np.random.random_sample(input_details["shape"]), input_details["dtype"])
+    elif input_details["dtype"] == np.uint8:
+        input_data = np.array(
+            np.random.randint(0, 256, size=input_details["shape"]),
+            input_details["dtype"])
+    elif input_details["dtype"] == np.bool_:
+        input_data = np.array(
+            np.random.choice(a=[True, False], size=input_details["shape"]),
+            input_details["dtype"])
+    else:
+        raise SystemExit("Unsupported input dtype")
+
+    interpreter.set_tensor(input_details["index"], input_data)
+    input_data.tofile(circle_model + ".input" + str(i))
+
+# Do inference
+interpreter.invoke()
+
+# Execute circle-part-driver.
+partition_command = [
+    driver, partition_conn_ini,
+    str(num_inputs), circle_model + ".input", circle_model + ".output"
+]
+print("Run: ")
+for arg in partition_command:
+    print("    ", arg, "\\")
+print("", flush=True)
+
+subprocess.run(partition_command, check=True)
+
+# Compare the results.
+for idx in range(len(interpreter.get_output_details())):
+    output_details = interpreter.get_output_details()[idx]
+    output_data = np.fromfile(circle_model + ".output" + str(idx),
+                              output_details["dtype"])
+    shape_file = open(circle_model + ".output" + str(idx) + ".shape", 'r')
+    output_shape = [int(i) for i in shape_file.read().split(',')]
+    luci_output_data = np.reshape(output_data, output_shape)
+    try:
+        if output_details["dtype"] == np.uint8:
+            if np.allclose(
+                    luci_output_data,
+                    interpreter.get_tensor(
+                        interpreter.get_output_details()[idx]["index"]),
+                    rtol=0,
+                    atol=0) == False:
+                raise SystemExit("Execution result of " + tflite_model +
+                                 " does not match with " + circle_model)
+        elif output_details["dtype"] == np.float32:
+            if np.allclose(
+                    luci_output_data,
+                    interpreter.get_tensor(
+                        interpreter.get_output_details()[idx]["index"]),
+                    rtol=1.e-5,
+                    atol=1.e-5) == False:
+                raise SystemExit("Execution result of " + tflite_model +
+                                 " does not match with " + circle_model)
+        elif output_details["dtype"] == np.int64:
+            if np.allclose(
+                    luci_output_data,
+                    interpreter.get_tensor(
+                        interpreter.get_output_details()[idx]["index"]),
+                    rtol=0,
+                    atol=0) == False:
+                raise SystemExit("Execution result of " + tflite_model +
+                                 " does not match with " + circle_model)
+        elif output_details["dtype"] == np.int32:
+            if np.allclose(
+                    luci_output_data,
+                    interpreter.get_tensor(
+                        interpreter.get_output_details()[idx]["index"]),
+                    rtol=0,
+                    atol=0) == False:
+                raise SystemExit("Execution result of " + tflite_model +
+                                 " does not match with " + circle_model)
+        else:
+            raise SystemExit("Unsupported data type: ", output_details["dtype"])
+    except:
+        print(traceback.format_exc())
+        quit(255)
+
+quit(0)
diff --git a/compiler/circle-part-value-test/parts/Net_InstanceNorm_003.001.part b/compiler/circle-part-value-test/parts/Net_InstanceNorm_003.001.part
new file mode 100644
index 000000000..01b8c704e
--- /dev/null
+++ b/compiler/circle-part-value-test/parts/Net_InstanceNorm_003.001.part
@@ -0,0 +1,7 @@
+[partition]
+backends=cpu,acl_cl
+default=cpu
+comply=opcode
+
+[OPCODE]
+ADD=acl_cl
diff --git a/compiler/circle-part-value-test/parts/Net_InstanceNorm_003.002.part b/compiler/circle-part-value-test/parts/Net_InstanceNorm_003.002.part
new file mode 100644
index 000000000..dc378a448
--- /dev/null
+++ b/compiler/circle-part-value-test/parts/Net_InstanceNorm_003.002.part
@@ -0,0 +1,8 @@
+[partition]
+backends=cpu,acl_cl
+default=cpu
+comply=opcode
+
+[OPCODE]
+SUB=acl_cl
+DIV=acl_cl
diff --git a/compiler/circle-part-value-test/parts/Net_InstanceNorm_003.part b/compiler/circle-part-value-test/parts/Net_InstanceNorm_003.part
new file mode 100644
index 000000000..d4d439d27
--- /dev/null
+++ b/compiler/circle-part-value-test/parts/Net_InstanceNorm_003.part
@@ -0,0 +1,7 @@
+[partition]
+backends=cpu,acl_cl
+default=cpu
+comply=opcode
+
+[OPCODE]
+DIV=acl_cl
diff --git a/compiler/circle-part-value-test/parts/Part_Add_Sqrt_000.part b/compiler/circle-part-value-test/parts/Part_Add_Sqrt_000.part
new file mode 100644
index 000000000..402af87e9
--- /dev/null
+++ b/compiler/circle-part-value-test/parts/Part_Add_Sqrt_000.part
@@ -0,0 +1,7 @@
+[partition]
+backends=cpu,acl_cl
+default=cpu
+comply=opcode
+
+[OPCODE]
+SQRT=acl_cl
diff --git a/compiler/circle-part-value-test/parts/Part_Add_Sqrt_Rsqrt_000.part b/compiler/circle-part-value-test/parts/Part_Add_Sqrt_Rsqrt_000.part
new file mode 100644
index 000000000..c6dba9f94
--- /dev/null
+++ b/compiler/circle-part-value-test/parts/Part_Add_Sqrt_Rsqrt_000.part
@@ -0,0 +1,7 @@
+[partition]
+backends=cpu,acl_cl
+default=cpu
+comply=opcode
+
+[OPCODE]
+RSQRT=acl_cl
diff --git a/compiler/circle-part-value-test/parts/Part_Add_Sub_000.part b/compiler/circle-part-value-test/parts/Part_Add_Sub_000.part
new file mode 100644
index 000000000..905137ce7
--- /dev/null
+++ b/compiler/circle-part-value-test/parts/Part_Add_Sub_000.part
@@ -0,0 +1,7 @@
+[partition]
+backends=cpu,acl_cl
+default=cpu
+comply=opcode
+
+[OPCODE]
+SUB=acl_cl
diff --git a/compiler/circle-part-value-test/parts/Part_Sqrt_Rsqrt_000.part b/compiler/circle-part-value-test/parts/Part_Sqrt_Rsqrt_000.part
new file mode 100644
index 000000000..402af87e9
--- /dev/null
+++ b/compiler/circle-part-value-test/parts/Part_Sqrt_Rsqrt_000.part
@@ -0,0 +1,7 @@
+[partition]
+backends=cpu,acl_cl
+default=cpu
+comply=opcode
+
+[OPCODE]
+SQRT=acl_cl
diff --git a/compiler/circle-part-value-test/parts/Part_Sqrt_Rsqrt_001.part b/compiler/circle-part-value-test/parts/Part_Sqrt_Rsqrt_001.part
new file mode 100644
index 000000000..402af87e9
--- /dev/null
+++ b/compiler/circle-part-value-test/parts/Part_Sqrt_Rsqrt_001.part
@@ -0,0 +1,7 @@
+[partition]
+backends=cpu,acl_cl
+default=cpu
+comply=opcode
+
+[OPCODE]
+SQRT=acl_cl
diff --git a/compiler/circle-part-value-test/parts/Part_Sqrt_Rsqrt_002.part b/compiler/circle-part-value-test/parts/Part_Sqrt_Rsqrt_002.part
new file mode 100644
index 000000000..402af87e9
--- /dev/null
+++ b/compiler/circle-part-value-test/parts/Part_Sqrt_Rsqrt_002.part
@@ -0,0 +1,7 @@
+[partition]
+backends=cpu,acl_cl
+default=cpu
+comply=opcode
+
+[OPCODE]
+SQRT=acl_cl
diff --git a/compiler/circle-part-value-test/parts/Part_Sqrt_Rsqrt_003.part b/compiler/circle-part-value-test/parts/Part_Sqrt_Rsqrt_003.part
new file mode 100644
index 000000000..402af87e9
--- /dev/null
+++ b/compiler/circle-part-value-test/parts/Part_Sqrt_Rsqrt_003.part
@@ -0,0 +1,7 @@
+[partition]
+backends=cpu,acl_cl
+default=cpu
+comply=opcode
+
+[OPCODE]
+SQRT=acl_cl
diff --git a/compiler/circle-part-value-test/parts/Part_Sqrt_Rsqrt_Add_000.part b/compiler/circle-part-value-test/parts/Part_Sqrt_Rsqrt_Add_000.part
new file mode 100644
index 000000000..402af87e9
--- /dev/null
+++ b/compiler/circle-part-value-test/parts/Part_Sqrt_Rsqrt_Add_000.part
@@ -0,0 +1,7 @@
+[partition]
+backends=cpu,acl_cl
+default=cpu
+comply=opcode
+
+[OPCODE]
+SQRT=acl_cl
diff --git a/compiler/circle-part-value-test/parts/Part_Sqrt_Rsqrt_Add_001.part b/compiler/circle-part-value-test/parts/Part_Sqrt_Rsqrt_Add_001.part
new file mode 100644
index 000000000..402af87e9
--- /dev/null
+++ b/compiler/circle-part-value-test/parts/Part_Sqrt_Rsqrt_Add_001.part
@@ -0,0 +1,7 @@
+[partition]
+backends=cpu,acl_cl
+default=cpu
+comply=opcode
+
+[OPCODE]
+SQRT=acl_cl
diff --git a/compiler/circle-part-value-test/parts/Part_Sqrt_Rsqrt_Add_002.part b/compiler/circle-part-value-test/parts/Part_Sqrt_Rsqrt_Add_002.part
new file mode 100644
index 000000000..402af87e9
--- /dev/null
+++ b/compiler/circle-part-value-test/parts/Part_Sqrt_Rsqrt_Add_002.part
@@ -0,0 +1,7 @@
+[partition]
+backends=cpu,acl_cl
+default=cpu
+comply=opcode
+
+[OPCODE]
+SQRT=acl_cl
diff --git a/compiler/circle-part-value-test/parts/Part_Sqrt_Rsqrt_Add_003.part b/compiler/circle-part-value-test/parts/Part_Sqrt_Rsqrt_Add_003.part
new file mode 100644
index 000000000..0ec264c94
--- /dev/null
+++ b/compiler/circle-part-value-test/parts/Part_Sqrt_Rsqrt_Add_003.part
@@ -0,0 +1,7 @@
+[partition]
+backends=cpu,acl_cl
+default=cpu
+comply=opcode
+
+[OPCODE]
+WWW=acl_cl
diff --git a/compiler/circle-part-value-test/parts/Part_Sqrt_Rsqrt_Add_004.part b/compiler/circle-part-value-test/parts/Part_Sqrt_Rsqrt_Add_004.part
new file mode 100644
index 000000000..febab2246
--- /dev/null
+++ b/compiler/circle-part-value-test/parts/Part_Sqrt_Rsqrt_Add_004.part
@@ -0,0 +1,6 @@
+[partition]
+backends=cpu,acl_cl
+default=cpu
+comply=opcode
+
+[OPCODE]
diff --git a/compiler/circle-part-value-test/requires.cmake b/compiler/circle-part-value-test/requires.cmake
new file mode 100644
index 000000000..a9301f947
--- /dev/null
+++ b/compiler/circle-part-value-test/requires.cmake
@@ -0,0 +1,3 @@
+require("common-artifacts")
+require("circle-partitioner")
+require("circle-part-driver")
diff --git a/compiler/circle-part-value-test/test.lst b/compiler/circle-part-value-test/test.lst
new file mode 100644
index 000000000..8316560f0
--- /dev/null
+++ b/compiler/circle-part-value-test/test.lst
@@ -0,0 +1,20 @@
+# Add recipe names from /res/TensorFlowLiteRecipes to test.
+# Only add items exist in common-artifacts test: tflite/circle files are copied as source.
+#
+# add(RECIPE_NAME PARTITION_NAME)
+
+add(Part_Add_Sub_000 Part_Add_Sub_000)
+add(Part_Sqrt_Rsqrt_000 Part_Sqrt_Rsqrt_000)
+add(Part_Sqrt_Rsqrt_001 Part_Sqrt_Rsqrt_001)
+add(Part_Sqrt_Rsqrt_002 Part_Sqrt_Rsqrt_002)
+add(Part_Sqrt_Rsqrt_003 Part_Sqrt_Rsqrt_003)
+add(Part_Sqrt_Rsqrt_Add_000 Part_Sqrt_Rsqrt_Add_000)
+add(Part_Sqrt_Rsqrt_Add_001 Part_Sqrt_Rsqrt_Add_001)
+add(Part_Sqrt_Rsqrt_Add_002 Part_Sqrt_Rsqrt_Add_002)
+add(Part_Sqrt_Rsqrt_Add_003 Part_Sqrt_Rsqrt_Add_003)
+add(Part_Sqrt_Rsqrt_Add_004 Part_Sqrt_Rsqrt_Add_004)
+add(Part_Add_Sqrt_000 Part_Add_Sqrt_000)
+add(Part_Add_Sqrt_Rsqrt_000 Part_Add_Sqrt_Rsqrt_000)
+add(Net_InstanceNorm_003 Net_InstanceNorm_003)
+add(Net_InstanceNorm_003 Net_InstanceNorm_003.001)
+add(Net_InstanceNorm_003 Net_InstanceNorm_003.002)
diff --git a/compiler/circle-partitioner/CMakeLists.txt b/compiler/circle-partitioner/CMakeLists.txt
new file mode 100644
index 000000000..573e6ec12
--- /dev/null
+++ b/compiler/circle-partitioner/CMakeLists.txt
@@ -0,0 +1,17 @@
+file(GLOB_RECURSE SOURCES "src/*.cpp")
+
+add_executable(circle_partitioner "${SOURCES}")
+target_link_libraries(circle_partitioner foder)
+target_link_libraries(circle_partitioner crew)
+target_link_libraries(circle_partitioner safemain)
+target_link_libraries(circle_partitioner luci_lang)
+target_link_libraries(circle_partitioner luci_log)
+target_link_libraries(circle_partitioner luci_import)
+target_link_libraries(circle_partitioner luci_service)
+target_link_libraries(circle_partitioner luci_export)
+target_link_libraries(circle_partitioner luci_partition)
+target_link_libraries(circle_partitioner arser)
+target_link_libraries(circle_partitioner vconone)
+target_link_libraries(circle_partitioner nncc_common)
+
+install(TARGETS circle_partitioner DESTINATION bin)
diff --git a/compiler/circle-partitioner/README.md b/compiler/circle-partitioner/README.md
new file mode 100644
index 000000000..7c387cf76
--- /dev/null
+++ b/compiler/circle-partitioner/README.md
@@ -0,0 +1,3 @@
+# circle-partitioner
+
+_circle-partitioner_ provides model partitioning of circle model to two or more circle models.
diff --git a/compiler/circle-partitioner/requires.cmake b/compiler/circle-partitioner/requires.cmake
new file mode 100644
index 000000000..507a4d89d
--- /dev/null
+++ b/compiler/circle-partitioner/requires.cmake
@@ -0,0 +1,6 @@
+require("foder")
+require("crew")
+require("safemain")
+require("luci")
+require("arser")
+require("vconone")
diff --git a/compiler/circle-partitioner/src/CirclePartitioner.cpp b/compiler/circle-partitioner/src/CirclePartitioner.cpp
new file mode 100644
index 000000000..28ff22abc
--- /dev/null
+++ b/compiler/circle-partitioner/src/CirclePartitioner.cpp
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "PartitionRead.h"
+#include "PartitionExport.h"
+#include "HelperPath.h"
+#include "HelperStrings.h"
+
+#include <foder/FileLoader.h>
+
+#include <luci/Importer.h>
+#include <luci/Service/Validate.h>
+#include <luci/CircleExporter.h>
+#include <luci/CircleFileExpContract.h>
+#include <luci/Log.h>
+
+#include <arser/arser.h>
+#include <vconone/vconone.h>
+
+#include <iostream>
+#include <string>
+
+namespace
+{
+
+const char *opt_bks = "--backends";
+const char *opt_def = "--default";
+const char *opt_part = "partition";
+const char *opt_input = "input";
+const char *opt_work = "work";
+
+void print_version(void)
+{
+  std::cout << "circle-partitioner version " << vconone::get_string() << std::endl;
+  std::cout << vconone::get_copyright() << std::endl;
+}
+
+void build_arser(arser::Arser &arser)
+{
+  arser.add_argument("--version")
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("Show version information and exit")
+    .exit_with(print_version);
+
+  arser.add_argument(opt_bks)
+    .nargs(1)
+    .type(arser::DataType::STR)
+    .required(false)
+    .help("Backends in CSV to use for partitioning");
+
+  arser.add_argument(opt_def)
+    .nargs(1)
+    .type(arser::DataType::STR)
+    .required(false)
+    .help("Default backend to assign");
+
+  arser.add_argument(opt_part)
+    .nargs(1)
+    .type(arser::DataType::STR)
+    .help("Partition file which provides backend to assign");
+  arser.add_argument(opt_input)
+    .nargs(1)
+    .type(arser::DataType::STR)
+    .help("Input circle model filename");
+  arser.add_argument(opt_work)
+    .nargs(1)
+    .type(arser::DataType::STR)
+    .help("Work folder of partition, input files exist and output files are produced");
+}
+
+std::unique_ptr<luci::Module> load_model(const std::string &input_path)
+{
+  // Load model from the file
+  foder::FileLoader file_loader{input_path};
+  std::vector<char> model_data = file_loader.load();
+
+  // Verify flatbuffers
+  flatbuffers::Verifier verifier{reinterpret_cast<uint8_t *>(model_data.data()), model_data.size()};
+  if (!circle::VerifyModelBuffer(verifier))
+  {
+    std::cerr << "ERROR: Invalid input file '" << input_path << "'" << std::endl;
+    return nullptr;
+  }
+
+  const circle::Model *circle_model = circle::GetModel(model_data.data());
+  if (circle_model == nullptr)
+  {
+    std::cerr << "ERROR: Failed to load circle '" << input_path << "'" << std::endl;
+    return nullptr;
+  }
+
+  // Import from input Circle file
+  luci::Importer importer;
+  return importer.importModule(circle_model);
+}
+
+bool validate_module(luci::Module *module)
+{
+  for (size_t g = 0; g < module->size(); ++g)
+  {
+    auto graph = module->graph(g);
+    if (!luci::validate(graph))
+    {
+      std::cerr << "ERROR: Invalid circle model" << std::endl;
+      return false;
+    }
+    if (!luci::validate_name(graph))
+    {
+      std::cerr << "ERROR: circle model has empty name" << std::endl;
+      return false;
+    }
+  }
+
+  if (!luci::validate_unique_name(module))
+  {
+    std::cerr << "ERROR: circle model has duplicate names" << std::endl;
+    return false;
+  }
+
+  return true;
+}
+
+bool validate_partition(luci::PartitionTable &partition)
+{
+  if (partition.groups.size() == 0)
+  {
+    std::cerr << "There is no 'backends' information";
+    return false;
+  }
+  if (partition.default_group.empty())
+  {
+    std::cerr << "There is no 'default' backend information";
+    return false;
+  }
+  if (!partee::is_one_of(partition.default_group, partition.groups))
+  {
+    std::cerr << "'default' backend is not one of 'backends' item";
+    return false;
+  }
+  for (auto &byopcode : partition.byopcodes)
+  {
+    if (!partee::is_one_of(byopcode.second, partition.groups))
+    {
+      std::cerr << "OPCODE " << byopcode.first << " is not assigned to one of 'backends' items";
+      return false;
+    }
+  }
+  return true;
+}
+
+void dump(std::ostream &os, const luci::PartitionTable &table)
+{
+  os << "Backends:";
+  for (auto &group : table.groups)
+  {
+    os << " " << group;
+    if (table.default_group == group)
+      os << "(default)";
+  }
+  os << std::endl;
+
+  os << "Assign by OPCODE: " << std::endl;
+  for (auto &item : table.byopcodes)
+    os << "  " << item.first << "=" << item.second << std::endl;
+}
+
+std::ostream &operator<<(std::ostream &os, const luci::PartitionTable &table)
+{
+  dump(os, table);
+  return os;
+}
+
+} // namespace
+
+int entry(int argc, char **argv)
+{
+  LOGGER(l);
+
+  arser::Arser arser("circle-partitioner provides circle model partitioning");
+
+  build_arser(arser);
+
+  try
+  {
+    arser.parse(argc, argv);
+  }
+  catch (const std::runtime_error &err)
+  {
+    std::cerr << err.what() << std::endl;
+    std::cerr << arser;
+    return EXIT_FAILURE;
+  }
+
+  std::string partition_file = arser.get<std::string>(opt_part);
+  std::string input_file = arser.get<std::string>(opt_input);
+  std::string work_folder = arser.get<std::string>(opt_work);
+
+  std::string partition_path = work_folder + "/" + partition_file;
+  std::string input_path = work_folder + "/" + input_file;
+
+  auto module = load_model(input_path);
+  if (module.get() == nullptr)
+  {
+    return EXIT_FAILURE;
+  }
+  if (!validate_module(module.get()))
+  {
+    return EXIT_FAILURE;
+  }
+
+  // Read partition information
+  INFO(l) << "--- Read PartitionConfig-----------------------" << std::endl;
+  auto partition = partee::read(partition_path);
+  INFO(l) << partition << std::endl;
+
+  // override with command line arguments
+  {
+    if (arser[opt_bks])
+    {
+      auto backend_backends = arser.get<std::string>(opt_bks);
+      partition.groups = partee::csv_to_vector<std::string>(backend_backends);
+    }
+    if (arser[opt_def])
+    {
+      partition.default_group = arser.get<std::string>(opt_def);
+    }
+  }
+  if (!validate_partition(partition))
+  {
+    return EXIT_FAILURE;
+  }
+
+  INFO(l) << "--- PartitionConfig final----------------------" << std::endl;
+  INFO(l) << partition << std::endl;
+
+  // apply partition to module
+  auto pms = luci::apply(module.get(), partition);
+
+  // validate partitioned modules
+  for (auto &pmodule : pms.pmodules)
+  {
+    for (size_t g = 0; g < pmodule.module->size(); ++g)
+    {
+      auto graph = pmodule.module->graph(g);
+      if (graph == nullptr)
+      {
+        std::cerr << "ERROR: Failed to create partition model" << std::endl;
+        return EXIT_FAILURE;
+      }
+      if (!luci::validate(graph))
+      {
+        std::cerr << "ERROR: Failed to create partition model" << std::endl;
+        return EXIT_FAILURE;
+      }
+    }
+  }
+
+  INFO(l) << "--- Partition Export---------------------------" << std::endl;
+  uint32_t idx = 1;
+  for (auto &pmodule : pms.pmodules)
+  {
+    // Export to output circle file
+    luci::CircleExporter exporter;
+
+    auto output_path = partee::make_path(work_folder, input_path, idx, pmodule.group);
+    pmodule.name = partee::get_filename_ext(output_path);
+    INFO(l) << "--- " << output_path << ": " << pmodule.name << std::endl;
+
+    luci::CircleFileExpContract contract(pmodule.module.get(), output_path);
+    if (!exporter.invoke(&contract))
+    {
+      std::cerr << "ERROR: Failed to export '" << output_path << "'" << std::endl;
+      return EXIT_FAILURE;
+    }
+    idx++;
+  }
+
+  INFO(l) << "--- Partition connection information-----------" << std::endl;
+  if (!partee::export_part_conn_json(work_folder, input_file, module.get(), pms))
+  {
+    return EXIT_FAILURE;
+  }
+  if (!partee::export_part_conn_ini(work_folder, input_file, module.get(), pms))
+  {
+    return EXIT_FAILURE;
+  }
+
+  INFO(l) << "--- Partition done-----------------------------" << std::endl << std::endl;
+
+  return EXIT_SUCCESS;
+}
diff --git a/compiler/circle-partitioner/src/HelperPath.cpp b/compiler/circle-partitioner/src/HelperPath.cpp
new file mode 100644
index 000000000..fc4bb2c70
--- /dev/null
+++ b/compiler/circle-partitioner/src/HelperPath.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "HelperPath.h"
+
+#include <cassert>
+#include <sstream>
+#include <stdlib.h>
+
+namespace partee
+{
+
+bool make_dir(const std::string &path)
+{
+  std::string command("mkdir -p ");
+  command += path;
+  int ret = ::system(command.c_str());
+  return ret == 0;
+}
+
+std::string get_filename_ext(const std::string &base)
+{
+  // find last '/' to get filename.ext
+  auto pos = base.find_last_of("/");
+  if (pos == std::string::npos)
+    return base;
+
+  return base.substr(pos + 1);
+}
+
+std::string make_path(const std::string &base, const std::string &input, uint32_t idx,
+                      const std::string &backend)
+{
+  auto filename_ext = get_filename_ext(input);
+
+  // We will assume file type .circle if not given
+  // TODO maybe throw if there is no extension?
+  std::string filename = filename_ext;
+  std::string ext = "circle";
+
+  auto pos = filename_ext.find_last_of(".");
+  if (pos != std::string::npos)
+  {
+    filename = filename_ext.substr(0, pos);
+    ext = filename_ext.substr(pos + 1);
+  }
+
+  // format idx with 5 '0' paddings like '00123'
+  uint32_t length = 5;
+  auto seq = std::string(length, '0').append(std::to_string(idx));
+  auto seq_fmt = seq.substr(seq.size() - length);
+
+  return base + "/" + filename + "." + seq_fmt + "_" + backend + "." + ext;
+}
+
+} // namespace partee
diff --git a/compiler/circle-partitioner/src/HelperPath.h b/compiler/circle-partitioner/src/HelperPath.h
new file mode 100644
index 000000000..e38e3a903
--- /dev/null
+++ b/compiler/circle-partitioner/src/HelperPath.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CIRCLE_HELPER_PATH_H__
+#define __CIRCLE_HELPER_PATH_H__
+
+#include <string>
+
+namespace partee
+{
+
+/**
+ * @brief create folder
+ */
+bool make_dir(const std::string &path);
+
+/**
+ * @brief get filename part of base
+ */
+std::string get_filename_ext(const std::string &base);
+
+/**
+ * @brief Make file path from base and backend
+ */
+std::string make_path(const std::string &base, const std::string &input, uint32_t idx,
+                      const std::string &backend);
+
+} // namespace partee
+
+#endif // __CIRCLE_HELPER_PATH_H__
diff --git a/compiler/circle-partitioner/src/HelperStrings.cpp b/compiler/circle-partitioner/src/HelperStrings.cpp
new file mode 100644
index 000000000..96d000c74
--- /dev/null
+++ b/compiler/circle-partitioner/src/HelperStrings.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "HelperStrings.h"
+
+#include <algorithm>
+#include <sstream>
+
+namespace partee
+{
+
+template <> std::vector<std::string> csv_to_vector(const std::string &str)
+{
+  std::vector<std::string> ret;
+  std::istringstream is(str);
+  for (std::string item; std::getline(is, item, ',');)
+  {
+    ret.push_back(item);
+  }
+  return ret;
+}
+
+bool is_one_of(const std::string &item, const std::vector<std::string> &items)
+{
+  return std::find(items.begin(), items.end(), item) != items.end();
+}
+
+} // namespace partee
diff --git a/compiler/circle-partitioner/src/HelperStrings.h b/compiler/circle-partitioner/src/HelperStrings.h
new file mode 100644
index 000000000..2af14c1ff
--- /dev/null
+++ b/compiler/circle-partitioner/src/HelperStrings.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CIRCLE_HELPER_STRINGS_H__
+#define __CIRCLE_HELPER_STRINGS_H__
+
+#include <string>
+#include <vector>
+
+namespace partee
+{
+
+template <typename T> std::vector<T> csv_to_vector(const std::string &str);
+
+bool is_one_of(const std::string &item, const std::vector<std::string> &items);
+
+} // namespace partee
+
+#endif // __CIRCLE_HELPER_STRINGS_H__
diff --git a/compiler/circle-partitioner/src/PartitionExport.cpp b/compiler/circle-partitioner/src/PartitionExport.cpp
new file mode 100644
index 000000000..a61451d66
--- /dev/null
+++ b/compiler/circle-partitioner/src/PartitionExport.cpp
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "PartitionExport.h"
+#include "HelperPath.h"
+
+#include <crew/PConfig.h>
+
+#include <iostream>
+#include <fstream>
+#include <string>
+#include <vector>
+
+namespace
+{
+
+std::string export_file_path(const std::string &output_base, const std::string &input,
+                             const std::string &ext)
+{
+  auto filename_ext = partee::get_filename_ext(input);
+  auto pos = filename_ext.find_last_of(".");
+  assert(pos > 0);
+  auto filename = filename_ext.substr(0, pos);
+  auto filepath = output_base + "/" + filename + ".conn" + ext;
+  return filepath;
+}
+
+} // namespace
+
+namespace
+{
+
+void graph_io_to_config_part(loco::Graph *graph, crew::Part &part)
+{
+  assert(graph != nullptr);
+
+  auto *gis = graph->inputs();
+  auto *gos = graph->outputs();
+  for (uint32_t i = 0; i < gis->size(); ++i)
+  {
+    auto *gi = gis->at(i);
+    assert(gi != nullptr);
+    part.inputs.push_back(gi->name());
+  }
+  for (uint32_t i = 0; i < gos->size(); ++i)
+  {
+    auto *go = gos->at(i);
+    assert(go != nullptr);
+    part.outputs.push_back(go->name());
+  }
+}
+
+void pms2config(const luci::PartedModules &pms, crew::PConfig &pconfig)
+{
+  for (auto &pmodule : pms.pmodules)
+  {
+    auto *graph = pmodule.module->graph();
+
+    crew::Part part;
+    part.model_file = pmodule.name;
+    graph_io_to_config_part(graph, part);
+
+    pconfig.parts.push_back(part);
+  }
+}
+
+} // namespace
+
+namespace partee
+{
+
+bool export_part_conn_json(const std::string &output_base, const std::string &input,
+                           const luci::Module *source, luci::PartedModules &pms)
+{
+  crew::PConfig pconfig;
+
+  // TODO is graph I/O using main graph is enough?
+  auto *graph = source->graph();
+
+  pconfig.source.model_file = input;
+  graph_io_to_config_part(graph, pconfig.source);
+
+  pms2config(pms, pconfig);
+
+  auto filepath_json = export_file_path(output_base, input, ".json");
+  std::ofstream fs(filepath_json.c_str(), std::ofstream::binary | std::ofstream::trunc);
+  if (not fs.good())
+  {
+    std::cerr << "ERROR: Failed to create file: " << filepath_json;
+    return false;
+  }
+  if (not write_json(fs, pconfig))
+  {
+    std::cerr << "ERROR: Failed to write json file: " << filepath_json;
+    return false;
+  }
+  fs.close();
+
+  return true;
+}
+
+bool export_part_conn_ini(const std::string &output_base, const std::string &input,
+                          const luci::Module *source, luci::PartedModules &pms)
+{
+  crew::PConfig pconfig;
+
+  // TODO is graph I/O using main graph is enough?
+  auto *graph = source->graph();
+
+  pconfig.source.model_file = input;
+  graph_io_to_config_part(graph, pconfig.source);
+
+  pms2config(pms, pconfig);
+
+  auto filepath_ini = export_file_path(output_base, input, ".ini");
+  std::ofstream fs(filepath_ini.c_str(), std::ofstream::binary | std::ofstream::trunc);
+  if (not fs.good())
+  {
+    std::cerr << "ERROR: Failed to create file: " << filepath_ini;
+    return false;
+  }
+  if (not write_ini(fs, pconfig))
+  {
+    std::cerr << "ERROR: Failed to write ini file: " << filepath_ini;
+    return false;
+  }
+  fs.close();
+
+  return true;
+}
+
+} // namespace partee
diff --git a/compiler/circle-partitioner/src/PartitionExport.h b/compiler/circle-partitioner/src/PartitionExport.h
new file mode 100644
index 000000000..fd287dcd3
--- /dev/null
+++ b/compiler/circle-partitioner/src/PartitionExport.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CIRCLE_PARTITION_EXPORT_H__
+#define __CIRCLE_PARTITION_EXPORT_H__
+
+#include <luci/Partition.h>
+
+#include <string>
+
+namespace partee
+{
+
+/**
+ * @brief This will save partition connection to json format file
+ */
+bool export_part_conn_json(const std::string &output_base, const std::string &input,
+                           const luci::Module *source, luci::PartedModules &pms);
+
+/**
+ * @brief This will save partition connection to ini format file
+ */
+bool export_part_conn_ini(const std::string &output_base, const std::string &input,
+                          const luci::Module *source, luci::PartedModules &pms);
+
+} // namespace partee
+
+#endif // __CIRCLE_PARTITION_EXPORT_H__
diff --git a/compiler/circle-partitioner/src/PartitionRead.cpp b/compiler/circle-partitioner/src/PartitionRead.cpp
new file mode 100644
index 000000000..b179ecb59
--- /dev/null
+++ b/compiler/circle-partitioner/src/PartitionRead.cpp
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "PartitionRead.h"
+#include "HelperStrings.h"
+
+#include <crew/PConfigIni.h>
+#include <crew/PConfigIniDump.h>
+#include <luci/Log.h>
+
+#include <stdexcept>
+
+namespace
+{
+
+using namespace partee;
+
+const char *_section_partition = "partition";
+const char *_section_OPCODE = "OPCODE";
+
+const char *_key_backends = "backends";
+const char *_key_default = "default";
+const char *_key_underscore = "_";
+
+luci::PartitionTable parse_table(const crew::Sections &sections)
+{
+  luci::PartitionTable table;
+
+  for (auto &section : sections)
+  {
+    if (section.name == _section_partition)
+    {
+      auto &items = section.items;
+      if (items.find(_key_backends) == items.end())
+      {
+        throw std::invalid_argument("'backends' is required");
+      }
+      if (items.find(_key_default) == items.end())
+      {
+        throw std::invalid_argument("'default' is required");
+      }
+
+      table.groups = csv_to_vector<std::string>(items.at(_key_backends));
+      table.default_group = items.at(_key_default);
+    }
+    else if (section.name == _section_OPCODE)
+    {
+      auto &items = section.items;
+
+      for (auto &item : items)
+      {
+        if (item.first == _key_underscore)
+          table.default_group = item.second;
+        else
+        {
+          table.byopcodes.emplace(item.first, item.second);
+        }
+      }
+    }
+  }
+
+  return table;
+}
+
+} // namespace
+
+namespace partee
+{
+
+luci::PartitionTable read(const std::string &path)
+{
+  LOGGER(l);
+
+  INFO(l) << "PartitionConfig: " << path << std::endl;
+
+  auto partition_config = crew::read_ini(path);
+
+  INFO(l) << partition_config << std::endl;
+
+  auto partition_table = parse_table(partition_config);
+
+  return partition_table;
+}
+
+} // namespace partee
diff --git a/compiler/stdex/include/stdex/Queue.h b/compiler/circle-partitioner/src/PartitionRead.h
index c72297bc8..9b07b328b 100644
--- a/compiler/stdex/include/stdex/Queue.h
+++ b/compiler/circle-partitioner/src/PartitionRead.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,25 +14,24 @@
  * limitations under the License.
  */
 
-#ifndef __STDEX_QUEUE_H__
-#define __STDEX_QUEUE_H__
+#ifndef __CIRCLE_PARTITION_READ_H__
+#define __CIRCLE_PARTITION_READ_H__
 
-#include <queue>
+#include <luci/IR/Module.h>
+#include <luci/Partition.h>
 
-namespace stdex
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace partee
 {
 
 /**
- * @brief Take the front (= first) element from the queue
- * @note The queue SHOULD have at least one element
+ * @brief Reads and parse file and return PartitionTable
  */
-template <typename T> T take(std::queue<T> &q)
-{
-  auto res = q.front();
-  q.pop();
-  return res;
-}
+luci::PartitionTable read(const std::string &path);
 
-} // namespace stdex
+} // namespace partee
 
-#endif // __STDEX_QUEUE_H__
+#endif // __CIRCLE_PARTITION_READ_H__
diff --git a/compiler/circle-quantizer/CMakeLists.txt b/compiler/circle-quantizer/CMakeLists.txt
index 5075b13d5..a5f5f61c4 100644
--- a/compiler/circle-quantizer/CMakeLists.txt
+++ b/compiler/circle-quantizer/CMakeLists.txt
@@ -10,6 +10,7 @@ target_link_libraries(circle-quantizer luci_import)
 target_link_libraries(circle-quantizer luci_service)
 target_link_libraries(circle-quantizer luci_pass)
 target_link_libraries(circle-quantizer luci_export)
+target_link_libraries(circle-quantizer luci_env)
 target_link_libraries(circle-quantizer arser)
 target_link_libraries(circle-quantizer vconone)
 
diff --git a/compiler/circle-quantizer/src/CircleQuantizer.cpp b/compiler/circle-quantizer/src/CircleQuantizer.cpp
index 54b38a170..720430e5a 100644
--- a/compiler/circle-quantizer/src/CircleQuantizer.cpp
+++ b/compiler/circle-quantizer/src/CircleQuantizer.cpp
@@ -21,6 +21,7 @@
 #include <luci/Service/Validate.h>
 #include <luci/CircleExporter.h>
 #include <luci/CircleFileExpContract.h>
+#include <luci/UserSettings.h>
 
 #include <oops/InternalExn.h>
 #include <arser/arser.h>
@@ -57,47 +58,53 @@ int entry(int argc, char **argv)
   luci::CircleOptimizer optimizer;
 
   auto options = optimizer.options();
+  auto settings = luci::UserSettings::settings();
 
   const std::string qdqw = "--quantize_dequantize_weights";
   const std::string qwmm = "--quantize_with_minmax";
   const std::string rq = "--requantize";
 
+  const std::string gpd = "--generate_profile_data";
+
   arser::Arser arser("circle-quantizer provides circle model quantization");
 
   arser.add_argument("--version")
-      .nargs(0)
-      .required(false)
-      .default_value(false)
-      .help("Show version information and exit")
-      .exit_with(print_version);
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("Show version information and exit")
+    .exit_with(print_version);
 
   arser.add_argument(qdqw)
-      .nargs(3)
-      .type(arser::DataType::STR_VEC)
-      .required(false)
-      .help("Quantize-dequantize weight values required action before quantization. "
-            "Three arguments required: input_dtype(float32) "
-            "output_dtype(uint8) granularity(layer, channel)");
+    .nargs(3)
+    .type(arser::DataType::STR_VEC)
+    .required(false)
+    .help("Quantize-dequantize weight values required action before quantization. "
+          "Three arguments required: input_dtype(float32) "
+          "output_dtype(uint8) granularity(layer, channel)");
 
   arser.add_argument(qwmm)
-      .nargs(3)
-      .type(arser::DataType::STR_VEC)
-      .required(false)
-      .help("Quantize with min/max values. "
-            "Three arguments required: input_dtype(float32) "
-            "output_dtype(uint8) granularity(layer, channel)");
+    .nargs(3)
+    .type(arser::DataType::STR_VEC)
+    .required(false)
+    .help("Quantize with min/max values. "
+          "Three arguments required: input_dtype(float32) "
+          "output_dtype(uint8) granularity(layer, channel)");
 
   arser.add_argument(rq)
-      .nargs(2)
-      .type(arser::DataType::STR_VEC)
-      .required(false)
-      .help("Requantize a quantized model. "
-            "Two arguments required: input_dtype(int8) "
-            "output_dtype(uint8)");
+    .nargs(2)
+    .type(arser::DataType::STR_VEC)
+    .required(false)
+    .help("Requantize a quantized model. "
+          "Two arguments required: input_dtype(int8) "
+          "output_dtype(uint8)");
 
   arser.add_argument("input").nargs(1).type(arser::DataType::STR).help("Input circle model");
   arser.add_argument("output").nargs(1).type(arser::DataType::STR).help("Output circle model");
 
+  arser.add_argument(gpd).nargs(0).required(false).default_value(false).help(
+    "This will turn on profiling data generation.");
+
   try
   {
     arser.parse(argc, argv);
@@ -109,13 +116,20 @@ int entry(int argc, char **argv)
     return 255;
   }
 
-  if (arser[qdqw])
   {
-    if (arser[qwmm] || arser[rq])
+    // only one of qdqw, qwmm, rq option can be used
+    int32_t opt_used = arser[qdqw] ? 1 : 0;
+    opt_used += arser[qwmm] ? 1 : 0;
+    opt_used += arser[rq] ? 1 : 0;
+    if (opt_used != 1)
     {
       print_exclusive_options();
       return 255;
     }
+  }
+
+  if (arser[qdqw])
+  {
     auto values = arser.get<std::vector<std::string>>(qdqw);
     if (values.size() != 3)
     {
@@ -131,11 +145,6 @@ int entry(int argc, char **argv)
 
   if (arser[qwmm])
   {
-    if (arser[qdqw] || arser[rq])
-    {
-      print_exclusive_options();
-      return 255;
-    }
     auto values = arser.get<std::vector<std::string>>(qwmm);
     if (values.size() != 3)
     {
@@ -151,11 +160,6 @@ int entry(int argc, char **argv)
 
   if (arser[rq])
   {
-    if (arser[qwmm] || arser[qdqw])
-    {
-      print_exclusive_options();
-      return 255;
-    }
     auto values = arser.get<std::vector<std::string>>(rq);
     if (values.size() != 2)
     {
@@ -171,6 +175,9 @@ int entry(int argc, char **argv)
   std::string input_path = arser.get<std::string>("input");
   std::string output_path = arser.get<std::string>("output");
 
+  if (arser[gpd])
+    settings->set(luci::UserSettings::Key::ProfilingDataGen, true);
+
   // Load model from the file
   foder::FileLoader file_loader{input_path};
   std::vector<char> model_data = file_loader.load();
diff --git a/compiler/circle-tensordump/driver/Driver.cpp b/compiler/circle-tensordump/driver/Driver.cpp
index 5bab9f59e..70f3c8d84 100644
--- a/compiler/circle-tensordump/driver/Driver.cpp
+++ b/compiler/circle-tensordump/driver/Driver.cpp
@@ -29,14 +29,14 @@
 int entry(int argc, char **argv)
 {
   arser::Arser arser{
-      "circle-tensordump allows users to retrieve tensor information from a Circle model file"};
+    "circle-tensordump allows users to retrieve tensor information from a Circle model file"};
 
   arser.add_argument("circle").nargs(1).type(arser::DataType::STR).help("Circle file path to dump");
   arser.add_argument("--tensors").nargs(0).help("Dump to console");
   arser.add_argument("--tensors_to_hdf5")
-      .nargs(1)
-      .type(arser::DataType::STR)
-      .help("Dump to hdf5 file. Specify hdf5 file path to be dumped");
+    .nargs(1)
+    .type(arser::DataType::STR)
+    .help("Dump to hdf5 file. Specify hdf5 file path to be dumped");
 
   try
   {
diff --git a/compiler/circle-tensordump/src/Dump.cpp b/compiler/circle-tensordump/src/Dump.cpp
index dee2f3620..d5c3fe6fa 100644
--- a/compiler/circle-tensordump/src/Dump.cpp
+++ b/compiler/circle-tensordump/src/Dump.cpp
@@ -253,7 +253,7 @@ void write_vector_data_to_hdf5(H5::H5File &file, std::string &group_name, std::s
     return;
   auto dataspace = std::make_unique<H5::DataSpace>(dims.size(), dims.data());
   auto dataset = std::make_unique<H5::DataSet>(
-      file.createDataSet(group_name + "/" + dataset_name, type, *dataspace));
+    file.createDataSet(group_name + "/" + dataset_name, type, *dataspace));
   dataset->write(data->data(), type);
 }
 
@@ -264,7 +264,7 @@ void write_scalar_data_to_hdf5(H5::H5File &file, std::string &group_name, std::s
 {
   auto dataspace = std::make_unique<H5::DataSpace>(H5S_SCALAR);
   auto dataset = std::make_unique<H5::DataSet>(
-      file.createDataSet(group_name + "/" + dataset_name, type, *dataspace));
+    file.createDataSet(group_name + "/" + dataset_name, type, *dataspace));
   dataset->write(&data, type);
 }
 
@@ -308,7 +308,7 @@ void DumpTensorsToHdf5::run(std::ostream &os, const circle::Model *model,
       // create a group for each tensor whose name is its tensor name
       std::string group_name = ::mangle(tensor->name()->c_str());
       std::unique_ptr<H5::Group> tensor_group =
-          std::make_unique<H5::Group>(file.createGroup(group_name));
+        std::make_unique<H5::Group>(file.createGroup(group_name));
 
       // write a buffer data
       uint32_t buff_idx = tensor->buffer();
diff --git a/compiler/circle2circle-dredd-recipe-test/test.lst b/compiler/circle2circle-dredd-recipe-test/test.lst
index 3a95e2be2..bb944201e 100644
--- a/compiler/circle2circle-dredd-recipe-test/test.lst
+++ b/compiler/circle2circle-dredd-recipe-test/test.lst
@@ -11,14 +11,27 @@
 ## TFLITE RECIPE
 
 Add(Net_Preactivation_BN_000 PASS fuse_preactivation_batchnorm)
+Add(Net_BroadcastTo_AddV2_000 PASS resolve_customop_add)
+Add(Net_BroadcastTo_AddV2_001 PASS resolve_customop_add)
+Add(Net_Conv_Add_Mul_000 PASS fuse_batchnorm_with_conv)
+Add(Net_Conv_Add_Mul_001 PASS fuse_batchnorm_with_conv)
+Add(Net_Conv_Add_Mul_002 PASS fuse_batchnorm_with_conv)
+Add(Net_Conv_Min_Max_000 PASS transform_min_max_to_relu6)
+Add(Net_Conv_Relu6_000 PASS fuse_activation_function)
+Add(Net_DwConv_BN_000 PASS fuse_batchnorm_with_dwconv)
+Add(Net_DwConv_BN_001 PASS fuse_batchnorm_with_dwconv)
+Add(Net_Reshape_Reshape_000 PASS remove_redundant_reshape)
+Add(Net_Squeeze_Squeeze_000 PASS substitute_squeeze_to_reshape)
 Add(Net_TConv_Add_000 PASS fuse_add_with_tconv)
 Add(Net_TConv_Add_001 PASS fuse_add_with_tconv)
 Add(Net_TConv_Add_002 PASS fuse_add_with_tconv)
 Add(Net_TConv_BN_000 PASS fuse_batchnorm_with_tconv)
 Add(Net_TConv_BN_001 PASS fuse_batchnorm_with_tconv)
+Add(Net_TConv_BN_002 PASS fuse_batchnorm_with_tconv)
 Add(Net_InstanceNorm_001 PASS fuse_instnorm)
 Add(Net_InstanceNorm_002 PASS fuse_instnorm)
 Add(Net_InstanceNorm_003 PASS fuse_instnorm)
+Add(Net_Maximum_Minimum_000 PASS transform_min_max_to_relu6)
 Add(BatchMatMulV2_000 PASS resolve_customop_batchmatmul)
 Add(MatMul_000 PASS resolve_customop_matmul)
 Add(DepthwiseConv2D_003 PASS)
diff --git a/compiler/circle2circle/src/Circle2Circle.cpp b/compiler/circle2circle/src/Circle2Circle.cpp
index cde5de8fd..da05a0a9a 100644
--- a/compiler/circle2circle/src/Circle2Circle.cpp
+++ b/compiler/circle2circle/src/Circle2Circle.cpp
@@ -51,157 +51,266 @@ int entry(int argc, char **argv)
   arser::Arser arser("circle2circle provides circle model optimization and transformations");
 
   arser.add_argument("--version")
-      .nargs(0)
-      .required(false)
-      .default_value(false)
-      .help("Show version information and exit")
-      .exit_with(print_version);
-
-  arser.add_argument("--all").nargs(0).required(false).default_value(false).help(
-      "Enable all optimize options");
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("Show version information and exit")
+    .exit_with(print_version);
+
+  arser.add_argument("--O1").nargs(0).required(false).default_value(false).help(
+    "Enable O1 optimize options");
+
+  arser.add_argument("--fold_add_v2")
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("This will fold AddV2 operators with constant inputs");
+
+  arser.add_argument("--fold_cast")
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("This will fold Cast operators with constant input");
 
   arser.add_argument("--fold_dequantize")
-      .nargs(0)
-      .required(false)
-      .default_value(false)
-      .help("This will fold dequantize op");
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("This will fold dequantize op");
+
+  arser.add_argument("--fold_sparse_to_dense")
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("This will fold SparseToDense operator");
+
+  arser.add_argument("--forward_reshape_to_unaryop")
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("This will move Reshape after UnaryOp for centain condition");
 
   arser.add_argument("--fuse_activation_function")
-      .nargs(0)
-      .required(false)
-      .default_value(false)
-      .help("This will fuse Activation function to a preceding operator");
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("This will fuse Activation function to a preceding operator");
 
   arser.add_argument("--fuse_add_with_tconv")
-      .nargs(0)
-      .required(false)
-      .default_value(false)
-      .help("This will fuse Add operator to Transposed Convolution operator");
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("This will fuse Add operator to Transposed Convolution operator");
+
+  arser.add_argument("--fuse_batchnorm_with_conv")
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("This will fuse BatchNorm operators to Convolution operator");
+
+  arser.add_argument("--fuse_batchnorm_with_dwconv")
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("This will fuse BatchNorm operators to Depthwise Convolution operator");
 
   arser.add_argument("--fuse_batchnorm_with_tconv")
-      .nargs(0)
-      .required(false)
-      .default_value(false)
-      .help("This will fuse BatchNorm operators to Transposed Convolution operator");
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("This will fuse BatchNorm operators to Transposed Convolution operator");
 
   arser.add_argument("--fuse_bcq")
-      .nargs(0)
-      .required(false)
-      .default_value(false)
-      .help("This will fuse operators and apply Binary Coded Quantization");
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("This will fuse operators and apply Binary Coded Quantization");
 
   arser.add_argument("--fuse_instnorm")
-      .nargs(0)
-      .required(false)
-      .default_value(false)
-      .help("This will fuse operators to InstanceNorm operator");
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("This will fuse operators to InstanceNorm operator");
 
   arser.add_argument("--make_batchnorm_gamma_positive")
-      .nargs(0)
-      .required(false)
-      .default_value(false)
-      .help("This will make negative gamma of BatchNorm into a small positive value (1e-10). Note "
-            "that this pass can change the execution result of the model. So, use it only when the "
-            "impact is known to be acceptable.");
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("This will make negative gamma of BatchNorm into a small positive value (1e-10). Note "
+          "that this pass can change the execution result of the model. So, use it only when the "
+          "impact is known to be acceptable.");
 
   arser.add_argument("--fuse_preactivation_batchnorm")
-      .nargs(0)
-      .required(false)
-      .default_value(false)
-      .help("This will fuse BatchNorm operators of pre-activations to Convolution operator");
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("This will fuse BatchNorm operators of pre-activations to Convolution operator");
+
+  arser.add_argument("--remove_redundant_reshape")
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("This will fuse or remove subsequent Reshape operators");
 
   arser.add_argument("--remove_redundant_transpose")
-      .nargs(0)
-      .required(false)
-      .default_value(false)
-      .help("This will fuse or remove subsequent Transpose operators");
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("This will fuse or remove subsequent Transpose operators");
+
+  arser.add_argument("--remove_unnecessary_reshape")
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("This will remove unnecessary reshape operators");
+
+  arser.add_argument("--remove_unnecessary_slice")
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("This will remove unnecessary slice operators");
+
+  arser.add_argument("--remove_unnecessary_strided_slice")
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("This will remove unnecessary strided slice operators");
+
+  arser.add_argument("--remove_unnecessary_split")
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("This will remove unnecessary split operators");
 
   arser.add_argument("--replace_cw_mul_add_with_depthwise_conv")
-      .nargs(0)
-      .required(false)
-      .default_value(false)
-      .help("This will replace channel-wise mul/add with DepthwiseConv2D operator");
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("This will replace channel-wise mul/add with DepthwiseConv2D operator");
 
   arser.add_argument("--resolve_customop_add")
-      .nargs(0)
-      .required(false)
-      .default_value(false)
-      .help("This will convert Custom(Add) to Add operator");
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("This will convert Custom(Add) to Add operator");
 
   arser.add_argument("--resolve_customop_batchmatmul")
-      .nargs(0)
-      .required(false)
-      .default_value(false)
-      .help("This will convert Custom(BatchMatmul) to BatchMatmul operator");
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("This will convert Custom(BatchMatmul) to BatchMatmul operator");
 
   arser.add_argument("--resolve_customop_matmul")
-      .nargs(0)
-      .required(false)
-      .default_value(false)
-      .help("This will convert Custom(Matmul) to Matmul operator");
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("This will convert Custom(Matmul) to Matmul operator");
 
   arser.add_argument("--shuffle_weight_to_16x1float32")
-      .nargs(0)
-      .required(false)
-      .default_value(false)
-      .help("This will convert weight format of FullyConnected to SHUFFLED16x1FLOAT32. Note that "
-            "it only converts weights whose row is a multiple of 16");
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("This will convert weight format of FullyConnected to SHUFFLED16x1FLOAT32. Note that "
+          "it only converts weights whose row is a multiple of 16");
 
   arser.add_argument("--substitute_pack_to_reshape")
-      .nargs(0)
-      .required(false)
-      .default_value(false)
-      .help("This will convert single input Pack to Reshape");
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("This will convert single input Pack to Reshape");
+
+  arser.add_argument("--substitute_squeeze_to_reshape")
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("This will convert certain condition Squeeze to Reshape");
+
+  arser.add_argument("--substitute_transpose_to_reshape")
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("This will convert single input Transpose to Reshape");
+
+  arser.add_argument("--convert_nchw_to_nhwc")
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("Experimental: This will convert NCHW operators to NHWC under the assumption that "
+          "input model is NCHW.");
+
+  arser.add_argument("--nchw_to_nhwc_preserve_input_shape")
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("Preserve the input shape of the model (argument for --convert_nchw_to_nhwc).");
+
+  arser.add_argument("--nchw_to_nhwc_preserve_output_shape")
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("Preserve the output shape of the model (argument for --convert_nchw_to_nhwc).");
+
+  arser.add_argument("--transform_min_max_to_relu6")
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("Transform Minimum-Maximum pattern to Relu6 operator");
 
   arser.add_argument("--mute_warnings")
-      .nargs(0)
-      .required(false)
-      .default_value(false)
-      .help("This will turn off warning messages");
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("This will turn off warning messages");
 
   arser.add_argument("--disable_validation")
-      .nargs(0)
-      .required(false)
-      .default_value(false)
-      .help("This will turn off operator validations. May help input model investigation.");
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("This will turn off operator validations. May help input model investigation.");
+
+  arser.add_argument("--generate_profile_data")
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("This will turn on profiling data generation.");
 
   arser.add_argument("input").nargs(1).type(arser::DataType::STR).help("Input circle model");
   arser.add_argument("output").nargs(1).type(arser::DataType::STR).help("Output circle model");
 
   // sparsification argument
   arser.add_argument("--sparsify_tensor")
-      .nargs(1)
-      .type(arser::DataType::STR)
-      .required(false)
-      .help("Tensor name that you want to sparsify");
+    .nargs(1)
+    .type(arser::DataType::STR)
+    .required(false)
+    .help("Tensor name that you want to sparsify");
 
   arser.add_argument("--sparsify_traversal_order")
-      .nargs(1)
-      .type(arser::DataType::STR)
-      .required(false)
-      .default_value("0,1,2,3")
-      .help("Traversal order of dimensions. Default value: 0,1,2,3");
+    .nargs(1)
+    .type(arser::DataType::STR)
+    .required(false)
+    .default_value("0,1,2,3")
+    .help("Traversal order of dimensions. Default value: 0,1,2,3");
 
   arser.add_argument("--sparsify_format")
-      .nargs(1)
-      .type(arser::DataType::STR)
-      .required(false)
-      .default_value("d,s")
-      .help("Format of each dimension. 'd' stands for dense, 's' stands for sparse(CSR). Default "
-            "value: d,s");
+    .nargs(1)
+    .type(arser::DataType::STR)
+    .required(false)
+    .default_value("d,s")
+    .help("Format of each dimension. 'd' stands for dense, 's' stands for sparse(CSR). Default "
+          "value: d,s");
 
   arser.add_argument("--sparsify_block_size")
-      .nargs(1)
-      .type(arser::DataType::STR)
-      .required(false)
-      .help("Size of each block dimension");
+    .nargs(1)
+    .type(arser::DataType::STR)
+    .required(false)
+    .help("Size of each block dimension");
 
   arser.add_argument("--sparsify_block_map")
-      .nargs(1)
-      .type(arser::DataType::STR)
-      .required(false)
-      .default_value("0,1")
-      .help("Map from block dimension to the original tensor dimension. Default value: 0,1");
+    .nargs(1)
+    .type(arser::DataType::STR)
+    .required(false)
+    .default_value("0,1")
+    .help("Map from block dimension to the original tensor dimension. Default value: 0,1");
 
   try
   {
@@ -214,7 +323,7 @@ int entry(int argc, char **argv)
     return 255;
   }
 
-  if (arser.get<bool>("--all"))
+  if (arser.get<bool>("--O1"))
   {
     options->enable(Algorithms::FuseBCQ);
     options->enable(Algorithms::FuseInstanceNorm);
@@ -224,12 +333,24 @@ int entry(int argc, char **argv)
     options->enable(Algorithms::RemoveRedundantTranspose);
     options->enable(Algorithms::SubstitutePackToReshape);
   }
+  if (arser.get<bool>("--fold_add_v2"))
+    options->enable(Algorithms::FoldAddV2);
+  if (arser.get<bool>("--fold_cast"))
+    options->enable(Algorithms::FoldCast);
   if (arser.get<bool>("--fold_dequantize"))
     options->enable(Algorithms::FoldDequantize);
+  if (arser.get<bool>("--fold_sparse_to_dense"))
+    options->enable(Algorithms::FoldSparseToDense);
+  if (arser.get<bool>("--forward_reshape_to_unaryop"))
+    options->enable(Algorithms::ForwardReshapeToUnaryOp);
   if (arser.get<bool>("--fuse_activation_function"))
     options->enable(Algorithms::FuseActivationFunction);
+  if (arser.get<bool>("--fuse_batchnorm_with_conv"))
+    options->enable(Algorithms::FuseBatchNormWithConv);
   if (arser.get<bool>("--fuse_add_with_tconv"))
     options->enable(Algorithms::FuseAddWithTConv);
+  if (arser.get<bool>("--fuse_batchnorm_with_dwconv"))
+    options->enable(Algorithms::FuseBatchNormWithDwConv);
   if (arser.get<bool>("--fuse_batchnorm_with_tconv"))
     options->enable(Algorithms::FuseBatchNormWithTConv);
   if (arser.get<bool>("--fuse_bcq"))
@@ -240,8 +361,18 @@ int entry(int argc, char **argv)
     options->enable(Algorithms::MakeBatchNormGammaPositive);
   if (arser.get<bool>("--fuse_preactivation_batchnorm"))
     options->enable(Algorithms::FusePreActivationBatchNorm);
+  if (arser.get<bool>("--remove_redundant_reshape"))
+    options->enable(Algorithms::RemoveRedundantReshape);
   if (arser.get<bool>("--remove_redundant_transpose"))
     options->enable(Algorithms::RemoveRedundantTranspose);
+  if (arser.get<bool>("--remove_unnecessary_reshape"))
+    options->enable(Algorithms::RemoveUnnecessaryReshape);
+  if (arser.get<bool>("--remove_unnecessary_slice"))
+    options->enable(Algorithms::RemoveUnnecessarySlice);
+  if (arser.get<bool>("--remove_unnecessary_strided_slice"))
+    options->enable(Algorithms::RemoveUnnecessaryStridedSlice);
+  if (arser.get<bool>("--remove_unnecessary_split"))
+    options->enable(Algorithms::RemoveUnnecessarySplit);
   if (arser.get<bool>("--replace_cw_mul_add_with_depthwise_conv"))
     options->enable(Algorithms::ReplaceMulAddWithDepthwiseConv);
   if (arser.get<bool>("--resolve_customop_add"))
@@ -254,11 +385,19 @@ int entry(int argc, char **argv)
     options->enable(Algorithms::ShuffleWeightTo16x1Float32);
   if (arser.get<bool>("--substitute_pack_to_reshape"))
     options->enable(Algorithms::SubstitutePackToReshape);
+  if (arser.get<bool>("--substitute_squeeze_to_reshape"))
+    options->enable(Algorithms::SubstituteSqueezeToReshape);
+  if (arser.get<bool>("--substitute_transpose_to_reshape"))
+    options->enable(Algorithms::SubstituteTransposeToReshape);
+  if (arser.get<bool>("--transform_min_max_to_relu6"))
+    options->enable(Algorithms::TransformMinMaxToRelu6Pass);
 
   if (arser.get<bool>("--mute_warnings"))
     settings->set(luci::UserSettings::Key::MuteWarnings, true);
   if (arser.get<bool>("--disable_validation"))
     settings->set(luci::UserSettings::Key::DisableValidation, true);
+  if (arser.get<bool>("--generate_profile_data"))
+    settings->set(luci::UserSettings::Key::ProfilingDataGen, true);
 
   std::string input_path = arser.get<std::string>("input");
   std::string output_path = arser.get<std::string>("output");
@@ -284,6 +423,15 @@ int entry(int argc, char **argv)
                    arser.get<std::string>("--sparsify_block_map"));
   }
 
+  if (arser.get<bool>("--convert_nchw_to_nhwc"))
+  {
+    options->enable(Algorithms::ConvertNCHWToNHWC);
+    if (arser.get<bool>("--nchw_to_nhwc_preserve_input_shape"))
+      options->param(AlgorithmParameters::NCHW_to_NHWC_preserve_input_shape, "true");
+    if (arser.get<bool>("--nchw_to_nhwc_preserve_output_shape"))
+      options->param(AlgorithmParameters::NCHW_to_NHWC_preserve_output_shape, "true");
+  }
+
   // Load model from the file
   foder::FileLoader file_loader{input_path};
   std::vector<char> model_data;
diff --git a/compiler/circle2circle/src/TestHelper.h b/compiler/circle2circle/src/TestHelper.h
index f4dbe23a9..1e055b217 100644
--- a/compiler/circle2circle/src/TestHelper.h
+++ b/compiler/circle2circle/src/TestHelper.h
@@ -39,7 +39,7 @@ public:
   {
     assert(_ptr < N);
     _argv[_ptr] = new char[strlen(in) + 1];
-    strcpy(_argv[_ptr], in);
+    strncpy(_argv[_ptr], in, strlen(in) + 1);
     _ptr++;
   }
 
@@ -47,7 +47,7 @@ public:
 
 private:
   pchar_t _argv[N] = {
-      nullptr,
+    nullptr,
   };
   size_t _ptr = 0;
 };
diff --git a/compiler/circlechef/circle/CMakeLists.txt b/compiler/circlechef/circle/CMakeLists.txt
index 2ca016b84..98a284c30 100644
--- a/compiler/circlechef/circle/CMakeLists.txt
+++ b/compiler/circlechef/circle/CMakeLists.txt
@@ -5,6 +5,5 @@ target_include_directories(circlechef_circle PUBLIC include)
 target_include_directories(circlechef_circle PRIVATE src)
 target_link_libraries(circlechef_circle circlechef_proto)
 target_link_libraries(circlechef_circle mio_circle)
-target_link_libraries(circlechef_circle stdex)
 target_link_libraries(circlechef_circle cwrap)
 target_link_libraries(circlechef_circle souschef)
diff --git a/compiler/circlechef/circle/src/Convert.cpp b/compiler/circlechef/circle/src/Convert.cpp
index 77614d9b5..248687fed 100644
--- a/compiler/circlechef/circle/src/Convert.cpp
+++ b/compiler/circlechef/circle/src/Convert.cpp
@@ -33,10 +33,11 @@ circlechef::TensorType as_circlechef_type(const circle::TensorType type)
       return circlechef::UINT8;
     case circle::TensorType_BOOL:
       return circlechef::BOOL;
+    case circle::TensorType_INT16:
+      return circlechef::INT16;
     // TODO handle other types
     // TensorType_FLOAT16
     // TensorType_STRING
-    // TensorType_INT16
     // TensorType_COMPLEX64
     default:
       throw std::runtime_error{"unsupported tensor type"};
diff --git a/compiler/circlechef/core/CMakeLists.txt b/compiler/circlechef/core/CMakeLists.txt
index 54b3ea53d..0e8f47483 100644
--- a/compiler/circlechef/core/CMakeLists.txt
+++ b/compiler/circlechef/core/CMakeLists.txt
@@ -1,9 +1,23 @@
 file(GLOB_RECURSE SOURCES "src/*.cpp")
+file(GLOB_RECURSE TESTS "src/*.test.cpp")
+list(REMOVE_ITEM SOURCES ${TESTS})
 
 add_library(circlechef_core STATIC ${SOURCES})
 target_include_directories(circlechef_core PUBLIC include)
 target_include_directories(circlechef_core PRIVATE src)
-target_link_libraries(circlechef_core circlechef_proto)
-target_link_libraries(circlechef_core circlechef_log)
-target_link_libraries(circlechef_core mio_circle)
-target_link_libraries(circlechef_core souschef)
+target_link_libraries(circlechef_core PUBLIC circlechef_proto)
+target_link_libraries(circlechef_core PUBLIC circlechef_log)
+target_link_libraries(circlechef_core PUBLIC mio_circle)
+target_link_libraries(circlechef_core PUBLIC souschef)
+target_link_libraries(circlechef_core PRIVATE nncc_coverage)
+
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
+nnas_find_package(GTest REQUIRED)
+
+GTest_AddTest(circlechef_core_test ${TESTS})
+target_include_directories(circlechef_core_test PRIVATE src)
+target_link_libraries(circlechef_core_test circlechef_core)
+target_link_libraries(circlechef_core_test nncc_coverage)
diff --git a/compiler/circlechef/core/src/Convert.cpp b/compiler/circlechef/core/src/Convert.cpp
index 2db0a6212..d9bbd6e50 100644
--- a/compiler/circlechef/core/src/Convert.cpp
+++ b/compiler/circlechef/core/src/Convert.cpp
@@ -64,6 +64,8 @@ circle::TensorType as_circle_tensortype(const circlechef::TensorType &value)
       return circle::TensorType_INT64;
     case circlechef::BOOL:
       return circle::TensorType_BOOL;
+    case circlechef::INT16:
+      return circle::TensorType_INT16;
     default:
       break;
   }
diff --git a/compiler/circlechef/core/src/Convert.test.cpp b/compiler/circlechef/core/src/Convert.test.cpp
new file mode 100644
index 000000000..b17f5df44
--- /dev/null
+++ b/compiler/circlechef/core/src/Convert.test.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Convert.h"
+
+#include <gtest/gtest.h>
+
+TEST(ConvertTest, as_circle_padding)
+{
+  ASSERT_EQ(circle::Padding_SAME, as_circle_padding(circlechef::SAME));
+  ASSERT_EQ(circle::Padding_VALID, as_circle_padding(circlechef::VALID));
+}
+
+TEST(ConvertTest, as_circle_padding_NEG)
+{
+  EXPECT_THROW(as_circle_padding(static_cast<circlechef::Padding>(99)), std::runtime_error);
+}
+
+TEST(ConvertTest, as_circle_activation)
+{
+  ASSERT_EQ(circle::ActivationFunctionType_NONE, as_circle_activation(circlechef::NONE));
+  ASSERT_EQ(circle::ActivationFunctionType_RELU, as_circle_activation(circlechef::RELU));
+  ASSERT_EQ(circle::ActivationFunctionType_RELU6, as_circle_activation(circlechef::RELU6));
+}
+
+TEST(ConvertTest, as_circle_activation_NEG)
+{
+  EXPECT_THROW(as_circle_activation(static_cast<circlechef::Activation>(99)), std::runtime_error);
+}
+
+TEST(ConvertTest, as_circle_tensortype)
+{
+  ASSERT_EQ(circle::TensorType_FLOAT32, as_circle_tensortype(circlechef::FLOAT32));
+  ASSERT_EQ(circle::TensorType_INT32, as_circle_tensortype(circlechef::INT32));
+  ASSERT_EQ(circle::TensorType_UINT8, as_circle_tensortype(circlechef::UINT8));
+  ASSERT_EQ(circle::TensorType_INT64, as_circle_tensortype(circlechef::INT64));
+  ASSERT_EQ(circle::TensorType_BOOL, as_circle_tensortype(circlechef::BOOL));
+  ASSERT_EQ(circle::TensorType_INT16, as_circle_tensortype(circlechef::INT16));
+}
+
+TEST(ConvertTest, as_circle_tensortype_NEG)
+{
+  EXPECT_THROW(as_circle_tensortype(static_cast<circlechef::TensorType>(99)), std::runtime_error);
+}
diff --git a/compiler/circlechef/core/src/ModelChef.cpp b/compiler/circlechef/core/src/ModelChef.cpp
index 4f25d62c0..d7101f618 100644
--- a/compiler/circlechef/core/src/ModelChef.cpp
+++ b/compiler/circlechef/core/src/ModelChef.cpp
@@ -51,7 +51,7 @@ class GeneratedModelImpl final : public circlechef::GeneratedModel::Impl
 {
 public:
   GeneratedModelImpl(std::unique_ptr<flatbuffers::FlatBufferBuilder> &&builder)
-      : _builder{std::move(builder)}
+    : _builder{std::move(builder)}
   {
     // DO NOTHING
   }
@@ -90,6 +90,7 @@ DataChefRegistry &data_chef_registry(const circlechef::TensorType &type)
   static DataChefRegistry fp32;
   static DataChefRegistry u8;
   static DataChefRegistry boolean;
+  static DataChefRegistry s16;
 
   switch (type)
   {
@@ -103,6 +104,8 @@ DataChefRegistry &data_chef_registry(const circlechef::TensorType &type)
       return u8;
     case circlechef::BOOL:
       return boolean;
+    case circlechef::INT16:
+      return s16;
     default:
       break;
   }
@@ -489,7 +492,7 @@ GeneratedModel cook(const ::circlechef::ModelRecipe &model_recipe)
 // Initialize Data Chef Registry
 #define DATA_CHEF(TYPE, NAME, FACTORY_CLASS) \
   data_chef_registry(::circlechef::TYPE)     \
-      .add(#NAME, std::unique_ptr<FACTORY_CLASS>(new FACTORY_CLASS()));
+    .add(#NAME, std::unique_ptr<FACTORY_CLASS>(new FACTORY_CLASS()));
 #include <souschef/DataChef.def>
 #undef DATA_CHEF
 
@@ -497,7 +500,7 @@ GeneratedModel cook(const ::circlechef::ModelRecipe &model_recipe)
   // Create FlatBufferBuilder
   //
   auto flatbuffer_builder =
-      std::unique_ptr<flatbuffers::FlatBufferBuilder>(new flatbuffers::FlatBufferBuilder(1024));
+    std::unique_ptr<flatbuffers::FlatBufferBuilder>(new flatbuffers::FlatBufferBuilder(1024));
 
   // Operand-related
   std::vector<flatbuffers::Offset<::circle::Buffer>> buffer_vec;
@@ -510,7 +513,7 @@ GeneratedModel cook(const ::circlechef::ModelRecipe &model_recipe)
 
   // Create OperatorCode with Builtin Operator
   std::map<circle::BuiltinOperator, int32_t> builtin_code_map =
-      gather_builtincode_map(model_recipe);
+    gather_builtincode_map(model_recipe);
   for (auto const &opcode : builtin_code_map)
   {
     circle::OperatorCodeBuilder code_builder{*flatbuffer_builder};
@@ -592,7 +595,7 @@ GeneratedModel cook(const ::circlechef::ModelRecipe &model_recipe)
 
   // Return "GenerateModel"
   return GeneratedModel{
-      std::unique_ptr<GeneratedModelImpl>(new GeneratedModelImpl(std::move(flatbuffer_builder)))};
+    std::unique_ptr<GeneratedModelImpl>(new GeneratedModelImpl(std::move(flatbuffer_builder)))};
 }
 
 } // namespace circlechef
diff --git a/compiler/circlechef/core/src/Op/BCQFullyConnected.cpp b/compiler/circlechef/core/src/Op/BCQFullyConnected.cpp
index 4c82c52cc..497cbb86b 100644
--- a/compiler/circlechef/core/src/Op/BCQFullyConnected.cpp
+++ b/compiler/circlechef/core/src/Op/BCQFullyConnected.cpp
@@ -26,9 +26,9 @@ flatbuffers::Offset<void> BCQFullyConnectedChef::value(flatbuffers::FlatBufferBu
 
   circle::BCQFullyConnectedOptionsBuilder bcq_fully_connected_options_builder{fbb};
   bcq_fully_connected_options_builder.add_weights_hidden_size(
-      operation.bcq_fully_connected_options().weights_hidden_size());
+    operation.bcq_fully_connected_options().weights_hidden_size());
   bcq_fully_connected_options_builder.add_fused_activation_function(
-      as_circle_activation(operation.bcq_fully_connected_options().activation()));
+    as_circle_activation(operation.bcq_fully_connected_options().activation()));
 
   return bcq_fully_connected_options_builder.Finish().Union();
 }
diff --git a/compiler/circlechef/core/src/Op/BCQGather.cpp b/compiler/circlechef/core/src/Op/BCQGather.cpp
index 08f6f611f..3b343ee66 100644
--- a/compiler/circlechef/core/src/Op/BCQGather.cpp
+++ b/compiler/circlechef/core/src/Op/BCQGather.cpp
@@ -24,7 +24,7 @@ flatbuffers::Offset<void> BCQGatherChef::value(flatbuffers::FlatBufferBuilder &f
 
   circle::BCQGatherOptionsBuilder bcq_gather_options_builder{fbb};
   bcq_gather_options_builder.add_input_hidden_size(
-      operation.bcq_gather_options().input_hidden_size());
+    operation.bcq_gather_options().input_hidden_size());
   bcq_gather_options_builder.add_axis(operation.bcq_gather_options().axis());
 
   return bcq_gather_options_builder.Finish().Union();
diff --git a/compiler/circlechef/core/src/Op/BatchMatMul.cpp b/compiler/circlechef/core/src/Op/BatchMatMul.cpp
index d98c0801a..645571abe 100644
--- a/compiler/circlechef/core/src/Op/BatchMatMul.cpp
+++ b/compiler/circlechef/core/src/Op/BatchMatMul.cpp
@@ -24,9 +24,9 @@ flatbuffers::Offset<void> BatchMatMulChef::value(flatbuffers::FlatBufferBuilder
 
   circle::BatchMatMulOptionsBuilder batch_matmul_options_options_builder{fbb};
   batch_matmul_options_options_builder.add_adjoint_lhs(
-      operation.batch_matmul_options().adjoint_lhs());
+    operation.batch_matmul_options().adjoint_lhs());
   batch_matmul_options_options_builder.add_adjoint_rhs(
-      operation.batch_matmul_options().adjoint_rhs());
+    operation.batch_matmul_options().adjoint_rhs());
 
   return batch_matmul_options_options_builder.Finish().Union();
 }
diff --git a/compiler/circlechef/proto/circlechef.proto b/compiler/circlechef/proto/circlechef.proto
index 83d2dfe9c..1c14b97ff 100644
--- a/compiler/circlechef/proto/circlechef.proto
+++ b/compiler/circlechef/proto/circlechef.proto
@@ -19,6 +19,7 @@ enum TensorType {
   UINT8 = 3;
   INT64 = 4;
   BOOL = 6;
+  INT16 = 7;
 }
 
 message TensorShape {
diff --git a/compiler/circlechef/tests/short_int_datatype/test.recipe b/compiler/circlechef/tests/short_int_datatype/test.recipe
new file mode 100644
index 000000000..e0f582527
--- /dev/null
+++ b/compiler/circlechef/tests/short_int_datatype/test.recipe
@@ -0,0 +1,32 @@
+operand {
+  name: "ifm1"
+  type: INT16
+  shape { dim: 1 dim: 4 dim: 4 dim: 3 }
+}
+operand {
+  name: "constant"
+  type: INT16
+  shape { dim: 1 dim: 4 dim: 3 dim: 4 }
+  filler {
+    tag: "gaussian"
+    arg: "3.0"
+    arg: "10.0"
+  }
+}
+operand {
+  name: "ofm"
+  type: INT16
+  shape { dim: 1 dim: 4 dim: 4 dim: 4 }
+}
+operation {
+  type: "BatchMatMul"
+  input: "ifm1"
+  input: "constant"
+  output: "ofm"
+  batch_matmul_options {
+    adjoint_lhs: false
+    adjoint_rhs: false
+  }
+}
+input: "ifm1"
+output: "ofm"
diff --git a/compiler/circlechef/tests/short_int_datatype/test.reverse b/compiler/circlechef/tests/short_int_datatype/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/compiler/circlechef/tests/short_int_datatype/test.reverse
diff --git a/compiler/circlechef/tools/console/CMakeLists.txt b/compiler/circlechef/tools/console/CMakeLists.txt
index 10168fca3..faf0a94f0 100644
--- a/compiler/circlechef/tools/console/CMakeLists.txt
+++ b/compiler/circlechef/tools/console/CMakeLists.txt
@@ -1,3 +1,12 @@
 add_executable(circlechef Driver.cpp)
 target_link_libraries(circlechef circlechef_core)
 target_link_libraries(circlechef safemain)
+
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
+nnas_find_package(GTest REQUIRED)
+
+GTest_AddTest(circlechef_test Driver.test.cpp Driver.cpp)
+target_link_libraries(circlechef_test circlechef_core)
diff --git a/compiler/circlechef/tools/console/Driver.cpp b/compiler/circlechef/tools/console/Driver.cpp
index 0909f5927..6aa4c3cc5 100644
--- a/compiler/circlechef/tools/console/Driver.cpp
+++ b/compiler/circlechef/tools/console/Driver.cpp
@@ -22,7 +22,7 @@
 
 #include <iostream>
 
-int entry(int argc, char **argv)
+int entry_stream(std::istream &is)
 {
   int32_t model_version = 1;
 
@@ -30,7 +30,7 @@ int entry(int argc, char **argv)
 
   // Read a model recipe from standard input
   {
-    google::protobuf::io::IstreamInputStream iis{&std::cin};
+    google::protobuf::io::IstreamInputStream iis{&is};
     if (!google::protobuf::TextFormat::Parse(&iis, &model_recipe))
     {
       std::cerr << "ERROR: Failed to parse recipe" << std::endl;
@@ -56,3 +56,9 @@ int entry(int argc, char **argv)
 
   return 0;
 }
+
+int entry(int, char **)
+{
+  // forward to entry_stream
+  return entry_stream(std::cin);
+}
diff --git a/compiler/circlechef/tools/console/Driver.test.cpp b/compiler/circlechef/tools/console/Driver.test.cpp
new file mode 100644
index 000000000..d8e4e657e
--- /dev/null
+++ b/compiler/circlechef/tools/console/Driver.test.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+// entry function to test from Driver.cpp
+int entry_stream(std::istream &is);
+
+TEST(CircleChefDriverTest, entry_empty_NEG)
+{
+  std::istringstream empty_input("");
+
+  ASSERT_EQ(0, entry_stream(empty_input));
+}
+
+TEST(CircleChefDriverTest, entry_invaid_NEG)
+{
+  std::istringstream empty_input("invalid: input");
+
+  ASSERT_NE(0, entry_stream(empty_input));
+}
+
+TEST(CircleChefDriverTest, entry_invaid_version_NEG)
+{
+  std::istringstream empty_input("version: 9999");
+
+  ASSERT_NE(0, entry_stream(empty_input));
+}
diff --git a/compiler/circlechef/tools/file/Driver.cpp b/compiler/circlechef/tools/file/Driver.cpp
index bcc0c7ae9..76d0f3f7f 100644
--- a/compiler/circlechef/tools/file/Driver.cpp
+++ b/compiler/circlechef/tools/file/Driver.cpp
@@ -29,8 +29,8 @@ int entry(int argc, char **argv)
 {
   arser::Arser arser;
   arser.add_argument("recipe")
-      .type(arser::DataType::STR)
-      .help("Source recipe file path to convert");
+    .type(arser::DataType::STR)
+    .help("Source recipe file path to convert");
   arser.add_argument("circle").type(arser::DataType::STR).help("Target circle file path");
 
   try
diff --git a/compiler/circlechef/tools/reverse/Driver.cpp b/compiler/circlechef/tools/reverse/Driver.cpp
index 8a2b85fc7..639e0af6f 100644
--- a/compiler/circlechef/tools/reverse/Driver.cpp
+++ b/compiler/circlechef/tools/reverse/Driver.cpp
@@ -26,8 +26,8 @@ int entry(int argc, char **argv)
 {
   arser::Arser arser;
   arser.add_argument("circle")
-      .type(arser::DataType::STR)
-      .help("Source circle file path to convert");
+    .type(arser::DataType::STR)
+    .help("Source circle file path to convert");
   arser.add_argument("recipe").type(arser::DataType::STR).help("Target recipe file path");
 
   try
diff --git a/compiler/circledump/README.md b/compiler/circledump/README.md
index 686e918ac..e31c2d560 100644
--- a/compiler/circledump/README.md
+++ b/compiler/circledump/README.md
@@ -67,5 +67,4 @@ O T(3) ofm
 
 - mio-circle
 - safemain
-- stdex
 - FlatBuffers
diff --git a/compiler/circledump/src/Dump.cpp b/compiler/circledump/src/Dump.cpp
index f8e2d61f3..42b4ad97a 100644
--- a/compiler/circledump/src/Dump.cpp
+++ b/compiler/circledump/src/Dump.cpp
@@ -18,6 +18,7 @@
 
 #include "Read.h"
 #include "OpPrinter.h"
+#include "MetadataPrinter.h"
 
 #include <ostream>
 
@@ -362,6 +363,7 @@ void dump_model(std::ostream &os, const circle::Model *model)
 
   auto opcodes = reader.opcodes();
   auto buffers = reader.buffers();
+  auto metadata = reader.metadata();
 
   // dump operator_codes
   os << "Operator Codes: [order] OpCodeName (OpCode Enum)" << std::endl;
@@ -395,6 +397,26 @@ void dump_model(std::ostream &os, const circle::Model *model)
   }
   os << std::endl;
 
+  // dump metadata
+  if (metadata != nullptr)
+  {
+    os << "metadata : B(index) name" << std::endl;
+    for (uint32_t i = 0; i < metadata->Length(); ++i)
+    {
+      const auto buff_id = metadata->Get(i)->buffer();
+      const auto metadata_name = metadata->Get(i)->name()->str();
+      os << "B(" << buff_id << ") " << metadata_name << std::endl;
+
+      const uint8_t *buff_data;
+      reader.buffer_info(buff_id, &buff_data);
+      if (auto meta_prn = MetadataPrinterRegistry::get().lookup(metadata_name))
+      {
+        meta_prn->print(buff_data, os);
+      }
+    }
+    os << std::endl;
+  }
+
   for (uint32_t sg = 0; sg < num_subgraph; ++sg)
   {
     reader.select_subgraph(sg);
diff --git a/compiler/circledump/src/MetadataPrinter.cpp b/compiler/circledump/src/MetadataPrinter.cpp
new file mode 100644
index 000000000..f2df9bc16
--- /dev/null
+++ b/compiler/circledump/src/MetadataPrinter.cpp
@@ -0,0 +1,119 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "MetadataPrinter.h"
+
+#include <cassert>
+#include <string>
+#include <vector>
+
+namespace circledump
+{
+
+class SourceTablePrinter : public MetadataPrinter
+{
+public:
+  /**
+   *  source table consists of following parts
+   *  - [ entry_number : uint32_t ]
+   *  - [ id : uint32_t ][ length : uint32_t ][ data : 'length' Bytes ] * entry_number
+   */
+  virtual void print(const uint8_t *buffer, std::ostream &os) const override
+  {
+    if (buffer)
+    {
+      os << "    [node_id : node_name]" << std::endl;
+      auto cur = buffer;
+      // entry number
+      const uint32_t num = *reinterpret_cast<const uint32_t *>(cur);
+      cur += sizeof(uint32_t);
+      for (uint32_t entry = 0; entry < num; entry++)
+      {
+        // id
+        const uint32_t node_id = *reinterpret_cast<const uint32_t *>(cur);
+        cur += sizeof(uint32_t);
+        // length
+        const uint32_t len = *reinterpret_cast<const uint32_t *>(cur);
+        cur += sizeof(uint32_t);
+        assert(len != 0);
+        // data
+        // non-empty 'data' has trailing '\0'. Let's exclude it.
+        std::string node_name = std::string(cur, cur + len - 1);
+        cur += len;
+
+        // print
+        os << "    [" << node_id << " : " << node_name << "]" << std::endl;
+      }
+    }
+  }
+};
+
+class OpTablePrinter : public MetadataPrinter
+{
+public:
+  /**
+   *  op table consists of following parts
+   *  - [ entry_number : uint32_t ]
+   *  - [ id : uint32_t ][ length : uint32_t ][ origin_ids : length * uint32_t ] * entry_number
+   */
+  virtual void print(const uint8_t *buffer, std::ostream &os) const override
+  {
+    if (buffer)
+    {
+      os << "    [node_id : origin_ids]" << std::endl;
+      auto cur = buffer;
+      // entry number
+      const uint32_t num = *reinterpret_cast<const uint32_t *>(cur);
+      cur += sizeof(uint32_t);
+      for (uint32_t entry = 0; entry < num; entry++)
+      {
+        // id
+        const uint32_t node_id = *reinterpret_cast<const uint32_t *>(cur);
+        cur += sizeof(uint32_t);
+        // length
+        const uint32_t len = *reinterpret_cast<const uint32_t *>(cur);
+        cur += sizeof(uint32_t);
+        assert(len != 0);
+        // origin_ids
+        std::vector<uint32_t> origin_ids;
+        for (uint32_t o = 0; o < len; o++)
+        {
+          origin_ids.push_back(*reinterpret_cast<const uint32_t *>(cur));
+          cur += sizeof(uint32_t);
+        }
+
+        // print
+        os << "    [" << node_id << " : ";
+        uint32_t i = 0;
+        for (const auto &id : origin_ids)
+        {
+          if (i++)
+            os << ", ";
+          os << id;
+        }
+        os << "]" << std::endl;
+      }
+    }
+  }
+};
+
+MetadataPrinterRegistry::MetadataPrinterRegistry()
+{
+  _metadata_map["ONE_source_table"] = std::make_unique<SourceTablePrinter>();
+  _metadata_map["ONE_op_table"] = std::make_unique<OpTablePrinter>();
+}
+
+} // namespace circledump
diff --git a/compiler/circledump/src/MetadataPrinter.h b/compiler/circledump/src/MetadataPrinter.h
new file mode 100644
index 000000000..1dca2ca1e
--- /dev/null
+++ b/compiler/circledump/src/MetadataPrinter.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CIRCLEDUMP_METADATA_PRINTER_H__
+#define __CIRCLEDUMP_METADATA_PRINTER_H__
+
+#include <ostream>
+#include <string>
+#include <map>
+#include <memory>
+
+namespace circledump
+{
+
+class MetadataPrinter
+{
+public:
+  virtual void print(const uint8_t * /* buffer */, std::ostream &) const = 0;
+};
+
+class MetadataPrinterRegistry
+{
+public:
+  MetadataPrinterRegistry();
+
+public:
+  const MetadataPrinter *lookup(std::string op) const
+  {
+    if (_metadata_map.find(op) == _metadata_map.end())
+      return nullptr;
+
+    return _metadata_map.at(op).get();
+  }
+
+public:
+  static MetadataPrinterRegistry &get()
+  {
+    static MetadataPrinterRegistry me;
+    return me;
+  }
+
+private:
+  std::map<std::string /* metadata name */, std::unique_ptr<MetadataPrinter>> _metadata_map;
+};
+
+} // namespace circledump
+
+#endif // __CIRCLEDUMP_METADATA_PRINTER_H__
diff --git a/compiler/circledump/src/OpPrinter.cpp b/compiler/circledump/src/OpPrinter.cpp
index ef22baaee..5319bb88d 100644
--- a/compiler/circledump/src/OpPrinter.cpp
+++ b/compiler/circledump/src/OpPrinter.cpp
@@ -90,6 +90,26 @@ public:
   }
 };
 
+class BidirectionalSequenceLSTMPrinter : public OpPrinter
+{
+public:
+  void options(const circle::Operator *op, std::ostream &os) const override
+  {
+    if (auto *params = op->builtin_options_as_BidirectionalSequenceLSTMOptions())
+    {
+      os << "    ";
+      os << "Activation(" << EnumNameActivationFunctionType(params->fused_activation_function())
+         << ") ";
+      os << "cell_clip(" << params->cell_clip() << ") ";
+      os << "proj_clip(" << params->proj_clip() << ") ";
+      os << "time_major(" << params->time_major() << ") ";
+      os << "asymmetric_quantize_inputs(" << params->asymmetric_quantize_inputs() << ") ";
+      os << "merge_outputs(" << params->merge_outputs() << ") ";
+      os << std::endl;
+    }
+  }
+};
+
 class CastPrinter : public OpPrinter
 {
 public:
@@ -279,7 +299,7 @@ public:
       os << "Stride.H(" << conv_params->stride_h() << ") ";
       os << "DepthMultiplier(" << conv_params->depth_multiplier() << ") ";
       os << "Dilation.W(" << conv_params->dilation_w_factor() << ") ";
-      os << "Dilation.H(" << conv_params->dilation_h_factor() << ")";
+      os << "Dilation.H(" << conv_params->dilation_h_factor() << ") ";
       os << "Activation("
          << EnumNameActivationFunctionType(conv_params->fused_activation_function()) << ") ";
       os << std::endl;
@@ -287,6 +307,25 @@ public:
   }
 };
 
+class FakeQuantPrinter : public OpPrinter
+{
+public:
+  void options(const circle::Operator *op, std::ostream &os) const override
+  {
+    if (auto *params = op->builtin_options_as_FakeQuantOptions())
+    {
+      os << "    ";
+      os << "Min(" << params->min() << ") ";
+      os << "Max(" << params->max() << ") ";
+      os << "NumBits(" << params->num_bits() << ") ";
+      os << std::boolalpha;
+      os << "NarrowRange(" << params->narrow_range() << ") ";
+      os << std::noboolalpha;
+      os << std::endl;
+    }
+  }
+};
+
 class FullyConnectedPrinter : public OpPrinter
 {
 public:
@@ -720,6 +759,8 @@ OpPrinterRegistry::OpPrinterRegistry()
   _op_map[circle::BuiltinOperator_ARG_MIN] = make_unique<ArgMinPrinter>();
   _op_map[circle::BuiltinOperator_AVERAGE_POOL_2D] = make_unique<Pool2DPrinter>();
   _op_map[circle::BuiltinOperator_BATCH_MATMUL] = make_unique<BatchMatMulPrinter>();
+  _op_map[circle::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM] =
+    make_unique<BidirectionalSequenceLSTMPrinter>();
   _op_map[circle::BuiltinOperator_CAST] = make_unique<CastPrinter>();
   // There is no Option for CEIL
   _op_map[circle::BuiltinOperator_CONCATENATION] = make_unique<ConcatenationPrinter>();
@@ -728,6 +769,7 @@ OpPrinterRegistry::OpPrinterRegistry()
   _op_map[circle::BuiltinOperator_DEPTHWISE_CONV_2D] = make_unique<DepthwiseConv2DPrinter>();
   // There is no Option for DEQUANTIZE
   _op_map[circle::BuiltinOperator_DIV] = make_unique<DivPrinter>();
+  _op_map[circle::BuiltinOperator_FAKE_QUANT] = make_unique<FakeQuantPrinter>();
   // There is no Option for FLOOR
   // There is no Option for FLOOR_MOD
   _op_map[circle::BuiltinOperator_FULLY_CONNECTED] = make_unique<FullyConnectedPrinter>();
@@ -737,7 +779,7 @@ OpPrinterRegistry::OpPrinterRegistry()
   _op_map[circle::BuiltinOperator_L2_POOL_2D] = make_unique<Pool2DPrinter>();
   _op_map[circle::BuiltinOperator_LEAKY_RELU] = make_unique<LeakyReluPrinter>();
   _op_map[circle::BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION] =
-      make_unique<LocalResponseNormalizationPrinter>();
+    make_unique<LocalResponseNormalizationPrinter>();
   // There is no Option for LOG
   // There is no Option for LOGISTIC
   // There is no Option for LOG_SOFTMAX
@@ -761,7 +803,7 @@ OpPrinterRegistry::OpPrinterRegistry()
   _op_map[circle::BuiltinOperator_RESHAPE] = make_unique<ReshapePrinter>();
   _op_map[circle::BuiltinOperator_RESIZE_BILINEAR] = make_unique<ResizeBilinearPrinter>();
   _op_map[circle::BuiltinOperator_RESIZE_NEAREST_NEIGHBOR] =
-      make_unique<ResizeNearestNeighborPrinter>();
+    make_unique<ResizeNearestNeighborPrinter>();
   _op_map[circle::BuiltinOperator_REVERSE_SEQUENCE] = make_unique<ReverseSequencePrinter>();
   // There is no Option for ROUND
   // There is no Option for SELECT
@@ -782,7 +824,7 @@ OpPrinterRegistry::OpPrinterRegistry()
   _op_map[circle::BuiltinOperator_TRANSPOSE_CONV] = make_unique<TransposeConvPrinter>();
   // There is no Option for TOPK_V2
   _op_map[circle::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM] =
-      make_unique<UnidirectionalSequenceLSTMPrinter>();
+    make_unique<UnidirectionalSequenceLSTMPrinter>();
   _op_map[circle::BuiltinOperator_UNIQUE] = make_unique<UniquePrinter>();
   _op_map[circle::BuiltinOperator_WHILE] = make_unique<WhilePrinter>();
   _op_map[circle::BuiltinOperator_CUSTOM] = make_unique<CustomOpPrinter>();
diff --git a/compiler/circledump/src/Read.cpp b/compiler/circledump/src/Read.cpp
index 053225536..db8298585 100644
--- a/compiler/circledump/src/Read.cpp
+++ b/compiler/circledump/src/Read.cpp
@@ -81,6 +81,7 @@ Reader::Reader(const circle::Model *model)
   _version = model->version();
   _subgraphs = model->subgraphs();
   _buffers = model->buffers();
+  _metadata = model->metadata();
 
   auto opcodes = model->operator_codes();
   for (const ::circle::OperatorCode *opcode : *opcodes)
diff --git a/compiler/circledump/src/Read.h b/compiler/circledump/src/Read.h
index dd1ef20b6..be0e15827 100644
--- a/compiler/circledump/src/Read.h
+++ b/compiler/circledump/src/Read.h
@@ -52,6 +52,7 @@ private:
   using CircleBuffers_t = flatbuffers::Vector<flatbuffers::Offset<circle::Buffer>>;
   using CircleTensors_t = flatbuffers::Vector<flatbuffers::Offset<circle::Tensor>>;
   using CircleOperators_t = flatbuffers::Vector<flatbuffers::Offset<circle::Operator>>;
+  using CircleMetadata_t = flatbuffers::Vector<flatbuffers::Offset<circle::Metadata>>;
 
 public:
   Reader(const circle::Model *model);
@@ -68,6 +69,7 @@ public:
   const std::vector<int32_t> &inputs() const { return _inputs; }
   const std::vector<int32_t> &outputs() const { return _outputs; }
   const circle::DataFormat &data_format() const { return _data_format; }
+  const CircleMetadata_t *metadata() const { return _metadata; }
 
   uint32_t num_subgraph() const { return _subgraphs->Length(); }
 
@@ -87,6 +89,7 @@ private:
   const CircleBuffers_t *_buffers{nullptr};
   const CircleTensors_t *_tensors{nullptr};
   const CircleOperators_t *_operators{nullptr};
+  const CircleMetadata_t *_metadata{nullptr};
 
   uint32_t _subgraph_index;
   std::string _subgraph_name;
diff --git a/compiler/cli/CMakeLists.txt b/compiler/cli/CMakeLists.txt
index 22948fff9..2ab8c0529 100644
--- a/compiler/cli/CMakeLists.txt
+++ b/compiler/cli/CMakeLists.txt
@@ -12,4 +12,3 @@ endif(NOT GTest_FOUND)
 
 GTest_AddTEst(cli_test ${TESTS})
 target_link_libraries(cli_test cli)
-target_link_libraries(cli_test stdex)
diff --git a/compiler/cli/src/App.test.cpp b/compiler/cli/src/App.test.cpp
index fe2d44179..59e5da3bd 100644
--- a/compiler/cli/src/App.test.cpp
+++ b/compiler/cli/src/App.test.cpp
@@ -16,7 +16,7 @@
 
 #include "cli/App.h"
 
-#include <stdex/Memory.h>
+#include <memory>
 
 #include <gtest/gtest.h>
 
@@ -52,7 +52,7 @@ TEST(APP, run)
   cli::App app("test");
 
   std::string args;
-  app.insert("record", stdex::make_unique<RecordCommand>(3, args));
+  app.insert("record", std::make_unique<RecordCommand>(3, args));
 
   const char *argv[] = {"record", "hello", "world"};
 
diff --git a/compiler/coco/core/CMakeLists.txt b/compiler/coco/core/CMakeLists.txt
index 8c6844733..a81d366c9 100644
--- a/compiler/coco/core/CMakeLists.txt
+++ b/compiler/coco/core/CMakeLists.txt
@@ -7,7 +7,6 @@ target_include_directories(coco_core PUBLIC include)
 # NOTE Some coco_core PUBLIC headers include angkor headers
 target_link_libraries(coco_core PUBLIC angkor)
 target_link_libraries(coco_core PRIVATE pepper_assert)
-target_link_libraries(coco_core PRIVATE stdex)
 # Let's apply nncc common compile options
 # NOTE This will enable strict compilation (warnings as error).
 #      Please refer to top-level CMakeLists.txt for details
@@ -22,4 +21,3 @@ nnas_find_package(GTest REQUIRED)
 
 GTest_AddTest(coco_core_test ${TESTS})
 target_link_libraries(coco_core_test coco_core)
-target_link_libraries(coco_core_test stdex)
diff --git a/compiler/coco/core/include/coco/IR/FeatureShape.h b/compiler/coco/core/include/coco/IR/FeatureShape.h
index 015fc709d..3c8e9accd 100644
--- a/compiler/coco/core/include/coco/IR/FeatureShape.h
+++ b/compiler/coco/core/include/coco/IR/FeatureShape.h
@@ -31,13 +31,13 @@ class FeatureShape : public nncc::core::ADT::feature::Shape
 {
 public:
   FeatureShape(uint32_t depth, uint32_t height, uint32_t width)
-      : Shape{depth, height, width}, _batch{1}
+    : Shape{depth, height, width}, _batch{1}
   {
     // DO NOTHING
   }
 
   FeatureShape(uint32_t batch, uint32_t depth, uint32_t height, uint32_t width)
-      : Shape{depth, height, width}, _batch{batch}
+    : Shape{depth, height, width}, _batch{batch}
   {
     // DO NOTHING
   }
diff --git a/compiler/coco/core/include/coco/IR/Locatable.h b/compiler/coco/core/include/coco/IR/Locatable.h
index b80a4a360..549802776 100644
--- a/compiler/coco/core/include/coco/IR/Locatable.h
+++ b/compiler/coco/core/include/coco/IR/Locatable.h
@@ -24,7 +24,7 @@ namespace coco
 
 /**
  * @brief Return the associated instruction if exists.
-  */
+ */
 struct Locatable
 {
   virtual ~Locatable() = default;
diff --git a/compiler/coco/core/include/coco/IR/Ops.h b/compiler/coco/core/include/coco/IR/Ops.h
index 01ac92b7f..39dce5272 100644
--- a/compiler/coco/core/include/coco/IR/Ops.h
+++ b/compiler/coco/core/include/coco/IR/Ops.h
@@ -407,6 +407,6 @@ public:
   const Sqrt *asSqrt(void) const override { return this; }
 };
 
-} // namesapce coco
+} // namespace coco
 
 #endif // __COCO_IR_OPS_H__
diff --git a/compiler/coco/core/include/coco/IR/Padding2D.h b/compiler/coco/core/include/coco/IR/Padding2D.h
index b764656cc..68a3481f1 100644
--- a/compiler/coco/core/include/coco/IR/Padding2D.h
+++ b/compiler/coco/core/include/coco/IR/Padding2D.h
@@ -32,7 +32,7 @@ public:
 
 public:
   Padding2D(uint32_t top, uint32_t bottom, uint32_t left, uint32_t right)
-      : _top{top}, _bottom{bottom}, _left{left}, _right{right}
+    : _top{top}, _bottom{bottom}, _left{left}, _right{right}
   {
     // DO NOTHING
   }
diff --git a/compiler/coco/core/src/ADT/PtrList.test.cpp b/compiler/coco/core/src/ADT/PtrList.test.cpp
index dcbad8b90..904dd6e1d 100644
--- a/compiler/coco/core/src/ADT/PtrList.test.cpp
+++ b/compiler/coco/core/src/ADT/PtrList.test.cpp
@@ -25,7 +25,7 @@ namespace
 struct Object
 {
 };
-}
+} // namespace
 
 TEST(ADT_PTR_LIST, ctor)
 {
diff --git a/compiler/coco/core/src/ADT/PtrManager.test.cpp b/compiler/coco/core/src/ADT/PtrManager.test.cpp
index bb9056f29..5a9f09d4e 100644
--- a/compiler/coco/core/src/ADT/PtrManager.test.cpp
+++ b/compiler/coco/core/src/ADT/PtrManager.test.cpp
@@ -61,7 +61,7 @@ struct ObjectManager final : public coco::PtrManager<Object>
 
   void free(Object *o) { release(o); }
 };
-}
+} // namespace
 
 TEST(ADT_PTR_MANAGER, usecase)
 {
diff --git a/compiler/coco/core/src/IR/BagManager.cpp b/compiler/coco/core/src/IR/BagManager.cpp
index 10fe69d57..8cfb0c09c 100644
--- a/compiler/coco/core/src/IR/BagManager.cpp
+++ b/compiler/coco/core/src/IR/BagManager.cpp
@@ -16,14 +16,14 @@
 
 #include "coco/IR/BagManager.h"
 
-#include <stdex/Memory.h>
+#include <memory>
 
 namespace coco
 {
 
 Bag *BagManager::create(uint32_t size)
 {
-  auto bag = stdex::make_unique<Bag>(size);
+  auto bag = std::make_unique<Bag>(size);
   modulize(bag.get());
   return take(std::move(bag));
 }
diff --git a/compiler/coco/core/src/IR/BlockManager.cpp b/compiler/coco/core/src/IR/BlockManager.cpp
index 5e3b88173..d1bcacb32 100644
--- a/compiler/coco/core/src/IR/BlockManager.cpp
+++ b/compiler/coco/core/src/IR/BlockManager.cpp
@@ -16,8 +16,7 @@
 
 #include "coco/IR/BlockManager.h"
 
-#include <stdex/Memory.h>
-
+#include <memory>
 #include <cassert>
 
 namespace coco
@@ -25,7 +24,7 @@ namespace coco
 
 Block *BlockManager::create(void)
 {
-  auto blk = stdex::make_unique<Block>();
+  auto blk = std::make_unique<Block>();
   modulize(blk.get());
   return take(std::move(blk));
 }
diff --git a/compiler/coco/core/src/IR/Conv2D.test.cpp b/compiler/coco/core/src/IR/Conv2D.test.cpp
index df0a2470b..5bf06ca9f 100644
--- a/compiler/coco/core/src/IR/Conv2D.test.cpp
+++ b/compiler/coco/core/src/IR/Conv2D.test.cpp
@@ -20,11 +20,9 @@
 #include <vector>
 #include <memory>
 
-#include <stdex/Memory.h>
-
 #include <gtest/gtest.h>
 
-using stdex::make_unique;
+using std::make_unique;
 
 namespace
 {
diff --git a/compiler/coco/core/src/IR/Def.test.cpp b/compiler/coco/core/src/IR/Def.test.cpp
index 98455c09e..443fdcb95 100644
--- a/compiler/coco/core/src/IR/Def.test.cpp
+++ b/compiler/coco/core/src/IR/Def.test.cpp
@@ -19,13 +19,13 @@
 
 #include "coco/IR/FeatureObject.h"
 
-#include <stdex/Memory.h>
+#include <memory>
 
 #include "Producer.mock.h"
 
 #include <gtest/gtest.h>
 
-using stdex::make_unique;
+using std::make_unique;
 
 namespace
 {
diff --git a/compiler/coco/core/src/IR/InputManager.cpp b/compiler/coco/core/src/IR/InputManager.cpp
index 6d5b9470b..0530deeda 100644
--- a/compiler/coco/core/src/IR/InputManager.cpp
+++ b/compiler/coco/core/src/IR/InputManager.cpp
@@ -16,14 +16,14 @@
 
 #include "coco/IR/InputManager.h"
 
-#include <stdex/Memory.h>
+#include <memory>
 
 namespace coco
 {
 
 Input *InputManager::create(const nncc::core::ADT::tensor::Shape &shape)
 {
-  auto input = stdex::make_unique<Input>(shape);
+  auto input = std::make_unique<Input>(shape);
   modulize(input.get());
   return take(std::move(input));
 }
diff --git a/compiler/coco/core/src/IR/Module.cpp b/compiler/coco/core/src/IR/Module.cpp
index 0b65ceedc..420cf6f0c 100644
--- a/compiler/coco/core/src/IR/Module.cpp
+++ b/compiler/coco/core/src/IR/Module.cpp
@@ -16,9 +16,9 @@
 
 #include "coco/IR/Module.h"
 
-#include <stdex/Memory.h>
+#include <memory>
 
-using stdex::make_unique;
+using std::make_unique;
 
 namespace
 {
diff --git a/compiler/coco/core/src/IR/ObjectManager.cpp b/compiler/coco/core/src/IR/ObjectManager.cpp
index 1b7215a04..38c3a9bcc 100644
--- a/compiler/coco/core/src/IR/ObjectManager.cpp
+++ b/compiler/coco/core/src/IR/ObjectManager.cpp
@@ -19,11 +19,10 @@
 #include "coco/IR/FeatureObject.h"
 #include "coco/IR/KernelObject.h"
 
-#include <stdex/Memory.h>
-
+#include <memory>
 #include <cassert>
 
-using stdex::make_unique;
+using std::make_unique;
 
 namespace coco
 {
diff --git a/compiler/coco/core/src/IR/OpManager.cpp b/compiler/coco/core/src/IR/OpManager.cpp
index c87b704fe..911f999c7 100644
--- a/compiler/coco/core/src/IR/OpManager.cpp
+++ b/compiler/coco/core/src/IR/OpManager.cpp
@@ -16,13 +16,12 @@
 
 #include "coco/IR/OpManager.h"
 
-#include <stdex/Memory.h>
-
+#include <memory>
 #include <cassert>
 #include <queue>
 #include <set>
 
-using stdex::make_unique;
+using std::make_unique;
 
 namespace coco
 {
diff --git a/compiler/coco/core/src/IR/Ops.test.cpp b/compiler/coco/core/src/IR/Ops.test.cpp
index ae979b2bf..cfbd3ca70 100644
--- a/compiler/coco/core/src/IR/Ops.test.cpp
+++ b/compiler/coco/core/src/IR/Ops.test.cpp
@@ -21,11 +21,9 @@
 #include <vector>
 #include <memory>
 
-#include <stdex/Memory.h>
-
 #include <gtest/gtest.h>
 
-using stdex::make_unique;
+using std::make_unique;
 
 /**
  * Section: Add Op
diff --git a/compiler/coco/core/src/IR/OutputManager.cpp b/compiler/coco/core/src/IR/OutputManager.cpp
index 86b9580ac..5dd51c378 100644
--- a/compiler/coco/core/src/IR/OutputManager.cpp
+++ b/compiler/coco/core/src/IR/OutputManager.cpp
@@ -16,14 +16,14 @@
 
 #include "coco/IR/OutputManager.h"
 
-#include <stdex/Memory.h>
+#include <memory>
 
 namespace coco
 {
 
 Output *OutputManager::create(const nncc::core::ADT::tensor::Shape &shape)
 {
-  auto output = stdex::make_unique<Output>(shape);
+  auto output = std::make_unique<Output>(shape);
   modulize(output.get());
   return take(std::move(output));
 }
diff --git a/compiler/coco/core/src/IR/Part.test.cpp b/compiler/coco/core/src/IR/Part.test.cpp
index 87e0e1516..4348d4db2 100644
--- a/compiler/coco/core/src/IR/Part.test.cpp
+++ b/compiler/coco/core/src/IR/Part.test.cpp
@@ -17,11 +17,11 @@
 #include "coco/IR/Part.h"
 #include "coco/IR/Op.h"
 
-#include <stdex/Memory.h>
+#include <memory>
 
 #include <gtest/gtest.h>
 
-using stdex::make_unique;
+using std::make_unique;
 
 namespace
 {
diff --git a/compiler/coco/core/src/IR/Use.test.cpp b/compiler/coco/core/src/IR/Use.test.cpp
index 3191e9852..b7026385f 100644
--- a/compiler/coco/core/src/IR/Use.test.cpp
+++ b/compiler/coco/core/src/IR/Use.test.cpp
@@ -21,11 +21,11 @@
 
 #include "Consumer.mock.h"
 
-#include <stdex/Memory.h>
+#include <memory>
 
 #include <gtest/gtest.h>
 
-using stdex::make_unique;
+using std::make_unique;
 
 namespace
 {
diff --git a/compiler/coco/generic/CMakeLists.txt b/compiler/coco/generic/CMakeLists.txt
index 02fbf67f5..c65c84c06 100644
--- a/compiler/coco/generic/CMakeLists.txt
+++ b/compiler/coco/generic/CMakeLists.txt
@@ -5,7 +5,6 @@ list(REMOVE_ITEM SOURCES ${TESTS})
 add_library(coco_generic SHARED ${SOURCES})
 target_include_directories(coco_generic PUBLIC include)
 target_link_libraries(coco_generic PUBLIC coco_core)
-target_link_libraries(coco_generic PRIVATE stdex)
 target_link_libraries(coco_generic PRIVATE nncc_common)
 
 if(NOT ENABLE_TEST)
@@ -17,6 +16,3 @@ nnas_find_package(GTest REQUIRED)
 
 GTest_AddTest(coco_generic_test ${TESTS})
 target_link_libraries(coco_generic_test coco_generic)
-# stdex is a PRIVATE dependency of coco_generic, and thus is not linked to coco_generic_test
-# even though coco_generic_test is linked to coco_generic
-target_link_libraries(coco_generic_test stdex)
diff --git a/compiler/coco/generic/src/IR/Data.cpp b/compiler/coco/generic/src/IR/Data.cpp
index b71947253..5ab7069ee 100644
--- a/compiler/coco/generic/src/IR/Data.cpp
+++ b/compiler/coco/generic/src/IR/Data.cpp
@@ -19,13 +19,12 @@
 #include <nncc/core/ADT/kernel/NCHWLayout.h>
 #include <nncc/core/ADT/kernel/Overlay.h>
 
-#include <stdex/Memory.h>
-
+#include <memory>
 #include <map>
 
 using namespace nncc::core::ADT;
 
-using stdex::make_unique;
+using std::make_unique;
 
 namespace
 {
@@ -71,7 +70,7 @@ public:
 private:
   std::map<const coco::Bag *, std::unique_ptr<std::vector<uint8_t>>> _data;
 };
-}
+} // namespace
 
 namespace
 {
diff --git a/compiler/common-artifacts/CMakeLists.txt b/compiler/common-artifacts/CMakeLists.txt
index ec9e3cf85..e93a66ef0 100644
--- a/compiler/common-artifacts/CMakeLists.txt
+++ b/compiler/common-artifacts/CMakeLists.txt
@@ -171,9 +171,9 @@ foreach(RECIPE IN ITEMS ${RECIPES})
   if(DEFINED RULE_SOURCE_PATH)
     # Copy .rule
     add_custom_command(OUTPUT ${RULE_BINARY_PATH}
-    COMMAND ${CMAKE_COMMAND} -E copy "${RULE_SOURCE_PATH}" "${RULE_BINARY_PATH}"
-    DEPENDS ${RULE_SOURCE_PATH}
-    COMMENT "Generate ${RULE_FILE}"
+      COMMAND ${CMAKE_COMMAND} -E copy "${RULE_SOURCE_PATH}" "${RULE_BINARY_PATH}"
+      DEPENDS ${RULE_SOURCE_PATH}
+      COMMENT "Generate ${RULE_FILE}"
     )
     list(APPEND TEST_DEPS ${RULE_BINARY_PATH})
   endif()
@@ -188,21 +188,21 @@ foreach(RECIPE IN ITEMS ${RECIPES})
     list(APPEND TEST_DEPS ${TFLITE_OUTPUT_PATH})
 
     if(NOT DEFINED NO_CIRCLIZE_${RECIPE})
-    # Generate .circle
-    add_custom_command(OUTPUT ${CIRCLE_OUTPUT_PATH}
-      COMMAND $<TARGET_FILE:tflite2circle> ${TFLITE_OUTPUT_PATH} ${CIRCLE_OUTPUT_PATH}
-      DEPENDS $<TARGET_FILE:tflite2circle> ${TFLITE_OUTPUT_PATH}
-      COMMENT "Generate ${CIRCLE_FILE}"
-    )
-    set(MODEL_FORMAT "circle")
-    list(APPEND TEST_DEPS ${CIRCLE_OUTPUT_PATH})
+      # Generate .circle
+      add_custom_command(OUTPUT ${CIRCLE_OUTPUT_PATH}
+        COMMAND $<TARGET_FILE:tflite2circle> ${TFLITE_OUTPUT_PATH} ${CIRCLE_OUTPUT_PATH}
+        DEPENDS $<TARGET_FILE:tflite2circle> ${TFLITE_OUTPUT_PATH}
+        COMMENT "Generate ${CIRCLE_FILE}"
+      )
+      set(MODEL_FORMAT "circle")
+      list(APPEND TEST_DEPS ${CIRCLE_OUTPUT_PATH})
     endif()
   else()
     # Generate .circle
     add_custom_command(OUTPUT ${CIRCLE_OUTPUT_PATH}
-    COMMAND $<TARGET_FILE:circlechef-file> ${RECIPE_BINARY_PATH} ${CIRCLE_OUTPUT_PATH}
-    DEPENDS $<TARGET_FILE:circlechef-file> ${RECIPE_BINARY_PATH}
-    COMMENT "Generate ${CIRCLE_FILE}"
+      COMMAND $<TARGET_FILE:circlechef-file> ${RECIPE_BINARY_PATH} ${CIRCLE_OUTPUT_PATH}
+      DEPENDS $<TARGET_FILE:circlechef-file> ${RECIPE_BINARY_PATH}
+      COMMENT "Generate ${CIRCLE_FILE}"
     )
     list(APPEND TEST_DEPS ${CIRCLE_OUTPUT_PATH})
   endif()
@@ -213,7 +213,7 @@ foreach(RECIPE IN ITEMS ${RECIPES})
   if(NOT DEFINED NO_OPTIMIZE_${RECIPE})
     # Generate optimized .circle
     add_custom_command(OUTPUT ${OPT_CIRCLE_OUTPUT_PATH}
-      COMMAND $<TARGET_FILE:circle2circle> --all ${CIRCLE_OUTPUT_PATH} ${OPT_CIRCLE_OUTPUT_PATH}
+      COMMAND $<TARGET_FILE:circle2circle> --O1 ${CIRCLE_OUTPUT_PATH} ${OPT_CIRCLE_OUTPUT_PATH}
       DEPENDS $<TARGET_FILE:circle2circle>  ${CIRCLE_OUTPUT_PATH}
       COMMENT "Generate ${OPT_CIRCLE_FILE}"
     )
@@ -224,54 +224,43 @@ foreach(RECIPE IN ITEMS ${RECIPES})
   set(MODEL_FILE "${RECIPE}${OPT_FORMAT}.${MODEL_FORMAT}")
   set(MODEL_PATH "${CMAKE_CURRENT_BINARY_DIR}/${MODEL_FILE}")
   set(NNPKG_FILE "${RECIPE}${OPT_FORMAT}")
-  set(NNPKG_PATH "${CMAKE_CURRENT_BINARY_DIR}/${NNPKG_FILE}")
+  set(NNPKG_DIR "${CMAKE_CURRENT_BINARY_DIR}/${NNPKG_FILE}")
+  set(NNPKG_MODEL "${NNPKG_DIR}/${MODEL_FILE}")
+
+  # Generate nnpackage directory
+  add_custom_command(OUTPUT ${NNPKG_DIR}
+      COMMAND ${CMAKE_COMMAND} -E make_directory ${NNPKG_DIR}
+      DEPENDS ${MODEL_PATH}
+      COMMENT "Generate ${RECIPE} nnpackage directory"
+    )
+  list(APPEND TEST_DEPS ${NNPKG_DIR})
 
-  add_custom_command(OUTPUT ${NNPKG_PATH}
+  add_custom_command(OUTPUT ${NNPKG_MODEL}
     COMMAND ${MODEL2NNPKG} ${MODEL_PATH}
-    DEPENDS ${MODEL2NNPKG} ${MODEL_PATH}
+    DEPENDS ${MODEL2NNPKG} ${MODEL_PATH} ${NNPKG_DIR}
     COMMENT "Generate ${RECIPE} nnpackage"
   )
-  list(APPEND TEST_DEPS ${NNPKG_PATH})
-
-  set(INPUT_HDF5_FILE "${RECIPE}${OPT_FORMAT}.input.h5")
-  set(INPUT_BIN_PATH "${CMAKE_CURRENT_BINARY_DIR}/${INPUT_HDF5_FILE}")
-
-  set(EXPECTED_HDF5_FILE "${RECIPE}${OPT_FORMAT}.expected.h5")
-  set(EXPECTED_BIN_PATH "${CMAKE_CURRENT_BINARY_DIR}/${EXPECTED_HDF5_FILE}")
+  list(APPEND TEST_DEPS ${NNPKG_MODEL})
 
   if(NOT DEFINED NO_TCGEN_${RECIPE})
-    # Generate input.h5, expected.h5
-    add_custom_command(OUTPUT ${INPUT_BIN_PATH} ${EXPECTED_BIN_PATH}
-      COMMAND $<TARGET_FILE:testDataGenerator> ${MODEL_FILE}
-      DEPENDS $<TARGET_FILE:testDataGenerator> ${MODEL_FILE}
-      COMMENT "Generate ${INPUT_BIN_PATH} and ${EXPECTED_BIN_PATH}"
-    )
-
     # Generate test directory
-    set(TC_DIRECTORY "${NNPKG_PATH}/metadata/tc")
+    set(TC_DIRECTORY "${NNPKG_DIR}/metadata/tc")
     add_custom_command(OUTPUT ${TC_DIRECTORY}
       COMMAND ${CMAKE_COMMAND} -E make_directory ${TC_DIRECTORY}
-      DEPENDS ${NNPKG_PATH}
+      DEPENDS ${NNPKG_DIR}
       COMMENT "Generate ${RECIPE} nnpackage test directory"
     )
+    list(APPEND TEST_DEPS ${TC_DIRECTORY})
 
-    # Move input hdf5 file to test directory
-    set(INPUT_NNPKG_PATH "${TC_DIRECTORY}/input.h5")
-    add_custom_command(OUTPUT ${INPUT_NNPKG_PATH}
-      COMMAND ${CMAKE_COMMAND} -E rename ${INPUT_BIN_PATH} ${INPUT_NNPKG_PATH}
-      DEPENDS ${INPUT_BIN_PATH} ${TC_DIRECTORY}
-      COMMENT "Move ${INPUT_HDF5_FILE} to nnpackage"
-    )
-
-    # Move expected hdf5 file to test directory
-    set(EXPECTED_NNPKG_PATH "${TC_DIRECTORY}/expected.h5")
-    add_custom_command(OUTPUT ${EXPECTED_NNPKG_PATH}
-      COMMAND ${CMAKE_COMMAND} -E rename ${EXPECTED_BIN_PATH} ${EXPECTED_NNPKG_PATH}
-      DEPENDS ${EXPECTED_BIN_PATH} ${TC_DIRECTORY}
-      COMMENT "Move ${EXPECTED_HDF5_FILE} to nnpackage"
+    # Generate input.h5, expected.h5
+    set(INPUT_HDF5_FILE "${TC_DIRECTORY}/input.h5")
+    set(EXPECTED_HDF5_FILE "${TC_DIRECTORY}/expected.h5")
+    add_custom_command(OUTPUT ${INPUT_HDF5_FILE} ${EXPECTED_HDF5_FILE}
+      COMMAND $<TARGET_FILE:testDataGenerator> --input_data ${INPUT_HDF5_FILE} --expected_data ${EXPECTED_HDF5_FILE} ${MODEL_FILE}
+      DEPENDS $<TARGET_FILE:testDataGenerator> ${MODEL_FILE} ${TC_DIRECTORY}
+      COMMENT "Generate ${INPUT_HDF5_FILE} and ${EXPECTED_HDF5_FILE}"
     )
-    list(APPEND TEST_DEPS ${TC_DIRECTORY} ${INPUT_BIN_PATH} ${EXPECTED_BIN_PATH}
-                          ${INPUT_NNPKG_PATH} ${EXPECTED_NNPKG_PATH})
+    list(APPEND TEST_DEPS ${INPUT_HDF5_FILE} ${EXPECTED_HDF5_FILE})
   endif()
 endforeach()
 
diff --git a/compiler/common-artifacts/exclude.lst b/compiler/common-artifacts/exclude.lst
index 34a4d2c6a..b9b758fe7 100644
--- a/compiler/common-artifacts/exclude.lst
+++ b/compiler/common-artifacts/exclude.lst
@@ -28,6 +28,7 @@ tcgenerate(BatchMatMul_000)
 tcgenerate(BatchMatMulV2_000)
 tcgenerate(BatchMatMulV2_001)
 tcgenerate(BatchToSpaceND_000)
+tcgenerate(BroadcastTo_000) # luci-interpreter doesn't support custom operator
 tcgenerate(Cast_000)
 tcgenerate(Cast_001)
 tcgenerate(Ceil_000)
@@ -41,6 +42,8 @@ tcgenerate(ExpandDims_000)
 tcgenerate(ExpandDims_001)
 tcgenerate(ExpandDims_002)
 tcgenerate(ExpandDims_003)
+tcgenerate(ExpandDims_004)
+tcgenerate(FakeQuant_000) # runtime and luci-interpreter doesn't support yet
 tcgenerate(Fill_000)
 tcgenerate(Fill_001)
 tcgenerate(FloorMod_000)
@@ -60,6 +63,9 @@ tcgenerate(MatrixSetDiag_000)
 tcgenerate(MaxPoolWithArgMax_000)
 tcgenerate(MaxPoolWithArgMax_001)
 tcgenerate(MaxPoolWithArgMax_002)
+tcgenerate(Mean_dynamic_000) # TestDataGenerator does not support unknown dimension
+tcgenerate(Mean_dynamic_001) # TestDataGenerator does not support unknown dimension
+tcgenerate(Mean_U8_dynamic_000) # TestDataGenerator does not support unknown dimension
 tcgenerate(NonMaxSuppressionV4_000)
 tcgenerate(NonMaxSuppressionV4_001)
 tcgenerate(NonMaxSuppressionV5_000)
@@ -67,10 +73,8 @@ tcgenerate(NonMaxSuppressionV5_001)
 tcgenerate(MirrorPad_000)
 tcgenerate(Mul_U8_000)
 tcgenerate(Neg_000)
+tcgenerate(Net_BroadcastTo_AddV2_001) # luci-interpreter doesn't support custom operator
 tcgenerate(Net_Dangle_001)
-tcgenerate(Net_InstanceNorm_001)
-tcgenerate(Net_InstanceNorm_002)
-tcgenerate(Net_InstanceNorm_003)
 tcgenerate(Net_ZeroDim_001) # luci-interpreter doesn't support zero dim
 tcgenerate(OneHot_000)
 tcgenerate(OneHot_001)
@@ -85,24 +89,26 @@ tcgenerate(ReduceAny_000)
 tcgenerate(ReduceAny_001)
 tcgenerate(ReduceAny_002)
 tcgenerate(ReduceAny_003)
-tcgenerate(ReduceAny_dynamic_000)
-tcgenerate(ReduceAny_dynamic_001)
-tcgenerate(ReduceAny_dynamic_002)
-tcgenerate(ReduceAny_dynamic_003)
+tcgenerate(ReduceAny_dynamic_000) # TestDataGenerator does not support unknown dimension
+tcgenerate(ReduceAny_dynamic_001) # TestDataGenerator does not support unknown dimension
+tcgenerate(ReduceAny_dynamic_002) # TestDataGenerator does not support unknown dimension
+tcgenerate(ReduceAny_dynamic_003) # TestDataGenerator does not support unknown dimension
 tcgenerate(ReduceMax_000)
-tcgenerate(ReduceMax_dynamic_000)
+tcgenerate(ReduceMax_dynamic_000) # TestDataGenerator does not support unknown dimension
 tcgenerate(ReduceMin_000)
-tcgenerate(ReduceMin_dynamic_000)
+tcgenerate(ReduceMin_dynamic_000) # TestDataGenerator does not support unknown dimension
 tcgenerate(ReduceProd_000)
 tcgenerate(ReduceProd_001)
 tcgenerate(ReduceProd_002)
 tcgenerate(ReduceProd_003)
-tcgenerate(ReduceProd_dynamic_000)
-tcgenerate(ReduceProd_dynamic_001)
-tcgenerate(ReduceProd_dynamic_002)
-tcgenerate(ReduceProd_dynamic_003)
+tcgenerate(ReduceProd_dynamic_000) # TestDataGenerator does not support unknown dimension
+tcgenerate(ReduceProd_dynamic_001) # TestDataGenerator does not support unknown dimension
+tcgenerate(ReduceProd_dynamic_002) # TestDataGenerator does not support unknown dimension
+tcgenerate(ReduceProd_dynamic_003) # TestDataGenerator does not support unknown dimension
+tcgenerate(ReLU_dynamic_000) # TestDataGenerator does not support unknown dimension
+tcgenerate(ReLU6_dynamic_000) # TestDataGenerator does not support unknown dimension
 tcgenerate(ReLUN1To1_000)
-tcgenerate(ReLUN1To1_dynamic_000)
+tcgenerate(ReLUN1To1_dynamic_000) # TestDataGenerator does not support unknown dimension
 tcgenerate(Reshape_003) # luci-interpreter doesn't support reshape without built-in option
 tcgenerate(ReverseSequence_000)
 tcgenerate(ReverseV2_000)
@@ -117,6 +123,7 @@ tcgenerate(SelectV2_001)
 tcgenerate(SelectV2_002)
 tcgenerate(Shape_000)
 tcgenerate(Sin_000)
+tcgenerate(Slice_001) # luci-interpreter doesn't support Slice with -1
 tcgenerate(SpaceToBatchND_000)
 tcgenerate(SpaceToBatchND_001)
 tcgenerate(SpaceToBatchND_002)
@@ -124,11 +131,10 @@ tcgenerate(SpaceToBatchND_003)
 tcgenerate(SparseToDense_000)
 tcgenerate(SplitV_000)
 tcgenerate(Square_000)
-tcgenerate(SquaredDifference_000)
 tcgenerate(Sum_000)
 tcgenerate(Sum_001)
-tcgenerate(Sum_dynamic_000)
-tcgenerate(Sum_dynamic_001)
+tcgenerate(Sum_dynamic_000) # TestDataGenerator does not support unknown dimension
+tcgenerate(Sum_dynamic_001) # TestDataGenerator does not support unknown dimension
 tcgenerate(Tile_000)
 tcgenerate(Tile_U8_000)
 tcgenerate(TopKV2_000)
diff --git a/compiler/common-artifacts/src/TestDataGenerator.cpp b/compiler/common-artifacts/src/TestDataGenerator.cpp
index f8f014442..be6bb5ba9 100644
--- a/compiler/common-artifacts/src/TestDataGenerator.cpp
+++ b/compiler/common-artifacts/src/TestDataGenerator.cpp
@@ -34,7 +34,7 @@ namespace
 uint32_t element_num(std::vector<hsize_t> &vec)
 {
   return static_cast<uint32_t>(
-      std::accumulate(std::begin(vec), std::end(vec), 1, std::multiplies<uint32_t>()));
+    std::accumulate(std::begin(vec), std::end(vec), 1, std::multiplies<uint32_t>()));
 }
 
 H5::PredType hdf5_dtype_cast(const loco::DataType loco_dtype)
@@ -94,10 +94,20 @@ int entry(int argc, char **argv)
 {
   arser::Arser arser;
   arser.add_argument("circle").type(arser::DataType::STR).help("Circle file you want to test");
+  arser.add_argument("--input_data")
+    .required(true)
+    .nargs(1)
+    .type(arser::DataType::STR)
+    .help("Path to generate input data h5 file");
+  arser.add_argument("--expected_data")
+    .required(true)
+    .nargs(1)
+    .type(arser::DataType::STR)
+    .help("Path to generate expected data h5 file");
   arser.add_argument("--fixed_seed")
-      .required(false)
-      .nargs(0)
-      .help("Put a fixed seed into the random number generator");
+    .required(false)
+    .nargs(0)
+    .help("Put a fixed seed into the random number generator");
 
   try
   {
@@ -111,8 +121,6 @@ int entry(int argc, char **argv)
   }
 
   std::string circle_file = arser.get<std::string>("circle");
-  size_t last_dot_index = circle_file.find_last_of(".");
-  std::string prefix = circle_file.substr(0, last_dot_index);
 
   // load circle file
   foder::FileLoader file_loader{circle_file};
@@ -144,17 +152,17 @@ int entry(int argc, char **argv)
    *       ㄴDATA ...
    */
   // create random data and dump into hdf5 file
-  H5::H5File input_file{prefix + ".input.h5", H5F_ACC_TRUNC};
+  H5::H5File input_file{arser.get<std::string>("--input_data"), H5F_ACC_TRUNC};
   std::unique_ptr<H5::Group> input_name_group =
-      std::make_unique<H5::Group>(input_file.createGroup("name"));
+    std::make_unique<H5::Group>(input_file.createGroup("name"));
   std::unique_ptr<H5::Group> input_value_group =
-      std::make_unique<H5::Group>(input_file.createGroup("value"));
+    std::make_unique<H5::Group>(input_file.createGroup("value"));
 
-  H5::H5File output_file{prefix + ".expected.h5", H5F_ACC_TRUNC};
+  H5::H5File output_file{arser.get<std::string>("--expected_data"), H5F_ACC_TRUNC};
   std::unique_ptr<H5::Group> output_name_group =
-      std::make_unique<H5::Group>(output_file.createGroup("name"));
+    std::make_unique<H5::Group>(output_file.createGroup("name"));
   std::unique_ptr<H5::Group> output_value_group =
-      std::make_unique<H5::Group>(output_file.createGroup("value"));
+    std::make_unique<H5::Group>(output_file.createGroup("value"));
 
   std::random_device rd; // used to obtain a seed for the random number engine
   uint32_t input_index = 0;
@@ -187,7 +195,7 @@ int entry(int argc, char **argv)
       auto dataspace = std::make_unique<H5::DataSpace>(dims.size(), dims.data());
       auto dtype = hdf5_dtype_cast(input_node->dtype());
       auto dataset = std::make_unique<H5::DataSet>(
-          input_file.createDataSet("value/" + std::to_string(input_index), dtype, *dataspace));
+        input_file.createDataSet("value/" + std::to_string(input_index), dtype, *dataspace));
 
       auto data_size = ::element_num(dims);
       auto dtype_size = loco::size(input_node->dtype());
@@ -241,7 +249,7 @@ int entry(int argc, char **argv)
       auto dataspace = std::make_unique<H5::DataSpace>(dims.size(), dims.data());
       auto dtype = hdf5_dtype_cast(output_node->dtype());
       auto dataset = std::make_unique<H5::DataSet>(
-          output_file.createDataSet("value/" + std::to_string(output_index), dtype, *dataspace));
+        output_file.createDataSet("value/" + std::to_string(output_index), dtype, *dataspace));
 
       uint32_t tensor_bytesize = loco::size(output_node->dtype());
       tensor_bytesize *= ::element_num(dims);
diff --git a/compiler/crew/CMakeLists.txt b/compiler/crew/CMakeLists.txt
new file mode 100644
index 000000000..1824d86ab
--- /dev/null
+++ b/compiler/crew/CMakeLists.txt
@@ -0,0 +1,20 @@
+file(GLOB_RECURSE SOURCES "src/*.cpp")
+file(GLOB_RECURSE TESTS "src/*.test.cpp")
+list(REMOVE_ITEM SOURCES ${TESTS})
+
+add_library(crew STATIC ${SOURCES})
+target_include_directories(crew PRIVATE src)
+target_include_directories(crew PUBLIC include)
+target_link_libraries(crew PRIVATE foder)
+target_link_libraries(crew PRIVATE nncc_common)
+
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
+nnas_find_package(GTest REQUIRED)
+
+GTest_AddTest(crew_test ${TESTS})
+target_include_directories(crew_test PRIVATE src)
+target_link_libraries(crew_test nncc_common)
+target_link_libraries(crew_test crew)
diff --git a/compiler/crew/README.md b/compiler/crew/README.md
new file mode 100644
index 000000000..29691929d
--- /dev/null
+++ b/compiler/crew/README.md
@@ -0,0 +1,13 @@
+# crew
+
+_crew_ is circle partitioning Configuration REader and Writer library.
+
+### Support formats
+
+Current _crew_ supports below format and functionalities.
+- INI read
+- INI write
+- JSON write
+
+_crew_ supports limited portion of JSON and INI formats just enough to access
+circle partition configuration files.
diff --git a/compiler/crew/include/crew/PConfig.h b/compiler/crew/include/crew/PConfig.h
new file mode 100644
index 000000000..9ff875574
--- /dev/null
+++ b/compiler/crew/include/crew/PConfig.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CREW_PCONFIG_H__
+#define __CREW_PCONFIG_H__
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+namespace crew
+{
+
+struct Part
+{
+  std::string model_file;
+  std::vector<std::string> inputs;
+  std::vector<std::string> outputs;
+};
+
+using Parts = std::vector<Part>;
+using Source = Part;
+
+struct PConfig
+{
+  Source source;
+  Parts parts;
+};
+
+/**
+ * @brief Read config as ini file, return false if failed
+ */
+bool read_ini(const std::string &path, PConfig &config);
+
+/**
+ * @brief Write config as ini file, return false if failed
+ */
+bool write_ini(std::ostream &os, const PConfig &config);
+
+/**
+ * @brief Write config as json file, return false if failed
+ */
+bool write_json(std::ostream &os, const PConfig &config);
+
+} // namespace crew
+
+#endif // __CREW_PCONFIG_H__
diff --git a/compiler/crew/include/crew/PConfigIni.h b/compiler/crew/include/crew/PConfigIni.h
new file mode 100644
index 000000000..45a54e115
--- /dev/null
+++ b/compiler/crew/include/crew/PConfigIni.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CREW_PCONFIG_INI_H__
+#define __CREW_PCONFIG_INI_H__
+
+#include <iostream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace crew
+{
+
+using KeyValues = std::unordered_map<std::string, std::string>;
+
+struct Section
+{
+  std::string name;
+  KeyValues items;
+};
+
+using Sections = std::vector<Section>;
+
+/**
+ * @brief Reads Config INI from null terminated string and return Sections
+ */
+Sections read_ini(const char *data, size_t length);
+/**
+ * @brief Reads Config INI from file and return Sections
+ */
+Sections read_ini(const std::string &path);
+
+/**
+ * @brief Write Config INI with Sections to ostream
+ */
+void write_ini(std::ostream &os, const Sections &sections);
+/**
+ * @brief Write Config INI with Sections to file, throw if failed
+ */
+void write_ini(const std::string &path, const Sections &sections);
+
+/**
+ * @brief Find a section with name, empty section if not found
+ */
+Section find(const Sections &sections, const std::string &name);
+
+/**
+ * @brief Find a key-value pair from key and return value, empty string if not found
+ */
+std::string find(const Section &section, const std::string &key);
+
+} // namespace crew
+
+#endif // __CREW_PCONFIG_INI_H__
diff --git a/compiler/crew/include/crew/PConfigIniDump.h b/compiler/crew/include/crew/PConfigIniDump.h
new file mode 100644
index 000000000..0755c6b20
--- /dev/null
+++ b/compiler/crew/include/crew/PConfigIniDump.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CREW_PCONFIG_INI_DUMP_H__
+#define __CREW_PCONFIG_INI_DUMP_H__
+
+#include "PConfigIni.h"
+
+#include <iostream>
+
+namespace crew
+{
+
+void dump(std::ostream &os, const Sections &sections);
+
+} // namespace crew
+
+std::ostream &operator<<(std::ostream &os, const crew::Sections &sections);
+
+#endif // __CREW_PCONFIG_INI_DUMP_H__
diff --git a/compiler/crew/requires.cmake b/compiler/crew/requires.cmake
new file mode 100644
index 000000000..27406d465
--- /dev/null
+++ b/compiler/crew/requires.cmake
@@ -0,0 +1 @@
+require("foder")
diff --git a/compiler/crew/src/PConfig.cpp b/compiler/crew/src/PConfig.cpp
new file mode 100644
index 000000000..b8e7c3e44
--- /dev/null
+++ b/compiler/crew/src/PConfig.cpp
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "crew/PConfig.h"
+#include "crew/PConfigIni.h"
+
+#include "PConfigJson.h"
+
+#include <utility>
+
+namespace
+{
+
+bool read_part(const crew::Section &section, crew::Part &part)
+{
+  // construct Source from section_source
+  part.model_file = crew::find(section, "file");
+  if (part.model_file.empty())
+    return false;
+
+  // read inputs for Source
+  for (int32_t i = 1;; ++i)
+  {
+    std::string item = "i" + std::to_string(i);
+    std::string input = crew::find(section, item);
+    if (input.empty())
+      break;
+
+    part.inputs.push_back(input);
+  }
+  // read outputs for Source
+  for (int32_t i = 1;; ++i)
+  {
+    std::string item = "o" + std::to_string(i);
+    std::string output = crew::find(section, item);
+    if (output.empty())
+      break;
+
+    part.outputs.push_back(output);
+  }
+  return true;
+}
+
+} // namespace
+
+namespace
+{
+
+void write_part(crew::JsonExport &je, const crew::Part &part)
+{
+  std::vector<std::string> graph_inputs;
+  std::vector<std::string> graph_outputs;
+
+  for (auto &input : part.inputs)
+  {
+    graph_inputs.push_back(input);
+  }
+  for (auto &output : part.outputs)
+  {
+    graph_outputs.push_back(output);
+  }
+
+  je.key_val("file", part.model_file.c_str(), true);
+  je.key_val("inputs", graph_inputs, true);
+  je.key_val("outputs", graph_outputs, false);
+}
+
+void write_parts(crew::JsonExport &je, const crew::Parts &parts)
+{
+  uint32_t idx = 1;
+  uint32_t size = parts.size();
+  for (auto &part : parts)
+  {
+    je.open_brace();
+    write_part(je, part);
+    je.close_brace(idx < size);
+    idx++;
+  }
+}
+
+} // namespace
+
+namespace
+{
+
+void part_to_section_io(const crew::Part &part, crew::Section &section)
+{
+  uint32_t idx = 1;
+  for (auto &input : part.inputs)
+  {
+    std::string key = "i" + std::to_string(idx);
+    section.items.emplace(key, input);
+    idx++;
+  }
+  idx = 1;
+  for (auto &output : part.outputs)
+  {
+    std::string key = "o" + std::to_string(idx);
+    section.items.emplace(key, output);
+    idx++;
+  }
+}
+
+} // namespace
+
+namespace crew
+{
+
+bool read_ini(const std::string &path, PConfig &pconfig)
+{
+  auto sections = crew::read_ini(path);
+
+  auto section_source = crew::find(sections, "source");
+  auto section_models = crew::find(sections, "models");
+  if (section_source.name != "source" || section_models.name != "models")
+  {
+    return false;
+  }
+
+  if (!read_part(section_source, pconfig.source))
+  {
+    return false;
+  }
+
+  // get models list
+  std::vector<std::string> models;
+  for (int32_t i = 1;; ++i)
+  {
+    std::string item = "m" + std::to_string(i);
+    std::string model = crew::find(section_models, item);
+    if (model.empty())
+      break;
+
+    models.push_back(model);
+  }
+
+  for (auto &model : models)
+  {
+    auto section_model = crew::find(sections, model);
+
+    Part part;
+    if (!read_part(section_model, part))
+    {
+      return false;
+    }
+    pconfig.parts.push_back(part);
+  }
+
+  return true;
+}
+
+bool write_ini(std::ostream &os, const PConfig &pconfig)
+{
+  crew::Sections sections;
+
+  // make [source]
+  crew::Section section_source;
+  section_source.name = "source";
+  section_source.items["file"] = pconfig.source.model_file;
+  part_to_section_io(pconfig.source, section_source);
+  sections.push_back(section_source);
+
+  // make [models]
+  crew::Section section_models;
+  section_models.name = "models";
+  uint32_t idx = 1;
+  for (auto &part : pconfig.parts)
+  {
+    std::string key = "m" + std::to_string(idx);
+    section_models.items[key] = part.model_file;
+    idx++;
+  }
+  sections.push_back(section_models);
+
+  for (auto &part : pconfig.parts)
+  {
+    // make circle model section
+    crew::Section section_model;
+    section_model.name = part.model_file;
+    section_model.items["file"] = part.model_file;
+    part_to_section_io(part, section_model);
+    sections.push_back(section_model);
+  }
+
+  write_ini(os, sections);
+
+  return true;
+}
+
+bool write_json(std::ostream &os, const PConfig &pconfig)
+{
+  crew::JsonExport je(os);
+
+  je.open_brace();
+  {
+    je.open_brace("source");
+    write_part(je, pconfig.source);
+    je.close_brace(true);
+  }
+  {
+    je.open_bracket("parts");
+    write_parts(je, pconfig.parts);
+    je.close_bracket(false);
+  }
+  je.close_brace(false);
+
+  return true;
+}
+
+} // namespace crew
diff --git a/compiler/crew/src/PConfigIni.cpp b/compiler/crew/src/PConfigIni.cpp
new file mode 100644
index 000000000..f0e3e8e01
--- /dev/null
+++ b/compiler/crew/src/PConfigIni.cpp
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "crew/PConfigIni.h"
+#include "crew/PConfigIniDump.h"
+
+#include <foder/FileLoader.h>
+
+#include <cassert>
+#include <cstring>
+#include <fstream>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+namespace crew
+{
+
+Sections read_ini(const char *data, size_t length)
+{
+  assert(data != nullptr);
+  assert(length > 0);
+
+  auto buffer = std::vector<char>();
+  buffer.reserve(length + 1);
+  char *pbuffer = buffer.data();
+  memcpy(pbuffer, data, length);
+  // add null at end to be sure
+  *(pbuffer + length) = 0;
+
+  Sections sections;
+  Section section;
+
+  std::string string_line;
+
+  const char *delim = "\r\n";
+  const char *one_line = std::strtok(pbuffer, delim);
+  while (one_line != nullptr)
+  {
+    if (*one_line == '[')
+    {
+      if (!section.name.empty())
+      {
+        sections.push_back(section);
+      }
+      section.name.clear();
+      section.items.clear();
+
+      string_line = one_line + 1;
+      auto pos = string_line.find(']');
+      assert(pos != std::string::npos);
+      if (pos != std::string::npos)
+      {
+        section.name = string_line.substr(0, pos);
+      }
+    }
+    else if (*one_line == '#' || *one_line == ';')
+    {
+      // Comment line, do nothing
+    }
+    else if (*one_line) // string legnth is not 0
+    {
+      if (section.name.empty())
+        throw std::runtime_error("Invalid INI file");
+
+      string_line = one_line;
+      auto pos = string_line.find('=');
+      assert(pos != std::string::npos);
+      if (pos != std::string::npos)
+      {
+        auto key = string_line.substr(0, pos);
+        auto val = string_line.substr(pos + 1);
+        section.items.emplace(key, val);
+      }
+    }
+
+    one_line = std::strtok(nullptr, delim);
+  }
+  if (!section.name.empty())
+  {
+    sections.push_back(section);
+  }
+
+  return sections;
+}
+
+Sections read_ini(const std::string &path)
+{
+  foder::FileLoader file_loader{path};
+  // load will throw if error while opening
+  auto ini_data = file_loader.load();
+
+  return read_ini(ini_data.data(), ini_data.size());
+}
+
+void write_ini(std::ostream &os, const Sections &sections)
+{
+  std::stringstream ss;
+
+  ss << sections;
+
+  std::string strss = ss.str();
+
+  os.write(strss.c_str(), strss.length());
+}
+
+void write_ini(const std::string &filepath, const Sections &sections)
+{
+  std::ofstream fs(filepath.c_str(), std::ofstream::binary | std::ofstream::trunc);
+  if (not fs.good())
+  {
+    std::string msg = "Failed to create file: " + filepath;
+    throw std::runtime_error(msg);
+  }
+
+  write_ini(fs, sections);
+
+  fs.close();
+}
+
+Section find(const Sections &sections, const std::string &name)
+{
+  for (auto &section : sections)
+  {
+    if (section.name == name)
+      return section;
+  }
+  Section not_found;
+  return not_found;
+}
+
+std::string find(const Section &section, const std::string &key)
+{
+  for (auto &item : section.items)
+  {
+    if (item.first == key)
+      return item.second;
+  }
+  return "";
+}
+
+} // namespace crew
diff --git a/compiler/crew/src/PConfigIni.test.cpp b/compiler/crew/src/PConfigIni.test.cpp
new file mode 100644
index 000000000..bdd2ccc1f
--- /dev/null
+++ b/compiler/crew/src/PConfigIni.test.cpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "crew/PConfigIni.h"
+#include "crew/PConfigIniDump.h"
+
+#include <gtest/gtest.h>
+
+#include <sstream>
+#include <stdexcept>
+
+TEST(ConfigIniTest, read_ini_non_exist_file)
+{
+  EXPECT_THROW(crew::read_ini("/hello/world/not_a_file"), std::runtime_error);
+}
+
+TEST(ConfigIniTest, read_ini_simple)
+{
+  std::stringstream ss;
+
+  ss << "[hello]\nkey=world\n";
+
+  auto str = ss.str();
+  auto sections = crew::read_ini(str.c_str(), str.length());
+  ASSERT_EQ(1UL, sections.size());
+
+  auto its = sections.begin();
+  ASSERT_NE(sections.end(), its);
+  EXPECT_TRUE("hello" == its->name);
+  ASSERT_EQ(1UL, its->items.size());
+
+  auto it = its->items.begin();
+  ASSERT_NE(its->items.end(), it);
+  EXPECT_TRUE("key" == it->first);
+  EXPECT_TRUE("world" == it->second);
+}
+
+TEST(ConfigIniTest, read_ini_simple_NEG)
+{
+  std::stringstream ss;
+
+  ss << "key=value\nhello=world\n";
+
+  auto str = ss.str();
+
+  EXPECT_THROW(crew::read_ini(str.c_str(), str.length()), std::runtime_error);
+}
+
+TEST(ConfigIniTest, read_ini_comment)
+{
+  std::stringstream ss;
+
+  ss << "[hello]\n;comment=skip\n#comment=skip\nkey=world\n";
+
+  auto str = ss.str();
+  auto sections = crew::read_ini(str.c_str(), str.length());
+  ASSERT_EQ(1UL, sections.size());
+
+  auto its = sections.begin();
+  ASSERT_NE(sections.end(), its);
+  EXPECT_TRUE("hello" == its->name);
+  ASSERT_EQ(1UL, its->items.size());
+
+  auto it = its->items.begin();
+  ASSERT_NE(its->items.end(), it);
+  EXPECT_TRUE("key" == it->first);
+  EXPECT_TRUE("world" == it->second);
+}
+
+TEST(ConfigIniTest, write_ini_file_error_NEG)
+{
+  crew::Sections sections;
+  EXPECT_THROW(crew::write_ini("/abc/def/cannot_access", sections), std::runtime_error);
+}
diff --git a/compiler/crew/src/PConfigIniDump.cpp b/compiler/crew/src/PConfigIniDump.cpp
new file mode 100644
index 000000000..5b7a1cb6d
--- /dev/null
+++ b/compiler/crew/src/PConfigIniDump.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "crew/PConfigIniDump.h"
+
+namespace crew
+{
+
+/**
+ * @brief Dump content of sections
+ */
+void dump(std::ostream &os, const Sections &sections)
+{
+  for (auto &section : sections)
+  {
+    os << "[" << section.name << "]" << std::endl;
+    for (auto &item : section.items)
+    {
+      os << item.first << "=" << item.second << std::endl;
+    }
+    os << std::endl;
+  }
+}
+
+} // namespace crew
+
+std::ostream &operator<<(std::ostream &os, const crew::Sections &sections)
+{
+  crew::dump(os, sections);
+  return os;
+}
diff --git a/compiler/crew/src/PConfigIniDump.test.cpp b/compiler/crew/src/PConfigIniDump.test.cpp
new file mode 100644
index 000000000..25cf4736b
--- /dev/null
+++ b/compiler/crew/src/PConfigIniDump.test.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "crew/PConfigIni.h"
+#include "crew/PConfigIniDump.h"
+
+#include <gtest/gtest.h>
+
+#include <sstream>
+#include <stdexcept>
+
+TEST(ConfigIniDumpTest, dump_sections)
+{
+  crew::Sections sections;
+  crew::Section section;
+
+  section.name = "hello";
+  section.items["key"] = "value";
+
+  sections.push_back(section);
+
+  std::stringstream ss;
+
+  ss << sections;
+
+  // there's extra \n at end of each section
+  ASSERT_TRUE(ss.str() == "[hello]\nkey=value\n\n");
+}
diff --git a/compiler/crew/src/PConfigJson.cpp b/compiler/crew/src/PConfigJson.cpp
new file mode 100644
index 000000000..5af0ebddd
--- /dev/null
+++ b/compiler/crew/src/PConfigJson.cpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "PConfigJson.h"
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+namespace
+{
+
+const char _CLF = '\n'; // Control Line Feed
+const char _DQU = '\"'; // Double QUotation
+
+} // namespace
+
+namespace crew
+{
+
+void JsonExport::indent(void)
+{
+  for (uint32_t i = 0; i < _indent; ++i)
+    _os << "  ";
+}
+
+void JsonExport::open_brace(void)
+{
+  indent();
+
+  _os << "{" << _CLF;
+  _indent++;
+}
+
+void JsonExport::open_brace(const std::string &key)
+{
+  indent();
+
+  _os << _DQU << key << _DQU << " : {" << _CLF;
+  _indent++;
+}
+
+void JsonExport::open_bracket(const std::string &key)
+{
+  indent();
+
+  _os << _DQU << key << _DQU << " : [" << _CLF;
+  _indent++;
+}
+
+void JsonExport::close_bracket(bool cont)
+{
+  _indent--;
+  indent();
+
+  _os << "]";
+  if (cont)
+    _os << ",";
+  _os << _CLF;
+}
+
+void JsonExport::close_brace(bool cont)
+{
+  _indent--;
+  indent();
+
+  _os << "}";
+  if (cont)
+    _os << ",";
+  _os << _CLF;
+}
+
+void JsonExport::key_val(const std::string &key, const std::string &value, bool cont)
+{
+  indent();
+
+  _os << _DQU << key << _DQU << " : " << _DQU << value << _DQU;
+  if (cont)
+    _os << ",";
+  _os << _CLF;
+}
+
+void JsonExport::key_val(const std::string &key, const std::vector<std::string> &l, bool cont)
+{
+  indent();
+
+  _os << _DQU << key << _DQU << " : [ ";
+  bool comma = false;
+  for (auto &v : l)
+  {
+    if (comma)
+      _os << ", ";
+    else
+      comma = true;
+    _os << _DQU << v << _DQU;
+  }
+  _os << " ]";
+  if (cont)
+    _os << ",";
+  _os << _CLF;
+}
+
+} // namespace crew
diff --git a/compiler/crew/src/PConfigJson.h b/compiler/crew/src/PConfigJson.h
new file mode 100644
index 000000000..c5c49d096
--- /dev/null
+++ b/compiler/crew/src/PConfigJson.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CREW_PCONFIG_JSON_H__
+#define __CREW_PCONFIG_JSON_H__
+
+#include <ostream>
+#include <string>
+#include <vector>
+
+namespace crew
+{
+
+class JsonExport
+{
+public:
+  JsonExport(std::ostream &os) : _os(os) {}
+
+private:
+  void indent(void);
+
+public:
+  void open_brace(void);
+  void open_brace(const std::string &key);
+  void open_bracket(const std::string &key);
+  void close_bracket(bool cont);
+  void close_brace(bool cont);
+  void key_val(const std::string &key, const std::string &value, bool cont);
+  void key_val(const std::string &key, const std::vector<std::string> &l, bool cont);
+
+private:
+  std::ostream &_os;
+  uint32_t _indent = 0;
+};
+
+} // namespace crew
+
+#endif // __CREW_PCONFIG_JSON_H__
diff --git a/compiler/crew/src/PConfigJson.test.cpp b/compiler/crew/src/PConfigJson.test.cpp
new file mode 100644
index 000000000..f8afabc3d
--- /dev/null
+++ b/compiler/crew/src/PConfigJson.test.cpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "PConfigJson.h"
+
+#include <gtest/gtest.h>
+
+#include <sstream>
+
+TEST(ConfigJsonTest, empty)
+{
+  std::stringstream ss;
+  crew::JsonExport je(ss);
+
+  je.open_brace();
+  je.close_brace(true);
+
+  ASSERT_TRUE(ss.str() == "{\n},\n");
+}
+
+TEST(ConfigJsonTest, keyvalue)
+{
+  std::stringstream ss;
+  crew::JsonExport je(ss);
+
+  je.open_brace("hello");
+  je.key_val("key", "value", true);
+  je.close_brace(true);
+
+  ASSERT_TRUE(ss.str() == "\"hello\" : {\n  \"key\" : \"value\",\n},\n");
+}
+
+TEST(ConfigJsonTest, keyvaluearray)
+{
+  std::stringstream ss;
+  crew::JsonExport je(ss);
+  std::vector<std::string> vs = {"1", "2"};
+
+  je.open_brace("hello");
+  je.key_val("key", vs, true);
+  je.close_brace(true);
+
+  ASSERT_TRUE(ss.str() == "\"hello\" : {\n  \"key\" : [ \"1\", \"2\" ],\n},\n");
+}
+
+TEST(ConfigJsonTest, bracket)
+{
+  std::stringstream ss;
+  crew::JsonExport je(ss);
+
+  je.open_bracket("hello");
+  je.close_bracket(true);
+
+  ASSERT_TRUE(ss.str() == "\"hello\" : [\n],\n");
+}
diff --git a/compiler/cwrap/src/Fildes.test.cpp b/compiler/cwrap/src/Fildes.test.cpp
index f9fa20f9e..c487f064a 100644
--- a/compiler/cwrap/src/Fildes.test.cpp
+++ b/compiler/cwrap/src/Fildes.test.cpp
@@ -44,7 +44,7 @@ int make_temp(char *name_template)
   return fd;
 }
 
-} // namespace make_temp
+} // namespace
 
 TEST(FildesTest, default_constructor)
 {
diff --git a/compiler/enco/cli/CMakeLists.txt b/compiler/enco/cli/CMakeLists.txt
index 5a43ab655..6777f329b 100644
--- a/compiler/enco/cli/CMakeLists.txt
+++ b/compiler/enco/cli/CMakeLists.txt
@@ -5,7 +5,6 @@ target_include_directories(enco-cli PRIVATE src)
 target_link_libraries(enco-cli enco_intf_cmdline)
 target_link_libraries(enco-cli enco_intf_frontend)
 target_link_libraries(enco-cli enco_core)
-target_link_libraries(enco-cli stdex)
 target_link_libraries(enco-cli dl)
 # Let's use project-wide compile options
 target_link_libraries(enco-cli nncc_common)
diff --git a/compiler/enco/cli/src/Driver.cpp b/compiler/enco/cli/src/Driver.cpp
index 185bb13b9..fe6cefb8c 100644
--- a/compiler/enco/cli/src/Driver.cpp
+++ b/compiler/enco/cli/src/Driver.cpp
@@ -135,8 +135,7 @@ private:
 
 } // namespace
 
-#include <stdex/Memory.h>
-
+#include <memory>
 #include <map>
 
 #include <iostream>
@@ -153,7 +152,7 @@ static int entry(int argc, char **argv)
   std::map<std::string, std::function<void(const std::string &arg)>> argparse;
 
   argparse["--frontend"] = [&](const std::string &path) {
-    frontend_zone = stdex::make_unique<FrontendZone>(path);
+    frontend_zone = std::make_unique<FrontendZone>(path);
   };
 
   argparse["--frontend-arg"] = [&](const std::string &arg) { frontend_zone->append(arg); };
diff --git a/compiler/enco/core/CMakeLists.txt b/compiler/enco/core/CMakeLists.txt
index f437e687a..25dad2bc6 100644
--- a/compiler/enco/core/CMakeLists.txt
+++ b/compiler/enco/core/CMakeLists.txt
@@ -17,7 +17,6 @@ target_link_libraries(enco_core PUBLIC coco_generic)
 # These libraries are linked for internal use, and thus does not appear in public headers.
 target_link_libraries(enco_core PRIVATE pp)
 target_link_libraries(enco_core PRIVATE morph)
-target_link_libraries(enco_core PRIVATE stdex)
 # Let's use nncc project-wide build options
 target_link_libraries(enco_core PRIVATE nncc_common)
 
diff --git a/compiler/enco/core/src/ANN/Binder.h b/compiler/enco/core/src/ANN/Binder.h
index 71b95676b..be9f705c7 100644
--- a/compiler/enco/core/src/ANN/Binder.h
+++ b/compiler/enco/core/src/ANN/Binder.h
@@ -32,7 +32,7 @@ class ANNBinder
 {
 public:
   ANNBinder(coco::Block *block, std::unique_ptr<ann::Module> &&module)
-      : _block{block}, _module{std::move(module)}
+    : _block{block}, _module{std::move(module)}
   {
     // DO NOTHING
   }
diff --git a/compiler/enco/core/src/ANN/Context.cpp b/compiler/enco/core/src/ANN/Context.cpp
index d4d1882fa..b6d2a3d42 100644
--- a/compiler/enco/core/src/ANN/Context.cpp
+++ b/compiler/enco/core/src/ANN/Context.cpp
@@ -16,12 +16,12 @@
 
 #include "ANN/Context.h"
 
-#include <stdex/Memory.h>
+#include <memory>
 
 ANNBinder *ANNContext::create(coco::Block *blk)
 {
-  auto mod = stdex::make_unique<ann::Module>();
-  auto obj = stdex::make_unique<ANNBinder>(blk, std::move(mod));
+  auto mod = std::make_unique<ann::Module>();
+  auto obj = std::make_unique<ANNBinder>(blk, std::move(mod));
   auto ptr = obj.get();
 
   _binders.emplace_back(std::move(obj));
diff --git a/compiler/enco/core/src/ANN/Context.test.cpp b/compiler/enco/core/src/ANN/Context.test.cpp
index 7fd26f30c..252d92290 100644
--- a/compiler/enco/core/src/ANN/Context.test.cpp
+++ b/compiler/enco/core/src/ANN/Context.test.cpp
@@ -33,7 +33,7 @@ public:
 protected:
   std::unique_ptr<coco::Module> m;
 };
-}
+} // namespace
 
 TEST_F(ANNContextTest, constructor)
 {
diff --git a/compiler/enco/core/src/ANN/IR/OperandInventory.cpp b/compiler/enco/core/src/ANN/IR/OperandInventory.cpp
index c7ad38811..4399c3900 100644
--- a/compiler/enco/core/src/ANN/IR/OperandInventory.cpp
+++ b/compiler/enco/core/src/ANN/IR/OperandInventory.cpp
@@ -16,9 +16,9 @@
 
 #include "ANN/IR/OperandInventory.h"
 
-#include <stdex/Memory.h>
+#include <memory>
 
-using stdex::make_unique;
+using std::make_unique;
 
 namespace ann
 {
diff --git a/compiler/enco/core/src/ANN/IR/Operation.h b/compiler/enco/core/src/ANN/IR/Operation.h
index cacc2b794..a1f1d46e2 100644
--- a/compiler/enco/core/src/ANN/IR/Operation.h
+++ b/compiler/enco/core/src/ANN/IR/Operation.h
@@ -38,7 +38,7 @@ public:
 public:
   Operation(const Code &code, std::initializer_list<OperandID> inputs,
             std::initializer_list<OperandID> outputs)
-      : _code{code}, _inputs{inputs}, _outputs{outputs}
+    : _code{code}, _inputs{inputs}, _outputs{outputs}
   {
     // DO NOTHING
   }
diff --git a/compiler/enco/core/src/ANN/IR/OperationInventory.cpp b/compiler/enco/core/src/ANN/IR/OperationInventory.cpp
index 37d48c170..93108dfb7 100644
--- a/compiler/enco/core/src/ANN/IR/OperationInventory.cpp
+++ b/compiler/enco/core/src/ANN/IR/OperationInventory.cpp
@@ -16,9 +16,9 @@
 
 #include "OperationInventory.h"
 
-#include <stdex/Memory.h>
+#include <memory>
 
-using stdex::make_unique;
+using std::make_unique;
 
 namespace ann
 {
diff --git a/compiler/enco/core/src/ANN/IR/WeightInventory.cpp b/compiler/enco/core/src/ANN/IR/WeightInventory.cpp
index d8809ac08..edcb16aed 100644
--- a/compiler/enco/core/src/ANN/IR/WeightInventory.cpp
+++ b/compiler/enco/core/src/ANN/IR/WeightInventory.cpp
@@ -16,9 +16,9 @@
 
 #include "WeightInventory.h"
 
-#include <stdex/Memory.h>
+#include <memory>
 
-using stdex::make_unique;
+using std::make_unique;
 
 namespace ann
 {
diff --git a/compiler/enco/core/src/AsmCode.h b/compiler/enco/core/src/AsmCode.h
index c43892888..6d57f1851 100644
--- a/compiler/enco/core/src/AsmCode.h
+++ b/compiler/enco/core/src/AsmCode.h
@@ -27,7 +27,7 @@ class AsmCode
 {
 public:
   AsmCode(const std::string &filename, const std::string &varname)
-      : _filename{filename}, _varname{varname}
+    : _filename{filename}, _varname{varname}
   {
     // DO NOTHING
   }
diff --git a/compiler/enco/core/src/Backend.cpp b/compiler/enco/core/src/Backend.cpp
index d4bec7447..77374fecd 100644
--- a/compiler/enco/core/src/Backend.cpp
+++ b/compiler/enco/core/src/Backend.cpp
@@ -44,13 +44,12 @@
 #include "Transforms/Split.h"
 #include "Transforms/GlobalDataGeneration.h"
 
-#include <stdex/Memory.h>
-
+#include <memory>
 #include <stdexcept>
 #include <iostream>
 #include <fstream>
 
-using stdex::make_unique;
+using std::make_unique;
 using namespace enco;
 
 namespace
@@ -168,7 +167,7 @@ void BackendImpl::compile(coco::Module *m, coco::Data *d)
   ofs << CppCode{data_var, code(sess)} << std::endl;
 }
 
-} // namespace enco
+} // namespace
 
 #include <iostream>
 
diff --git a/compiler/enco/core/src/CodeIndex.h b/compiler/enco/core/src/CodeIndex.h
index 7f2da6463..ed8f24109 100644
--- a/compiler/enco/core/src/CodeIndex.h
+++ b/compiler/enco/core/src/CodeIndex.h
@@ -30,7 +30,7 @@ public:
 
 public:
   CodeIndex(const coco::BlockIndex &blk_ind, const coco::InstrIndex &ins_ind)
-      : _blk_ind{blk_ind}, _ins_ind{ins_ind}
+    : _blk_ind{blk_ind}, _ins_ind{ins_ind}
   {
   }
 
diff --git a/compiler/enco/core/src/CppGen/Host.cpp b/compiler/enco/core/src/CppGen/Host.cpp
index 37e0583d7..7f9456239 100644
--- a/compiler/enco/core/src/CppGen/Host.cpp
+++ b/compiler/enco/core/src/CppGen/Host.cpp
@@ -18,8 +18,7 @@
 
 #include <pp/EnclosedDocument.h>
 
-#include <stdex/Memory.h>
-
+#include <memory>
 #include <map>
 #include <string>
 
@@ -157,7 +156,7 @@ public:
 
 public:
   TransferLoop(uint32_t count, uint32_t src_step, uint32_t dst_step)
-      : _count{count}, _step{src_step, dst_step}
+    : _count{count}, _step{src_step, dst_step}
   {
     // DO NOTHING
   }
@@ -293,7 +292,7 @@ std::unique_ptr<pp::MultiLineText> HostBlockCompiler::compile(const coco::Block
 {
   InstrPrinter prn{_mem};
 
-  auto res = stdex::make_unique<pp::LinearDocument>();
+  auto res = std::make_unique<pp::LinearDocument>();
 
   for (auto ins = blk->instr()->head(); ins; ins = ins->next())
   {
diff --git a/compiler/enco/core/src/CppGen/Subnet.cpp b/compiler/enco/core/src/CppGen/Subnet.cpp
index 9a636c6ae..599b0794e 100644
--- a/compiler/enco/core/src/CppGen/Subnet.cpp
+++ b/compiler/enco/core/src/CppGen/Subnet.cpp
@@ -21,11 +21,10 @@
 
 #include <pp/LinearDocument.h>
 
-#include <stdex/Memory.h>
-
+#include <memory>
 #include <sstream>
 
-using stdex::make_unique;
+using std::make_unique;
 using enco::concat;
 
 #define S(content) #content
@@ -117,7 +116,7 @@ class ScalarOperandDecl final : public CodeFragment
 {
 public:
   ScalarOperandDecl(const std::string &model, const ann::DType &dtype)
-      : _model{model}, _dtype{dtype}
+    : _model{model}, _dtype{dtype}
   {
     // DO NOTHING
   }
@@ -150,7 +149,7 @@ class TensorOperandDecl final : public CodeFragment
 public:
   TensorOperandDecl(const std::string &model, const ann::DType &dtype,
                     const nncc::core::ADT::tensor::Shape &shape)
-      : _model{model}, _dtype{dtype}, _shape{shape}
+    : _model{model}, _dtype{dtype}, _shape{shape}
   {
     // DO NOTHING
   }
@@ -194,7 +193,7 @@ class WeightDecl final : public CodeFragment
 public:
   WeightDecl(const std::string &model, const ann::OperandID &id, const std::string &base,
              const std::string &size)
-      : _model{model}, _id{id}, _base{base}, _size{size}
+    : _model{model}, _id{id}, _base{base}, _size{size}
   {
     // DO NOTHING
   }
diff --git a/compiler/enco/core/src/Session.cpp b/compiler/enco/core/src/Session.cpp
index 034f23892..18af87ace 100644
--- a/compiler/enco/core/src/Session.cpp
+++ b/compiler/enco/core/src/Session.cpp
@@ -16,12 +16,10 @@
 
 #include "Session.h"
 
-#include <stdex/Memory.h>
-
 #include <map>
 #include <memory>
 
-using stdex::make_unique;
+using std::make_unique;
 
 namespace
 {
diff --git a/compiler/enco/core/src/Support/Debugging.cpp b/compiler/enco/core/src/Support/Debugging.cpp
index bd65a27d8..9a9a7745e 100644
--- a/compiler/enco/core/src/Support/Debugging.cpp
+++ b/compiler/enco/core/src/Support/Debugging.cpp
@@ -77,7 +77,7 @@ pp::LinearDocument operator<<(const SectionBuilder &builder, Callback cb)
 }
 
 SectionBuilder section(const std::string &tag) { return SectionBuilder{tag}; }
-}
+} // namespace
 
 /**
  * SECTION: Bag
diff --git a/compiler/enco/core/src/Transforms/FeatureUnification.cpp b/compiler/enco/core/src/Transforms/FeatureUnification.cpp
index 1a7a0a8a4..9e4a8e19f 100644
--- a/compiler/enco/core/src/Transforms/FeatureUnification.cpp
+++ b/compiler/enco/core/src/Transforms/FeatureUnification.cpp
@@ -17,14 +17,13 @@
 #include "FeatureUnification.h"
 #include "IRUtils.h"
 
-#include <stdex/Memory.h>
-
+#include <memory>
 #include <set>
 #include <vector>
 
 #include <cassert>
 
-using stdex::make_unique;
+using std::make_unique;
 
 namespace
 {
diff --git a/compiler/enco/core/src/Transforms/GlobalDataGeneration.cpp b/compiler/enco/core/src/Transforms/GlobalDataGeneration.cpp
index 152477a51..cb5a0a9a9 100644
--- a/compiler/enco/core/src/Transforms/GlobalDataGeneration.cpp
+++ b/compiler/enco/core/src/Transforms/GlobalDataGeneration.cpp
@@ -18,11 +18,10 @@
 #include "Split.h"
 #include "Dims.h"
 
-#include <stdex/Memory.h>
-
+#include <memory>
 #include <map>
 
-using stdex::make_unique;
+using std::make_unique;
 
 namespace
 {
diff --git a/compiler/enco/core/src/Transforms/Split.cpp b/compiler/enco/core/src/Transforms/Split.cpp
index b57b8f882..714c27a72 100644
--- a/compiler/enco/core/src/Transforms/Split.cpp
+++ b/compiler/enco/core/src/Transforms/Split.cpp
@@ -22,13 +22,13 @@
 #include <coco/IR.h>
 
 #include <nncc/core/ADT/kernel/NHWCLayout.h>
-#include <stdex/Memory.h>
 
+#include <memory>
 #include <map>
 #include <stdexcept>
 #include <functional>
 
-using stdex::make_unique;
+using std::make_unique;
 
 namespace
 {
@@ -337,8 +337,8 @@ public:
     auto ofm = binder->addOperand<float>(_ofm);
 
     binder->addOperation(
-        ann::Operation::Code::DEPTHWISE_CONV_2D,
-        {ifm, ker, bias, left, right, top, bottom, hstride, vstride, multiplier, fuse}, {ofm});
+      ann::Operation::Code::DEPTHWISE_CONV_2D,
+      {ifm, ker, bias, left, right, top, bottom, hstride, vstride, multiplier, fuse}, {ofm});
   }
 
 private:
diff --git a/compiler/enco/core/src/Transforms/Split.h b/compiler/enco/core/src/Transforms/Split.h
index b4e1d7baf..85ad2684f 100644
--- a/compiler/enco/core/src/Transforms/Split.h
+++ b/compiler/enco/core/src/Transforms/Split.h
@@ -43,6 +43,6 @@ struct PhaseConstructionPass final : public Pass
   void run(const SessionID &sess) const override { split_into_phases(code(sess)); }
 };
 
-} // namespace enco;
+} // namespace enco
 
 #endif // __SPLIT_H__
diff --git a/compiler/enco/frontend/caffe/CMakeLists.txt b/compiler/enco/frontend/caffe/CMakeLists.txt
index ce43a41d3..9722392a1 100644
--- a/compiler/enco/frontend/caffe/CMakeLists.txt
+++ b/compiler/enco/frontend/caffe/CMakeLists.txt
@@ -16,7 +16,6 @@ target_link_libraries(enco_caffe_frontend enco_intf_frontend)
 target_link_libraries(enco_caffe_frontend enco_intf_cmdline)
 target_link_libraries(enco_caffe_frontend morph)
 target_link_libraries(enco_caffe_frontend caffeproto)
-target_link_libraries(enco_caffe_frontend stdex)
 
 nnas_find_package(GTest QUIET)
 
diff --git a/compiler/enco/frontend/caffe/src/Context.h b/compiler/enco/frontend/caffe/src/Context.h
index aca57ce6f..7cf27ead4 100644
--- a/compiler/enco/frontend/caffe/src/Context.h
+++ b/compiler/enco/frontend/caffe/src/Context.h
@@ -81,8 +81,8 @@ public:
   explicit GraphBuilderContext(coco::Module *module, coco::Data *data, coco::Block *block,
                                ShapeContext &shape_ctx, StoreContext &bag_ctx,
                                WeightContext &weight_ctx)
-      : _module(module), _data(data), _block(block), _shape_ctx(shape_ctx), _bag_ctx(bag_ctx),
-        _weight_ctx(weight_ctx)
+    : _module(module), _data(data), _block(block), _shape_ctx(shape_ctx), _bag_ctx(bag_ctx),
+      _weight_ctx(weight_ctx)
   {
     // DO NOTHING
   }
diff --git a/compiler/enco/frontend/caffe/src/Entry.cpp b/compiler/enco/frontend/caffe/src/Entry.cpp
index 2bdb73eac..41e174bc4 100644
--- a/compiler/enco/frontend/caffe/src/Entry.cpp
+++ b/compiler/enco/frontend/caffe/src/Entry.cpp
@@ -19,8 +19,7 @@
 
 #include <cmdline/View.h>
 
-#include <stdex/Memory.h>
-
+#include <memory>
 #include <fstream>
 #include <cassert>
 
@@ -28,7 +27,7 @@ extern "C" std::unique_ptr<enco::Frontend> make_frontend(const cmdline::View &cm
 {
   assert(cmdline.size() == 2);
 
-  auto frontend = stdex::make_unique<Frontend>();
+  auto frontend = std::make_unique<Frontend>();
 
   // Fill prototxt
   {
diff --git a/compiler/enco/frontend/caffe/src/GraphBuilderRegistry.cpp b/compiler/enco/frontend/caffe/src/GraphBuilderRegistry.cpp
index e9db31177..d9a1c9617 100644
--- a/compiler/enco/frontend/caffe/src/GraphBuilderRegistry.cpp
+++ b/compiler/enco/frontend/caffe/src/GraphBuilderRegistry.cpp
@@ -25,9 +25,9 @@
 #include "Layer/Scale.h"
 #include "Layer/BatchNorm.h"
 
-#include <stdex/Memory.h>
+#include <memory>
 
-using stdex::make_unique;
+using std::make_unique;
 
 namespace caffeimport
 {
diff --git a/compiler/enco/frontend/caffe/src/Layer/Convolution.cpp b/compiler/enco/frontend/caffe/src/Layer/Convolution.cpp
index 9fb096d49..807cce44d 100644
--- a/compiler/enco/frontend/caffe/src/Layer/Convolution.cpp
+++ b/compiler/enco/frontend/caffe/src/Layer/Convolution.cpp
@@ -101,7 +101,7 @@ void ConvolutionBuilder::build(const ::caffe::LayerParameter &layer,
 
   auto ker_dst = data->f32()->access(ker_obj);
   auto ker_src = kernel::OverlayFactory<float, kernel::NCHWLayout>::make(
-      ker_obj->shape(), ker_blob->mutable_data()->begin());
+    ker_obj->shape(), ker_blob->mutable_data()->begin());
 
   for (uint32_t n = 0; n < ker_obj->shape().count(); ++n)
   {
diff --git a/compiler/enco/frontend/tflite/CMakeLists.txt b/compiler/enco/frontend/tflite/CMakeLists.txt
index 77159879e..ea10fbc4b 100644
--- a/compiler/enco/frontend/tflite/CMakeLists.txt
+++ b/compiler/enco/frontend/tflite/CMakeLists.txt
@@ -19,7 +19,6 @@ target_link_libraries(enco_tflite_frontend enco_intf_frontend)
 target_link_libraries(enco_tflite_frontend enco_intf_cmdline)
 target_link_libraries(enco_tflite_frontend flatbuffers)
 target_link_libraries(enco_tflite_frontend enco_tflite_schema)
-target_link_libraries(enco_tflite_frontend stdex)
 target_link_libraries(enco_tflite_frontend morph)
 target_link_libraries(enco_tflite_frontend cwrap)
 
diff --git a/compiler/enco/frontend/tflite/src/Context.cpp b/compiler/enco/frontend/tflite/src/Context.cpp
index ef030dc5d..588c3c44b 100644
--- a/compiler/enco/frontend/tflite/src/Context.cpp
+++ b/compiler/enco/frontend/tflite/src/Context.cpp
@@ -48,7 +48,7 @@ void TensorContext::prepare(const tflite::SubGraph *graph)
 }
 
 TflOpCodeContext::TflOpCodeContext(
-    const flatbuffers::Vector<flatbuffers::Offset<tflite::OperatorCode>> *opcodes)
+  const flatbuffers::Vector<flatbuffers::Offset<tflite::OperatorCode>> *opcodes)
 {
   for (const tflite::OperatorCode *opcode : *opcodes)
   {
diff --git a/compiler/enco/frontend/tflite/src/Context.h b/compiler/enco/frontend/tflite/src/Context.h
index f72385f9a..caeac4ab5 100644
--- a/compiler/enco/frontend/tflite/src/Context.h
+++ b/compiler/enco/frontend/tflite/src/Context.h
@@ -135,8 +135,8 @@ public:
   explicit GraphBuilderContext(coco::Module *m, coco::Data *d, coco::Block *block,
                                TensorBags &tensor_bags, TensorContext &tensor_context,
                                TflBufferContext &buffer_context, const tflite::SubGraph *graph)
-      : _m(m), _d(d), _block(block), _tensor_bags(tensor_bags), _tensor_context(tensor_context),
-        _buffer_context(buffer_context), _graph(graph)
+    : _m(m), _d(d), _block(block), _tensor_bags(tensor_bags), _tensor_context(tensor_context),
+      _buffer_context(buffer_context), _graph(graph)
   {
     // DO NOTHING
   }
diff --git a/compiler/enco/frontend/tflite/src/Entry.cpp b/compiler/enco/frontend/tflite/src/Entry.cpp
index c69e18074..74d3096ab 100644
--- a/compiler/enco/frontend/tflite/src/Entry.cpp
+++ b/compiler/enco/frontend/tflite/src/Entry.cpp
@@ -19,12 +19,11 @@
 
 #include <cmdline/View.h>
 
-#include <stdex/Memory.h>
-
+#include <memory>
 #include <fstream>
 #include <cassert>
 
-using stdex::make_unique;
+using std::make_unique;
 
 extern "C" std::unique_ptr<enco::Frontend> make_frontend(const cmdline::View &cmdline)
 {
diff --git a/compiler/enco/frontend/tflite/src/Frontend.test.cpp b/compiler/enco/frontend/tflite/src/Frontend.test.cpp
index aee6099e7..1bc774629 100644
--- a/compiler/enco/frontend/tflite/src/Frontend.test.cpp
+++ b/compiler/enco/frontend/tflite/src/Frontend.test.cpp
@@ -16,11 +16,11 @@
 
 #include "Frontend.h"
 
-#include <stdex/Memory.h>
+#include <memory>
 
 #include <gtest/gtest.h>
 
-using stdex::make_unique;
+using std::make_unique;
 
 namespace
 {
diff --git a/compiler/enco/frontend/tflite/src/GraphBuilderRegistry.h b/compiler/enco/frontend/tflite/src/GraphBuilderRegistry.h
index 1ae882e89..ca4f74fc5 100644
--- a/compiler/enco/frontend/tflite/src/GraphBuilderRegistry.h
+++ b/compiler/enco/frontend/tflite/src/GraphBuilderRegistry.h
@@ -29,11 +29,11 @@
 #include "Op/Div.h"
 
 #include <schema_generated.h>
-#include <stdex/Memory.h>
 
+#include <memory>
 #include <map>
 
-using stdex::make_unique;
+using std::make_unique;
 
 namespace tflimport
 {
@@ -68,7 +68,7 @@ private:
     // add GraphBuilder for each tflite operation.
     _builder_map[tflite::BuiltinOperator_CONV_2D] = make_unique<Conv2DGraphBuilder>();
     _builder_map[tflite::BuiltinOperator_DEPTHWISE_CONV_2D] =
-        make_unique<DepthwiseConv2DGraphBuilder>();
+      make_unique<DepthwiseConv2DGraphBuilder>();
     _builder_map[tflite::BuiltinOperator_AVERAGE_POOL_2D] = make_unique<AvgPool2DGraphBuilder>();
     _builder_map[tflite::BuiltinOperator_MAX_POOL_2D] = make_unique<MaxPool2DGraphBuilder>();
     _builder_map[tflite::BuiltinOperator_CONCATENATION] = make_unique<ConcatenationGraphBuilder>();
diff --git a/compiler/enco/frontend/tflite/src/Op/AveragePool2D.cpp b/compiler/enco/frontend/tflite/src/Op/AveragePool2D.cpp
index 16f68fcdb..6f8223f10 100644
--- a/compiler/enco/frontend/tflite/src/Op/AveragePool2D.cpp
+++ b/compiler/enco/frontend/tflite/src/Op/AveragePool2D.cpp
@@ -102,7 +102,7 @@ void AvgPool2DGraphBuilder::build(const tflite::Operator *op, GraphBuilderContex
   coco_avgpool2d->stride()->horizontal(params->stride_w());
 
   coco::Padding2D padding =
-      pool2D_padding(params, ifm_shape, params->filter_width(), params->filter_height());
+    pool2D_padding(params, ifm_shape, params->filter_width(), params->filter_height());
 
   coco_avgpool2d->pad()->top(padding.top());
   coco_avgpool2d->pad()->bottom(padding.bottom());
diff --git a/compiler/enco/frontend/tflite/src/Op/Conv2D.cpp b/compiler/enco/frontend/tflite/src/Op/Conv2D.cpp
index e9516c0e9..d1f97597f 100644
--- a/compiler/enco/frontend/tflite/src/Op/Conv2D.cpp
+++ b/compiler/enco/frontend/tflite/src/Op/Conv2D.cpp
@@ -171,7 +171,7 @@ void Conv2DGraphBuilder::build(const tflite::Operator *op, GraphBuilderContext *
 
   // fused activation
   coco::FeatureObject *act_output =
-      build_activation(conv_params->fused_activation_function(), blk, last_obj);
+    build_activation(conv_params->fused_activation_function(), blk, last_obj);
 
   // Create Copy Instr of last_obj to Output Object
   auto copy_ins = instr_builder(m).copy(ofm_obj, act_output);
diff --git a/compiler/enco/frontend/tflite/src/Op/DepthwiseConv2D.cpp b/compiler/enco/frontend/tflite/src/Op/DepthwiseConv2D.cpp
index e3d7b263e..bc903c380 100644
--- a/compiler/enco/frontend/tflite/src/Op/DepthwiseConv2D.cpp
+++ b/compiler/enco/frontend/tflite/src/Op/DepthwiseConv2D.cpp
@@ -138,8 +138,8 @@ void DepthwiseConv2DGraphBuilder::build(const tflite::Operator *op,
           auto wc = new_shape.width() * new_shape.depth();
 
           ker_spn[n * hwc + h * wc + w * new_shape.depth() + c] =
-              buffer.ptr[tfl_n * hw * new_shape.count() + /* new_shape.count() is old c */
-                         h * new_shape.width() * new_shape.count() + w * new_shape.count() + tfl_c];
+            buffer.ptr[tfl_n * hw * new_shape.count() + /* new_shape.count() is old c */
+                       h * new_shape.width() * new_shape.count() + w * new_shape.count() + tfl_c];
         }
       }
     }
@@ -220,7 +220,7 @@ void DepthwiseConv2DGraphBuilder::build(const tflite::Operator *op,
 
   // fused activation
   coco::FeatureObject *act_output =
-      build_activation(dconv_params->fused_activation_function(), blk, last_obj);
+    build_activation(dconv_params->fused_activation_function(), blk, last_obj);
 
   // Create Copy Instr of last_obj to Output Object
   auto copy_ins = instr_builder(m).copy(ofm_obj, act_output);
diff --git a/compiler/enco/frontend/tflite/src/Op/MaxPool2D.cpp b/compiler/enco/frontend/tflite/src/Op/MaxPool2D.cpp
index ee4406425..41e0cde17 100644
--- a/compiler/enco/frontend/tflite/src/Op/MaxPool2D.cpp
+++ b/compiler/enco/frontend/tflite/src/Op/MaxPool2D.cpp
@@ -99,7 +99,7 @@ void MaxPool2DGraphBuilder::build(const tflite::Operator *op, GraphBuilderContex
   coco_maxpool2d->stride()->horizontal(params->stride_w());
 
   coco::Padding2D padding =
-      pool2D_padding(params, ifm_shape, params->filter_width(), params->filter_height());
+    pool2D_padding(params, ifm_shape, params->filter_width(), params->filter_height());
 
   coco_maxpool2d->pad()->top(padding.top());
   coco_maxpool2d->pad()->bottom(padding.bottom());
diff --git a/compiler/enco/test/basic/000/CMakeLists.txt b/compiler/enco/test/basic/000/CMakeLists.txt
index 20ba3c571..95e9cb0b0 100644
--- a/compiler/enco/test/basic/000/CMakeLists.txt
+++ b/compiler/enco/test/basic/000/CMakeLists.txt
@@ -11,7 +11,6 @@ set(LIB_TARGET ${PREFIX}-lib)
 add_library(${PREFIX}-frontend SHARED enco.test.cpp)
 target_link_libraries(${PREFIX}-frontend enco_intf_cmdline)
 target_link_libraries(${PREFIX}-frontend enco_intf_frontend)
-target_link_libraries(${PREFIX}-frontend stdex)
 
 # NOTE BYPRODUCTS are not specified in order to enforce source code generation
 add_custom_command(OUTPUT ${GENERATED_CPP} ${GENERATED_ASM} ${GENERATED_BIN}
diff --git a/compiler/enco/test/basic/000/enco.test.cpp b/compiler/enco/test/basic/000/enco.test.cpp
index 3dbf96613..84c28d0f7 100644
--- a/compiler/enco/test/basic/000/enco.test.cpp
+++ b/compiler/enco/test/basic/000/enco.test.cpp
@@ -19,7 +19,7 @@
 
 #include <nncc/core/ADT/tensor/LexicalLayout.h>
 
-#include <stdex/Memory.h>
+#include <memory>
 
 using namespace nncc::core::ADT;
 
@@ -77,5 +77,5 @@ struct Frontend final : public enco::Frontend
 
 extern "C" std::unique_ptr<enco::Frontend> make_frontend(const cmdline::View &cmdline)
 {
-  return stdex::make_unique<Frontend>();
+  return std::make_unique<Frontend>();
 }
diff --git a/compiler/enco/test/binder.cpp b/compiler/enco/test/binder.cpp
index c8c72fc8b..f04cfa4f6 100644
--- a/compiler/enco/test/binder.cpp
+++ b/compiler/enco/test/binder.cpp
@@ -46,9 +46,9 @@ void Network_invoke(Network *net);
 #include <nncc/core/ADT/tensor/LexicalLayout.h>
 #include <nncc/core/ADT/tensor/Overlay.h>
 
-#include <stdex/Memory.h>
+#include <memory>
 
-using stdex::make_unique;
+using std::make_unique;
 using namespace nncc::core::ADT;
 
 namespace
diff --git a/compiler/enco/test/caffe/CMakeLists.txt b/compiler/enco/test/caffe/CMakeLists.txt
index ee49b6b28..d552d6ec8 100644
--- a/compiler/enco/test/caffe/CMakeLists.txt
+++ b/compiler/enco/test/caffe/CMakeLists.txt
@@ -123,7 +123,6 @@ foreach(PREFIX IN ITEMS ${CANDIDATES})
   target_link_libraries(${BINDER_TARGET} nnkit_intf_backend)
   target_link_libraries(${BINDER_TARGET} ann_api)
   target_link_libraries(${BINDER_TARGET} ann_ref_static)
-  target_link_libraries(${BINDER_TARGET} stdex)
   set_target_properties(${BINDER_TARGET} PROPERTIES OUTPUT_NAME ${PREFIX})
 
   list(APPEND TESTS ${PREFIX})
diff --git a/compiler/enco/test/tflite/CMakeLists.txt b/compiler/enco/test/tflite/CMakeLists.txt
index d5a96a6da..81d5ed2a2 100644
--- a/compiler/enco/test/tflite/CMakeLists.txt
+++ b/compiler/enco/test/tflite/CMakeLists.txt
@@ -90,7 +90,6 @@ foreach(PREFIX IN ITEMS ${CANDIDATES})
   target_link_libraries(${BINDER_TARGET} nnkit_intf_backend)
   target_link_libraries(${BINDER_TARGET} ann_api)
   target_link_libraries(${BINDER_TARGET} ann_ref_static)
-  target_link_libraries(${BINDER_TARGET} stdex)
   set_target_properties(${BINDER_TARGET} PROPERTIES OUTPUT_NAME ${PREFIX})
 
   list(APPEND TESTS ${PREFIX})
diff --git a/compiler/encodump/CMakeLists.txt b/compiler/encodump/CMakeLists.txt
index 58fe17a51..a4ad441b2 100644
--- a/compiler/encodump/CMakeLists.txt
+++ b/compiler/encodump/CMakeLists.txt
@@ -13,5 +13,4 @@ target_include_directories(encodump PRIVATE src)
 target_link_libraries(encodump enco_intf_frontend)
 target_link_libraries(encodump enco_core)
 target_link_libraries(encodump safemain)
-target_link_libraries(encodump stdex)
 target_link_libraries(encodump dl)
diff --git a/compiler/encodump/src/Driver.cpp b/compiler/encodump/src/Driver.cpp
index f27cbe904..2928d1d25 100644
--- a/compiler/encodump/src/Driver.cpp
+++ b/compiler/encodump/src/Driver.cpp
@@ -137,8 +137,7 @@ private:
 
 } // namespace
 
-#include <stdex/Memory.h>
-
+#include <memory>
 #include <map>
 
 #include <iostream>
@@ -163,7 +162,7 @@ int entry(int argc, char **argv)
   std::map<std::string, std::function<void(const std::string &arg)>> argparse;
 
   argparse["--frontend"] = [&](const std::string &path) {
-    frontend_zone = stdex::make_unique<FrontendZone>(path);
+    frontend_zone = std::make_unique<FrontendZone>(path);
   };
 
   argparse["--frontend-arg"] = [&](const std::string &arg) { frontend_zone->append(arg); };
diff --git a/compiler/exo/CMakeLists.txt b/compiler/exo/CMakeLists.txt
index 79c75ef2e..e686cbb83 100644
--- a/compiler/exo/CMakeLists.txt
+++ b/compiler/exo/CMakeLists.txt
@@ -39,7 +39,6 @@ target_include_directories(exo PRIVATE src)
 target_link_libraries(exo PUBLIC exo_tflite_fbs)
 target_link_libraries(exo PUBLIC exo_circle_fbs)
 target_link_libraries(exo PUBLIC loco)
-target_link_libraries(exo PRIVATE stdex)
 target_link_libraries(exo PRIVATE pepper_str)
 target_link_libraries(exo PRIVATE pepper_strcast)
 target_link_libraries(exo PRIVATE locoex_customop)
@@ -64,7 +63,6 @@ nnas_find_package(GTest REQUIRED)
 
 GTest_AddTest(exo_test ${TESTS})
 target_include_directories(exo_test PRIVATE src)
-target_link_libraries(exo_test stdex)
 target_link_libraries(exo_test pepper_str)
 target_link_libraries(exo_test exo)
 target_link_libraries(exo_test hermes_std)
diff --git a/compiler/exo/requires.cmake b/compiler/exo/requires.cmake
index 6378b942d..3116c5757 100644
--- a/compiler/exo/requires.cmake
+++ b/compiler/exo/requires.cmake
@@ -1,4 +1,3 @@
-require("stdex")
 require("loco")
 require("locoex-customop")
 require("logo")
diff --git a/compiler/exo/src/Circle/CircleExporter.cpp b/compiler/exo/src/Circle/CircleExporter.cpp
index 797749090..cfcb9a258 100644
--- a/compiler/exo/src/Circle/CircleExporter.cpp
+++ b/compiler/exo/src/Circle/CircleExporter.cpp
@@ -18,16 +18,15 @@
 
 #include "CircleExporterImpl.h"
 
-#include <stdex/Memory.h>
-
 #include <oops/InternalExn.h>
 
+#include <memory>
 #include <fstream>
 
 namespace exo
 {
 
-CircleExporter::CircleExporter(loco::Graph *graph) : _impl(stdex::make_unique<Impl>(graph))
+CircleExporter::CircleExporter(loco::Graph *graph) : _impl(std::make_unique<Impl>(graph))
 {
   // NOTHING TO DO
 }
diff --git a/compiler/exo/src/Circle/CircleExporterImpl.cpp b/compiler/exo/src/Circle/CircleExporterImpl.cpp
index 4cba33da1..a93931597 100644
--- a/compiler/exo/src/Circle/CircleExporterImpl.cpp
+++ b/compiler/exo/src/Circle/CircleExporterImpl.cpp
@@ -88,7 +88,7 @@ encodeOperatorCodes(FlatBufferBuilder &builder, std::unordered_map<OpCode, uint3
         INTERNAL_EXN("Cannot find code for customop even though opcode is BuiltinOperator_CUSTOM");
 
       operator_codes_vec[idx] =
-          CreateOperatorCode(builder, it.first.opcode, builder.CreateString(custom_code->second));
+        CreateOperatorCode(builder, it.first.opcode, builder.CreateString(custom_code->second));
     }
   }
   return builder.CreateVector(operator_codes_vec);
@@ -148,7 +148,7 @@ void CircleExporter::Impl::exportGraph(loco::Graph *graph)
 
   // encode operator codes
   auto operator_codes =
-      encodeOperatorCodes(_builder, gd._operator_codes, gd._custom_operator_codes);
+    encodeOperatorCodes(_builder, gd._operator_codes, gd._custom_operator_codes);
 
   // Subgraphs
   Offset<SubGraph> subgraph = exportSubgraph(gd);
diff --git a/compiler/exo/src/Circle/CircleExporterUtils.cpp b/compiler/exo/src/Circle/CircleExporterUtils.cpp
index 12b204ce7..079f115f6 100644
--- a/compiler/exo/src/Circle/CircleExporterUtils.cpp
+++ b/compiler/exo/src/Circle/CircleExporterUtils.cpp
@@ -78,13 +78,13 @@ circle::Padding getOpPadding(const loco::Padding2D *pad, const loco::Stride<2> *
   //
   // NOTE input and output 'feature' map are shape of NHWC
   bool same_padding_criterion_1 =
-      (static_cast<uint32_t>(ofm._dims[1]) == (ifm._dims[1] - 1) / stride->vertical() + 1) &&
-      (static_cast<uint32_t>(ofm._dims[2]) == (ifm._dims[2] - 1) / stride->horizontal() + 1);
+    (static_cast<uint32_t>(ofm._dims[1]) == (ifm._dims[1] - 1) / stride->vertical() + 1) &&
+    (static_cast<uint32_t>(ofm._dims[2]) == (ifm._dims[2] - 1) / stride->horizontal() + 1);
 
   // For same padding, rear padding is same or bigger than front padding by at most 1
   bool same_padding_criterion_2 =
-      (pad->top() <= pad->bottom()) && (pad->bottom() <= pad->top() + 1) &&
-      (pad->left() <= pad->right()) && (pad->right() <= pad->left() + 1);
+    (pad->top() <= pad->bottom()) && (pad->bottom() <= pad->top() + 1) &&
+    (pad->left() <= pad->right()) && (pad->right() <= pad->left() + 1);
 
   if (same_padding_criterion_1 && same_padding_criterion_2)
     return circle::Padding_SAME;
@@ -123,8 +123,7 @@ void registerGraphIOName(loco::Graph *graph, SerializedModelData &gd)
   gd._data_format = circle::DataFormat::DataFormat_CHANNELS_LAST;
 }
 
-#include <stdex/Memory.h>
-
+#include <memory>
 #include <cassert>
 
 namespace
@@ -150,7 +149,7 @@ private:
 void set_tensor_index(loco::Node *node, const TFLTensorIndex &tensor_id)
 {
   assert(node->annot<TFLTensorIndexAnnotation>() == nullptr);
-  node->annot(stdex::make_unique<TFLTensorIndexAnnotation>(tensor_id));
+  node->annot(std::make_unique<TFLTensorIndexAnnotation>(tensor_id));
 }
 
 TFLTensorIndex get_tensor_index(loco::Node *node)
diff --git a/compiler/exo/src/Circle/CircleOperationExporter.cpp b/compiler/exo/src/Circle/CircleOperationExporter.cpp
index 390e2ec99..8b7337011 100644
--- a/compiler/exo/src/Circle/CircleOperationExporter.cpp
+++ b/compiler/exo/src/Circle/CircleOperationExporter.cpp
@@ -89,13 +89,19 @@ public:
   void visit(loco::ReLU *) final;
   void visit(loco::ReLU6 *) final;
   void visit(loco::Tanh *) final;
-  void visit(loco::Push *) final { /* DO NOTHING */}
-  void visit(loco::Pull *) final { /* DO NOTHING */}
+  void visit(loco::Push *) final
+  { /* DO NOTHING */
+  }
+  void visit(loco::Pull *) final
+  { /* DO NOTHING */
+  }
   void visit(loco::FeatureEncode *) final;
   void visit(loco::FeatureDecode *) final;
   void visit(loco::FilterEncode *) final;
   void visit(loco::DepthwiseFilterEncode *) final;
-  void visit(loco::ConstGen *) final { /* skip, everything is done in exportOpDefinedTensors */}
+  void visit(loco::ConstGen *) final
+  { /* skip, everything is done in exportOpDefinedTensors */
+  }
   void visit(loco::MaxPool2D *) final;
   void visit(loco::AvgPool2D *) final;
   void visit(loco::Conv2D *) final;
@@ -235,7 +241,7 @@ void OperationExporter::visit(locoex::TFLFullyConnected *node)
   auto inputs = builder.CreateVector(inputs_vec);
   auto outputs = builder.CreateVector(outputs_vec);
   auto options =
-      CreateFullyConnectedOptions(builder, to_circle_actfunc(node->fusedActivationFunction()));
+    CreateFullyConnectedOptions(builder, to_circle_actfunc(node->fusedActivationFunction()));
 
   // Make FULLY_CONNECTED operator
   auto op_offset = CreateOperator(builder, op_idx, inputs, outputs,
@@ -375,8 +381,8 @@ void OperationExporter::visit(locoex::TFLTranspose *node)
   auto options = CreateTransposeOptions(builder);
 
   auto op_offset =
-      CreateOperator(builder, op_idx, inputs, outputs,
-                     circle::BuiltinOptions::BuiltinOptions_TransposeOptions, options.Union());
+    CreateOperator(builder, op_idx, inputs, outputs,
+                   circle::BuiltinOptions::BuiltinOptions_TransposeOptions, options.Union());
   gd._operators.push_back(op_offset);
 }
 
@@ -393,7 +399,7 @@ void OperationExporter::visit(locoex::TFLTransposeConv *node)
   auto outputs = builder.CreateVector(outputs_vec);
   circle::Padding padding = getOpPadding(node->padding());
   auto options =
-      CreateTransposeConvOptions(builder, padding, node->stride()->w(), node->stride()->h());
+    CreateTransposeConvOptions(builder, padding, node->stride()->w(), node->stride()->h());
 
   // Make TRANSPOSE_CONV operator
   auto op_offset = CreateOperator(builder, op_idx, inputs, outputs,
@@ -405,7 +411,7 @@ template <class TFLPool2D>
 void OperationExporter::export_pool_2d(TFLPool2D *node, circle::BuiltinOperator builtin_op)
 {
   EXO_ASSERT(builtin_op == circle::BuiltinOperator_MAX_POOL_2D ||
-                 builtin_op == circle::BuiltinOperator_AVERAGE_POOL_2D,
+               builtin_op == circle::BuiltinOperator_AVERAGE_POOL_2D,
              "should be maxpool or avgpool");
   EXO_ASSERT(node->padding() != locoex::Padding::UNDEFINED, "Padding is not set");
 
@@ -481,10 +487,10 @@ void OperationExporter::visit(loco::MaxPool2D *node)
   auto inputs = builder.CreateVector(inputs_vec);
   auto outputs = builder.CreateVector(outputs_vec);
   circle::Padding padding = getOpPadding(
-      node->pad(), node->stride(), ShapeInference::get(node->ifm()), ShapeInference::get(node));
-  auto options = CreatePool2DOptions(builder, padding, node->stride()->horizontal(),
-                                     node->stride()->vertical(), node->window()->horizontal(),
-                                     node->window()->vertical());
+    node->pad(), node->stride(), ShapeInference::get(node->ifm()), ShapeInference::get(node));
+  auto options =
+    CreatePool2DOptions(builder, padding, node->stride()->horizontal(), node->stride()->vertical(),
+                        node->window()->horizontal(), node->window()->vertical());
   auto op_offset = CreateOperator(builder, op_idx, inputs, outputs,
                                   circle::BuiltinOptions_Pool2DOptions, options.Union());
   gd._operators.push_back(op_offset);
@@ -501,10 +507,10 @@ void OperationExporter::visit(loco::AvgPool2D *node)
   auto inputs = builder.CreateVector(inputs_vec);
   auto outputs = builder.CreateVector(outputs_vec);
   circle::Padding padding = getOpPadding(
-      node->pad(), node->stride(), ShapeInference::get(node->ifm()), ShapeInference::get(node));
-  auto options = CreatePool2DOptions(builder, padding, node->stride()->horizontal(),
-                                     node->stride()->vertical(), node->window()->horizontal(),
-                                     node->window()->vertical());
+    node->pad(), node->stride(), ShapeInference::get(node->ifm()), ShapeInference::get(node));
+  auto options =
+    CreatePool2DOptions(builder, padding, node->stride()->horizontal(), node->stride()->vertical(),
+                        node->window()->horizontal(), node->window()->vertical());
   auto op_offset = CreateOperator(builder, op_idx, inputs, outputs,
                                   circle::BuiltinOptions_Pool2DOptions, options.Union());
   gd._operators.push_back(op_offset);
@@ -527,7 +533,7 @@ void OperationExporter::visit(loco::Conv2D *node)
   std::vector<float> bias_vec_data(bias_vec_size); // initialized as zero vector
 
   auto bias_vec_offset =
-      builder.CreateVector(reinterpret_cast<uint8_t *>(bias_vec_data.data()), raw_bias_vec_size);
+    builder.CreateVector(reinterpret_cast<uint8_t *>(bias_vec_data.data()), raw_bias_vec_size);
 
   auto bias_buffer_offset = CreateBuffer(builder, bias_vec_offset);
 
@@ -539,7 +545,7 @@ void OperationExporter::visit(loco::Conv2D *node)
   auto name_offset = builder.CreateString("t_" + std::to_string(bias_tensor_id));
 
   auto bias_tensor_offset =
-      CreateTensor(builder, bias_vec_shape_offset, TensorType_FLOAT32, bias_buffer_id, name_offset);
+    CreateTensor(builder, bias_vec_shape_offset, TensorType_FLOAT32, bias_buffer_id, name_offset);
   gd._tensors.push_back(bias_tensor_offset);
 
   // Make input, output and options for operator
@@ -549,9 +555,9 @@ void OperationExporter::visit(loco::Conv2D *node)
   auto inputs = builder.CreateVector(inputs_vec);
   auto outputs = builder.CreateVector(outputs_vec);
   circle::Padding padding = getOpPadding(
-      node->pad(), node->stride(), ShapeInference::get(node->ifm()), ShapeInference::get(node));
-  auto options = CreateConv2DOptions(builder, padding, node->stride()->horizontal(),
-                                     node->stride()->vertical());
+    node->pad(), node->stride(), ShapeInference::get(node->ifm()), ShapeInference::get(node));
+  auto options =
+    CreateConv2DOptions(builder, padding, node->stride()->horizontal(), node->stride()->vertical());
 
   // Make CONV_2D operator
   auto op_offset = CreateOperator(builder, op_idx, inputs, outputs,
@@ -581,7 +587,7 @@ void OperationExporter::visit(loco::TransposedConv2D *node)
   }
 
   auto outshape_vec_offset = builder.CreateVector(
-      reinterpret_cast<uint8_t *>(outshape_vec_data.data()), raw_outshape_vec_size);
+    reinterpret_cast<uint8_t *>(outshape_vec_data.data()), raw_outshape_vec_size);
 
   auto outshape_buffer_offset = CreateBuffer(builder, outshape_vec_offset);
 
@@ -630,7 +636,7 @@ void OperationExporter::visit(loco::DepthwiseConv2D *node)
   size_t raw_bias_vec_size = bias_vec_size * sizeof(int32_t);
   std::vector<float> bias_vec_data(bias_vec_size);
   auto bias_vec_offset =
-      builder.CreateVector(reinterpret_cast<uint8_t *>(bias_vec_data.data()), raw_bias_vec_size);
+    builder.CreateVector(reinterpret_cast<uint8_t *>(bias_vec_data.data()), raw_bias_vec_size);
 
   auto bias_buffer_offset = CreateBuffer(builder, bias_vec_offset);
 
@@ -642,7 +648,7 @@ void OperationExporter::visit(loco::DepthwiseConv2D *node)
   auto name_offset = builder.CreateString("t_" + std::to_string(bias_tensor_id));
 
   auto bias_tensor_offset =
-      CreateTensor(builder, bias_vec_shape_offset, TensorType_FLOAT32, bias_buffer_id, name_offset);
+    CreateTensor(builder, bias_vec_shape_offset, TensorType_FLOAT32, bias_buffer_id, name_offset);
   gd._tensors.push_back(bias_tensor_offset);
 
   std::vector<int32_t> inputs_vec{get_tensor_index(node->ifm()), get_tensor_index(node->ker()),
@@ -651,13 +657,13 @@ void OperationExporter::visit(loco::DepthwiseConv2D *node)
   auto inputs = builder.CreateVector(inputs_vec);
   auto outputs = builder.CreateVector(outputs_vec);
   circle::Padding padding = getOpPadding(
-      node->pad(), node->stride(), ShapeInference::get(node->ifm()), ShapeInference::get(node));
+    node->pad(), node->stride(), ShapeInference::get(node->ifm()), ShapeInference::get(node));
 
   int32_t ifm_channel_size = ShapeInference::get(node->ifm())._dims[3];
   // multiplier = bias_vec_size(output_size)/ifm_channel_size
   auto options =
-      CreateDepthwiseConv2DOptions(builder, padding, node->stride()->horizontal(),
-                                   node->stride()->vertical(), bias_vec_size / ifm_channel_size);
+    CreateDepthwiseConv2DOptions(builder, padding, node->stride()->horizontal(),
+                                 node->stride()->vertical(), bias_vec_size / ifm_channel_size);
 
   auto op_offset = CreateOperator(builder, op_idx, inputs, outputs,
                                   circle::BuiltinOptions_DepthwiseConv2DOptions, options.Union());
@@ -691,7 +697,7 @@ void OperationExporter::visit(loco::TensorReduce *node)
 
   size_t raw_axes_vec_size = axes_vec_size * sizeof(int32_t);
   auto axes_vec_offset =
-      builder.CreateVector(reinterpret_cast<uint8_t *>(axes_vec.data()), raw_axes_vec_size);
+    builder.CreateVector(reinterpret_cast<uint8_t *>(axes_vec.data()), raw_axes_vec_size);
 
   auto axes_buffer_offset = CreateBuffer(builder, axes_vec_offset);
 
@@ -703,7 +709,7 @@ void OperationExporter::visit(loco::TensorReduce *node)
   auto name_offset = builder.CreateString("t_" + std::to_string(axes_tensor_id));
 
   auto axes_tensor_offset =
-      CreateTensor(builder, axes_vec_shape_offset, TensorType_INT32, axes_buffer_id, name_offset);
+    CreateTensor(builder, axes_vec_shape_offset, TensorType_INT32, axes_buffer_id, name_offset);
   gd._tensors.push_back(axes_tensor_offset);
 
   std::vector<int32_t> inputs_vec{get_tensor_index(node->input()), axes_tensor_id};
@@ -766,7 +772,7 @@ void exportAsTranspose(loco::Node *node, FlatBufferBuilder &builder,
   constexpr size_t raw_perm_vec_size = perm_vec_size * sizeof(int32_t);
 
   auto perm_vec_offset =
-      builder.CreateVector(reinterpret_cast<uint8_t *>(perm_vec_data.data()), raw_perm_vec_size);
+    builder.CreateVector(reinterpret_cast<uint8_t *>(perm_vec_data.data()), raw_perm_vec_size);
 
   auto perm_buffer_offset = CreateBuffer(builder, perm_vec_offset);
 
@@ -778,7 +784,7 @@ void exportAsTranspose(loco::Node *node, FlatBufferBuilder &builder,
   auto name_offset = builder.CreateString("t_" + std::to_string(perm_tensor_id));
 
   auto perm_tensor_offset =
-      CreateTensor(builder, perm_vec_shape_offset, TensorType_INT32, perm_buffer_id, name_offset);
+    CreateTensor(builder, perm_vec_shape_offset, TensorType_INT32, perm_buffer_id, name_offset);
   gd._tensors.push_back(perm_tensor_offset);
 
   // Create permutation node
@@ -792,7 +798,7 @@ void exportAsTranspose(loco::Node *node, FlatBufferBuilder &builder,
   constexpr auto options_type = circle::BuiltinOptions::BuiltinOptions_TransposeOptions;
 
   auto transpose_offset =
-      CreateOperator(builder, op_idx, inputs, outputs, options_type, options.Union());
+    CreateOperator(builder, op_idx, inputs, outputs, options_type, options.Union());
   gd._operators.push_back(transpose_offset);
 }
 
@@ -878,11 +884,11 @@ void exportAsReshape(loco::Node *node, FlatBufferBuilder &builder,
   //      but also by input.
 
   auto input_shape_shape_vec_offset =
-      builder.CreateVector(std::vector<int32_t>{(int32_t)new_shape_vec.size()});
+    builder.CreateVector(std::vector<int32_t>{(int32_t)new_shape_vec.size()});
 
   size_t input_shape_vec_size = new_shape_vec.size() * sizeof(int32_t);
   auto input_shape_input_vec_offset =
-      builder.CreateVector(reinterpret_cast<uint8_t *>(new_shape_vec.data()), input_shape_vec_size);
+    builder.CreateVector(reinterpret_cast<uint8_t *>(new_shape_vec.data()), input_shape_vec_size);
   auto input_shape_buffer_offset = CreateBuffer(builder, input_shape_input_vec_offset);
 
   const auto input_shape_buffer_id = static_cast<uint32_t>(gd._buffers.size());
@@ -891,7 +897,7 @@ void exportAsReshape(loco::Node *node, FlatBufferBuilder &builder,
   auto input_shape_tensor_id = static_cast<int32_t>(gd._tensors.size());
   auto name_offset = builder.CreateString("t_" + std::to_string(input_shape_tensor_id));
   auto input_shape_tensor_offset = CreateTensor(
-      builder, input_shape_shape_vec_offset, TensorType_INT32, input_shape_buffer_id, name_offset);
+    builder, input_shape_shape_vec_offset, TensorType_INT32, input_shape_buffer_id, name_offset);
   gd._tensors.push_back(input_shape_tensor_offset);
 
   uint32_t op_idx = gd.registerBuiltinOpcode(circle::BuiltinOperator_RESHAPE);
@@ -1093,7 +1099,7 @@ void OperationExporter::visit(loco::TensorConstantPad *node)
   auto padding_shape_vec_ptr = builder.CreateVector(std::vector<int32_t>{padding_vec_size, 2});
   // create tensor
   auto padding_tensor_ptr =
-      CreateTensor(builder, padding_shape_vec_ptr, TensorType_INT32, padding_buffer_id);
+    CreateTensor(builder, padding_shape_vec_ptr, TensorType_INT32, padding_buffer_id);
   // get tensor id
   const auto padding_tensor_id = static_cast<int32_t>(gd._tensors.size());
 
diff --git a/compiler/exo/src/Circle/CircleTypeInference.cpp b/compiler/exo/src/Circle/CircleTypeInference.cpp
index a1e92b884..d3d01b4af 100644
--- a/compiler/exo/src/Circle/CircleTypeInference.cpp
+++ b/compiler/exo/src/Circle/CircleTypeInference.cpp
@@ -31,8 +31,6 @@
 
 #include <oops/InternalExn.h>
 
-#include <stdex/Memory.h>
-
 #include <stdexcept>
 #include <type_traits>
 
diff --git a/compiler/exo/src/Conversion/DepthwiseConv2DConverter.cpp b/compiler/exo/src/Conversion/DepthwiseConv2DConverter.cpp
index e3884c3cc..aa2cad705 100644
--- a/compiler/exo/src/Conversion/DepthwiseConv2DConverter.cpp
+++ b/compiler/exo/src/Conversion/DepthwiseConv2DConverter.cpp
@@ -75,9 +75,9 @@ bool DepthwiseConv2DConverter::convert(loco::DepthwiseConv2D *origin)
     reshape->tensor(filter_dec);
 
     int32_t new_shape[4] = {
-        1, static_cast<int32_t>(filter_shape.height().value()),
-        static_cast<int32_t>(filter_shape.width().value()),
-        static_cast<int32_t>(filter_shape.depth().value() * filter_shape.multiplier().value())};
+      1, static_cast<int32_t>(filter_shape.height().value()),
+      static_cast<int32_t>(filter_shape.width().value()),
+      static_cast<int32_t>(filter_shape.depth().value() * filter_shape.multiplier().value())};
     locoex::set_new_shape(reshape, new_shape, 4);
 
     tfl_dw_conv2d->filter(reshape);
diff --git a/compiler/exo/src/Convert.cpp b/compiler/exo/src/Convert.cpp
index 45f0481f4..3a578eee8 100644
--- a/compiler/exo/src/Convert.cpp
+++ b/compiler/exo/src/Convert.cpp
@@ -32,7 +32,7 @@
 #include <logo/RemoveForwardNodePass.h>
 
 #include <logo/Phase.h>
-#include <stdex/Memory.h>
+#include <memory>
 
 namespace exo
 {
@@ -49,40 +49,40 @@ void convert_to_TFLNodes(loco::Graph *graph)
   logo::Phase phase;
   {
     // prepare type and shape before conversion
-    phase.emplace_back(stdex::make_unique<TypeInferencePass>());
-    phase.emplace_back(stdex::make_unique<ShapeInferencePass>());
+    phase.emplace_back(std::make_unique<TypeInferencePass>());
+    phase.emplace_back(std::make_unique<ShapeInferencePass>());
 
     // Add converters for canonical nodes. Note: Not all loco canonical nodes are listed.
-    phase.emplace_back(stdex::make_unique<AvgPool2DConverter>());
-    phase.emplace_back(stdex::make_unique<ConstGenConverter>());
-    phase.emplace_back(stdex::make_unique<Conv2DConverter>());
-    phase.emplace_back(stdex::make_unique<DepthwiseConv2DConverter>());
+    phase.emplace_back(std::make_unique<AvgPool2DConverter>());
+    phase.emplace_back(std::make_unique<ConstGenConverter>());
+    phase.emplace_back(std::make_unique<Conv2DConverter>());
+    phase.emplace_back(std::make_unique<DepthwiseConv2DConverter>());
     // TODO loco::DepthwiseFilterEncode
-    phase.emplace_back(stdex::make_unique<EltwiseAddConverter>());
-    phase.emplace_back(stdex::make_unique<EltwiseDivConverter>());
-    phase.emplace_back(stdex::make_unique<EltwiseMaxConverter>());
-    phase.emplace_back(stdex::make_unique<EltwiseMulConverter>());
-    phase.emplace_back(stdex::make_unique<EltwiseSqrtConverter>());
-    phase.emplace_back(stdex::make_unique<EltwiseSubConverter>());
-    phase.emplace_back(stdex::make_unique<FeatureBiasAddConverter>());
+    phase.emplace_back(std::make_unique<EltwiseAddConverter>());
+    phase.emplace_back(std::make_unique<EltwiseDivConverter>());
+    phase.emplace_back(std::make_unique<EltwiseMaxConverter>());
+    phase.emplace_back(std::make_unique<EltwiseMulConverter>());
+    phase.emplace_back(std::make_unique<EltwiseSqrtConverter>());
+    phase.emplace_back(std::make_unique<EltwiseSubConverter>());
+    phase.emplace_back(std::make_unique<FeatureBiasAddConverter>());
     // TODO loco::FixedReshape
-    phase.emplace_back(stdex::make_unique<MatMulConverter>());
-    phase.emplace_back(stdex::make_unique<MaxPool2DConverter>());
-    phase.emplace_back(stdex::make_unique<ReluConverter>());
-    phase.emplace_back(stdex::make_unique<Relu6Converter>());
+    phase.emplace_back(std::make_unique<MatMulConverter>());
+    phase.emplace_back(std::make_unique<MaxPool2DConverter>());
+    phase.emplace_back(std::make_unique<ReluConverter>());
+    phase.emplace_back(std::make_unique<Relu6Converter>());
     // TODO loco::Tanh
-    phase.emplace_back(stdex::make_unique<TensorConcatConverter>());
+    phase.emplace_back(std::make_unique<TensorConcatConverter>());
     // TODO loco::TensorBiasAdd
-    phase.emplace_back(stdex::make_unique<TensorBroadcastConverter>());
-    phase.emplace_back(stdex::make_unique<TensorReduceConverter>());
+    phase.emplace_back(std::make_unique<TensorBroadcastConverter>());
+    phase.emplace_back(std::make_unique<TensorReduceConverter>());
     // TODO loco::TensorSoftmax
-    phase.emplace_back(stdex::make_unique<TensorTransposeConverter>());
-    phase.emplace_back(stdex::make_unique<TransposedConv2DConverter>());
+    phase.emplace_back(std::make_unique<TensorTransposeConverter>());
+    phase.emplace_back(std::make_unique<TransposedConv2DConverter>());
 
     // Add optimization below
-    phase.emplace_back(stdex::make_unique<logo::SimplifyDomainConversionPass>());
-    phase.emplace_back(stdex::make_unique<logo::RemoveForwardNodePass>());
-    phase.emplace_back(stdex::make_unique<logo::RemoveDeadNodePass>());
+    phase.emplace_back(std::make_unique<logo::SimplifyDomainConversionPass>());
+    phase.emplace_back(std::make_unique<logo::RemoveForwardNodePass>());
+    phase.emplace_back(std::make_unique<logo::RemoveDeadNodePass>());
   }
 
   logo::PhaseRunner<logo::PhaseStrategy::Restart> phase_runner{graph};
diff --git a/compiler/exo/src/Dialect/IR/CircleNodes.h b/compiler/exo/src/Dialect/IR/CircleNodes.h
index 7be093103..c93bd1ab0 100644
--- a/compiler/exo/src/Dialect/IR/CircleNodes.h
+++ b/compiler/exo/src/Dialect/IR/CircleNodes.h
@@ -53,8 +53,8 @@ private:
  * @brief INSTANCE_NORM in circle
  */
 class CircleInstanceNorm final
-    : public FixedArityNode<3, CircleNodeImpl<CircleOpcode::INSTANCE_NORM>>,
-      public CircleNodeMixin<CircleNodeTrait::FusedActFunc>
+  : public FixedArityNode<3, CircleNodeImpl<CircleOpcode::INSTANCE_NORM>>,
+    public CircleNodeMixin<CircleNodeTrait::FusedActFunc>
 {
 public:
   /// @note  Currently only support FLOAT32 as input node
diff --git a/compiler/exo/src/Dialect/IR/TFLNodes.h b/compiler/exo/src/Dialect/IR/TFLNodes.h
index 41a11e7c0..1642eb1f4 100644
--- a/compiler/exo/src/Dialect/IR/TFLNodes.h
+++ b/compiler/exo/src/Dialect/IR/TFLNodes.h
@@ -129,7 +129,9 @@ class TFLAveragePool2D final : public FixedArityNode<1, TFLNodeImpl<TFLOpcode::A
                                public TFLNodeMixin<TFLNodeTrait::FusedActFunc>
 {
 public:
-  TFLAveragePool2D() : _padding(Padding::UNDEFINED) { /* empty */}
+  TFLAveragePool2D() : _padding(Padding::UNDEFINED)
+  { /* empty */
+  }
 
 public:
   loco::Node *value(void) const { return at(0)->node(); }
@@ -240,9 +242,9 @@ private:
  * @brief DEPTHWISE_CONV_2D in TensorFlow Lite
  */
 class TFLDepthwiseConv2D final
-    : public FixedArityNode<3, TFLNodeImpl<TFLOpcode::DEPTHWISE_CONV_2D>>,
-      public TFLNodeMixin<TFLNodeTrait::FusedActFunc>,
-      public TFLNodeMixin<TFLNodeTrait::Bias>
+  : public FixedArityNode<3, TFLNodeImpl<TFLOpcode::DEPTHWISE_CONV_2D>>,
+    public TFLNodeMixin<TFLNodeTrait::FusedActFunc>,
+    public TFLNodeMixin<TFLNodeTrait::Bias>
 {
 public:
   loco::Node *input(void) const { return at(0)->node(); }
@@ -325,7 +327,9 @@ class TFLMaxPool2D final : public FixedArityNode<1, TFLNodeImpl<TFLOpcode::MAX_P
                            public TFLNodeMixin<TFLNodeTrait::FusedActFunc>
 {
 public:
-  TFLMaxPool2D() : _padding(Padding::UNDEFINED) { /* empty */}
+  TFLMaxPool2D() : _padding(Padding::UNDEFINED)
+  { /* empty */
+  }
 
 public:
   loco::Node *value(void) const { return at(0)->node(); }
@@ -463,7 +467,7 @@ public:
 };
 
 class TFLSquaredDifference final
-    : public FixedArityNode<2, TFLNodeImpl<TFLOpcode::SQUARED_DIFFERENCE>>
+  : public FixedArityNode<2, TFLNodeImpl<TFLOpcode::SQUARED_DIFFERENCE>>
 {
 public:
   TFLSquaredDifference() = default;
diff --git a/compiler/exo/src/Dialect/Service/TFLShapeInferenceRule.test.cpp b/compiler/exo/src/Dialect/Service/TFLShapeInferenceRule.test.cpp
index b68728b47..5a7e71dcf 100644
--- a/compiler/exo/src/Dialect/Service/TFLShapeInferenceRule.test.cpp
+++ b/compiler/exo/src/Dialect/Service/TFLShapeInferenceRule.test.cpp
@@ -26,8 +26,6 @@
 #include <loco/Service/CanonicalShapeInferenceRule.h>
 #include <loco/Service/MultiDialectShapeInferenceRule.h>
 
-#include <stdex/Memory.h>
-
 #include <gtest/gtest.h>
 
 TEST(TFLShapeInferenceRuleTest, minimal_with_TFLRelu)
@@ -53,7 +51,7 @@ TEST(TFLShapeInferenceRuleTest, minimal_with_TFLRelu)
   loco::MultiDialectShapeInferenceRule rules;
 
   rules.bind(loco::CanonicalDialect::get(), &canonical_rule)
-      .bind(locoex::TFLDialect::get(), &tfl_rule);
+    .bind(locoex::TFLDialect::get(), &tfl_rule);
 
   loco::apply(&rules).to(graph.g.get());
 
@@ -98,7 +96,7 @@ TEST(TFLShapeInferenceRuleTest, avgpool2d_valid)
   loco::MultiDialectShapeInferenceRule rules;
 
   rules.bind(loco::CanonicalDialect::get(), &canonical_rule)
-      .bind(locoex::TFLDialect::get(), &tfl_rule);
+    .bind(locoex::TFLDialect::get(), &tfl_rule);
 
   loco::apply(&rules).to(graph.g.get());
 
@@ -145,7 +143,7 @@ TEST(TFLShapeInferenceRuleTest, avgpool2d_same)
   loco::MultiDialectShapeInferenceRule rules;
 
   rules.bind(loco::CanonicalDialect::get(), &canonical_rule)
-      .bind(locoex::TFLDialect::get(), &tfl_rule);
+    .bind(locoex::TFLDialect::get(), &tfl_rule);
 
   loco::apply(&rules).to(graph.g.get());
 
diff --git a/compiler/exo/src/Dialect/Service/TFLTypeInferenceRule.test.cpp b/compiler/exo/src/Dialect/Service/TFLTypeInferenceRule.test.cpp
index 9326e5e58..df7aee49c 100644
--- a/compiler/exo/src/Dialect/Service/TFLTypeInferenceRule.test.cpp
+++ b/compiler/exo/src/Dialect/Service/TFLTypeInferenceRule.test.cpp
@@ -24,8 +24,6 @@
 #include <loco/IR/CanonicalDialect.h>
 #include <loco/Service/TypeInference.h>
 
-#include <stdex/Memory.h>
-
 #include <gtest/gtest.h>
 
 TEST(TFLTypeInferenceRuleTest, minimal_with_TFLRelu)
diff --git a/compiler/exo/src/ExoFormattedGraph.h b/compiler/exo/src/ExoFormattedGraph.h
index 714e483b5..ec4173329 100644
--- a/compiler/exo/src/ExoFormattedGraph.h
+++ b/compiler/exo/src/ExoFormattedGraph.h
@@ -19,7 +19,7 @@
 
 #include <locop/FormattedGraph.h>
 
-#include <stdex/Memory.h>
+#include <memory>
 
 namespace exo
 {
@@ -47,7 +47,7 @@ public:
 public:
   std::unique_ptr<locop::NodeSummaryBuilder> create(const locop::SymbolTable *tlb) const final
   {
-    return stdex::make_unique<NodeSummaryBuilder>(tlb);
+    return std::make_unique<NodeSummaryBuilder>(tlb);
   }
 };
 
diff --git a/compiler/exo/src/ExoOptimize.cpp b/compiler/exo/src/ExoOptimize.cpp
index d7278e900..752693f38 100644
--- a/compiler/exo/src/ExoOptimize.cpp
+++ b/compiler/exo/src/ExoOptimize.cpp
@@ -22,7 +22,7 @@
 
 #include <logo/Phase.h>
 
-#include <stdex/Memory.h>
+#include <memory>
 
 namespace exo
 {
@@ -32,36 +32,36 @@ void optimize(loco::Graph *g)
   logo::Phase phase;
   {
     // prepare type and shape before optimization
-    phase.emplace_back(stdex::make_unique<TypeInferencePass>());
-    phase.emplace_back(stdex::make_unique<ShapeInferencePass>());
+    phase.emplace_back(std::make_unique<TypeInferencePass>());
+    phase.emplace_back(std::make_unique<ShapeInferencePass>());
 
-    phase.emplace_back(stdex::make_unique<FoldReshapeOfConstPass>());
-    phase.emplace_back(stdex::make_unique<FoldTransposeOfConstPass>());
+    phase.emplace_back(std::make_unique<FoldReshapeOfConstPass>());
+    phase.emplace_back(std::make_unique<FoldTransposeOfConstPass>());
 
     if (get<Knob::UseFuseBiasAddPass>())
     {
-      phase.emplace_back(stdex::make_unique<FuseBiasAddPass>());
+      phase.emplace_back(std::make_unique<FuseBiasAddPass>());
     }
 
     if (get<Knob::UseFuseInstanceNormPass>())
     {
-      phase.emplace_back(stdex::make_unique<FuseInstanceNormPass>());
+      phase.emplace_back(std::make_unique<FuseInstanceNormPass>());
     }
 
     if (get<Knob::UseFuseReluPass>())
     {
-      phase.emplace_back(stdex::make_unique<FuseReluPass>());
+      phase.emplace_back(std::make_unique<FuseReluPass>());
     }
-    phase.emplace_back(stdex::make_unique<FuseRsqrtPass>());
+    phase.emplace_back(std::make_unique<FuseRsqrtPass>());
 
     if (get<Knob::UseFuseSquaredDifferencePass>())
     {
-      phase.emplace_back(stdex::make_unique<FuseSquaredDifferencePass>());
+      phase.emplace_back(std::make_unique<FuseSquaredDifferencePass>());
     }
 
-    phase.emplace_back(stdex::make_unique<MergeConcatNodesPass>());
+    phase.emplace_back(std::make_unique<MergeConcatNodesPass>());
 
-    phase.emplace_back(stdex::make_unique<logo::RemoveDeadNodePass>());
+    phase.emplace_back(std::make_unique<logo::RemoveDeadNodePass>());
   }
 
   logo::PhaseRunner<logo::PhaseStrategy::Restart> phase_runner{g};
diff --git a/compiler/exo/src/GraphBlock.cpp b/compiler/exo/src/GraphBlock.cpp
index 0a45ce8ad..b26f2e8b6 100644
--- a/compiler/exo/src/GraphBlock.cpp
+++ b/compiler/exo/src/GraphBlock.cpp
@@ -19,7 +19,7 @@
 #include "Check.h"
 
 #include <loco.h>
-#include <stdex/Memory.h>
+#include <memory>
 
 namespace
 {
@@ -114,7 +114,7 @@ template <FeatureLayout T> loco::FeatureEncode *make_feature_encode(loco::Node *
   EXO_ASSERT(input_for_encode != nullptr, "input should not be nullptr");
   loco::Graph *g = input_for_encode->graph();
 
-  auto encoder = stdex::make_unique<loco::PermutingEncoder<loco::Domain::Feature>>();
+  auto encoder = std::make_unique<loco::PermutingEncoder<loco::Domain::Feature>>();
 
   encoder->perm(perm<T>());
 
@@ -130,7 +130,7 @@ template <FeatureLayout T> loco::FeatureDecode *make_feature_decode(loco::Node *
   EXO_ASSERT(input_for_decode != nullptr, "input should not be nullptr");
   loco::Graph *g = input_for_decode->graph();
 
-  auto decoder = stdex::make_unique<loco::PermutingDecoder<loco::Domain::Feature>>();
+  auto decoder = std::make_unique<loco::PermutingDecoder<loco::Domain::Feature>>();
 
   decoder->perm(perm<T>());
 
@@ -146,7 +146,7 @@ template <FilterLayout T> loco::FilterEncode *make_filter_encode(loco::Node *inp
   EXO_ASSERT(input_for_encode != nullptr, "filter should not be nullptr");
   loco::Graph *g = input_for_encode->graph();
 
-  auto encoder = stdex::make_unique<loco::PermutingEncoder<loco::Domain::Filter>>();
+  auto encoder = std::make_unique<loco::PermutingEncoder<loco::Domain::Filter>>();
 
   encoder->perm(perm<T>());
 
@@ -162,7 +162,7 @@ template <FilterLayout T> loco::FilterDecode *make_filter_decode(loco::Node *inp
   EXO_ASSERT(input_for_decode != nullptr, "filter should not be nullptr");
   loco::Graph *g = input_for_decode->graph();
 
-  auto decoder = stdex::make_unique<loco::PermutingDecoder<loco::Domain::Filter>>();
+  auto decoder = std::make_unique<loco::PermutingDecoder<loco::Domain::Filter>>();
 
   decoder->perm(perm<T>());
 
@@ -179,7 +179,7 @@ loco::DepthwiseFilterDecode *make_dw_filter_decode(loco::Node *input_for_decode)
   EXO_ASSERT(input_for_decode != nullptr, "filter should not be nullptr");
   loco::Graph *g = input_for_decode->graph();
 
-  auto decoder = stdex::make_unique<loco::PermutingDecoder<loco::Domain::DepthwiseFilter>>();
+  auto decoder = std::make_unique<loco::PermutingDecoder<loco::Domain::DepthwiseFilter>>();
 
   decoder->perm(perm<T>());
 
@@ -195,7 +195,7 @@ template <MatrixLayout T> loco::MatrixEncode *make_matrix_encode(loco::Node *inp
   EXO_ASSERT(input_for_encode != nullptr, "input should not be nullptr");
   loco::Graph *g = input_for_encode->graph();
 
-  auto encoder = stdex::make_unique<loco::PermutingEncoder<loco::Domain::Matrix>>();
+  auto encoder = std::make_unique<loco::PermutingEncoder<loco::Domain::Matrix>>();
 
   encoder->perm(perm<T>());
 
@@ -211,7 +211,7 @@ template <MatrixLayout T> loco::MatrixDecode *make_matrix_decode(loco::Node *inp
   EXO_ASSERT(input_for_decode != nullptr, "input should not be nullptr");
   loco::Graph *g = input_for_decode->graph();
 
-  auto decoder = stdex::make_unique<loco::PermutingDecoder<loco::Domain::Matrix>>();
+  auto decoder = std::make_unique<loco::PermutingDecoder<loco::Domain::Matrix>>();
 
   decoder->perm(perm<T>());
 
diff --git a/compiler/exo/src/GraphBlock.h b/compiler/exo/src/GraphBlock.h
index b771c821b..96e4b0831 100644
--- a/compiler/exo/src/GraphBlock.h
+++ b/compiler/exo/src/GraphBlock.h
@@ -72,7 +72,7 @@ template <MatrixLayout T> loco::MatrixEncode *make_matrix_encode(loco::Node *inp
 /// @brief Create a loco::MatrixDecode of given layout
 template <MatrixLayout T> loco::MatrixDecode *make_matrix_decode(loco::Node *input_for_decode);
 
-} // exo
+} // namespace exo
 
 //
 // DomainConverter
diff --git a/compiler/exo/src/Log.cpp b/compiler/exo/src/Log.cpp
index aa762968b..cbe9ecb73 100644
--- a/compiler/exo/src/Log.cpp
+++ b/compiler/exo/src/Log.cpp
@@ -17,7 +17,6 @@
 #include "Log.h"
 
 #include <hermes/ConsoleReporter.h>
-#include <stdex/Memory.h>
 
 #include <cstdlib>
 #include <iostream>
diff --git a/compiler/exo/src/LogHelper.cpp b/compiler/exo/src/LogHelper.cpp
index 7520b7ec8..153356632 100644
--- a/compiler/exo/src/LogHelper.cpp
+++ b/compiler/exo/src/LogHelper.cpp
@@ -72,7 +72,7 @@ namespace exo
 
 FormattedGraph fmt(loco::Graph *g)
 {
-  auto node_summary_builder = stdex::make_unique<NodeSummaryBuilderFactory>();
+  auto node_summary_builder = std::make_unique<NodeSummaryBuilderFactory>();
   return std::move(locop::fmt<locop::LinearV1>(g).with(std::move(node_summary_builder)));
 }
 
diff --git a/compiler/exo/src/LoggingContext.cpp b/compiler/exo/src/LoggingContext.cpp
index 1c14d97b9..120a50e7b 100644
--- a/compiler/exo/src/LoggingContext.cpp
+++ b/compiler/exo/src/LoggingContext.cpp
@@ -18,7 +18,8 @@
 #include "Log.h" // To use LoggerConfig
 
 #include <hermes/ConsoleReporter.h>
-#include <stdex/Memory.h>
+
+#include <memory>
 
 namespace exo
 {
@@ -30,11 +31,11 @@ hermes::Context *LoggingContext::get(void)
   if (ctx == nullptr)
   {
     ctx = new hermes::Context;
-    ctx->sinks()->append(stdex::make_unique<hermes::ConsoleReporter>());
-    ctx->config(stdex::make_unique<LoggerConfig>());
+    ctx->sinks()->append(std::make_unique<hermes::ConsoleReporter>());
+    ctx->config(std::make_unique<LoggerConfig>());
   }
 
   return ctx;
 }
 
-} // namespac exo
+} // namespace exo
diff --git a/compiler/exo/src/Pass/FoldTransposeOfConstPass.cpp b/compiler/exo/src/Pass/FoldTransposeOfConstPass.cpp
index 005c42944..66c99121e 100644
--- a/compiler/exo/src/Pass/FoldTransposeOfConstPass.cpp
+++ b/compiler/exo/src/Pass/FoldTransposeOfConstPass.cpp
@@ -124,7 +124,7 @@ void fold_transpose_of_const(locoex::TFLTranspose *transpose)
       index_orig.at(perm->at<S32>(axis)) = index_new.at(axis);
 
     const_new->at<FLOAT32>(l.offset(shape_new, index_new)) =
-        const_orig->at<FLOAT32>(l.offset(shape_orig, index_orig));
+      const_orig->at<FLOAT32>(l.offset(shape_orig, index_orig));
   }
 
   // replace
diff --git a/compiler/exo/src/Pass/FuseBiasAddPass.cpp b/compiler/exo/src/Pass/FuseBiasAddPass.cpp
index 6338dff5d..0e797dc80 100644
--- a/compiler/exo/src/Pass/FuseBiasAddPass.cpp
+++ b/compiler/exo/src/Pass/FuseBiasAddPass.cpp
@@ -136,7 +136,7 @@ public:
   Fuser(LatterT *latter)
   {
     static_assert(std::is_same<LatterT, locoex::TFLAdd>::value ||
-                      std::is_same<LatterT, locoex::TFLSub>::value,
+                    std::is_same<LatterT, locoex::TFLSub>::value,
                   "wrong template type");
 
     _latter = latter;
@@ -185,7 +185,7 @@ template <class LatterT> locoex::TFLConst *Fuser<LatterT>::create_fused_bias_con
 
     for (uint32_t x = 0; x < bias->dim(0).value(); x++)
       new_bias->at<loco::DataType::FLOAT32>(x) = calc<LatterT>(
-          bias->at<loco::DataType::FLOAT32>(x), _const_node->at<loco::DataType::FLOAT32>(x));
+        bias->at<loco::DataType::FLOAT32>(x), _const_node->at<loco::DataType::FLOAT32>(x));
   }
 
   return new_bias;
@@ -252,14 +252,14 @@ struct Collector final : public locoex::TFLNodeMutableVisitor<void>
   void setCandidate(FormerT *former, LatterT *latter, locoex::TFLConst *const_node)
   {
     static_assert(std::is_same<LatterT, locoex::TFLAdd>::value ||
-                      std::is_same<LatterT, locoex::TFLSub>::value,
+                    std::is_same<LatterT, locoex::TFLSub>::value,
                   "wrong template type");
 
     if (!check_act_func(former))
       return;
 
     auto depth =
-        loco::shape_get(as_loco_node(former)).template as<loco::TensorShape>().dim(3).value();
+      loco::shape_get(as_loco_node(former)).template as<loco::TensorShape>().dim(3).value();
     auto const_shape = loco::shape_get(const_node).template as<loco::TensorShape>();
 
     if (const_shape.rank() == 1 and const_shape.dim(0) == depth)
diff --git a/compiler/exo/src/Pass/FuseInstanceNormPass.cpp b/compiler/exo/src/Pass/FuseInstanceNormPass.cpp
index 04d4a62cd..40aa9144f 100644
--- a/compiler/exo/src/Pass/FuseInstanceNormPass.cpp
+++ b/compiler/exo/src/Pass/FuseInstanceNormPass.cpp
@@ -291,7 +291,7 @@ bool InstanceNormPattern::matched()
   CHECK_OR_FALSE(add_as_variance);
 
   CHECK_OR_FALSE(
-      fill(&mean_as_variance, &const_as_epsilon).with_commutative_args_of(add_as_variance));
+    fill(&mean_as_variance, &const_as_epsilon).with_commutative_args_of(add_as_variance));
 
   CHECK_OR_FALSE(const_as_epsilon->dtype() == loco::DataType::FLOAT32);
   // TODO Support regarding broadcast
@@ -317,7 +317,7 @@ bool InstanceNormPattern::matched()
   locoex::TFLMul *mul_gamma_should_be = nullptr;
   locoex::TFLMean *mean_of_ifm_should_be = nullptr;
   CHECK_OR_FALSE(fill(&mul_gamma_should_be, &mean_of_ifm_should_be)
-                     .with_commutative_args_of(mul_as_scaled_mean));
+                   .with_commutative_args_of(mul_as_scaled_mean));
   CHECK_OR_FALSE(mul_gamma == mul_gamma_should_be);
   CHECK_OR_FALSE(mean_of_ifm == mean_of_ifm_should_be);
 #undef CHECK_OR_FALSE
diff --git a/compiler/exo/src/Pass/FuseReluPass.test.cpp b/compiler/exo/src/Pass/FuseReluPass.test.cpp
index 6f83d4dd0..fd6f88d9c 100644
--- a/compiler/exo/src/Pass/FuseReluPass.test.cpp
+++ b/compiler/exo/src/Pass/FuseReluPass.test.cpp
@@ -73,8 +73,8 @@ template <class FusedTFLType, locoex::FusedActFunc FusedActFunc> void test()
 {
   static_assert((std::is_same<FusedTFLType, locoex::TFLRelu>::value &&
                  FusedActFunc == locoex::FusedActFunc::RELU) ||
-                    (std::is_same<FusedTFLType, locoex::TFLRelu6>::value &&
-                     FusedActFunc == locoex::FusedActFunc::RELU6),
+                  (std::is_same<FusedTFLType, locoex::TFLRelu6>::value &&
+                   FusedActFunc == locoex::FusedActFunc::RELU6),
                 "wrong template type");
 
   exo::test::TestGraph g;
diff --git a/compiler/exo/src/Pass/MergeConcatNodesPass.cpp b/compiler/exo/src/Pass/MergeConcatNodesPass.cpp
index 8945fcfce..5885332a6 100644
--- a/compiler/exo/src/Pass/MergeConcatNodesPass.cpp
+++ b/compiler/exo/src/Pass/MergeConcatNodesPass.cpp
@@ -39,8 +39,8 @@ bool canMerge(locoex::TFLConcatenation *node1, locoex::TFLConcatenation *node2)
     case locoex::FusedActFunc::RELU6:
       return true;
 
-    // case locoex::FusedActFunc::TANH:
-    //   return false;
+      // case locoex::FusedActFunc::TANH:
+      //   return false;
 
     default:
       INTERNAL_EXN_V("Unknown FusedActFunc", oops::to_uint32(node1->fusedActivationFunction()));
diff --git a/compiler/exo/src/Pass/ShapeInferencePass.cpp b/compiler/exo/src/Pass/ShapeInferencePass.cpp
index bc60f91c4..367d7da91 100644
--- a/compiler/exo/src/Pass/ShapeInferencePass.cpp
+++ b/compiler/exo/src/Pass/ShapeInferencePass.cpp
@@ -49,9 +49,9 @@ bool ShapeInferencePass::run(loco::Graph *g)
   loco::MultiDialectShapeInferenceRule rules;
 
   rules.bind(loco::CanonicalDialect::get(), &canonical_rule)
-      .bind(locoex::TFLDialect::get(), &tfl_rule)
-      .bind(locoex::CircleDialect::get(), &circle_rule)
-      .bind(locoex::COpDialect::get(), &cop_rule);
+    .bind(locoex::TFLDialect::get(), &tfl_rule)
+    .bind(locoex::CircleDialect::get(), &circle_rule)
+    .bind(locoex::COpDialect::get(), &cop_rule);
 
   return loco::apply(&rules).to(g);
 }
diff --git a/compiler/exo/src/Pass/TypeInferencePass.cpp b/compiler/exo/src/Pass/TypeInferencePass.cpp
index 31d4f13b6..52a9d0c33 100644
--- a/compiler/exo/src/Pass/TypeInferencePass.cpp
+++ b/compiler/exo/src/Pass/TypeInferencePass.cpp
@@ -47,9 +47,9 @@ bool TypeInferencePass::run(loco::Graph *g)
   loco::MultiDialectTypeInferenceRule rules;
 
   rules.bind(loco::CanonicalDialect::get(), &canonical_rule)
-      .bind(locoex::TFLDialect::get(), &tfl_rule)
-      .bind(locoex::CircleDialect::get(), &circle_rule)
-      .bind(locoex::COpDialect::get(), &cop_rule);
+    .bind(locoex::TFLDialect::get(), &tfl_rule)
+    .bind(locoex::CircleDialect::get(), &circle_rule)
+    .bind(locoex::COpDialect::get(), &cop_rule);
 
   return loco::apply(&rules).to(g);
 }
diff --git a/compiler/exo/src/ProgressReporter.h b/compiler/exo/src/ProgressReporter.h
index b0f420df9..83f327309 100644
--- a/compiler/exo/src/ProgressReporter.h
+++ b/compiler/exo/src/ProgressReporter.h
@@ -28,7 +28,7 @@ class ProgressReporter : public logo::PhaseEventListener
 {
 public:
   ProgressReporter(loco::Graph *graph, logo::PhaseStrategy strategy)
-      : _graph{graph}, _strategy{strategy}
+    : _graph{graph}, _strategy{strategy}
   {
     // DO NOTHING
   }
diff --git a/compiler/exo/src/TFLite/TFLExporter.cpp b/compiler/exo/src/TFLite/TFLExporter.cpp
index cf002b3e1..71131b725 100644
--- a/compiler/exo/src/TFLite/TFLExporter.cpp
+++ b/compiler/exo/src/TFLite/TFLExporter.cpp
@@ -18,16 +18,15 @@
 
 #include "TFLExporterImpl.h"
 
-#include <stdex/Memory.h>
-
 #include <oops/InternalExn.h>
 
+#include <memory>
 #include <fstream>
 
 namespace exo
 {
 
-TFLExporter::TFLExporter(loco::Graph *graph) : _impl(stdex::make_unique<Impl>(graph))
+TFLExporter::TFLExporter(loco::Graph *graph) : _impl(std::make_unique<Impl>(graph))
 {
   // NOTHING TO DO
 }
diff --git a/compiler/exo/src/TFLite/TFLExporterImpl.cpp b/compiler/exo/src/TFLite/TFLExporterImpl.cpp
index 07adbfb9d..1f6d1bd59 100644
--- a/compiler/exo/src/TFLite/TFLExporterImpl.cpp
+++ b/compiler/exo/src/TFLite/TFLExporterImpl.cpp
@@ -88,7 +88,7 @@ encodeOperatorCodes(FlatBufferBuilder &builder, std::unordered_map<OpCode, uint3
         INTERNAL_EXN("Cannot find code for custom op");
 
       operator_codes_vec[idx] =
-          CreateOperatorCode(builder, it.first.opcode, builder.CreateString(custom_code->second));
+        CreateOperatorCode(builder, it.first.opcode, builder.CreateString(custom_code->second));
     }
   }
   return builder.CreateVector(operator_codes_vec);
@@ -146,7 +146,7 @@ void TFLExporter::Impl::exportGraph(loco::Graph *graph)
 
   // encode operator codes
   auto operator_codes =
-      encodeOperatorCodes(_builder, gd._operator_codes, gd._custom_operator_codes);
+    encodeOperatorCodes(_builder, gd._operator_codes, gd._custom_operator_codes);
 
   // Subgraphs
   Offset<SubGraph> subgraph = exportSubgraph(gd);
diff --git a/compiler/exo/src/TFLite/TFLExporterImpl.test.cpp b/compiler/exo/src/TFLite/TFLExporterImpl.test.cpp
index 866ede6a2..c337b38d3 100644
--- a/compiler/exo/src/TFLite/TFLExporterImpl.test.cpp
+++ b/compiler/exo/src/TFLite/TFLExporterImpl.test.cpp
@@ -23,7 +23,8 @@
 #include "Knob.h"
 
 #include <loco/IR/PermutingCodec.h>
-#include <stdex/Memory.h>
+
+#include <memory>
 
 #include <gtest/gtest.h>
 
@@ -56,7 +57,7 @@ template <> loco::FeatureEncode *TFLExporterImplTests::make_node(void)
 {
   loco::FeatureEncode *encode_layer = graph()->nodes()->create<loco::FeatureEncode>();
 
-  auto encoder = stdex::make_unique<loco::PermutingEncoder<loco::Domain::Feature>>();
+  auto encoder = std::make_unique<loco::PermutingEncoder<loco::Domain::Feature>>();
   (*encoder->perm())[loco::FeatureAxis::Count] = 0;
   (*encoder->perm())[loco::FeatureAxis::Depth] = 1;
   (*encoder->perm())[loco::FeatureAxis::Height] = 2;
@@ -70,7 +71,7 @@ template <> loco::FeatureDecode *TFLExporterImplTests::make_node(void)
 {
   loco::FeatureDecode *decode_layer = graph()->nodes()->create<loco::FeatureDecode>();
 
-  auto decoder = stdex::make_unique<loco::PermutingDecoder<loco::Domain::Feature>>();
+  auto decoder = std::make_unique<loco::PermutingDecoder<loco::Domain::Feature>>();
   (*decoder->perm())[loco::FeatureAxis::Count] = 0;
   (*decoder->perm())[loco::FeatureAxis::Depth] = 1;
   (*decoder->perm())[loco::FeatureAxis::Height] = 2;
@@ -227,7 +228,7 @@ TEST(TFLExporterImplTest, Transpose_simple)
 
     auto bufs = (model->buffers());
     auto *perm_buf =
-        reinterpret_cast<const int32_t *>(bufs->Get(perm_tensor->buffer())->data()->data());
+      reinterpret_cast<const int32_t *>(bufs->Get(perm_tensor->buffer())->data()->data());
 
     ASSERT_EQ(1, perm_buf[0]);
     ASSERT_EQ(2, perm_buf[1]);
@@ -285,7 +286,7 @@ TEST(TFLExporterImplTest, Transpose_from_FilterEncode_FilterDecode)
 
     auto bufs = (model->buffers());
     auto *perm_buf =
-        reinterpret_cast<const int32_t *>(bufs->Get(perm_tensor->buffer())->data()->data());
+      reinterpret_cast<const int32_t *>(bufs->Get(perm_tensor->buffer())->data()->data());
     ASSERT_EQ(3, perm_buf[0]);
     ASSERT_EQ(0, perm_buf[1]);
     ASSERT_EQ(1, perm_buf[2]);
diff --git a/compiler/exo/src/TFLite/TFLExporterUtils.cpp b/compiler/exo/src/TFLite/TFLExporterUtils.cpp
index d35afc9aa..daec03c40 100644
--- a/compiler/exo/src/TFLite/TFLExporterUtils.cpp
+++ b/compiler/exo/src/TFLite/TFLExporterUtils.cpp
@@ -78,13 +78,13 @@ tflite::Padding getOpPadding(const loco::Padding2D *pad, const loco::Stride<2> *
   //
   // NOTE input and output 'feature' map are shape of NHWC
   bool same_padding_criterion_1 =
-      (static_cast<uint32_t>(ofm._dims[1]) == (ifm._dims[1] - 1) / stride->vertical() + 1) &&
-      (static_cast<uint32_t>(ofm._dims[2]) == (ifm._dims[2] - 1) / stride->horizontal() + 1);
+    (static_cast<uint32_t>(ofm._dims[1]) == (ifm._dims[1] - 1) / stride->vertical() + 1) &&
+    (static_cast<uint32_t>(ofm._dims[2]) == (ifm._dims[2] - 1) / stride->horizontal() + 1);
 
   // For same padding, rear padding is same or bigger than front padding by at most 1
   bool same_padding_criterion_2 =
-      (pad->top() <= pad->bottom()) && (pad->bottom() <= pad->top() + 1) &&
-      (pad->left() <= pad->right()) && (pad->right() <= pad->left() + 1);
+    (pad->top() <= pad->bottom()) && (pad->bottom() <= pad->top() + 1) &&
+    (pad->left() <= pad->right()) && (pad->right() <= pad->left() + 1);
 
   if (same_padding_criterion_1 && same_padding_criterion_2)
     return tflite::Padding_SAME;
@@ -120,8 +120,7 @@ void registerGraphIOName(loco::Graph *graph, SerializedModelData &gd)
   }
 }
 
-#include <stdex/Memory.h>
-
+#include <memory>
 #include <cassert>
 
 namespace
@@ -147,7 +146,7 @@ private:
 void set_tensor_index(loco::Node *node, const TFLTensorIndex &tensor_id)
 {
   assert(node->annot<TFLTensorIndexAnnotation>() == nullptr);
-  node->annot(stdex::make_unique<TFLTensorIndexAnnotation>(tensor_id));
+  node->annot(std::make_unique<TFLTensorIndexAnnotation>(tensor_id));
 }
 
 TFLTensorIndex get_tensor_index(loco::Node *node)
diff --git a/compiler/exo/src/TFLite/TFLOperationExporter.cpp b/compiler/exo/src/TFLite/TFLOperationExporter.cpp
index 79b5b6287..b7a0ffea8 100644
--- a/compiler/exo/src/TFLite/TFLOperationExporter.cpp
+++ b/compiler/exo/src/TFLite/TFLOperationExporter.cpp
@@ -81,13 +81,19 @@ public:
   void visit(loco::ReLU *) final;
   void visit(loco::ReLU6 *) final;
   void visit(loco::Tanh *) final;
-  void visit(loco::Push *) final { /* DO NOTHING */}
-  void visit(loco::Pull *) final { /* DO NOTHING */}
+  void visit(loco::Push *) final
+  { /* DO NOTHING */
+  }
+  void visit(loco::Pull *) final
+  { /* DO NOTHING */
+  }
   void visit(loco::FeatureEncode *) final;
   void visit(loco::FeatureDecode *) final;
   void visit(loco::FilterEncode *) final;
   void visit(loco::DepthwiseFilterEncode *) final;
-  void visit(loco::ConstGen *) final { /* skip, everything is done in exportOpDefinedTensors */}
+  void visit(loco::ConstGen *) final
+  { /* skip, everything is done in exportOpDefinedTensors */
+  }
   void visit(loco::MaxPool2D *) final;
   void visit(loco::AvgPool2D *) final;
   void visit(loco::Conv2D *) final;
@@ -227,7 +233,7 @@ void OperationExporter::visit(locoex::TFLFullyConnected *node)
   auto inputs = builder.CreateVector(inputs_vec);
   auto outputs = builder.CreateVector(outputs_vec);
   auto options =
-      CreateFullyConnectedOptions(builder, to_tflite_actfunc(node->fusedActivationFunction()));
+    CreateFullyConnectedOptions(builder, to_tflite_actfunc(node->fusedActivationFunction()));
 
   // Make FULLY_CONNECTED operator
   auto op_offset = CreateOperator(builder, op_idx, inputs, outputs,
@@ -367,8 +373,8 @@ void OperationExporter::visit(locoex::TFLTranspose *node)
   auto options = CreateTransposeOptions(builder);
 
   auto op_offset =
-      CreateOperator(builder, op_idx, inputs, outputs,
-                     tflite::BuiltinOptions::BuiltinOptions_TransposeOptions, options.Union());
+    CreateOperator(builder, op_idx, inputs, outputs,
+                   tflite::BuiltinOptions::BuiltinOptions_TransposeOptions, options.Union());
   gd._operators.push_back(op_offset);
 }
 
@@ -385,7 +391,7 @@ void OperationExporter::visit(locoex::TFLTransposeConv *node)
   auto outputs = builder.CreateVector(outputs_vec);
   tflite::Padding padding = getOpPadding(node->padding());
   auto options =
-      CreateTransposeConvOptions(builder, padding, node->stride()->w(), node->stride()->h());
+    CreateTransposeConvOptions(builder, padding, node->stride()->w(), node->stride()->h());
 
   // Make TRANSPOSE_CONV operator
   auto op_offset = CreateOperator(builder, op_idx, inputs, outputs,
@@ -397,7 +403,7 @@ template <class TFLPool2D>
 void OperationExporter::export_pool_2d(TFLPool2D *node, tflite::BuiltinOperator builtin_op)
 {
   EXO_ASSERT(builtin_op == tflite::BuiltinOperator_MAX_POOL_2D ||
-                 builtin_op == tflite::BuiltinOperator_AVERAGE_POOL_2D,
+               builtin_op == tflite::BuiltinOperator_AVERAGE_POOL_2D,
              "should be maxpool or avgpool");
   EXO_ASSERT(node->padding() != locoex::Padding::UNDEFINED, "Padding is not set");
 
@@ -458,10 +464,10 @@ void OperationExporter::visit(loco::MaxPool2D *node)
   auto inputs = builder.CreateVector(inputs_vec);
   auto outputs = builder.CreateVector(outputs_vec);
   tflite::Padding padding = getOpPadding(
-      node->pad(), node->stride(), ShapeInference::get(node->ifm()), ShapeInference::get(node));
-  auto options = CreatePool2DOptions(builder, padding, node->stride()->horizontal(),
-                                     node->stride()->vertical(), node->window()->horizontal(),
-                                     node->window()->vertical());
+    node->pad(), node->stride(), ShapeInference::get(node->ifm()), ShapeInference::get(node));
+  auto options =
+    CreatePool2DOptions(builder, padding, node->stride()->horizontal(), node->stride()->vertical(),
+                        node->window()->horizontal(), node->window()->vertical());
   auto op_offset = CreateOperator(builder, op_idx, inputs, outputs,
                                   tflite::BuiltinOptions_Pool2DOptions, options.Union());
   gd._operators.push_back(op_offset);
@@ -478,10 +484,10 @@ void OperationExporter::visit(loco::AvgPool2D *node)
   auto inputs = builder.CreateVector(inputs_vec);
   auto outputs = builder.CreateVector(outputs_vec);
   tflite::Padding padding = getOpPadding(
-      node->pad(), node->stride(), ShapeInference::get(node->ifm()), ShapeInference::get(node));
-  auto options = CreatePool2DOptions(builder, padding, node->stride()->horizontal(),
-                                     node->stride()->vertical(), node->window()->horizontal(),
-                                     node->window()->vertical());
+    node->pad(), node->stride(), ShapeInference::get(node->ifm()), ShapeInference::get(node));
+  auto options =
+    CreatePool2DOptions(builder, padding, node->stride()->horizontal(), node->stride()->vertical(),
+                        node->window()->horizontal(), node->window()->vertical());
   auto op_offset = CreateOperator(builder, op_idx, inputs, outputs,
                                   tflite::BuiltinOptions_Pool2DOptions, options.Union());
   gd._operators.push_back(op_offset);
@@ -504,7 +510,7 @@ void OperationExporter::visit(loco::Conv2D *node)
   std::vector<float> bias_vec_data(bias_vec_size); // initialized as zero vector
 
   auto bias_vec_offset =
-      builder.CreateVector(reinterpret_cast<uint8_t *>(bias_vec_data.data()), raw_bias_vec_size);
+    builder.CreateVector(reinterpret_cast<uint8_t *>(bias_vec_data.data()), raw_bias_vec_size);
 
   auto bias_buffer_offset = CreateBuffer(builder, bias_vec_offset);
 
@@ -516,7 +522,7 @@ void OperationExporter::visit(loco::Conv2D *node)
   auto name_offset = builder.CreateString("t_" + std::to_string(bias_tensor_id));
 
   auto bias_tensor_offset =
-      CreateTensor(builder, bias_vec_shape_offset, TensorType_FLOAT32, bias_buffer_id, name_offset);
+    CreateTensor(builder, bias_vec_shape_offset, TensorType_FLOAT32, bias_buffer_id, name_offset);
   gd._tensors.push_back(bias_tensor_offset);
 
   // Make input, output and options for operator
@@ -526,9 +532,9 @@ void OperationExporter::visit(loco::Conv2D *node)
   auto inputs = builder.CreateVector(inputs_vec);
   auto outputs = builder.CreateVector(outputs_vec);
   tflite::Padding padding = getOpPadding(
-      node->pad(), node->stride(), ShapeInference::get(node->ifm()), ShapeInference::get(node));
-  auto options = CreateConv2DOptions(builder, padding, node->stride()->horizontal(),
-                                     node->stride()->vertical());
+    node->pad(), node->stride(), ShapeInference::get(node->ifm()), ShapeInference::get(node));
+  auto options =
+    CreateConv2DOptions(builder, padding, node->stride()->horizontal(), node->stride()->vertical());
 
   // Make CONV_2D operator
   auto op_offset = CreateOperator(builder, op_idx, inputs, outputs,
@@ -558,7 +564,7 @@ void OperationExporter::visit(loco::TransposedConv2D *node)
   }
 
   auto outshape_vec_offset = builder.CreateVector(
-      reinterpret_cast<uint8_t *>(outshape_vec_data.data()), raw_outshape_vec_size);
+    reinterpret_cast<uint8_t *>(outshape_vec_data.data()), raw_outshape_vec_size);
 
   auto outshape_buffer_offset = CreateBuffer(builder, outshape_vec_offset);
 
@@ -607,7 +613,7 @@ void OperationExporter::visit(loco::DepthwiseConv2D *node)
   size_t raw_bias_vec_size = bias_vec_size * sizeof(int32_t);
   std::vector<float> bias_vec_data(bias_vec_size);
   auto bias_vec_offset =
-      builder.CreateVector(reinterpret_cast<uint8_t *>(bias_vec_data.data()), raw_bias_vec_size);
+    builder.CreateVector(reinterpret_cast<uint8_t *>(bias_vec_data.data()), raw_bias_vec_size);
 
   auto bias_buffer_offset = CreateBuffer(builder, bias_vec_offset);
 
@@ -619,7 +625,7 @@ void OperationExporter::visit(loco::DepthwiseConv2D *node)
   auto name_offset = builder.CreateString("t_" + std::to_string(bias_tensor_id));
 
   auto bias_tensor_offset =
-      CreateTensor(builder, bias_vec_shape_offset, TensorType_FLOAT32, bias_buffer_id, name_offset);
+    CreateTensor(builder, bias_vec_shape_offset, TensorType_FLOAT32, bias_buffer_id, name_offset);
   gd._tensors.push_back(bias_tensor_offset);
 
   std::vector<int32_t> inputs_vec{get_tensor_index(node->ifm()), get_tensor_index(node->ker()),
@@ -628,13 +634,13 @@ void OperationExporter::visit(loco::DepthwiseConv2D *node)
   auto inputs = builder.CreateVector(inputs_vec);
   auto outputs = builder.CreateVector(outputs_vec);
   tflite::Padding padding = getOpPadding(
-      node->pad(), node->stride(), ShapeInference::get(node->ifm()), ShapeInference::get(node));
+    node->pad(), node->stride(), ShapeInference::get(node->ifm()), ShapeInference::get(node));
 
   int32_t ifm_channel_size = ShapeInference::get(node->ifm())._dims[3];
   // multiplier = bias_vec_size(output_size)/ifm_channel_size
   auto options =
-      CreateDepthwiseConv2DOptions(builder, padding, node->stride()->horizontal(),
-                                   node->stride()->vertical(), bias_vec_size / ifm_channel_size);
+    CreateDepthwiseConv2DOptions(builder, padding, node->stride()->horizontal(),
+                                 node->stride()->vertical(), bias_vec_size / ifm_channel_size);
 
   auto op_offset = CreateOperator(builder, op_idx, inputs, outputs,
                                   tflite::BuiltinOptions_DepthwiseConv2DOptions, options.Union());
@@ -668,7 +674,7 @@ void OperationExporter::visit(loco::TensorReduce *node)
 
   size_t raw_axes_vec_size = axes_vec_size * sizeof(int32_t);
   auto axes_vec_offset =
-      builder.CreateVector(reinterpret_cast<uint8_t *>(axes_vec.data()), raw_axes_vec_size);
+    builder.CreateVector(reinterpret_cast<uint8_t *>(axes_vec.data()), raw_axes_vec_size);
 
   auto axes_buffer_offset = CreateBuffer(builder, axes_vec_offset);
 
@@ -680,7 +686,7 @@ void OperationExporter::visit(loco::TensorReduce *node)
   auto name_offset = builder.CreateString("t_" + std::to_string(axes_tensor_id));
 
   auto axes_tensor_offset =
-      CreateTensor(builder, axes_vec_shape_offset, TensorType_INT32, axes_buffer_id, name_offset);
+    CreateTensor(builder, axes_vec_shape_offset, TensorType_INT32, axes_buffer_id, name_offset);
   gd._tensors.push_back(axes_tensor_offset);
 
   std::vector<int32_t> inputs_vec{get_tensor_index(node->input()), axes_tensor_id};
@@ -743,7 +749,7 @@ void exportAsTranspose(loco::Node *node, FlatBufferBuilder &builder,
   constexpr size_t raw_perm_vec_size = perm_vec_size * sizeof(int32_t);
 
   auto perm_vec_offset =
-      builder.CreateVector(reinterpret_cast<uint8_t *>(perm_vec_data.data()), raw_perm_vec_size);
+    builder.CreateVector(reinterpret_cast<uint8_t *>(perm_vec_data.data()), raw_perm_vec_size);
 
   auto perm_buffer_offset = CreateBuffer(builder, perm_vec_offset);
 
@@ -755,7 +761,7 @@ void exportAsTranspose(loco::Node *node, FlatBufferBuilder &builder,
   auto name_offset = builder.CreateString("t_" + std::to_string(perm_tensor_id));
 
   auto perm_tensor_offset =
-      CreateTensor(builder, perm_vec_shape_offset, TensorType_INT32, perm_buffer_id, name_offset);
+    CreateTensor(builder, perm_vec_shape_offset, TensorType_INT32, perm_buffer_id, name_offset);
   gd._tensors.push_back(perm_tensor_offset);
 
   // Create permutation node
@@ -769,7 +775,7 @@ void exportAsTranspose(loco::Node *node, FlatBufferBuilder &builder,
   constexpr auto options_type = tflite::BuiltinOptions::BuiltinOptions_TransposeOptions;
 
   auto transpose_offset =
-      CreateOperator(builder, op_idx, inputs, outputs, options_type, options.Union());
+    CreateOperator(builder, op_idx, inputs, outputs, options_type, options.Union());
   gd._operators.push_back(transpose_offset);
 }
 
@@ -854,11 +860,11 @@ void exportAsReshape(loco::Node *node, FlatBufferBuilder &builder,
   //      but also by input.
 
   auto input_shape_shape_vec_offset =
-      builder.CreateVector(std::vector<int32_t>{(int32_t)new_shape_vec.size()});
+    builder.CreateVector(std::vector<int32_t>{(int32_t)new_shape_vec.size()});
 
   size_t input_shape_vec_size = new_shape_vec.size() * sizeof(int32_t);
   auto input_shape_input_vec_offset =
-      builder.CreateVector(reinterpret_cast<uint8_t *>(new_shape_vec.data()), input_shape_vec_size);
+    builder.CreateVector(reinterpret_cast<uint8_t *>(new_shape_vec.data()), input_shape_vec_size);
   auto input_shape_buffer_offset = CreateBuffer(builder, input_shape_input_vec_offset);
 
   const auto input_shape_buffer_id = static_cast<uint32_t>(gd._buffers.size());
@@ -867,7 +873,7 @@ void exportAsReshape(loco::Node *node, FlatBufferBuilder &builder,
   auto input_shape_tensor_id = static_cast<int32_t>(gd._tensors.size());
   auto name_offset = builder.CreateString("t_" + std::to_string(input_shape_tensor_id));
   auto input_shape_tensor_offset = CreateTensor(
-      builder, input_shape_shape_vec_offset, TensorType_INT32, input_shape_buffer_id, name_offset);
+    builder, input_shape_shape_vec_offset, TensorType_INT32, input_shape_buffer_id, name_offset);
   gd._tensors.push_back(input_shape_tensor_offset);
 
   uint32_t op_idx = gd.registerBuiltinOpcode(tflite::BuiltinOperator_RESHAPE);
@@ -1069,7 +1075,7 @@ void OperationExporter::visit(loco::TensorConstantPad *node)
   auto padding_shape_vec_ptr = builder.CreateVector(std::vector<int32_t>{padding_vec_size, 2});
   // create tensor
   auto padding_tensor_ptr =
-      CreateTensor(builder, padding_shape_vec_ptr, TensorType_INT32, padding_buffer_id);
+    CreateTensor(builder, padding_shape_vec_ptr, TensorType_INT32, padding_buffer_id);
   // get tensor id
   const auto padding_tensor_id = static_cast<int32_t>(gd._tensors.size());
 
diff --git a/compiler/exo/src/TFLite/TFLTensorExporter.cpp b/compiler/exo/src/TFLite/TFLTensorExporter.cpp
index 23c810ed5..2fb6f0c13 100644
--- a/compiler/exo/src/TFLite/TFLTensorExporter.cpp
+++ b/compiler/exo/src/TFLite/TFLTensorExporter.cpp
@@ -89,7 +89,7 @@ struct NoOpDetector final : public loco::CanonicalNodeMutableVisitor<bool>
   bool visit(loco::FeatureEncode *node) final
   {
     auto encoder =
-        loco::must_cast<loco::PermutingEncoder<loco::Domain::Feature> *>(node->encoder());
+      loco::must_cast<loco::PermutingEncoder<loco::Domain::Feature> *>(node->encoder());
     auto perm = encoder->perm();
     return isNHWC(perm);
   }
@@ -97,7 +97,7 @@ struct NoOpDetector final : public loco::CanonicalNodeMutableVisitor<bool>
   bool visit(loco::FeatureDecode *node) final
   {
     auto decoder =
-        loco::must_cast<loco::PermutingDecoder<loco::Domain::Feature> *>(node->decoder());
+      loco::must_cast<loco::PermutingDecoder<loco::Domain::Feature> *>(node->decoder());
     auto perm = decoder->perm();
     return isNHWC(perm);
   }
diff --git a/compiler/exo/src/TFLite/TFLTypeInference.cpp b/compiler/exo/src/TFLite/TFLTypeInference.cpp
index 8d6bb8d8c..56817ee3b 100644
--- a/compiler/exo/src/TFLite/TFLTypeInference.cpp
+++ b/compiler/exo/src/TFLite/TFLTypeInference.cpp
@@ -31,8 +31,6 @@
 
 #include <oops/InternalExn.h>
 
-#include <stdex/Memory.h>
-
 #include <stdexcept>
 #include <type_traits>
 
diff --git a/compiler/exo/src/TFLite/TFLTypeInference.test.cpp b/compiler/exo/src/TFLite/TFLTypeInference.test.cpp
index 8a3a08da9..054dad1f1 100644
--- a/compiler/exo/src/TFLite/TFLTypeInference.test.cpp
+++ b/compiler/exo/src/TFLite/TFLTypeInference.test.cpp
@@ -18,12 +18,9 @@
 #include "Pass/TypeInferencePass.h"
 
 #include <loco/IR/PermutingCodec.h>
-#include <stdex/Memory.h>
 
 #include <gtest/gtest.h>
 
-using stdex::make_unique;
-
 namespace
 {
 
diff --git a/compiler/exo/src/TestGraph.h b/compiler/exo/src/TestGraph.h
index f919cc9ae..46c2264ab 100644
--- a/compiler/exo/src/TestGraph.h
+++ b/compiler/exo/src/TestGraph.h
@@ -23,8 +23,6 @@
 
 #include <loco.h>
 
-#include <stdex/Memory.h>
-
 #include <cassert>
 
 namespace exo
@@ -284,7 +282,7 @@ public:
   {
     filterEncode = exo::make_filter_encode<exo::FilterLayout::HWIO>(pull); // from Tensorflow
     filterDecode =
-        exo::make_filter_decode<exo::FilterLayout::OHWI>(filterEncode); // to Tensorflow Lite
+      exo::make_filter_decode<exo::FilterLayout::OHWI>(filterEncode); // to Tensorflow Lite
     complete(filterDecode);
   }
 };
diff --git a/compiler/exo/src/TestHelper.h b/compiler/exo/src/TestHelper.h
index 1a3de50f5..bacaa3e5e 100644
--- a/compiler/exo/src/TestHelper.h
+++ b/compiler/exo/src/TestHelper.h
@@ -26,7 +26,7 @@
 
 #include <loco.h>
 
-#include <stdex/Memory.h>
+#include <memory>
 
 #include <gtest/gtest.h>
 
@@ -54,11 +54,11 @@ public:
   TypeShapeReadyPhase()
   {
     // Type and Shape inference is prerequisite for run other test
-    _phase.emplace_back(stdex::make_unique<::exo::TypeInferencePass>());
-    _phase.emplace_back(stdex::make_unique<::exo::ShapeInferencePass>());
+    _phase.emplace_back(std::make_unique<::exo::TypeInferencePass>());
+    _phase.emplace_back(std::make_unique<::exo::ShapeInferencePass>());
   }
 
-  template <typename PassT> void add_pass() { _phase.emplace_back(stdex::make_unique<PassT>()); }
+  template <typename PassT> void add_pass() { _phase.emplace_back(std::make_unique<PassT>()); }
 
   void run(loco::Graph *g)
   {
diff --git a/compiler/foder/CMakeLists.txt b/compiler/foder/CMakeLists.txt
index 6a413c61e..2e44eefa6 100644
--- a/compiler/foder/CMakeLists.txt
+++ b/compiler/foder/CMakeLists.txt
@@ -1,2 +1,3 @@
 add_library(foder INTERFACE)
 target_include_directories(foder INTERFACE include)
+target_link_libraries(foder INTERFACE nncc_coverage)
diff --git a/compiler/foder/include/foder/FileLoader.h b/compiler/foder/include/foder/FileLoader.h
index e2143ecf6..f0b052a63 100644
--- a/compiler/foder/include/foder/FileLoader.h
+++ b/compiler/foder/include/foder/FileLoader.h
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+#ifndef __FODER_FILE_LOADER_H__
+#define __FODER_FILE_LOADER_H__
+
 #include <fstream>
 #include <vector>
 
@@ -67,3 +70,5 @@ private:
 };
 
 } // namespace foder
+
+#endif // __FODER_FILE_LOADER_H__
diff --git a/compiler/hermes-std/CMakeLists.txt b/compiler/hermes-std/CMakeLists.txt
index c7b02e14c..8fce31953 100644
--- a/compiler/hermes-std/CMakeLists.txt
+++ b/compiler/hermes-std/CMakeLists.txt
@@ -6,7 +6,6 @@ add_library(hermes_std STATIC ${SOURCES})
 set_target_properties(hermes_std PROPERTIES POSITION_INDEPENDENT_CODE ON)
 target_include_directories(hermes_std PUBLIC include)
 target_link_libraries(hermes_std PUBLIC hermes)
-target_link_libraries(hermes_std PRIVATE stdex)
 target_link_libraries(hermes_std PRIVATE pepper_strcast)
 # Let's apply nncc common compile options
 #
@@ -23,5 +22,4 @@ endif(NOT ENABLE_TEST)
 nnas_find_package(GTest REQUIRED)
 
 GTest_AddTest(hermes_std_test ${TESTS})
-target_link_libraries(hermes_std_test stdex)
 target_link_libraries(hermes_std_test hermes_std)
diff --git a/compiler/hermes-std/src/ConsoleReporter.test.cpp b/compiler/hermes-std/src/ConsoleReporter.test.cpp
index c2e1f1c85..a65585a6a 100644
--- a/compiler/hermes-std/src/ConsoleReporter.test.cpp
+++ b/compiler/hermes-std/src/ConsoleReporter.test.cpp
@@ -16,8 +16,7 @@
 
 #include "hermes/ConsoleReporter.h"
 
-#include <stdex/Memory.h>
-
+#include <memory>
 #include <sstream>
 
 #include <gtest/gtest.h>
@@ -37,7 +36,7 @@ TEST(ConsoleReporterTest, notify)
 
     ss << "Hello" << std::endl;
 
-    m.text(stdex::make_unique<hermes::MessageText>(ss));
+    m.text(std::make_unique<hermes::MessageText>(ss));
   }
 
   hermes::ConsoleReporter r;
diff --git a/compiler/hermes-std/src/EnvConfig.test.cpp b/compiler/hermes-std/src/EnvConfig.test.cpp
new file mode 100644
index 000000000..e4b39c167
--- /dev/null
+++ b/compiler/hermes-std/src/EnvConfig.test.cpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "hermes/EnvConfig.h"
+
+#include <hermes/core/SourceSetting.h>
+
+#include <gtest/gtest.h>
+
+#include <stdlib.h>
+
+namespace
+{
+
+class Logger final : public hermes::Source
+{
+public:
+  Logger() = default;
+  ~Logger() = default;
+};
+
+std::string env_name("TEST_CONFIG");
+
+} // namespace
+
+TEST(EnvConfigTest, constructor)
+{
+  hermes::EnvConfig<hermes::EnvFormat::BooleanNumber> ec(env_name);
+
+  SUCCEED();
+}
+
+TEST(EnvConfigTest, configure)
+{
+  Logger logger;
+  hermes::SourceSetting ss;
+  hermes::EnvConfig<hermes::EnvFormat::BooleanNumber> ec(env_name);
+
+  ec.configure(&logger, ss);
+
+  SUCCEED();
+}
+
+TEST(EnvConfigTest, configure_enabled)
+{
+  setenv(env_name.c_str(), "1", 0);
+
+  Logger logger;
+  hermes::SourceSetting ss;
+  hermes::EnvConfig<hermes::EnvFormat::BooleanNumber> ec(env_name);
+
+  ec.configure(&logger, ss);
+
+  SUCCEED();
+}
diff --git a/compiler/hermes/CMakeLists.txt b/compiler/hermes/CMakeLists.txt
index 5debfbca0..e1a71c2b4 100644
--- a/compiler/hermes/CMakeLists.txt
+++ b/compiler/hermes/CMakeLists.txt
@@ -5,7 +5,6 @@ list(REMOVE_ITEM SOURCES ${TESTS})
 add_library(hermes STATIC ${SOURCES})
 set_target_properties(hermes PROPERTIES POSITION_INDEPENDENT_CODE ON)
 target_include_directories(hermes PUBLIC include)
-target_link_libraries(hermes PRIVATE stdex)
 # Let's apply nncc common compile options
 #
 # NOTE This will enable strict compilation (warnings as error).
@@ -22,7 +21,6 @@ nnas_find_package(GTest REQUIRED)
 
 add_executable(hermes_test ${TESTS})
 target_link_libraries(hermes_test gtest_main)
-target_link_libraries(hermes_test stdex)
 target_link_libraries(hermes_test hermes)
 
 add_test(hermes_test hermes_test)
diff --git a/compiler/hermes/requires.cmake b/compiler/hermes/requires.cmake
index a4855289c..e69de29bb 100644
--- a/compiler/hermes/requires.cmake
+++ b/compiler/hermes/requires.cmake
@@ -1 +0,0 @@
-require("stdex")
diff --git a/compiler/hermes/src/core/MessageBuffer.cpp b/compiler/hermes/src/core/MessageBuffer.cpp
index 175a45d3f..a4ff4eeff 100644
--- a/compiler/hermes/src/core/MessageBuffer.cpp
+++ b/compiler/hermes/src/core/MessageBuffer.cpp
@@ -16,7 +16,7 @@
 
 #include "hermes/core/MessageBuffer.h"
 
-#include <stdex/Memory.h>
+#include <memory>
 
 namespace hermes
 {
@@ -30,9 +30,9 @@ MessageBuffer::~MessageBuffer()
 {
   // NOTE The current implementation is unsafe as it may throw an excpetion.
   // TODO Find a better safe implementation.
-  auto msg = stdex::make_unique<Message>();
+  auto msg = std::make_unique<Message>();
 
-  msg->text(stdex::make_unique<MessageText>(_ss));
+  msg->text(std::make_unique<MessageText>(_ss));
 
   _bus->post(std::move(msg));
 }
diff --git a/compiler/hermes/src/core/Source.cpp b/compiler/hermes/src/core/Source.cpp
index 33f8b0570..d124f4430 100644
--- a/compiler/hermes/src/core/Source.cpp
+++ b/compiler/hermes/src/core/Source.cpp
@@ -16,8 +16,7 @@
 
 #include "hermes/core/Source.h"
 
-#include <stdex/Memory.h>
-
+#include <memory>
 #include <cassert>
 
 namespace hermes
@@ -64,7 +63,7 @@ void Source::reload(const Config *c) { c->configure(this, _setting); }
 std::unique_ptr<MessageBuffer> Source::buffer(const Severity &) const
 {
   // TODO Pass Severity
-  return stdex::make_unique<MessageBuffer>(_bus);
+  return std::make_unique<MessageBuffer>(_bus);
 }
 
 } // namespace hermes
diff --git a/compiler/loco/CMakeLists.txt b/compiler/loco/CMakeLists.txt
index f94052840..b1f61ade0 100644
--- a/compiler/loco/CMakeLists.txt
+++ b/compiler/loco/CMakeLists.txt
@@ -6,7 +6,6 @@ add_library(loco SHARED ${SOURCES})
 target_include_directories(loco PUBLIC include)
 # TODO Remove dependencies on angkor library
 target_link_libraries(loco PUBLIC angkor)
-target_link_libraries(loco PRIVATE stdex)
 # Let's apply nncc common compile options
 #
 # NOTE This will enable strict compilation (warnings as error).
@@ -24,5 +23,4 @@ endif(NOT ENABLE_TEST)
 nnas_find_package(GTest REQUIRED)
 
 GTest_AddTest(loco_test ${TESTS})
-target_link_libraries(loco_test stdex)
 target_link_libraries(loco_test loco)
diff --git a/compiler/loco/include/loco/IR/DataTypeTraits.h b/compiler/loco/include/loco/IR/DataTypeTraits.h
index c186300de..3713ac992 100644
--- a/compiler/loco/include/loco/IR/DataTypeTraits.h
+++ b/compiler/loco/include/loco/IR/DataTypeTraits.h
@@ -52,6 +52,12 @@ template <> struct DataTypeImpl<DataType::S16>
   using Type = int16_t;
 };
 
+template <> struct DataTypeImpl<DataType::U16>
+{
+  // Use C++ uint16_t type for unsigned 16bit integer
+  using Type = uint16_t;
+};
+
 template <> struct DataTypeImpl<DataType::S32>
 {
   // Use C++ int32_t type for 32bit integer
@@ -70,12 +76,24 @@ template <> struct DataTypeImpl<DataType::S64>
   using Type = int64_t;
 };
 
+template <> struct DataTypeImpl<DataType::U64>
+{
+  // Use C++ uint64_t type for unsigned 64bit integer
+  using Type = uint64_t;
+};
+
 template <> struct DataTypeImpl<DataType::FLOAT32>
 {
   // Use C++ float type for IEEE 32-bit floating-point numbers
   using Type = float;
 };
 
+template <> struct DataTypeImpl<DataType::FLOAT64>
+{
+  // Use C++ double type for IEEE 64-bit floating-point numbers
+  using Type = double;
+};
+
 // NOTE DataTypeImpl for BOOL is subject to change
 template <> struct DataTypeImpl<DataType::BOOL>
 {
@@ -97,14 +115,20 @@ inline uint32_t size(DataType data_type)
       return sizeof(DataTypeImpl<DataType::U8>::Type);
     case DataType::S16:
       return sizeof(DataTypeImpl<DataType::S16>::Type);
+    case DataType::U16:
+      return sizeof(DataTypeImpl<DataType::U16>::Type);
     case DataType::S32:
       return sizeof(DataTypeImpl<DataType::S32>::Type);
     case DataType::U32:
       return sizeof(DataTypeImpl<DataType::U32>::Type);
     case DataType::S64:
       return sizeof(DataTypeImpl<DataType::S64>::Type);
+    case DataType::U64:
+      return sizeof(DataTypeImpl<DataType::U64>::Type);
     case DataType::FLOAT32:
       return sizeof(DataTypeImpl<DataType::FLOAT32>::Type);
+    case DataType::FLOAT64:
+      return sizeof(DataTypeImpl<DataType::FLOAT64>::Type);
     case DataType::BOOL:
       return sizeof(DataTypeImpl<DataType::BOOL>::Type);
     default:
diff --git a/compiler/loco/include/loco/IR/Nodes.h b/compiler/loco/include/loco/IR/Nodes.h
index fecfad28d..63b1181bb 100644
--- a/compiler/loco/include/loco/IR/Nodes.h
+++ b/compiler/loco/include/loco/IR/Nodes.h
@@ -49,7 +49,7 @@ class GraphOutput;
  * @brief Make a value visible to user
  */
 class Push /* to user */ final
-    : public CanonicalNodeDef<CanonicalOpcode::Push, FixedArity<1>::Mixin>
+  : public CanonicalNodeDef<CanonicalOpcode::Push, FixedArity<1>::Mixin>
 {
 public:
   Push() = default;
@@ -91,8 +91,8 @@ Push *push_node(Graph *g, const GraphOutputIndex &index);
  * @brief Create a value from user data
  */
 class Pull /* from user */ final
-    : public CanonicalNodeDef<CanonicalOpcode::Pull, FixedArity<0>::Mixin,
-                              With<NodeTrait::TensorShape>::Mixin>
+  : public CanonicalNodeDef<CanonicalOpcode::Pull, FixedArity<0>::Mixin,
+                            With<NodeTrait::TensorShape>::Mixin>
 {
 public:
   Pull() = default;
@@ -213,8 +213,8 @@ public:
  * }
  */
 class ConstGen final
-    : public CanonicalNodeDef<CanonicalOpcode::ConstGen, FixedArity<0>::Mixin,
-                              With<NodeTrait::DataType>::Mixin, With<NodeTrait::TensorShape>::Mixin>
+  : public CanonicalNodeDef<CanonicalOpcode::ConstGen, FixedArity<0>::Mixin,
+                            With<NodeTrait::DataType>::Mixin, With<NodeTrait::TensorShape>::Mixin>
 {
 public:
   ConstGen() = default;
@@ -376,7 +376,7 @@ private:
  * @brief Create a feature map from a tensor
  */
 class FeatureEncode final
-    : public CanonicalNodeDef<CanonicalOpcode::FeatureEncode, FixedArity<1>::Mixin>
+  : public CanonicalNodeDef<CanonicalOpcode::FeatureEncode, FixedArity<1>::Mixin>
 {
 public:
   Node *input(void) const { return at(0)->node(); }
@@ -395,7 +395,7 @@ private:
  * @brief Create a tensor from a feature map
  */
 class FeatureDecode final
-    : public CanonicalNodeDef<CanonicalOpcode::FeatureDecode, FixedArity<1>::Mixin>
+  : public CanonicalNodeDef<CanonicalOpcode::FeatureDecode, FixedArity<1>::Mixin>
 {
 public:
   Node *input(void) const { return at(0)->node(); }
@@ -414,7 +414,7 @@ private:
  * @brief Create a filter from a tensor
  */
 class FilterEncode final
-    : public CanonicalNodeDef<CanonicalOpcode::FilterEncode, FixedArity<1>::Mixin>
+  : public CanonicalNodeDef<CanonicalOpcode::FilterEncode, FixedArity<1>::Mixin>
 {
 public:
   Node *input(void) const { return at(0)->node(); }
@@ -433,7 +433,7 @@ private:
  * @brief Create a tensor from a filter
  */
 class FilterDecode final
-    : public CanonicalNodeDef<CanonicalOpcode::FilterDecode, FixedArity<1>::Mixin>
+  : public CanonicalNodeDef<CanonicalOpcode::FilterDecode, FixedArity<1>::Mixin>
 {
 public:
   Node *input(void) const { return at(0)->node(); }
@@ -452,7 +452,7 @@ private:
  * @brief Create a depthwise filter from a tensor
  */
 class DepthwiseFilterEncode final
-    : public CanonicalNodeDef<CanonicalOpcode::DepthwiseFilterEncode, FixedArity<1>::Mixin>
+  : public CanonicalNodeDef<CanonicalOpcode::DepthwiseFilterEncode, FixedArity<1>::Mixin>
 {
 public:
   Node *input(void) const { return at(0)->node(); }
@@ -471,7 +471,7 @@ private:
  * @brief Create a tensor from a depthwise filter
  */
 class DepthwiseFilterDecode final
-    : public CanonicalNodeDef<CanonicalOpcode::DepthwiseFilterDecode, FixedArity<1>::Mixin>
+  : public CanonicalNodeDef<CanonicalOpcode::DepthwiseFilterDecode, FixedArity<1>::Mixin>
 {
 public:
   Node *input(void) const { return at(0)->node(); }
@@ -512,8 +512,8 @@ template <ReshapeType RT> class Reshape;
  */
 template <>
 class Reshape<ReshapeType::Fixed> final
-    : public CanonicalNodeDef<CanonicalOpcode::FixedReshape, FixedArity<1>::Mixin,
-                              With<NodeTrait::TensorShape>::Mixin>
+  : public CanonicalNodeDef<CanonicalOpcode::FixedReshape, FixedArity<1>::Mixin,
+                            With<NodeTrait::TensorShape>::Mixin>
 {
 public:
   Node *input(void) const { return at(0)->node(); }
@@ -529,7 +529,7 @@ using FixedReshape = Reshape<ReshapeType::Fixed>;
  * concatenated along the given axis.
  */
 class TensorConcat final
-    : public CanonicalNodeDef<CanonicalOpcode::TensorConcat, FixedArity<2>::Mixin>
+  : public CanonicalNodeDef<CanonicalOpcode::TensorConcat, FixedArity<2>::Mixin>
 {
 public:
   Node *lhs(void) const { return at(0)->node(); }
@@ -578,7 +578,7 @@ private:
  * @brief Depthwise 2D Convolution
  */
 class DepthwiseConv2D final
-    : public CanonicalNodeDef<CanonicalOpcode::DepthwiseConv2D, FixedArity<2>::Mixin>
+  : public CanonicalNodeDef<CanonicalOpcode::DepthwiseConv2D, FixedArity<2>::Mixin>
 {
 public:
   Node *ifm(void) const { return at(0)->node(); }
@@ -616,7 +616,7 @@ enum class ReduceFunc
  * @note  All the reduce functions always keep dimensions
  */
 class TensorReduce final
-    : public CanonicalNodeDef<CanonicalOpcode::TensorReduce, FixedArity<1>::Mixin>
+  : public CanonicalNodeDef<CanonicalOpcode::TensorReduce, FixedArity<1>::Mixin>
 {
 public:
   Node *input(void) const { return at(0)->node(); }
@@ -684,7 +684,7 @@ private:
  * With this, output shape is uniquely determined by all inputs and attributes.
  */
 class TransposedConv2D final
-    : public CanonicalNodeDef<CanonicalOpcode::TransposedConv2D, FixedArity<2>::Mixin>
+  : public CanonicalNodeDef<CanonicalOpcode::TransposedConv2D, FixedArity<2>::Mixin>
 {
 public:
   Node *ifm(void) const { return at(0)->node(); }
@@ -714,11 +714,11 @@ private:
 template <Domain D> class Softmax;
 
 /**
-* @brief Computes softmax activations for Tensor domain
-*/
+ * @brief Computes softmax activations for Tensor domain
+ */
 template <>
 class Softmax<Domain::Tensor> final
-    : public CanonicalNodeDef<CanonicalOpcode::TensorSoftmax, FixedArity<1>::Mixin>
+  : public CanonicalNodeDef<CanonicalOpcode::TensorSoftmax, FixedArity<1>::Mixin>
 {
 public:
   Softmax() = default;
@@ -777,7 +777,7 @@ template <Domain D> class BiasAdd;
  */
 template <>
 class BiasAdd<Domain::Tensor> final
-    : public CanonicalNodeDef<CanonicalOpcode::TensorBiasAdd, FixedArity<2>::Mixin>
+  : public CanonicalNodeDef<CanonicalOpcode::TensorBiasAdd, FixedArity<2>::Mixin>
 {
 public:
   BiasAdd() = default;
@@ -813,7 +813,7 @@ using TensorBiasAdd = BiasAdd<Domain::Tensor>;
  */
 template <>
 class BiasAdd<Domain::Feature> final
-    : public CanonicalNodeDef<CanonicalOpcode::FeatureBiasAdd, FixedArity<2>::Mixin>
+  : public CanonicalNodeDef<CanonicalOpcode::FeatureBiasAdd, FixedArity<2>::Mixin>
 {
 public:
   BiasAdd() = default;
@@ -848,7 +848,7 @@ using FeatureBiasAdd = BiasAdd<Domain::Feature>;
  * [padding.front(0) + 1 + padding.back(0), padding.front(1) + 2 + padding.back(1)] = [4,9].
  */
 class TensorConstantPad final
-    : public CanonicalNodeDef<CanonicalOpcode::TensorConstantPad, FixedArity<2>::Mixin>
+  : public CanonicalNodeDef<CanonicalOpcode::TensorConstantPad, FixedArity<2>::Mixin>
 {
 public:
   Node *input(void) const { return at(0)->node(); }
@@ -951,7 +951,7 @@ public:
  * @brief Elementwise Sqrt of input
  */
 class EltwiseSqrt final
-    : public CanonicalNodeDef<CanonicalOpcode::EltwiseSqrt, FixedArity<1>::Mixin>
+  : public CanonicalNodeDef<CanonicalOpcode::EltwiseSqrt, FixedArity<1>::Mixin>
 {
 public:
   EltwiseSqrt() = default;
@@ -976,7 +976,7 @@ public:
  * TODO Explain the operation semantics
  */
 class TensorBroadcast final
-    : public CanonicalNodeDef<CanonicalOpcode::TensorBroadcast, FixedArity<1>::Mixin>
+  : public CanonicalNodeDef<CanonicalOpcode::TensorBroadcast, FixedArity<1>::Mixin>
 {
 public:
   TensorBroadcast() = default;
@@ -1014,7 +1014,7 @@ private:
  * MatrixEncode currently requires a rank-2 Tensor as its input.
  */
 class MatrixEncode final
-    : public CanonicalNodeDef<CanonicalOpcode::MatrixEncode, FixedArity<1>::Mixin>
+  : public CanonicalNodeDef<CanonicalOpcode::MatrixEncode, FixedArity<1>::Mixin>
 {
 public:
   MatrixEncode() = default;
@@ -1038,7 +1038,7 @@ private:
  * MatrixDecode currently requires a Matrix as its input.
  */
 class MatrixDecode final
-    : public CanonicalNodeDef<CanonicalOpcode::MatrixDecode, FixedArity<1>::Mixin>
+  : public CanonicalNodeDef<CanonicalOpcode::MatrixDecode, FixedArity<1>::Mixin>
 {
 public:
   MatrixDecode() = default;
@@ -1086,7 +1086,7 @@ public:
  * Input and output belong to tensor domain.
  */
 class TensorTranspose final
-    : public CanonicalNodeDef<CanonicalOpcode::TensorTranspose, FixedArity<1>::Mixin>
+  : public CanonicalNodeDef<CanonicalOpcode::TensorTranspose, FixedArity<1>::Mixin>
 {
 public:
   TensorTranspose() = default;
diff --git a/compiler/loco/include/loco/IR/Padding2D.h b/compiler/loco/include/loco/IR/Padding2D.h
index 30557a891..b50a8045f 100644
--- a/compiler/loco/include/loco/IR/Padding2D.h
+++ b/compiler/loco/include/loco/IR/Padding2D.h
@@ -32,7 +32,7 @@ public:
 
 public:
   Padding2D(uint32_t top, uint32_t bottom, uint32_t left, uint32_t right)
-      : _top{top}, _bottom{bottom}, _left{left}, _right{right}
+    : _top{top}, _bottom{bottom}, _left{left}, _right{right}
   {
     // DO NOTHING
   }
diff --git a/compiler/loco/requires.cmake b/compiler/loco/requires.cmake
new file mode 100644
index 000000000..654db88c3
--- /dev/null
+++ b/compiler/loco/requires.cmake
@@ -0,0 +1 @@
+require("angkor")
diff --git a/compiler/loco/src/ADT/AnnotatedItem.test.cpp b/compiler/loco/src/ADT/AnnotatedItem.test.cpp
index 45ca87d75..87e597f5c 100644
--- a/compiler/loco/src/ADT/AnnotatedItem.test.cpp
+++ b/compiler/loco/src/ADT/AnnotatedItem.test.cpp
@@ -17,7 +17,8 @@
 #include "loco/ADT/AnnotatedItem.h"
 
 #include <gtest/gtest.h>
-#include <stdex/Memory.h>
+
+#include <memory>
 
 namespace
 {
@@ -31,7 +32,7 @@ template <int N> struct DerivedAnnotation final : public Annotation
 {
   static std::unique_ptr<DerivedAnnotation<N>> make(void)
   {
-    return stdex::make_unique<DerivedAnnotation<N>>();
+    return std::make_unique<DerivedAnnotation<N>>();
   }
 };
 
diff --git a/compiler/loco/src/IR/CanonicalDialect.cpp b/compiler/loco/src/IR/CanonicalDialect.cpp
index ea956b80e..9438956f8 100644
--- a/compiler/loco/src/IR/CanonicalDialect.cpp
+++ b/compiler/loco/src/IR/CanonicalDialect.cpp
@@ -18,8 +18,7 @@
 #include "loco/IR/Graph.h"
 #include "loco/IR/Nodes.h"
 
-#include <stdex/Memory.h>
-
+#include <memory>
 #include <cassert>
 #include <stdexcept>
 
@@ -55,7 +54,7 @@ namespace loco
 
 CanonicalDialect::CanonicalDialect()
 {
-  service<GraphOutputIndexQueryService>(stdex::make_unique<GraphOutputIndexQueryServiceImpl>());
+  service<GraphOutputIndexQueryService>(std::make_unique<GraphOutputIndexQueryServiceImpl>());
 }
 
 Dialect *CanonicalDialect::get(void)
diff --git a/compiler/loco/src/IR/Dialect.test.cpp b/compiler/loco/src/IR/Dialect.test.cpp
index 3af303375..447f443f2 100644
--- a/compiler/loco/src/IR/Dialect.test.cpp
+++ b/compiler/loco/src/IR/Dialect.test.cpp
@@ -16,7 +16,7 @@
 
 #include "loco/IR/Dialect.h"
 
-#include <stdex/Memory.h>
+#include <memory>
 
 #include <gtest/gtest.h>
 
@@ -31,7 +31,7 @@ TEST(DialectTest, service)
 
   struct MockDialect final : public loco::Dialect
   {
-    MockDialect() { service<S1>(stdex::make_unique<S1>()); }
+    MockDialect() { service<S1>(std::make_unique<S1>()); }
   };
 
   MockDialect dialect;
diff --git a/compiler/loco/src/IR/Graph.cpp b/compiler/loco/src/IR/Graph.cpp
index 8073d4545..98b22c3b6 100644
--- a/compiler/loco/src/IR/Graph.cpp
+++ b/compiler/loco/src/IR/Graph.cpp
@@ -16,8 +16,7 @@
 
 #include "loco/IR/Graph.h"
 
-#include <stdex/Memory.h>
-
+#include <memory>
 #include <cassert>
 
 namespace
@@ -25,7 +24,7 @@ namespace
 
 std::unique_ptr<loco::TensorShape> make_tensor_shape(std::initializer_list<loco::Dimension> dims)
 {
-  auto tensor_shape = stdex::make_unique<loco::TensorShape>();
+  auto tensor_shape = std::make_unique<loco::TensorShape>();
 
   tensor_shape->rank(dims.size());
   {
@@ -50,14 +49,11 @@ void Mixin<Trait::TensorShaped>::shape(std::initializer_list<Dimension> dims)
   shape(make_tensor_shape(dims));
 }
 
-GraphInput *Graph::InputContext::create(void)
-{
-  return take(stdex::make_unique<GraphInput>(size()));
-}
+GraphInput *Graph::InputContext::create(void) { return take(std::make_unique<GraphInput>(size())); }
 
 GraphOutput *Graph::OutputContext::create(void)
 {
-  return take(stdex::make_unique<GraphOutput>(size()));
+  return take(std::make_unique<GraphOutput>(size()));
 }
 
 std::set<loco::Node *> all_nodes(loco::Graph *g)
diff --git a/compiler/loco/src/IR/Graph.test.cpp b/compiler/loco/src/IR/Graph.test.cpp
index ad6894f30..837d29326 100644
--- a/compiler/loco/src/IR/Graph.test.cpp
+++ b/compiler/loco/src/IR/Graph.test.cpp
@@ -108,7 +108,7 @@ namespace
 {
 // temp node with multple params for ctor. loco::CanonicalOpcode::ReLU is used for simplicity
 class ParamCtorNode
-    : public loco::CanonicalNodeDef<loco::CanonicalOpcode::ReLU, loco::FixedArity<0>::Mixin>
+  : public loco::CanonicalNodeDef<loco::CanonicalOpcode::ReLU, loco::FixedArity<0>::Mixin>
 {
 public:
   ParamCtorNode(int i, float f)
diff --git a/compiler/loco/src/IR/PermutingCodec.cpp b/compiler/loco/src/IR/PermutingCodec.cpp
index 2857e5e28..e9fd1fb12 100644
--- a/compiler/loco/src/IR/PermutingCodec.cpp
+++ b/compiler/loco/src/IR/PermutingCodec.cpp
@@ -16,8 +16,7 @@
 
 #include "loco/IR/PermutingCodec.h"
 
-#include <stdex/Memory.h>
-
+#include <memory>
 #include <cassert>
 #include <set>
 #include <stdexcept>
@@ -139,7 +138,7 @@ TensorIndex PermutingEncoder<Domain::Feature>::value(const FeatureIndex &in) con
 
 std::unique_ptr<FeatureEncoder> PermutingEncoder<Domain::Feature>::clone(void) const
 {
-  return stdex::make_unique<PermutingEncoder<Domain::Feature>>(_perm);
+  return std::make_unique<PermutingEncoder<Domain::Feature>>(_perm);
 }
 
 bool PermutingEncoder<Domain::Feature>::valid(void) const { return ::valid(_perm); }
@@ -179,7 +178,7 @@ FeatureIndex PermutingDecoder<Domain::Feature>::value(const TensorIndex &in) con
 
 std::unique_ptr<FeatureDecoder> PermutingDecoder<Domain::Feature>::clone(void) const
 {
-  return stdex::make_unique<PermutingDecoder<Domain::Feature>>(_perm);
+  return std::make_unique<PermutingDecoder<Domain::Feature>>(_perm);
 }
 
 bool PermutingDecoder<Domain::Feature>::valid(void) const { return ::valid(_perm); }
diff --git a/compiler/loco/src/IR/Verifier.test.cpp b/compiler/loco/src/IR/Verifier.test.cpp
index 8c40a5058..8a92a35f0 100644
--- a/compiler/loco/src/IR/Verifier.test.cpp
+++ b/compiler/loco/src/IR/Verifier.test.cpp
@@ -18,10 +18,10 @@
 
 #include <gtest/gtest.h>
 
-#include <stdex/Memory.h>
+#include <memory>
 #include <vector>
 
-using stdex::make_unique;
+using std::make_unique;
 
 TEST(VerifierTest, valid_minimal)
 {
diff --git a/compiler/loco/src/Service/CanonicalShapeInferenceRule.cpp b/compiler/loco/src/Service/CanonicalShapeInferenceRule.cpp
index 6d5adc525..a0f0e892a 100644
--- a/compiler/loco/src/Service/CanonicalShapeInferenceRule.cpp
+++ b/compiler/loco/src/Service/CanonicalShapeInferenceRule.cpp
@@ -674,7 +674,7 @@ public:
     for (uint32_t axis = 0; axis < out_shape.rank(); ++axis)
     {
       out_shape.dim(axis) =
-          tensor_shape.dim(axis).value() + padding->front(axis) + padding->back(axis);
+        tensor_shape.dim(axis).value() + padding->front(axis) + padding->back(axis);
     }
 
     return loco::NodeShape{out_shape};
diff --git a/compiler/loco/src/Service/CanonicalShapeInferenceRule.test.cpp b/compiler/loco/src/Service/CanonicalShapeInferenceRule.test.cpp
index e88872b5d..0e0dec1a5 100644
--- a/compiler/loco/src/Service/CanonicalShapeInferenceRule.test.cpp
+++ b/compiler/loco/src/Service/CanonicalShapeInferenceRule.test.cpp
@@ -122,7 +122,7 @@ TEST(CanonicalShapeInferenceRuleTest, avgpool2d)
 
   testcase.pull_node->shape({1, 8, 4, 3});
 
-  testcase.encode_node->encoder(stdex::make_unique<PermutingEncoder<Domain::Feature>>(perm));
+  testcase.encode_node->encoder(std::make_unique<PermutingEncoder<Domain::Feature>>(perm));
 
   testcase.avgpool2d_node->window()->vertical(2);
   testcase.avgpool2d_node->window()->horizontal(2);
@@ -130,7 +130,7 @@ TEST(CanonicalShapeInferenceRuleTest, avgpool2d)
   testcase.avgpool2d_node->stride()->vertical(2);
   testcase.avgpool2d_node->stride()->horizontal(2);
 
-  testcase.decode_node->decoder(stdex::make_unique<PermutingDecoder<Domain::Feature>>(perm));
+  testcase.decode_node->decoder(std::make_unique<PermutingDecoder<Domain::Feature>>(perm));
 
   // Run Inference
   loco::CanonicalShapeInferenceRule rule;
@@ -224,7 +224,7 @@ TEST(CanonicalShapeInferenceRuleTest, maxpool2d)
 
   testcase.pull_node->shape({1, 8, 4, 3});
 
-  testcase.encode_node->encoder(stdex::make_unique<PermutingEncoder<Domain::Feature>>(perm));
+  testcase.encode_node->encoder(std::make_unique<PermutingEncoder<Domain::Feature>>(perm));
 
   testcase.maxpool2d_node->window()->vertical(2);
   testcase.maxpool2d_node->window()->horizontal(2);
@@ -232,7 +232,7 @@ TEST(CanonicalShapeInferenceRuleTest, maxpool2d)
   testcase.maxpool2d_node->stride()->vertical(2);
   testcase.maxpool2d_node->stride()->horizontal(2);
 
-  testcase.decode_node->decoder(stdex::make_unique<PermutingDecoder<Domain::Feature>>(perm));
+  testcase.decode_node->decoder(std::make_unique<PermutingDecoder<Domain::Feature>>(perm));
 
   // Run Inference
   loco::CanonicalShapeInferenceRule rule;
diff --git a/compiler/loco/src/Service/GraphBuilder.h b/compiler/loco/src/Service/GraphBuilder.h
index 71084673c..74eed2af8 100644
--- a/compiler/loco/src/Service/GraphBuilder.h
+++ b/compiler/loco/src/Service/GraphBuilder.h
@@ -20,10 +20,8 @@
 // loco-internal headers
 #include "loco/IR/Graph.h"
 
-// repo-internal headers
-#include <stdex/Memory.h>
-
 // C++ standard headers
+#include <memory>
 #include <stack>
 
 //
@@ -90,7 +88,7 @@ public:
   // "Layer" is in theory a subgraph builder.
   template <typename Layer, typename... Args>
   auto push(Args &&... args)
-      -> decltype(static_cast<Layer *>(nullptr)->operator()(static_cast<Context *>(nullptr)))
+    -> decltype(static_cast<Layer *>(nullptr)->operator()(static_cast<Context *>(nullptr)))
   {
     Layer layer{std::forward<Args>(args)...};
     return layer(ctx());
@@ -108,7 +106,7 @@ private:
 
 static inline std::unique_ptr<GraphBuilder> make_graph_builder(loco::Graph *g)
 {
-  return stdex::make_unique<GraphBuilder>(g);
+  return std::make_unique<GraphBuilder>(g);
 }
 
 // "InputLayer" creates both GraphInput and Pull node at once
@@ -159,7 +157,7 @@ struct InputLayer final
 
     ctx->stack()->push(pull_node);
 
-    return stdex::make_unique<Return>(graph_input, pull_node);
+    return std::make_unique<Return>(graph_input, pull_node);
   }
 };
 
@@ -205,7 +203,7 @@ struct OutputLayer final
 
     ctx->stack()->push(push_node);
 
-    return stdex::make_unique<Return>(graph_output, push_node);
+    return std::make_unique<Return>(graph_output, push_node);
   }
 };
 
@@ -236,7 +234,7 @@ struct ReLULayer final
 
     ctx->stack()->push(relu_node);
 
-    return stdex::make_unique<Return>(relu_node);
+    return std::make_unique<Return>(relu_node);
   }
 };
 
@@ -263,7 +261,7 @@ struct ConstGenLayer final
 
     ctx->stack()->push(const_node);
 
-    return stdex::make_unique<Return>(const_node);
+    return std::make_unique<Return>(const_node);
   }
 };
 
@@ -283,7 +281,7 @@ struct FeatureEncodeLayer final
     Return *perm(const loco::Permutation<loco::Domain::Feature> &perm)
     {
       using namespace loco;
-      _node->encoder(stdex::make_unique<PermutingEncoder<Domain::Feature>>(perm));
+      _node->encoder(std::make_unique<PermutingEncoder<Domain::Feature>>(perm));
       return this;
     }
 
@@ -302,7 +300,7 @@ struct FeatureEncodeLayer final
 
     ctx->stack()->push(encode_node);
 
-    return stdex::make_unique<Return>(encode_node);
+    return std::make_unique<Return>(encode_node);
   }
 };
 
@@ -320,7 +318,7 @@ struct FeatureDecodeLayer final
     Return *perm(const loco::Permutation<loco::Domain::Feature> &perm)
     {
       using namespace loco;
-      _node->decoder(stdex::make_unique<PermutingDecoder<Domain::Feature>>(perm));
+      _node->decoder(std::make_unique<PermutingDecoder<Domain::Feature>>(perm));
       return this;
     }
 
@@ -341,7 +339,7 @@ struct FeatureDecodeLayer final
 
     ctx->stack()->push(decode_node);
 
-    return stdex::make_unique<Return>(decode_node);
+    return std::make_unique<Return>(decode_node);
   }
 };
 
@@ -358,7 +356,7 @@ struct FilterEncodeLayer final
   public:
     Return *perm(const loco::Permutation<loco::Domain::Filter> &perm)
     {
-      auto encoder = stdex::make_unique<loco::PermutingEncoder<loco::Domain::Filter>>();
+      auto encoder = std::make_unique<loco::PermutingEncoder<loco::Domain::Filter>>();
       encoder->perm(perm);
       _node->encoder(std::move(encoder));
       return this;
@@ -379,7 +377,7 @@ struct FilterEncodeLayer final
 
     ctx->stack()->push(encode_node);
 
-    return stdex::make_unique<Return>(encode_node);
+    return std::make_unique<Return>(encode_node);
   }
 };
 
@@ -397,7 +395,7 @@ struct DepthwiseFilterEncodeLayer final
     Return *perm(const loco::Permutation<loco::Domain::DepthwiseFilter> &perm)
     {
       using namespace loco;
-      _node->encoder(stdex::make_unique<PermutingEncoder<Domain::DepthwiseFilter>>(perm));
+      _node->encoder(std::make_unique<PermutingEncoder<Domain::DepthwiseFilter>>(perm));
       return this;
     }
 
@@ -416,7 +414,7 @@ struct DepthwiseFilterEncodeLayer final
 
     ctx->stack()->push(encode_node);
 
-    return stdex::make_unique<Return>(encode_node);
+    return std::make_unique<Return>(encode_node);
   }
 };
 
@@ -446,7 +444,7 @@ struct DepthwiseConv2DLayer final
 
     ctx->stack()->push(depthwiseconv2d_node);
 
-    return stdex::make_unique<Return>(depthwiseconv2d_node);
+    return std::make_unique<Return>(depthwiseconv2d_node);
   }
 };
 
@@ -476,7 +474,7 @@ struct TransposedConv2DLayer final
 
     ctx->stack()->push(tr_conv2d_node);
 
-    return stdex::make_unique<Return>(tr_conv2d_node);
+    return std::make_unique<Return>(tr_conv2d_node);
   }
 };
 
@@ -512,7 +510,7 @@ struct FixedReshapeLayer final
 
     ctx->stack()->push(reshape_node);
 
-    return stdex::make_unique<Return>(reshape_node);
+    return std::make_unique<Return>(reshape_node);
   }
 };
 
@@ -540,7 +538,7 @@ struct TensorBroadcastLayer final
     broadcast_node->input(ctx->stack()->pop());
     ctx->stack()->push(broadcast_node);
 
-    return stdex::make_unique<Return>(broadcast_node);
+    return std::make_unique<Return>(broadcast_node);
   }
 };
 
diff --git a/compiler/loco/src/Service/GraphTestcase.h b/compiler/loco/src/Service/GraphTestcase.h
index 27b011f8d..06801e0aa 100644
--- a/compiler/loco/src/Service/GraphTestcase.h
+++ b/compiler/loco/src/Service/GraphTestcase.h
@@ -22,8 +22,6 @@
 
 #include "GraphBuilder.h"
 
-#include <stdex/Memory.h>
-
 enum class GraphCode
 {
   Identity,
@@ -278,7 +276,7 @@ public:
     const_node = graph_builder->push<ConstGenLayer>()->node();
 
     filter_encode_node =
-        graph_builder->push<DepthwiseFilterEncodeLayer>()->perm(filter_perm)->node();
+      graph_builder->push<DepthwiseFilterEncodeLayer>()->perm(filter_perm)->node();
 
     depthwiseconv2d_node = graph_builder->push<DepthwiseConv2DLayer>()->node();
 
diff --git a/compiler/loco/src/Service/MultiDialectShapeInferenceRule.test.cpp b/compiler/loco/src/Service/MultiDialectShapeInferenceRule.test.cpp
index 3d5a11ae4..7be41f7ee 100644
--- a/compiler/loco/src/Service/MultiDialectShapeInferenceRule.test.cpp
+++ b/compiler/loco/src/Service/MultiDialectShapeInferenceRule.test.cpp
@@ -112,8 +112,8 @@ TEST(MultiDialectShapeInferenceRuleTest, test1)
   loco::MultiDialectShapeInferenceRule rules;
 
   rules.bind(loco::CanonicalDialect::get(), &canonical_rule)
-      .bind(TestDialect<2, 3>::get(), &t23_rule)
-      .bind(TestDialect<4, 5>::get(), &t45_rule);
+    .bind(TestDialect<2, 3>::get(), &t23_rule)
+    .bind(TestDialect<4, 5>::get(), &t45_rule);
 
   loco::apply(&rules).to(g.get());
 
diff --git a/compiler/loco/src/Service/ShapeInference.cpp b/compiler/loco/src/Service/ShapeInference.cpp
index 84eb10963..d177a4869 100644
--- a/compiler/loco/src/Service/ShapeInference.cpp
+++ b/compiler/loco/src/Service/ShapeInference.cpp
@@ -18,8 +18,7 @@
 #include "loco/IR/Algorithm.h"
 
 #include <cassert>
-
-#include <stdex/Memory.h>
+#include <memory>
 
 namespace
 {
@@ -82,7 +81,7 @@ bool ShapeInferenceSession::to(Graph *g) const
       {
         if (_rule->infer(node, shape))
         {
-          node->annot(stdex::make_unique<ShapeAnnotation>(shape));
+          node->annot(std::make_unique<ShapeAnnotation>(shape));
           changed = true;
         }
       }
diff --git a/compiler/loco/src/Service/TypeInference.cpp b/compiler/loco/src/Service/TypeInference.cpp
index 27d7d9a29..df038efe7 100644
--- a/compiler/loco/src/Service/TypeInference.cpp
+++ b/compiler/loco/src/Service/TypeInference.cpp
@@ -19,8 +19,7 @@
 #include "loco/IR/Algorithm.h"
 
 #include <cassert>
-
-#include <stdex/Memory.h>
+#include <memory>
 
 namespace
 {
@@ -73,7 +72,7 @@ bool TypeInferenceSession::to(Graph *g) const
       {
         if (_rule->infer(node, dtype))
         {
-          node->annot(stdex::make_unique<DataTypeAnnotation>(dtype));
+          node->annot(std::make_unique<DataTypeAnnotation>(dtype));
           changed = true;
         }
       }
diff --git a/compiler/loco/src/Service/TypeInference.test.cpp b/compiler/loco/src/Service/TypeInference.test.cpp
index 13bcfa52b..0d2cc8864 100644
--- a/compiler/loco/src/Service/TypeInference.test.cpp
+++ b/compiler/loco/src/Service/TypeInference.test.cpp
@@ -268,8 +268,8 @@ TEST(MultiDialectTypeInferenceRuleTest, test1)
   loco::MultiDialectTypeInferenceRule rules;
 
   rules.bind(TestDialect<loco::DataType::S8>::get(), &s8_rule)
-      .bind(TestDialect<loco::DataType::U8>::get(), &u8_rule)
-      .bind(loco::CanonicalDialect::get(), &canon_rule);
+    .bind(TestDialect<loco::DataType::U8>::get(), &u8_rule)
+    .bind(loco::CanonicalDialect::get(), &canon_rule);
 
   loco::apply(&rules).to(g.get());
 
diff --git a/compiler/loco/src/tensorflow.test.cpp b/compiler/loco/src/tensorflow.test.cpp
index f534aee7b..d905429f5 100644
--- a/compiler/loco/src/tensorflow.test.cpp
+++ b/compiler/loco/src/tensorflow.test.cpp
@@ -23,9 +23,9 @@
 
 #include <gtest/gtest.h>
 
-#include <stdex/Memory.h>
+#include <memory>
 
-using stdex::make_unique;
+using std::make_unique;
 
 namespace
 {
@@ -65,7 +65,7 @@ loco::Permutation<loco::Domain::Filter> make_HWIO_permutation(void)
   return HWIO;
 }
 
-} // nemaspace
+} // namespace
 
 #if 0
 >>> MaxPool_Float_000 testcase
diff --git a/compiler/locoex-customop/CMakeLists.txt b/compiler/locoex-customop/CMakeLists.txt
index df1e01526..12356c81b 100644
--- a/compiler/locoex-customop/CMakeLists.txt
+++ b/compiler/locoex-customop/CMakeLists.txt
@@ -5,7 +5,7 @@ list(REMOVE_ITEM SOURCES ${TESTS})
 add_library(locoex_customop SHARED ${SOURCES})
 target_include_directories(locoex_customop PUBLIC include)
 target_link_libraries(locoex_customop PUBLIC loco)
-target_link_libraries(locoex_customop PRIVATE stdex locop pepper_str)
+target_link_libraries(locoex_customop PRIVATE locop pepper_str)
 install(TARGETS locoex_customop DESTINATION lib)
 
 if(NOT ENABLE_TEST)
@@ -15,4 +15,4 @@ endif(NOT ENABLE_TEST)
 nnas_find_package(GTest REQUIRED)
 
 GTest_AddTest(locoex_customop_test ${TESTS})
-target_link_libraries(locoex_customop_test loco locoex_customop stdex)
+target_link_libraries(locoex_customop_test loco locoex_customop)
diff --git a/compiler/locoex-customop/requires.cmake b/compiler/locoex-customop/requires.cmake
index 9127144f2..c4240bc09 100644
--- a/compiler/locoex-customop/requires.cmake
+++ b/compiler/locoex-customop/requires.cmake
@@ -1,4 +1,3 @@
 require("loco")
-require("stdex")
 require("locop")
 require("pepper-str")
diff --git a/compiler/locoex-customop/src/COpCall.cpp b/compiler/locoex-customop/src/COpCall.cpp
index 029914758..e86ad5c5b 100644
--- a/compiler/locoex-customop/src/COpCall.cpp
+++ b/compiler/locoex-customop/src/COpCall.cpp
@@ -57,7 +57,7 @@ std::vector<std::string> COpCall::attr_names() const
 
 #define INSTANTIATE(AT)                                                                            \
   template const typename AttrTypeTrait<AT>::Type *COpCall::attr<AT>(const std::string &attr_name) \
-      const;
+    const;
 
 INSTANTIATE(COpAttrType::Float)
 INSTANTIATE(COpAttrType::Int)
diff --git a/compiler/locoex-customop/src/COpCall.test.cpp b/compiler/locoex-customop/src/COpCall.test.cpp
index d5f01d22d..7bc4186e5 100644
--- a/compiler/locoex-customop/src/COpCall.test.cpp
+++ b/compiler/locoex-customop/src/COpCall.test.cpp
@@ -20,7 +20,7 @@
 #include <loco/IR/Graph.h>
 #include <loco/IR/Nodes.h>
 
-#include <stdex/Memory.h>
+#include <memory>
 
 #include <gtest/gtest.h>
 
@@ -51,8 +51,8 @@ TEST(CallTest, Test_01)
     custom->input(0, inp);
     custom->input(1, inp);
 
-    custom->attr(int_attr, stdex::make_unique<COpAttrInt>(int_val));
-    custom->attr(float_attr, stdex::make_unique<COpAttrFloat>(float_val));
+    custom->attr(int_attr, std::make_unique<COpAttrInt>(int_val));
+    custom->attr(float_attr, std::make_unique<COpAttrFloat>(float_val));
   }
 
   // access custom op input
diff --git a/compiler/locoex-customop/src/VariadicArityNode.test.cpp b/compiler/locoex-customop/src/VariadicArityNode.test.cpp
index a618824e5..86a9de5cd 100644
--- a/compiler/locoex-customop/src/VariadicArityNode.test.cpp
+++ b/compiler/locoex-customop/src/VariadicArityNode.test.cpp
@@ -47,7 +47,7 @@ class BinaryInputNode : public TestNode
 public:
   BinaryInputNode() : TestNode(2) {}
 };
-}
+} // namespace
 
 TEST(CustomOpTest, VariadicArityNode_arity_0)
 {
diff --git a/compiler/locomotiv/CMakeLists.txt b/compiler/locomotiv/CMakeLists.txt
index 5c0156b78..308f48619 100644
--- a/compiler/locomotiv/CMakeLists.txt
+++ b/compiler/locomotiv/CMakeLists.txt
@@ -8,7 +8,6 @@ target_include_directories(locomotiv PUBLIC include)
 target_include_directories(locomotiv PRIVATE src)
 target_link_libraries(locomotiv PUBLIC loco)
 target_link_libraries(locomotiv PUBLIC angkor)
-target_link_libraries(locomotiv PRIVATE stdex)
 # Let's apply nncc common compile options
 #
 # NOTE This will enable strict compilation (warnings as error).
diff --git a/compiler/locomotiv/include/locomotiv/Session.h b/compiler/locomotiv/include/locomotiv/Session.h
index 3268d60b3..85c26c09c 100644
--- a/compiler/locomotiv/include/locomotiv/Session.h
+++ b/compiler/locomotiv/include/locomotiv/Session.h
@@ -51,7 +51,7 @@ public:
    * @warn  This approach may fail in case of graph with control flow
    */
   Session(loco::Graph *g, const std::vector<loco::Node *> &custom_outputs)
-      : _graph(g), _outputs(custom_outputs)
+    : _graph(g), _outputs(custom_outputs)
   {
     // DO NOTHING
   }
diff --git a/compiler/locomotiv/requires.cmake b/compiler/locomotiv/requires.cmake
index 1c09aa13d..654db88c3 100644
--- a/compiler/locomotiv/requires.cmake
+++ b/compiler/locomotiv/requires.cmake
@@ -1,2 +1 @@
 require("angkor")
-require("stdex")
diff --git a/compiler/locomotiv/src/Node/AvgPool2D.cpp b/compiler/locomotiv/src/Node/AvgPool2D.cpp
index 5fdf1e725..0adabd49a 100644
--- a/compiler/locomotiv/src/Node/AvgPool2D.cpp
+++ b/compiler/locomotiv/src/Node/AvgPool2D.cpp
@@ -78,9 +78,9 @@ nncc::core::ADT::tensor::Buffer<T> avgPool2D(const loco::AvgPool2D *avgpool2d,
   const uint32_t pad_right = avgpool2d->pad()->right();
 
   const uint32_t output_height =
-      compute_out_size(ifm_height, pad_top + pad_bottom, window_height, stride_height);
+    compute_out_size(ifm_height, pad_top + pad_bottom, window_height, stride_height);
   const uint32_t output_width =
-      compute_out_size(ifm_width, pad_left + pad_right, window_width, stride_width);
+    compute_out_size(ifm_width, pad_left + pad_right, window_width, stride_width);
 
   // prepare output buffer
   Shape output_shape{batches, output_height, output_width, depth};
diff --git a/compiler/locomotiv/src/Node/AvgPool2D.test.cpp b/compiler/locomotiv/src/Node/AvgPool2D.test.cpp
index f9863b47d..ec5f3cd82 100644
--- a/compiler/locomotiv/src/Node/AvgPool2D.test.cpp
+++ b/compiler/locomotiv/src/Node/AvgPool2D.test.cpp
@@ -84,7 +84,7 @@ void run_test(const float *ifm, const float *expected_ofm, const Shape &ifm_shap
   ASSERT_TRUE(*(avgpool2d_data->shape()) == ofm_shape);
 
   auto ofm_overlay =
-      make_overlay<float, LexicalLayout>(ofm_shape, const_cast<float *>(expected_ofm));
+    make_overlay<float, LexicalLayout>(ofm_shape, const_cast<float *>(expected_ofm));
   for (nncc::core::ADT::tensor::IndexEnumerator e{ofm_shape}; e.valid(); e.advance())
   {
     const auto &ind = e.current();
diff --git a/compiler/locomotiv/src/Node/BiasAdd.cpp b/compiler/locomotiv/src/Node/BiasAdd.cpp
index b84fa7e3c..0c45cc12f 100644
--- a/compiler/locomotiv/src/Node/BiasAdd.cpp
+++ b/compiler/locomotiv/src/Node/BiasAdd.cpp
@@ -55,7 +55,7 @@ void execute_node(loco::BiasAdd<loco::Domain::Tensor> *bias_add)
 
   validate(input_data && bias_data, "Input not ready");
   validate(locomotiv::annot_domain(bias_add->value()) == loco::Domain::Tensor &&
-               locomotiv::annot_domain(bias_add->bias()) == loco::Domain::Bias,
+             locomotiv::annot_domain(bias_add->bias()) == loco::Domain::Bias,
            "Wrong input domain");
 
   std::unique_ptr<NodeData> bias_add_data = calc(input_data, bias_data, bias_add->axis());
@@ -74,7 +74,7 @@ void execute_node(loco::BiasAdd<loco::Domain::Feature> *bias_add)
 
   validate(input_data && bias_data, "Input not ready");
   validate(locomotiv::annot_domain(bias_add->value()) == loco::Domain::Feature &&
-               locomotiv::annot_domain(bias_add->bias()) == loco::Domain::Bias,
+             locomotiv::annot_domain(bias_add->bias()) == loco::Domain::Bias,
            "Wrong input domain");
 
   std::unique_ptr<NodeData> bias_add_data = calc(input_data, bias_data, 3);
diff --git a/compiler/locomotiv/src/Node/Conv2D.cpp b/compiler/locomotiv/src/Node/Conv2D.cpp
index cdf0dfd56..2f9ca5a7e 100644
--- a/compiler/locomotiv/src/Node/Conv2D.cpp
+++ b/compiler/locomotiv/src/Node/Conv2D.cpp
@@ -82,9 +82,9 @@ Buffer<RET_T> calc_conv2D(const loco::Conv2D *conv2d, const Buffer<IFM_T> *input
   const uint32_t pad_right = conv2d->pad()->right();
 
   const uint32_t output_height =
-      compute_out_size(input_height + pad_top + pad_bottom, filter_height, stride_height);
+    compute_out_size(input_height + pad_top + pad_bottom, filter_height, stride_height);
   const uint32_t output_width =
-      compute_out_size(input_width + pad_left + pad_right, filter_width, stride_width);
+    compute_out_size(input_width + pad_left + pad_right, filter_width, stride_width);
 
   const uint32_t batches = input_shape.dim(0);
   const uint32_t input_depth = input_shape.dim(3);
@@ -121,9 +121,9 @@ Buffer<RET_T> calc_conv2D(const loco::Conv2D *conv2d, const Buffer<IFM_T> *input
                     ((unsigned)in_y < input_height))
                 {
                   auto input_value =
-                      input_buf->at(Index({batch, (unsigned)in_y, (unsigned)in_x, in_channel}));
+                    input_buf->at(Index({batch, (unsigned)in_y, (unsigned)in_x, in_channel}));
                   auto filter_value =
-                      filter_buf->at(Index({out_channel, filter_y, filter_x, in_channel}));
+                    filter_buf->at(Index({out_channel, filter_y, filter_x, in_channel}));
                   total += (input_value * filter_value);
                 }
               }
diff --git a/compiler/locomotiv/src/Node/Conv2D.test.cpp b/compiler/locomotiv/src/Node/Conv2D.test.cpp
index 66e947acc..93afa79b7 100644
--- a/compiler/locomotiv/src/Node/Conv2D.test.cpp
+++ b/compiler/locomotiv/src/Node/Conv2D.test.cpp
@@ -97,7 +97,7 @@ void run_test(const float *ifm, const float *ker, const float *expected_ofm, con
   ASSERT_TRUE(*(conv2d_result->shape()) == ofm_shape);
 
   auto ofm_overlay =
-      make_overlay<float, LexicalLayout>(ofm_shape, const_cast<float *>(expected_ofm));
+    make_overlay<float, LexicalLayout>(ofm_shape, const_cast<float *>(expected_ofm));
   for (nncc::core::ADT::tensor::IndexEnumerator e{ofm_shape}; e.valid(); e.advance())
   {
     const auto &ind = e.current();
diff --git a/compiler/locomotiv/src/Node/DepthwiseConv2D.cpp b/compiler/locomotiv/src/Node/DepthwiseConv2D.cpp
index f39cd177e..a1a8e506f 100644
--- a/compiler/locomotiv/src/Node/DepthwiseConv2D.cpp
+++ b/compiler/locomotiv/src/Node/DepthwiseConv2D.cpp
@@ -89,9 +89,9 @@ Buffer<RET_T> calc_dw_conv2d(const loco::DepthwiseConv2D *dw_conv2d, const Buffe
   const uint32_t pad_right = dw_conv2d->pad()->right();
 
   const uint32_t ofm_height =
-      compute_out_size(ifm_height, pad_top + pad_bottom, ker_height, stride_height);
+    compute_out_size(ifm_height, pad_top + pad_bottom, ker_height, stride_height);
   const uint32_t ofm_width =
-      compute_out_size(ifm_width, pad_left + pad_right, ker_width, stride_width);
+    compute_out_size(ifm_width, pad_left + pad_right, ker_width, stride_width);
 
   const uint32_t batches = ifm_shape.dim(0);
   const uint32_t ifm_depth = ifm_shape.dim(3);
diff --git a/compiler/locomotiv/src/Node/DepthwiseConv2D.test.cpp b/compiler/locomotiv/src/Node/DepthwiseConv2D.test.cpp
index 1ff333be0..8a435b6ab 100644
--- a/compiler/locomotiv/src/Node/DepthwiseConv2D.test.cpp
+++ b/compiler/locomotiv/src/Node/DepthwiseConv2D.test.cpp
@@ -97,7 +97,7 @@ void run_test(const float *ifm, const float *ker, const float *expected_ofm, con
   ASSERT_TRUE(*(dw_conv2d_result->shape()) == ofm_shape);
 
   auto ofm_overlay =
-      make_overlay<float, LexicalLayout>(ofm_shape, const_cast<float *>(expected_ofm));
+    make_overlay<float, LexicalLayout>(ofm_shape, const_cast<float *>(expected_ofm));
   for (nncc::core::ADT::tensor::IndexEnumerator e{ofm_shape}; e.valid(); e.advance())
   {
     const auto &ind = e.current();
diff --git a/compiler/locomotiv/src/Node/DepthwiseFilterEncode.cpp b/compiler/locomotiv/src/Node/DepthwiseFilterEncode.cpp
index 03f5bf833..e161287ea 100644
--- a/compiler/locomotiv/src/Node/DepthwiseFilterEncode.cpp
+++ b/compiler/locomotiv/src/Node/DepthwiseFilterEncode.cpp
@@ -59,8 +59,8 @@ std::unique_ptr<locomotiv::NodeData> dw_filter_encode(const loco::DepthwiseFilte
 
   // Make HWCM (i.e. height, width, depth, multiplier) buffer from DepthwiseFilterShape
   Buffer<T> node_buf = make_buffer<T, LexicalLayout>(
-      Shape{node_shape.height().value(), node_shape.width().value(), node_shape.depth().value(),
-            node_shape.multiplier().value()});
+    Shape{node_shape.height().value(), node_shape.width().value(), node_shape.depth().value(),
+          node_shape.multiplier().value()});
 
   // Copy buffer in an order arranged by encoder
   for (IndexEnumerator e{node_buf.shape()}; e.valid(); e.advance())
diff --git a/compiler/locomotiv/src/Node/DepthwiseFilterEncode.test.cpp b/compiler/locomotiv/src/Node/DepthwiseFilterEncode.test.cpp
index 5b2ec9326..44364723c 100644
--- a/compiler/locomotiv/src/Node/DepthwiseFilterEncode.test.cpp
+++ b/compiler/locomotiv/src/Node/DepthwiseFilterEncode.test.cpp
@@ -62,7 +62,7 @@ TEST(NodeExecution_DepthwiseFilterEncode, f32)
 
   // Encoder to correctly read input tensor as MHWC
   auto encoder = std::unique_ptr<loco::PermutingEncoder<loco::Domain::DepthwiseFilter>>(
-      new loco::PermutingEncoder<loco::Domain::DepthwiseFilter>);
+    new loco::PermutingEncoder<loco::Domain::DepthwiseFilter>);
   encoder->perm()->axis(loco::DepthwiseFilterAxis::Multiplier) = 0;
   encoder->perm()->axis(loco::DepthwiseFilterAxis::Height) = 1;
   encoder->perm()->axis(loco::DepthwiseFilterAxis::Width) = 2;
diff --git a/compiler/locomotiv/src/Node/FeatureCodec.test.cpp b/compiler/locomotiv/src/Node/FeatureCodec.test.cpp
index 1b6b06c13..dacd0170c 100644
--- a/compiler/locomotiv/src/Node/FeatureCodec.test.cpp
+++ b/compiler/locomotiv/src/Node/FeatureCodec.test.cpp
@@ -64,7 +64,7 @@ protected:
                                             const loco::Permutation<loco::Domain::Feature> &perm)
   {
     auto encoder = std::unique_ptr<loco::PermutingEncoder<loco::Domain::Feature>>(
-        new loco::PermutingEncoder<loco::Domain::Feature>);
+      new loco::PermutingEncoder<loco::Domain::Feature>);
 
     encoder->perm(perm);
 
@@ -80,7 +80,7 @@ protected:
                                             const loco::Permutation<loco::Domain::Feature> &perm)
   {
     auto decoder = std::unique_ptr<loco::PermutingDecoder<loco::Domain::Feature>>(
-        new loco::PermutingDecoder<loco::Domain::Feature>);
+      new loco::PermutingDecoder<loco::Domain::Feature>);
 
     decoder->perm(perm);
 
diff --git a/compiler/locomotiv/src/Node/FeatureDecode.cpp b/compiler/locomotiv/src/Node/FeatureDecode.cpp
index 8776e1b42..2877906f9 100644
--- a/compiler/locomotiv/src/Node/FeatureDecode.cpp
+++ b/compiler/locomotiv/src/Node/FeatureDecode.cpp
@@ -54,8 +54,8 @@ std::unique_ptr<locomotiv::NodeData> feature_decode(const loco::FeatureDecode *n
 
   // Make tensor buffer from TensorShape
   Buffer<T> node_buf =
-      make_buffer<T, LexicalLayout>(Shape{node_shape.dim(0).value(), node_shape.dim(1).value(),
-                                          node_shape.dim(2).value(), node_shape.dim(3).value()});
+    make_buffer<T, LexicalLayout>(Shape{node_shape.dim(0).value(), node_shape.dim(1).value(),
+                                        node_shape.dim(2).value(), node_shape.dim(3).value()});
 
   // Copy buffer in an order arranged by decoder
   for (IndexEnumerator e{node_buf.shape()}; e.valid(); e.advance())
diff --git a/compiler/locomotiv/src/Node/FeatureEncode.cpp b/compiler/locomotiv/src/Node/FeatureEncode.cpp
index 406de76ff..c3570b981 100644
--- a/compiler/locomotiv/src/Node/FeatureEncode.cpp
+++ b/compiler/locomotiv/src/Node/FeatureEncode.cpp
@@ -54,8 +54,8 @@ std::unique_ptr<locomotiv::NodeData> feature_encode(const loco::FeatureEncode *n
 
   // Make NHWC buffer from FeatureShape
   Buffer<T> node_buf =
-      make_buffer<T, LexicalLayout>(Shape{node_shape.count().value(), node_shape.height().value(),
-                                          node_shape.width().value(), node_shape.depth().value()});
+    make_buffer<T, LexicalLayout>(Shape{node_shape.count().value(), node_shape.height().value(),
+                                        node_shape.width().value(), node_shape.depth().value()});
 
   // Copy buffer in an order arranged by encoder
   for (IndexEnumerator e{node_buf.shape()}; e.valid(); e.advance())
diff --git a/compiler/locomotiv/src/Node/FilterEncode.cpp b/compiler/locomotiv/src/Node/FilterEncode.cpp
index 0e2ac918f..84ba681ba 100644
--- a/compiler/locomotiv/src/Node/FilterEncode.cpp
+++ b/compiler/locomotiv/src/Node/FilterEncode.cpp
@@ -54,8 +54,8 @@ std::unique_ptr<locomotiv::NodeData> filter_encode(const loco::FilterEncode *nod
 
   // Make NHWC buffer from FilterShape
   Buffer<T> node_buf =
-      make_buffer<T, LexicalLayout>(Shape{node_shape.count().value(), node_shape.height().value(),
-                                          node_shape.width().value(), node_shape.depth().value()});
+    make_buffer<T, LexicalLayout>(Shape{node_shape.count().value(), node_shape.height().value(),
+                                        node_shape.width().value(), node_shape.depth().value()});
 
   // Copy buffer in an order arranged by encoder
   for (IndexEnumerator e{node_buf.shape()}; e.valid(); e.advance())
diff --git a/compiler/locomotiv/src/Node/FilterEncode.test.cpp b/compiler/locomotiv/src/Node/FilterEncode.test.cpp
index dcca94993..80d108ece 100644
--- a/compiler/locomotiv/src/Node/FilterEncode.test.cpp
+++ b/compiler/locomotiv/src/Node/FilterEncode.test.cpp
@@ -62,7 +62,7 @@ TEST(NodeExecution_FilterEncode, s32)
 
   // Encoder to correctly read input tensor as NCHW
   auto encoder = std::unique_ptr<loco::PermutingEncoder<loco::Domain::Filter>>(
-      new loco::PermutingEncoder<loco::Domain::Filter>);
+    new loco::PermutingEncoder<loco::Domain::Filter>);
   encoder->perm()->axis(loco::FilterAxis::Count) = 0;
   encoder->perm()->axis(loco::FilterAxis::Depth) = 1;
   encoder->perm()->axis(loco::FilterAxis::Height) = 2;
@@ -116,7 +116,7 @@ TEST(NodeExecution_FilterEncode, f32)
 
   // Encoder to correctly read input tensor as CHNW
   auto encoder = std::unique_ptr<loco::PermutingEncoder<loco::Domain::Filter>>(
-      new loco::PermutingEncoder<loco::Domain::Filter>);
+    new loco::PermutingEncoder<loco::Domain::Filter>);
   encoder->perm()->axis(loco::FilterAxis::Depth) = 0;
   encoder->perm()->axis(loco::FilterAxis::Height) = 1;
   encoder->perm()->axis(loco::FilterAxis::Count) = 2;
diff --git a/compiler/locomotiv/src/Node/MatrixCodec.test.cpp b/compiler/locomotiv/src/Node/MatrixCodec.test.cpp
index da4afeded..7f684e41f 100644
--- a/compiler/locomotiv/src/Node/MatrixCodec.test.cpp
+++ b/compiler/locomotiv/src/Node/MatrixCodec.test.cpp
@@ -64,7 +64,7 @@ protected:
                                           const loco::Permutation<loco::Domain::Matrix> &perm)
   {
     auto encoder = std::unique_ptr<loco::PermutingEncoder<loco::Domain::Matrix>>(
-        new loco::PermutingEncoder<loco::Domain::Matrix>);
+      new loco::PermutingEncoder<loco::Domain::Matrix>);
 
     encoder->perm(perm);
 
@@ -80,7 +80,7 @@ protected:
                                           const loco::Permutation<loco::Domain::Matrix> &perm)
   {
     auto decoder = std::unique_ptr<loco::PermutingDecoder<loco::Domain::Matrix>>(
-        new loco::PermutingDecoder<loco::Domain::Matrix>);
+      new loco::PermutingDecoder<loco::Domain::Matrix>);
 
     decoder->perm(perm);
 
diff --git a/compiler/locomotiv/src/Node/MatrixDecode.cpp b/compiler/locomotiv/src/Node/MatrixDecode.cpp
index 0310015f1..2a65a7b74 100644
--- a/compiler/locomotiv/src/Node/MatrixDecode.cpp
+++ b/compiler/locomotiv/src/Node/MatrixDecode.cpp
@@ -52,7 +52,7 @@ std::unique_ptr<locomotiv::NodeData> matrix_decode(const loco::MatrixDecode *nod
 
   // Make tensor buffer from TensorShape
   Buffer<T> node_buf =
-      make_buffer<T, LexicalLayout>(Shape{node_shape.dim(0).value(), node_shape.dim(1).value()});
+    make_buffer<T, LexicalLayout>(Shape{node_shape.dim(0).value(), node_shape.dim(1).value()});
 
   // Copy buffer in an order arranged by decoder
   for (IndexEnumerator e{node_buf.shape()}; e.valid(); e.advance())
diff --git a/compiler/locomotiv/src/Node/MatrixEncode.cpp b/compiler/locomotiv/src/Node/MatrixEncode.cpp
index e3554e15a..ac51e4256 100644
--- a/compiler/locomotiv/src/Node/MatrixEncode.cpp
+++ b/compiler/locomotiv/src/Node/MatrixEncode.cpp
@@ -54,7 +54,7 @@ std::unique_ptr<locomotiv::NodeData> matrix_encode(const loco::MatrixEncode *nod
 
   // Make HW buffer from MatrixShape
   Buffer<T> node_buf =
-      make_buffer<T, LexicalLayout>(Shape{node_shape.height().value(), node_shape.width().value()});
+    make_buffer<T, LexicalLayout>(Shape{node_shape.height().value(), node_shape.width().value()});
 
   // Copy buffer in an order arranged by encoder
   for (IndexEnumerator e{node_buf.shape()}; e.valid(); e.advance())
diff --git a/compiler/locomotiv/src/Node/MaxPool2D.cpp b/compiler/locomotiv/src/Node/MaxPool2D.cpp
index 8dce1cb1e..dc626387b 100644
--- a/compiler/locomotiv/src/Node/MaxPool2D.cpp
+++ b/compiler/locomotiv/src/Node/MaxPool2D.cpp
@@ -79,9 +79,9 @@ nncc::core::ADT::tensor::Buffer<T> maxPool2D(const loco::MaxPool2D *maxpool2d,
   const uint32_t pad_right = maxpool2d->pad()->right();
 
   const uint32_t output_height =
-      compute_out_size(ifm_height, pad_top + pad_bottom, window_height, stride_height);
+    compute_out_size(ifm_height, pad_top + pad_bottom, window_height, stride_height);
   const uint32_t output_width =
-      compute_out_size(ifm_width, pad_left + pad_right, window_width, stride_width);
+    compute_out_size(ifm_width, pad_left + pad_right, window_width, stride_width);
 
   // prepare output buffer
   Shape output_shape{batches, output_height, output_width, depth};
diff --git a/compiler/locomotiv/src/Node/MaxPool2D.test.cpp b/compiler/locomotiv/src/Node/MaxPool2D.test.cpp
index 5046d4a6e..d00282dd7 100644
--- a/compiler/locomotiv/src/Node/MaxPool2D.test.cpp
+++ b/compiler/locomotiv/src/Node/MaxPool2D.test.cpp
@@ -82,7 +82,7 @@ void run_test(const float *ifm, const float *expected_ofm, const Shape &ifm_shap
   ASSERT_TRUE(*(maxpool2d_data->shape()) == ofm_shape);
 
   auto ofm_overlay =
-      make_overlay<float, LexicalLayout>(ofm_shape, const_cast<float *>(expected_ofm));
+    make_overlay<float, LexicalLayout>(ofm_shape, const_cast<float *>(expected_ofm));
   for (nncc::core::ADT::tensor::IndexEnumerator e{ofm_shape}; e.valid(); e.advance())
   {
     const auto &ind = e.current();
diff --git a/compiler/locomotiv/src/Node/TensorConcat.cpp b/compiler/locomotiv/src/Node/TensorConcat.cpp
index 188bb635b..84da3a3e5 100644
--- a/compiler/locomotiv/src/Node/TensorConcat.cpp
+++ b/compiler/locomotiv/src/Node/TensorConcat.cpp
@@ -52,7 +52,7 @@ void execute_node(loco::TensorConcat *tensor_concat)
   validate(lhs_data->dtype() == rhs_data->dtype(), "lhs and rhs of Concat should have same dtype");
 
   validate(annot_domain(tensor_concat->lhs()) == loco::Domain::Tensor &&
-               annot_domain(tensor_concat->rhs()) == loco::Domain::Tensor,
+             annot_domain(tensor_concat->rhs()) == loco::Domain::Tensor,
            "Some ingredients of TensorConcat is not Tensor");
 
   // Calculate output shape
diff --git a/compiler/locomotiv/src/Node/TransposedConv2D.cpp b/compiler/locomotiv/src/Node/TransposedConv2D.cpp
index bec15a5df..2f3c3d089 100644
--- a/compiler/locomotiv/src/Node/TransposedConv2D.cpp
+++ b/compiler/locomotiv/src/Node/TransposedConv2D.cpp
@@ -65,7 +65,7 @@ Buffer<RET_T> calc_tr_conv2D(const loco::TransposedConv2D *tr_conv2d,
   locomotiv::validate(input_shape.rank() == 4, "ifm rank must be 4");
   locomotiv::validate(filter_shape.rank() == 4, "filter rank must be 4");
   locomotiv::validate(input_shape.dim(3) /* depth of input */ ==
-                          filter_shape.dim(3) /* depth of filter */,
+                        filter_shape.dim(3) /* depth of filter */,
                       "channel value mismatch");
 
   const uint32_t input_height = input_shape.dim(1);
@@ -86,9 +86,9 @@ Buffer<RET_T> calc_tr_conv2D(const loco::TransposedConv2D *tr_conv2d,
   // TODO Support dilations
 
   const uint32_t output_height =
-      compute_transposed_out_size(input_height, pad_top + pad_bottom, filter_height, stride_height);
+    compute_transposed_out_size(input_height, pad_top + pad_bottom, filter_height, stride_height);
   const uint32_t output_width =
-      compute_transposed_out_size(input_width, pad_left + pad_right, filter_width, stride_width);
+    compute_transposed_out_size(input_width, pad_left + pad_right, filter_width, stride_width);
 
   const uint32_t batches = input_shape.dim(0);
   const uint32_t input_depth = input_shape.dim(3);
@@ -131,9 +131,9 @@ Buffer<RET_T> calc_tr_conv2D(const loco::TransposedConv2D *tr_conv2d,
                 {
                   auto input_value = input_buf->at(Index({batch, in_y, in_x, in_channel}));
                   auto filter_value =
-                      filter_buf->at(Index({out_channel, filter_y, filter_x, in_channel}));
+                    filter_buf->at(Index({out_channel, filter_y, filter_x, in_channel}));
                   output_buf.at(Index({batch, (unsigned)out_y, (unsigned)out_x, out_channel})) +=
-                      input_value * filter_value;
+                    input_value * filter_value;
                 }
               }
             }
diff --git a/compiler/locomotiv/src/Node/TransposedConv2D.test.cpp b/compiler/locomotiv/src/Node/TransposedConv2D.test.cpp
index ef759f51b..a516ef9f2 100644
--- a/compiler/locomotiv/src/Node/TransposedConv2D.test.cpp
+++ b/compiler/locomotiv/src/Node/TransposedConv2D.test.cpp
@@ -97,7 +97,7 @@ void run_test(const float *ifm, const float *ker, const float *expected_ofm, con
   ASSERT_TRUE(*(conv2d_result->shape()) == ofm_shape);
 
   auto ofm_overlay =
-      make_overlay<float, LexicalLayout>(ofm_shape, const_cast<float *>(expected_ofm));
+    make_overlay<float, LexicalLayout>(ofm_shape, const_cast<float *>(expected_ofm));
   for (nncc::core::ADT::tensor::IndexEnumerator e{ofm_shape}; e.valid(); e.advance())
   {
     const auto &ind = e.current();
diff --git a/compiler/locomotiv/src/NodeDataImpl.cpp b/compiler/locomotiv/src/NodeDataImpl.cpp
index 2efebe5a9..9373b8dd2 100644
--- a/compiler/locomotiv/src/NodeDataImpl.cpp
+++ b/compiler/locomotiv/src/NodeDataImpl.cpp
@@ -16,8 +16,7 @@
 
 #include "NodeDataImpl.h"
 
-#include <stdex/Memory.h>
-
+#include <memory>
 #include <cassert>
 
 namespace
@@ -59,7 +58,7 @@ template <> NodeDataImpl::NodeDataImpl(const Buffer<float> &buf)
 
 void annot_data(loco::Node *node, std::unique_ptr<NodeData> &&data)
 {
-  node->annot(stdex::make_unique<NodeDataAnnotation>(std::move(data)));
+  node->annot(std::make_unique<NodeDataAnnotation>(std::move(data)));
 }
 
 const NodeData *annot_data(const loco::Node *node)
diff --git a/compiler/locomotiv/src/NodeExecution.h b/compiler/locomotiv/src/NodeExecution.h
index 363188d38..eb0608d2b 100644
--- a/compiler/locomotiv/src/NodeExecution.h
+++ b/compiler/locomotiv/src/NodeExecution.h
@@ -62,7 +62,7 @@ private:
     return dynamic_cast<Derived *>(node);
   }
 
-// clang-format off
+  // clang-format off
   /**
    * @brief Calculate for one specified node and update its result as NodeData.
    *        Abort program when its ingredients are not ready or not supported.
diff --git a/compiler/locomotiv/src/UserData.cpp b/compiler/locomotiv/src/UserData.cpp
index b658ada9b..98f761efd 100644
--- a/compiler/locomotiv/src/UserData.cpp
+++ b/compiler/locomotiv/src/UserData.cpp
@@ -16,8 +16,7 @@
 
 #include "UserData.h"
 
-#include <stdex/Memory.h>
-
+#include <memory>
 #include <cassert>
 
 namespace
@@ -55,7 +54,7 @@ const NodeData *user_data(const loco::Node *node)
 
 void user_data(loco::Node *node, std::unique_ptr<NodeData> &&data)
 {
-  node->annot(stdex::make_unique<UserDataAnnotation>(std::move(data)));
+  node->annot(std::make_unique<UserDataAnnotation>(std::move(data)));
 }
 
 void erase_user_data(loco::Node *node) { node->annot<UserDataAnnotation>(nullptr); }
diff --git a/compiler/locop/CMakeLists.txt b/compiler/locop/CMakeLists.txt
index 107ee8be8..f02fb1a72 100644
--- a/compiler/locop/CMakeLists.txt
+++ b/compiler/locop/CMakeLists.txt
@@ -13,7 +13,6 @@ target_link_libraries(locop PUBLIC loco)
 target_link_libraries(locop PRIVATE nncc_common)
 target_link_libraries(locop PUBLIC nncc_coverage)
 target_link_libraries(locop PRIVATE pp)
-target_link_libraries(locop PRIVATE stdex)
 
 if(NOT ENABLE_TEST)
   return()
@@ -23,5 +22,4 @@ endif(NOT ENABLE_TEST)
 nnas_find_package(GTest REQUIRED)
 
 GTest_AddTest(locop_test ${TESTS})
-target_link_libraries(locop_test stdex)
 target_link_libraries(locop_test locop)
diff --git a/compiler/locop/src/CanonicalNodeSummaryBuilder.cpp b/compiler/locop/src/CanonicalNodeSummaryBuilder.cpp
index 61d9e8ae7..75dd39f36 100644
--- a/compiler/locop/src/CanonicalNodeSummaryBuilder.cpp
+++ b/compiler/locop/src/CanonicalNodeSummaryBuilder.cpp
@@ -25,8 +25,6 @@
 
 #include <pp/Format.h>
 
-#include <stdex/Memory.h>
-
 #include <map>
 #include <set>
 
diff --git a/compiler/locop/src/ExampleGraph.h b/compiler/locop/src/ExampleGraph.h
index 76813bcd8..84010f75b 100644
--- a/compiler/locop/src/ExampleGraph.h
+++ b/compiler/locop/src/ExampleGraph.h
@@ -19,7 +19,7 @@
 
 #include <loco.h>
 
-#include <stdex/Memory.h>
+#include <memory>
 
 namespace
 {
@@ -55,7 +55,7 @@ template <> std::unique_ptr<Bundle<PullPush>> make_bundle(void)
 
   push->from(pull);
 
-  auto res = stdex::make_unique<Bundle<PullPush>>();
+  auto res = std::make_unique<Bundle<PullPush>>();
 
   res->g = std::move(g);
   res->pull = pull;
diff --git a/compiler/locop/src/FormattedGraph.cpp b/compiler/locop/src/FormattedGraph.cpp
index bf4175768..94bfbd2f8 100644
--- a/compiler/locop/src/FormattedGraph.cpp
+++ b/compiler/locop/src/FormattedGraph.cpp
@@ -23,8 +23,7 @@
 
 #include <pp/Format.h>
 
-#include <stdex/Memory.h>
-
+#include <memory>
 #include <map>
 #include <set>
 
@@ -300,7 +299,7 @@ void FormattedGraphImpl<Formatter::LinearV1>::dump(std::ostream &os) const
   else
   {
     // Use Built-in NodeSummaryBuilder otherwise
-    node_summary_builder = stdex::make_unique<GenericNodeSummaryBuilder>(&symbols);
+    node_summary_builder = std::make_unique<GenericNodeSummaryBuilder>(&symbols);
   }
 
   // Print Graph Input(s)
diff --git a/compiler/locop/src/FormattedGraph.test.cpp b/compiler/locop/src/FormattedGraph.test.cpp
index aff9ebe5f..9f11a4e5d 100644
--- a/compiler/locop/src/FormattedGraph.test.cpp
+++ b/compiler/locop/src/FormattedGraph.test.cpp
@@ -17,7 +17,7 @@
 #include "locop/FormattedGraph.h"
 #include "ExampleGraph.h"
 
-#include <stdex/Memory.h>
+#include <memory>
 
 #include <gtest/gtest.h>
 
@@ -42,7 +42,7 @@ TEST(LinearV1FormatterTest, user_defined_node_summary_builder)
   auto bundle = make_bundle<PullPush>();
   auto g = bundle->graph();
   {
-    bundle->push->annot(stdex::make_unique<MyAnnotation>());
+    bundle->push->annot(std::make_unique<MyAnnotation>());
   }
 
   struct MyBuilder final : public locop::NodeSummaryBuilder
@@ -63,11 +63,11 @@ TEST(LinearV1FormatterTest, user_defined_node_summary_builder)
   {
     std::unique_ptr<locop::NodeSummaryBuilder> create(const locop::SymbolTable *) const final
     {
-      return stdex::make_unique<MyBuilder>();
+      return std::make_unique<MyBuilder>();
     }
   };
 
-  std::cout << locop::fmt<locop::LinearV1>(g).with(stdex::make_unique<MyFactory>()) << std::endl;
+  std::cout << locop::fmt<locop::LinearV1>(g).with(std::make_unique<MyFactory>()) << std::endl;
 
   // TODO Check whether MyBuilder actually sees all the nodes in a graph
   SUCCEED();
@@ -134,11 +134,11 @@ TEST(LinearV1FormatterTest, node_summary_builder_composition)
   {
     std::unique_ptr<locop::NodeSummaryBuilder> create(const locop::SymbolTable *tbl) const final
     {
-      return stdex::make_unique<CompositeBuilder>(tbl);
+      return std::make_unique<CompositeBuilder>(tbl);
     }
   };
 
-  std::cout << locop::fmt<locop::LinearV1>(g).with(stdex::make_unique<MyFactory>()) << std::endl;
+  std::cout << locop::fmt<locop::LinearV1>(g).with(std::make_unique<MyFactory>()) << std::endl;
 
   // TODO Check whether MyBuilder actually sees all the nodes in a graph
   SUCCEED();
diff --git a/compiler/locop/src/FormattedTensorShape.cpp b/compiler/locop/src/FormattedTensorShape.cpp
index b2b6ea074..bc6310313 100644
--- a/compiler/locop/src/FormattedTensorShape.cpp
+++ b/compiler/locop/src/FormattedTensorShape.cpp
@@ -25,7 +25,7 @@ std::ostream &operator<<(std::ostream &os, const loco::Dimension &d)
   return os;
 }
 
-} // namespace
+} // namespace loco
 
 namespace locop
 {
diff --git a/compiler/locop/src/FormattedTensorShape.test.cpp b/compiler/locop/src/FormattedTensorShape.test.cpp
index fc85df3a6..626b6cc23 100644
--- a/compiler/locop/src/FormattedTensorShape.test.cpp
+++ b/compiler/locop/src/FormattedTensorShape.test.cpp
@@ -16,7 +16,7 @@
 
 #include "locop/FormattedTensorShape.h"
 
-#include <stdex/Memory.h>
+#include <memory>
 
 #include <gtest/gtest.h>
 
@@ -24,12 +24,26 @@ using namespace locop;
 
 TEST(FormattedTensorShapeTest, BracketFormat)
 {
-  auto tensor_shape = stdex::make_unique<loco::TensorShape>();
+  auto tensor_shape = std::make_unique<loco::TensorShape>();
 
   tensor_shape->rank(2);
   tensor_shape->dim(0) = 4;
+  tensor_shape->dim(1) = 8;
 
   std::cout << fmt<TensorShapeFormat::Bracket>(tensor_shape.get()) << std::endl;
 
   SUCCEED();
 }
+
+TEST(FormattedTensorShapeTest, PlainFormat)
+{
+  auto tensor_shape = std::make_unique<loco::TensorShape>();
+
+  tensor_shape->rank(2);
+  tensor_shape->dim(0) = 4;
+  tensor_shape->dim(1) = 8;
+
+  std::cout << fmt<TensorShapeFormat::Plain>(tensor_shape.get()) << std::endl;
+
+  SUCCEED();
+}
diff --git a/compiler/locop/src/GenericNodeSummaryBuilder.test.cpp b/compiler/locop/src/GenericNodeSummaryBuilder.test.cpp
index d688b5490..cfa82c2a2 100644
--- a/compiler/locop/src/GenericNodeSummaryBuilder.test.cpp
+++ b/compiler/locop/src/GenericNodeSummaryBuilder.test.cpp
@@ -17,8 +17,7 @@
 #include "locop/GenericNodeSummaryBuilder.h"
 #include "locop/FormattedGraph.h"
 
-#include <stdex/Memory.h>
-
+#include <memory>
 #include <stdexcept>
 
 #include <gtest/gtest.h>
@@ -44,7 +43,7 @@ TEST(GenericNodeSummaryBuilderTest, simple)
   {
     std::unique_ptr<locop::NodeSummaryBuilder> create(const locop::SymbolTable *tbl) const final
     {
-      return stdex::make_unique<locop::GenericNodeSummaryBuilder>(tbl);
+      return std::make_unique<locop::GenericNodeSummaryBuilder>(tbl);
     }
   };
 
@@ -52,7 +51,7 @@ TEST(GenericNodeSummaryBuilderTest, simple)
 
   g->nodes()->create<MockNode>();
 
-  std::cout << locop::fmt<locop::LinearV1>(g).with(stdex::make_unique<MockFactory>()) << std::endl;
+  std::cout << locop::fmt<locop::LinearV1>(g).with(std::make_unique<MockFactory>()) << std::endl;
 
   SUCCEED();
 }
diff --git a/compiler/locop/src/NodeSummary.cpp b/compiler/locop/src/NodeSummary.cpp
index 3f8856997..20250a90f 100644
--- a/compiler/locop/src/NodeSummary.cpp
+++ b/compiler/locop/src/NodeSummary.cpp
@@ -16,8 +16,7 @@
 
 #include "locop/NodeSummary.h"
 
-#include <stdex/Memory.h>
-
+#include <memory>
 #include <cassert>
 
 namespace locop
@@ -36,6 +35,6 @@ const std::string &NodeDesc::opname(void) const
   return *_name;
 }
 
-void NodeDesc::opname(const std::string &v) { _name = stdex::make_unique<std::string>(v); }
+void NodeDesc::opname(const std::string &v) { _name = std::make_unique<std::string>(v); }
 
-} // namespace loco
+} // namespace locop
diff --git a/compiler/logo-core/src/Phase.test.cpp b/compiler/logo-core/src/Phase.test.cpp
new file mode 100644
index 000000000..2ee09101b
--- /dev/null
+++ b/compiler/logo-core/src/Phase.test.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <logo/Phase.h>
+
+#include <loco.h>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+struct Bumblebee final : public logo::Pass
+{
+  const char *name(void) const final { return "Bee"; }
+  bool run(loco::Graph *) final { return false; }
+};
+
+} // namespace
+
+TEST(LogoPhaseSaturateTests, simple)
+{
+  loco::Graph g;
+  logo::PhaseRunner<logo::PhaseStrategy::Saturate> phase_runner{&g};
+  logo::Phase phase;
+
+  phase.emplace_back(std::make_unique<Bumblebee>());
+  phase_runner.run(phase);
+
+  SUCCEED();
+}
+
+TEST(LogoPhaseRestartTests, simple)
+{
+  loco::Graph g;
+  logo::PhaseRunner<logo::PhaseStrategy::Restart> phase_runner{&g};
+  logo::Phase phase;
+
+  phase.emplace_back(std::make_unique<Bumblebee>());
+  phase_runner.run(phase);
+
+  SUCCEED();
+}
diff --git a/compiler/logo/CMakeLists.txt b/compiler/logo/CMakeLists.txt
index 399cb7586..a8efd9b03 100644
--- a/compiler/logo/CMakeLists.txt
+++ b/compiler/logo/CMakeLists.txt
@@ -9,7 +9,6 @@ target_include_directories(logo PUBLIC include)
 target_link_libraries(logo PUBLIC loco)
 target_link_libraries(logo PUBLIC logo_core)
 target_link_libraries(logo PRIVATE locomotiv)
-target_link_libraries(logo PRIVATE stdex)
 
 if(NOT ENABLE_TEST)
   return()
@@ -20,4 +19,3 @@ nnas_find_package(GTest REQUIRED)
 GTest_AddTest(logo_test ${TESTS})
 target_include_directories(logo_test PRIVATE src)
 target_link_libraries(logo_test logo)
-target_link_libraries(logo_test stdex)
diff --git a/compiler/logo/requires.cmake b/compiler/logo/requires.cmake
index 9a7d14788..c76183353 100644
--- a/compiler/logo/requires.cmake
+++ b/compiler/logo/requires.cmake
@@ -1,4 +1,3 @@
 require("loco")
 require("logo-core")
 require("locomotiv")
-require("stdex")
diff --git a/compiler/logo/src/Passes/ConstantFoldingPass.cpp b/compiler/logo/src/Passes/ConstantFoldingPass.cpp
index e038e7140..2bd4759ca 100644
--- a/compiler/logo/src/Passes/ConstantFoldingPass.cpp
+++ b/compiler/logo/src/Passes/ConstantFoldingPass.cpp
@@ -19,8 +19,6 @@
 #include <loco.h>
 #include <loco/IR/CanonicalDialect.h>
 
-#include <stdex/Memory.h>
-
 #include <locomotiv/Session.h>
 
 #include <cassert>
@@ -52,19 +50,19 @@ uint64_t num_elements(const loco::NodeMixin<loco::NodeTrait::TensorShape> &shape
 bool skip(const loco::Node *node)
 {
   static std::set<uint32_t> skip_op = {
-      // TODO Current implementation works for 'Tensor' domain only. Support other domains such as
-      //      `Feature`, `Filter`, `Bias`, etc.
-      static_cast<uint32_t>(loco::CanonicalOpcode::FilterEncode),
-      static_cast<uint32_t>(loco::CanonicalOpcode::FeatureEncode),
-      static_cast<uint32_t>(loco::CanonicalOpcode::BiasEncode),
-      static_cast<uint32_t>(loco::CanonicalOpcode::DepthwiseFilterEncode),
-
-      // We don't perform constant folding for Push
-      static_cast<uint32_t>(loco::CanonicalOpcode::Push),
-
-      // TensorBroadcast is a good hint for optimization
-      // TODO Let this option be controlled by driver using logo
-      static_cast<uint32_t>(loco::CanonicalOpcode::TensorBroadcast),
+    // TODO Current implementation works for 'Tensor' domain only. Support other domains such as
+    //      `Feature`, `Filter`, `Bias`, etc.
+    static_cast<uint32_t>(loco::CanonicalOpcode::FilterEncode),
+    static_cast<uint32_t>(loco::CanonicalOpcode::FeatureEncode),
+    static_cast<uint32_t>(loco::CanonicalOpcode::BiasEncode),
+    static_cast<uint32_t>(loco::CanonicalOpcode::DepthwiseFilterEncode),
+
+    // We don't perform constant folding for Push
+    static_cast<uint32_t>(loco::CanonicalOpcode::Push),
+
+    // TensorBroadcast is a good hint for optimization
+    // TODO Let this option be controlled by driver using logo
+    static_cast<uint32_t>(loco::CanonicalOpcode::TensorBroadcast),
   };
 
   if (node->dialect() == loco::CanonicalDialect::get())
diff --git a/compiler/logo/src/Passes/ConstantFoldingPass.test.cpp b/compiler/logo/src/Passes/ConstantFoldingPass.test.cpp
index b9c4942c4..5d222eb00 100644
--- a/compiler/logo/src/Passes/ConstantFoldingPass.test.cpp
+++ b/compiler/logo/src/Passes/ConstantFoldingPass.test.cpp
@@ -24,6 +24,21 @@
 
 using namespace logo::test;
 
+TEST(ConstantFoldingTest, name)
+{
+  logo::ConstantFoldingPass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
+
+TEST(ConstantFoldingTest, run_NEG)
+{
+  loco::Graph g;
+  logo::ConstantFoldingPass pass;
+
+  ASSERT_FALSE(pass.run(&g));
+}
+
 namespace
 {
 
diff --git a/compiler/logo/src/Passes/EmptyTestGraph.h b/compiler/logo/src/Passes/EmptyTestGraph.h
new file mode 100644
index 000000000..67f2c8a11
--- /dev/null
+++ b/compiler/logo/src/Passes/EmptyTestGraph.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LOGO_EMPTY_TEST_GRAPH_H__
+#define __LOGO_EMPTY_TEST_GRAPH_H__
+
+#include <loco.h>
+
+namespace logo
+{
+
+void create_empty_test_net(loco::Graph *graph);
+
+} // namespace logo
+
+#endif // __LOGO_EMPTY_TEST_GRAPH_H__
diff --git a/compiler/logo/src/Passes/EmptyTestGraph.test.cpp b/compiler/logo/src/Passes/EmptyTestGraph.test.cpp
new file mode 100644
index 000000000..46750b79c
--- /dev/null
+++ b/compiler/logo/src/Passes/EmptyTestGraph.test.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <loco.h>
+
+#include <gtest/gtest.h>
+
+namespace logo
+{
+
+void create_empty_test_net(loco::Graph *graph)
+{
+  assert(graph);
+
+  auto const_node = graph->nodes()->create<loco::ConstGen>();
+  {
+    const_node->dtype(loco::DataType::FLOAT32);
+    const_node->rank(1);
+    const_node->dim(0) = 1;
+    const_node->size<loco::DataType::FLOAT32>(1);
+    const_node->at<loco::DataType::FLOAT32>(0) = 1.0f;
+  }
+
+  auto push_node = graph->nodes()->create<loco::Push>();
+  {
+    push_node->from(const_node);
+  }
+
+  auto graph_output = graph->outputs()->create();
+  {
+    graph_output->name("output");
+    graph_output->dtype(loco::DataType::FLOAT32);
+    loco::link(graph_output, push_node);
+  }
+}
+
+} // namespace logo
diff --git a/compiler/logo/src/Passes/RemoveDeadNodePass.test.cpp b/compiler/logo/src/Passes/RemoveDeadNodePass.test.cpp
new file mode 100644
index 000000000..c0ecbdaa9
--- /dev/null
+++ b/compiler/logo/src/Passes/RemoveDeadNodePass.test.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <logo/RemoveDeadNodePass.h>
+
+#include "EmptyTestGraph.h"
+
+#include <gtest/gtest.h>
+
+TEST(RemoveDeadNodePassTest, name)
+{
+  logo::RemoveDeadNodePass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
+
+TEST(RemoveDeadNodePassTest, run_NEG)
+{
+  loco::Graph g;
+  logo::RemoveDeadNodePass pass;
+
+  logo::create_empty_test_net(&g);
+
+  ASSERT_FALSE(pass.run(&g));
+}
diff --git a/compiler/stdex/src/Set.test.cpp b/compiler/logo/src/Passes/RemoveDeadNodeWithQueryPass.test.cpp
index 90361936f..f14bfc30d 100644
--- a/compiler/stdex/src/Set.test.cpp
+++ b/compiler/logo/src/Passes/RemoveDeadNodeWithQueryPass.test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,24 +14,25 @@
  * limitations under the License.
  */
 
-#include "stdex/Set.h"
+#include <logo/RemoveDeadNodeWithQueryPass.h>
+
+#include "EmptyTestGraph.h"
 
 #include <gtest/gtest.h>
 
-TEST(SET, operator_eq)
+TEST(RemoveDeadNodeWithQueryPassTest, name)
 {
-  ASSERT_TRUE(std::set<int>({1, 2, 3}) == std::set<int>({1, 2, 3}));
-  ASSERT_FALSE(std::set<int>({1, 3}) == std::set<int>({1, 2, 3}));
+  logo::RemoveDeadNodeWithQueryPass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
 }
 
-TEST(SET, operator_diff)
+TEST(RemoveDeadNodeWithQueryPassTest, run_NEG)
 {
-  const std::set<int> lhs{1, 2, 3};
-  const std::set<int> rhs{2, 4};
+  loco::Graph g;
+  logo::RemoveDeadNodeWithQueryPass pass;
 
-  auto res = lhs - rhs;
+  logo::create_empty_test_net(&g);
 
-  ASSERT_EQ(res.size(), 2);
-  ASSERT_NE(res.find(1), res.end());
-  ASSERT_NE(res.find(3), res.end());
+  ASSERT_FALSE(pass.run(&g));
 }
diff --git a/compiler/logo/src/Passes/RemoveForwardNodePass.test.cpp b/compiler/logo/src/Passes/RemoveForwardNodePass.test.cpp
new file mode 100644
index 000000000..bb905aec5
--- /dev/null
+++ b/compiler/logo/src/Passes/RemoveForwardNodePass.test.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <logo/RemoveForwardNodePass.h>
+
+#include "EmptyTestGraph.h"
+
+#include <gtest/gtest.h>
+
+TEST(RemoveForwardNodePassTest, name)
+{
+  logo::RemoveForwardNodePass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
+
+TEST(RemoveForwardNodePassTest, run_NEG)
+{
+  loco::Graph g;
+  logo::RemoveForwardNodePass pass;
+
+  logo::create_empty_test_net(&g);
+
+  ASSERT_FALSE(pass.run(&g));
+}
diff --git a/compiler/logo/src/Passes/ReorderDecodePass.test.cpp b/compiler/logo/src/Passes/ReorderDecodePass.test.cpp
new file mode 100644
index 000000000..f8e158d3a
--- /dev/null
+++ b/compiler/logo/src/Passes/ReorderDecodePass.test.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <logo/ReorderDecodePass.h>
+
+#include "EmptyTestGraph.h"
+
+#include <gtest/gtest.h>
+
+TEST(ReorderDecodePassTest, TensorBiasAdd_name)
+{
+  logo::ReorderDecodePass<loco::TensorBiasAdd> pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
+
+TEST(ReorderDecodePassTest, ReLU_name)
+{
+  logo::ReorderDecodePass<loco::ReLU> pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
+
+TEST(ReorderDecodePassTest, TensorBiasAdd_run_NEG)
+{
+  loco::Graph g;
+  logo::ReorderDecodePass<loco::TensorBiasAdd> pass;
+
+  logo::create_empty_test_net(&g);
+
+  ASSERT_FALSE(pass.run(&g));
+}
+
+TEST(ReorderDecodePassTest, ReLU_run_NEG)
+{
+  loco::Graph g;
+  logo::ReorderDecodePass<loco::ReLU> pass;
+
+  logo::create_empty_test_net(&g);
+
+  ASSERT_FALSE(pass.run(&g));
+}
diff --git a/compiler/logo/src/Passes/ResolveDuplicateReshapePass.test.cpp b/compiler/logo/src/Passes/ResolveDuplicateReshapePass.test.cpp
new file mode 100644
index 000000000..de2df6fd5
--- /dev/null
+++ b/compiler/logo/src/Passes/ResolveDuplicateReshapePass.test.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <logo/ResolveDuplicateReshapePass.h>
+
+#include "EmptyTestGraph.h"
+
+#include <gtest/gtest.h>
+
+TEST(ResolveDuplicateReshapePassTest, name)
+{
+  logo::ResolveDuplicateReshapePass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
+
+TEST(ResolveDuplicateReshapePassTest, run_NEG)
+{
+  loco::Graph g;
+  logo::ResolveDuplicateReshapePass pass;
+
+  logo::create_empty_test_net(&g);
+
+  ASSERT_FALSE(pass.run(&g));
+}
diff --git a/compiler/logo/src/Passes/ResolveRedundantReshapePass.test.cpp b/compiler/logo/src/Passes/ResolveRedundantReshapePass.test.cpp
new file mode 100644
index 000000000..9a7e95846
--- /dev/null
+++ b/compiler/logo/src/Passes/ResolveRedundantReshapePass.test.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <logo/ResolveRedundantReshapePass.h>
+
+#include "EmptyTestGraph.h"
+
+#include <gtest/gtest.h>
+
+TEST(ResolveRedundantReshapePassTest, name)
+{
+  logo::ResolveRedundantReshapePass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
+
+TEST(ResolveRedundantReshapePassTest, run_NEG)
+{
+  loco::Graph g;
+  logo::ResolveRedundantReshapePass pass;
+
+  logo::create_empty_test_net(&g);
+
+  ASSERT_FALSE(pass.run(&g));
+}
diff --git a/compiler/logo/src/Passes/SimplifyDomainConversionPass.cpp b/compiler/logo/src/Passes/SimplifyDomainConversionPass.cpp
index 0bda85b6f..500f08623 100644
--- a/compiler/logo/src/Passes/SimplifyDomainConversionPass.cpp
+++ b/compiler/logo/src/Passes/SimplifyDomainConversionPass.cpp
@@ -20,8 +20,7 @@
 #include <loco/IR/CanonicalDialect.h>
 #include <loco/IR/CanonicalNode.h>
 
-#include <stdex/Memory.h>
-
+#include <memory>
 #include <set>
 #include <vector>
 #include <cassert>
@@ -231,8 +230,8 @@ bool SimplifyDomainConversionPass::run(loco::Graph *g)
           perm_vec[to] = from;
         }
 
-        transposeCandidates.insert(stdex::make_unique<TransposeCtx>(
-            encode_node, decode_node, encode_node->input(), perm_vec));
+        transposeCandidates.insert(
+          std::make_unique<TransposeCtx>(encode_node, decode_node, encode_node->input(), perm_vec));
       }
     }
 
@@ -293,8 +292,8 @@ bool SimplifyDomainConversionPass::run(loco::Graph *g)
           perm_vec[to] = from;
         }
 
-        transposeCandidates.insert(stdex::make_unique<TransposeCtx>(
-            encode_node, decode_node, encode_node->input(), perm_vec));
+        transposeCandidates.insert(
+          std::make_unique<TransposeCtx>(encode_node, decode_node, encode_node->input(), perm_vec));
       }
     }
 
@@ -377,8 +376,8 @@ bool SimplifyDomainConversionPass::run(loco::Graph *g)
           perm_vec[to] = from;
         }
 
-        transposeCandidates.insert(stdex::make_unique<TransposeCtx>(
-            encode_node, decode_node, encode_node->input(), perm_vec));
+        transposeCandidates.insert(
+          std::make_unique<TransposeCtx>(encode_node, decode_node, encode_node->input(), perm_vec));
       }
     }
 
@@ -397,7 +396,7 @@ bool SimplifyDomainConversionPass::run(loco::Graph *g)
 
       TransposeCtx(loco::Node *first, loco::Node *last, loco::Node *input,
                    std::vector<loco::TensorAxis> perm)
-          : first_node(first), last_node(last), input_node(input), perm_vec(perm)
+        : first_node(first), last_node(last), input_node(input), perm_vec(perm)
       { /* empty */
       }
     };
diff --git a/compiler/logo/src/Passes/SimplifyDomainConversionPass.test.cpp b/compiler/logo/src/Passes/SimplifyDomainConversionPass.test.cpp
index 9a05763b4..75a288089 100644
--- a/compiler/logo/src/Passes/SimplifyDomainConversionPass.test.cpp
+++ b/compiler/logo/src/Passes/SimplifyDomainConversionPass.test.cpp
@@ -19,10 +19,26 @@
 #include "TestHelper.h"
 
 #include <loco.h>
-#include <stdex/Memory.h>
+
+#include <memory>
 
 #include <gtest/gtest.h>
 
+TEST(SimplifyDomainConversionPassTest, name)
+{
+  logo::SimplifyDomainConversionPass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
+
+TEST(SimplifyDomainConversionPassTest, run_NEG)
+{
+  loco::Graph g;
+  logo::SimplifyDomainConversionPass pass;
+
+  ASSERT_FALSE(pass.run(&g));
+}
+
 namespace
 {
 
@@ -65,7 +81,7 @@ template <FilterLayout T> loco::FilterDecode *make_filter_decode(loco::Node *inp
 {
   loco::Graph *g = input_for_decode->graph();
 
-  auto decoder = stdex::make_unique<loco::PermutingDecoder<loco::Domain::Filter>>();
+  auto decoder = std::make_unique<loco::PermutingDecoder<loco::Domain::Filter>>();
 
   decoder->perm(perm<T>());
 
@@ -80,7 +96,7 @@ template <FilterLayout T> loco::FilterEncode *make_filter_encode(loco::Node *inp
 {
   loco::Graph *g = input_for_encode->graph();
 
-  auto encoder = stdex::make_unique<loco::PermutingEncoder<loco::Domain::Filter>>();
+  auto encoder = std::make_unique<loco::PermutingEncoder<loco::Domain::Filter>>();
 
   encoder->perm(perm<T>());
 
diff --git a/compiler/luci-eval-driver/CMakeLists.txt b/compiler/luci-eval-driver/CMakeLists.txt
new file mode 100644
index 000000000..990f9d1a9
--- /dev/null
+++ b/compiler/luci-eval-driver/CMakeLists.txt
@@ -0,0 +1,12 @@
+set(SRCS_EVAL_TESTER
+      src/EvalDriver.cpp
+   )
+
+add_executable(luci_eval_driver ${SRCS_EVAL_TESTER})
+target_link_libraries(luci_eval_driver PRIVATE oops)
+target_link_libraries(luci_eval_driver PRIVATE loco)
+target_link_libraries(luci_eval_driver PRIVATE luci_import)
+target_link_libraries(luci_eval_driver PRIVATE luci_export)
+target_link_libraries(luci_eval_driver PRIVATE luci_lang)
+target_link_libraries(luci_eval_driver PRIVATE luci_interpreter)
+target_link_libraries(luci_eval_driver PRIVATE safemain)
diff --git a/compiler/luci-eval-driver/requires.cmake b/compiler/luci-eval-driver/requires.cmake
new file mode 100644
index 000000000..2904d9d3c
--- /dev/null
+++ b/compiler/luci-eval-driver/requires.cmake
@@ -0,0 +1,5 @@
+require("oops")
+require("loco")
+require("luci")
+require("luci-interpreter")
+require("safemain")
diff --git a/compiler/luci-value-test/tester/src/EvalTester.cpp b/compiler/luci-eval-driver/src/EvalDriver.cpp
index b49602e5e..4762cffe7 100644
--- a/compiler/luci-value-test/tester/src/EvalTester.cpp
+++ b/compiler/luci-eval-driver/src/EvalDriver.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,11 +21,8 @@
 
 #include <cstdlib>
 #include <fstream>
-#include <iostream>
 #include <vector>
-#include <map>
 #include <string>
-#include <random>
 
 namespace
 {
@@ -73,7 +70,7 @@ template <typename NodeT> size_t getTensorSize(const NodeT *node)
 } // namespace
 
 /*
- * @brief EvalTester main
+ * @brief EvalDriver main
  *
  *        Driver for testing luci-inerpreter
  *
@@ -83,8 +80,8 @@ int entry(int argc, char **argv)
   if (argc != 5)
   {
     std::cerr
-        << "Usage: " << argv[0]
-        << " <path/to/circle/model> <num_inputs> <path/to/input/prefix> <path/to/output/file>\n";
+      << "Usage: " << argv[0]
+      << " <path/to/circle/model> <num_inputs> <path/to/input/prefix> <path/to/output/file>\n";
     return EXIT_FAILURE;
   }
 
@@ -92,32 +89,12 @@ int entry(int argc, char **argv)
   const int32_t num_inputs = atoi(argv[2]);
   const char *input_prefix = argv[3];
   const char *output_file = argv[4];
-  const std::string intermediate_filename = std::string(filename) + ".inter.circle";
 
   // Load model from the file
-  std::unique_ptr<luci::Module> initial_module = importModel(filename);
-  if (initial_module == nullptr)
-  {
-    std::cerr << "ERROR: Failed to load '" << filename << "'" << std::endl;
-    return EXIT_FAILURE;
-  }
-
-  // Export to a Circle file
-  luci::CircleExporter exporter;
-
-  luci::CircleFileExpContract contract(initial_module.get(), intermediate_filename);
-
-  if (!exporter.invoke(&contract))
-  {
-    std::cerr << "ERROR: Failed to export '" << intermediate_filename << "'" << std::endl;
-    return EXIT_FAILURE;
-  }
-
-  // Import model again
-  std::unique_ptr<luci::Module> module = importModel(intermediate_filename);
+  std::unique_ptr<luci::Module> module = importModel(filename);
   if (module == nullptr)
   {
-    std::cerr << "ERROR: Failed to load '" << intermediate_filename << "'" << std::endl;
+    std::cerr << "ERROR: Failed to load '" << filename << "'" << std::endl;
     return EXIT_FAILURE;
   }
 
diff --git a/compiler/luci-interpreter/src/Interpreter.cpp b/compiler/luci-interpreter/src/Interpreter.cpp
index 639ffc1f0..b57b691d0 100644
--- a/compiler/luci-interpreter/src/Interpreter.cpp
+++ b/compiler/luci-interpreter/src/Interpreter.cpp
@@ -31,7 +31,7 @@ class EventNotifierImpl final : public EventNotifier
 public:
   EventNotifierImpl(const RuntimeToIR &runtime_to_ir,
                     const std::vector<ExecutionObserver *> &observers)
-      : _runtime_to_ir(runtime_to_ir), _observers(observers)
+    : _runtime_to_ir(runtime_to_ir), _observers(observers)
   {
   }
 
diff --git a/compiler/luci-interpreter/src/core/Kernel.h b/compiler/luci-interpreter/src/core/Kernel.h
index 5f5efb219..5cdb2e360 100644
--- a/compiler/luci-interpreter/src/core/Kernel.h
+++ b/compiler/luci-interpreter/src/core/Kernel.h
@@ -29,7 +29,7 @@ class Kernel
 {
 protected:
   Kernel(std::vector<const Tensor *> inputs, std::vector<Tensor *> outputs)
-      : _inputs(std::move(inputs)), _outputs(std::move(outputs))
+    : _inputs(std::move(inputs)), _outputs(std::move(outputs))
   {
   }
 
@@ -59,7 +59,7 @@ template <typename Params> class KernelWithParams : public Kernel
 protected:
   KernelWithParams(std::vector<const Tensor *> inputs, std::vector<Tensor *> outputs,
                    const Params &params)
-      : Kernel(std::move(inputs), std::move(outputs)), _params(params)
+    : Kernel(std::move(inputs), std::move(outputs)), _params(params)
   {
   }
 
diff --git a/compiler/luci-interpreter/src/core/KernelParams.h b/compiler/luci-interpreter/src/core/KernelParams.h
index b74be797b..dab6ba25f 100644
--- a/compiler/luci-interpreter/src/core/KernelParams.h
+++ b/compiler/luci-interpreter/src/core/KernelParams.h
@@ -44,6 +44,7 @@ struct ArgMaxParams
 struct ConcatenationParams
 {
   int axis;
+  Activation activation;
 };
 
 struct Conv2DParams
@@ -111,6 +112,12 @@ struct MulParams
   Activation activation;
 };
 
+struct PackParams
+{
+  int32_t values_count;
+  int32_t axis;
+};
+
 struct Pool2DParams
 {
   Padding padding;
diff --git a/compiler/luci-interpreter/src/core/RuntimeGraph.cpp b/compiler/luci-interpreter/src/core/RuntimeGraph.cpp
index 57f6fed44..fb0ad304b 100644
--- a/compiler/luci-interpreter/src/core/RuntimeGraph.cpp
+++ b/compiler/luci-interpreter/src/core/RuntimeGraph.cpp
@@ -94,7 +94,7 @@ void RuntimeGraph::TensorAllocPlan::deallocate(size_t kernel_index) const
 }
 
 RuntimeGraph::RuntimeGraph(RuntimeModule *owning_module)
-    : _owning_module(owning_module), _tensor_alloc_plan(std::make_unique<TensorAllocPlan>())
+  : _owning_module(owning_module), _tensor_alloc_plan(std::make_unique<TensorAllocPlan>())
 {
 }
 
diff --git a/compiler/luci-interpreter/src/core/Tensor.cpp b/compiler/luci-interpreter/src/core/Tensor.cpp
index 6e0424ffa..a9e7be0a9 100644
--- a/compiler/luci-interpreter/src/core/Tensor.cpp
+++ b/compiler/luci-interpreter/src/core/Tensor.cpp
@@ -24,8 +24,8 @@ namespace luci_interpreter
 
 Tensor::Tensor(DataType element_type, Shape shape, AffineQuantization quantization,
                std::string name)
-    : _element_type(element_type), _shape(std::move(shape)), _quantization(std::move(quantization)),
-      _name(std::move(name)), _data_allocated(false)
+  : _element_type(element_type), _shape(std::move(shape)), _quantization(std::move(quantization)),
+    _name(std::move(name)), _data_allocated(false)
 {
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/Add.cpp b/compiler/luci-interpreter/src/kernels/Add.cpp
index 8d119d516..7381c3849 100644
--- a/compiler/luci-interpreter/src/kernels/Add.cpp
+++ b/compiler/luci-interpreter/src/kernels/Add.cpp
@@ -31,7 +31,7 @@ namespace kernels
 {
 
 Add::Add(const Tensor *input1, const Tensor *input2, Tensor *output, const AddParams &params)
-    : KernelWithParams<AddParams>({input1, input2}, {output}, params)
+  : KernelWithParams<AddParams>({input1, input2}, {output}, params)
 {
 }
 
@@ -76,13 +76,13 @@ void Add::evalFloat() const
   params.float_activation_max = activation_max;
 
   const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes(
-      getTensorShape(input1()), getTensorShape(input2()), &params);
+    getTensorShape(input1()), getTensorShape(input2()), &params);
 
   if (need_broadcast)
   {
     tflite::reference_ops::BroadcastAdd4DSlow(
-        params, getTensorShape(input1()), getTensorData<float>(input1()), getTensorShape(input2()),
-        getTensorData<float>(input2()), getTensorShape(output()), getTensorData<float>(output()));
+      params, getTensorShape(input1()), getTensorData<float>(input1()), getTensorShape(input2()),
+      getTensorData<float>(input2()), getTensorShape(output()), getTensorData<float>(output()));
   }
   else
   {
@@ -130,14 +130,13 @@ void Add::evalQuantized() const
   params.quantized_activation_max = activation_max;
 
   const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes(
-      getTensorShape(input1()), getTensorShape(input2()), &params);
+    getTensorShape(input1()), getTensorShape(input2()), &params);
 
   if (need_broadcast)
   {
     tflite::reference_ops::BroadcastAdd4DSlow(
-        params, getTensorShape(input1()), getTensorData<uint8_t>(input1()),
-        getTensorShape(input2()), getTensorData<uint8_t>(input2()), getTensorShape(output()),
-        getTensorData<uint8_t>(output()));
+      params, getTensorShape(input1()), getTensorData<uint8_t>(input1()), getTensorShape(input2()),
+      getTensorData<uint8_t>(input2()), getTensorShape(output()), getTensorData<uint8_t>(output()));
   }
   else
   {
@@ -176,12 +175,12 @@ void Add::evalQuantizedS16() const
     const int32_t shifted_input1_val = static_cast<int32_t>(input1_val) << left_shift;
     const int32_t shifted_input2_val = static_cast<int32_t>(input2_val) << left_shift;
     const int32_t scaled_input1_val = tflite::MultiplyByQuantizedMultiplierSmallerThanOneExp(
-        shifted_input1_val, input1_multiplier, input1_shift);
+      shifted_input1_val, input1_multiplier, input1_shift);
     const int32_t scaled_input2_val = tflite::MultiplyByQuantizedMultiplierSmallerThanOneExp(
-        shifted_input2_val, input2_multiplier, input2_shift);
+      shifted_input2_val, input2_multiplier, input2_shift);
     const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
     const int32_t raw_output = tflite::MultiplyByQuantizedMultiplierSmallerThanOneExp(
-        raw_sum, output_multiplier, output_shift);
+      raw_sum, output_multiplier, output_shift);
     const int32_t clamped_output = std::min(activation_max, std::max(activation_min, raw_output));
     return static_cast<int16_t>(clamped_output);
   };
diff --git a/compiler/luci-interpreter/src/kernels/Add.test.cpp b/compiler/luci-interpreter/src/kernels/Add.test.cpp
index de8a3bbb0..5ad9beb30 100644
--- a/compiler/luci-interpreter/src/kernels/Add.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Add.test.cpp
@@ -40,29 +40,29 @@ TEST(AddTest, Uint8)
   std::initializer_list<float> base_data = {-0.3f, 2.3f, 0.9f,  0.5f, 0.8f, -1.1f,
                                             1.2f,  2.8f, -1.6f, 0.0f, 0.7f, -2.2f};
   std::initializer_list<int32_t> test_shapes[] = {
-      {1, 1, 3, 2}, {1, 3, 1, 2}, {2, 1, 3, 1}, {2, 3, 1, 1}};
+    {1, 1, 3, 2}, {1, 3, 1, 2}, {2, 1, 3, 1}, {2, 3, 1, 1}};
   std::initializer_list<float> test_data = {0.2f, 0.3f, -0.4f, 0.5f, 1.0f, 0.9f};
   std::initializer_list<int32_t> output_shapes[] = {
-      {2, 3, 3, 2}, {2, 3, 1, 2}, {2, 3, 3, 2}, {2, 3, 1, 2}};
+    {2, 3, 3, 2}, {2, 3, 1, 2}, {2, 3, 3, 2}, {2, 3, 1, 2}};
   std::vector<std::vector<float>> output_data = {
-      {-0.1f, 2.6f,  -0.7f, 2.8f,  0.7f,  3.0f,  1.1f, 0.8f,  0.5f, 1.0f,  1.9f, 1.4f,
-       1.0f,  -0.8f, 0.4f,  -0.6f, 1.8f,  -0.2f, 1.4f, 3.0f,  0.8f, 3.0f,  2.2f, 3.0f,
-       -1.4f, 0.3f,  -2.0f, 0.5f,  -0.6f, 0.9f,  0.9f, -1.9f, 0.3f, -1.7f, 1.7f, -1.3f},
-      {-0.1f, 2.6f, 0.5f, 1.0f, 1.8f, -0.2f, 1.4f, 3.0f, -2.0f, 0.5f, 1.7f, -1.3f},
-      {-0.1f, 2.5f,  0.0f,  2.6f,  -0.7f, 1.9f,  1.1f, 0.7f,  1.2f, 0.8f,  0.5f, 0.1f,
-       1.0f,  -0.9f, 1.1f,  -0.8f, 0.4f,  -1.5f, 1.7f, 3.0f,  2.2f, 3.0f,  2.1f, 3.0f,
-       -1.1f, 0.5f,  -0.6f, 1.0f,  -0.7f, 0.9f,  1.2f, -1.7f, 1.7f, -1.2f, 1.6f, -1.3f},
-      {-0.1f, 2.5f, 1.2f, 0.8f, 0.4f, -1.5f, 1.7f, 3.0f, -0.6f, 1.0f, 1.6f, -1.3f}};
+    {-0.1f, 2.6f,  -0.7f, 2.8f,  0.7f,  3.0f,  1.1f, 0.8f,  0.5f, 1.0f,  1.9f, 1.4f,
+     1.0f,  -0.8f, 0.4f,  -0.6f, 1.8f,  -0.2f, 1.4f, 3.0f,  0.8f, 3.0f,  2.2f, 3.0f,
+     -1.4f, 0.3f,  -2.0f, 0.5f,  -0.6f, 0.9f,  0.9f, -1.9f, 0.3f, -1.7f, 1.7f, -1.3f},
+    {-0.1f, 2.6f, 0.5f, 1.0f, 1.8f, -0.2f, 1.4f, 3.0f, -2.0f, 0.5f, 1.7f, -1.3f},
+    {-0.1f, 2.5f,  0.0f,  2.6f,  -0.7f, 1.9f,  1.1f, 0.7f,  1.2f, 0.8f,  0.5f, 0.1f,
+     1.0f,  -0.9f, 1.1f,  -0.8f, 0.4f,  -1.5f, 1.7f, 3.0f,  2.2f, 3.0f,  2.1f, 3.0f,
+     -1.1f, 0.5f,  -0.6f, 1.0f,  -0.7f, 0.9f,  1.2f, -1.7f, 1.7f, -1.2f, 1.6f, -1.3f},
+    {-0.1f, 2.5f, 1.2f, 0.8f, 0.4f, -1.5f, 1.7f, 3.0f, -0.6f, 1.0f, 1.6f, -1.3f}};
   float kQuantizedTolerance = GetTolerance(-3.f, 3.f);
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-3.f, 3.f);
   for (int i = 0; i < output_data.size(); i++)
   {
     Tensor input1_tensor =
-        makeInputTensor<DataType::U8>(base_shape, quant_param.first, quant_param.second, base_data);
+      makeInputTensor<DataType::U8>(base_shape, quant_param.first, quant_param.second, base_data);
     Tensor input2_tensor = makeInputTensor<DataType::U8>(test_shapes[i], quant_param.first,
                                                          quant_param.second, test_data);
     Tensor output_tensor =
-        makeOutputTensor(getElementType<uint8_t>(), quant_param.first, quant_param.second);
+      makeOutputTensor(getElementType<uint8_t>(), quant_param.first, quant_param.second);
 
     AddParams params{};
     params.activation = Activation::NONE;
@@ -81,9 +81,9 @@ TEST(AddTest, Uint8)
     Tensor input1_tensor = makeInputTensor<DataType::U8>(test_shapes[i], quant_param.first,
                                                          quant_param.second, test_data);
     Tensor input2_tensor =
-        makeInputTensor<DataType::U8>(base_shape, quant_param.first, quant_param.second, base_data);
+      makeInputTensor<DataType::U8>(base_shape, quant_param.first, quant_param.second, base_data);
     Tensor output_tensor =
-        makeOutputTensor(getElementType<uint8_t>(), quant_param.first, quant_param.second);
+      makeOutputTensor(getElementType<uint8_t>(), quant_param.first, quant_param.second);
 
     AddParams params{};
     params.activation = Activation::NONE;
@@ -103,14 +103,14 @@ TEST(AddTest, Float)
   Shape base_shape = {2, 3, 1, 2};
   std::vector<Shape> test_shapes{{1, 1, 3, 2}, {1, 3, 1, 2}, {2, 1, 3, 1}, {2, 3, 1, 1}};
   std::vector<std::vector<float>> test_outputs = {
-      {0.0f, 2.6f, 0.0f, 2.8f, 0.7f, 3.2f, 1.1f, 0.8f, 0.5f, 1.0f, 1.9f, 1.4f,
-       1.0f, 0.0f, 0.4f, 0.0f, 1.8f, 0.0f, 1.4f, 3.1f, 0.8f, 3.3f, 2.2f, 3.7f,
-       0.0f, 0.3f, 0.0f, 0.5f, 0.0f, 0.9f, 0.9f, 0.0f, 0.3f, 0.0f, 1.7f, 0.0f},
-      {0.0f, 2.6f, 0.5f, 1.0f, 1.8f, 0.0f, 1.4f, 3.1f, 0.0f, 0.5f, 1.7f, 0.0f},
-      {0.0f, 2.5f, 0.0f, 2.6f, 0.0f, 1.9f, 1.1f, 0.7f, 1.2f, 0.8f, 0.5f, 0.1f,
-       1.0f, 0.0f, 1.1f, 0.0f, 0.4f, 0.0f, 1.7f, 3.3f, 2.2f, 3.8f, 2.1f, 3.7f,
-       0.0f, 0.5f, 0.0f, 1.0f, 0.0f, 0.9f, 1.2f, 0.0f, 1.7f, 0.0f, 1.6f, 0.0f},
-      {0.0f, 2.5f, 1.2f, 0.8f, 0.4f, 0.0f, 1.7f, 3.3f, 0.0f, 1.0f, 1.6f, 0.0f}};
+    {0.0f, 2.6f, 0.0f, 2.8f, 0.7f, 3.2f, 1.1f, 0.8f, 0.5f, 1.0f, 1.9f, 1.4f,
+     1.0f, 0.0f, 0.4f, 0.0f, 1.8f, 0.0f, 1.4f, 3.1f, 0.8f, 3.3f, 2.2f, 3.7f,
+     0.0f, 0.3f, 0.0f, 0.5f, 0.0f, 0.9f, 0.9f, 0.0f, 0.3f, 0.0f, 1.7f, 0.0f},
+    {0.0f, 2.6f, 0.5f, 1.0f, 1.8f, 0.0f, 1.4f, 3.1f, 0.0f, 0.5f, 1.7f, 0.0f},
+    {0.0f, 2.5f, 0.0f, 2.6f, 0.0f, 1.9f, 1.1f, 0.7f, 1.2f, 0.8f, 0.5f, 0.1f,
+     1.0f, 0.0f, 1.1f, 0.0f, 0.4f, 0.0f, 1.7f, 3.3f, 2.2f, 3.8f, 2.1f, 3.7f,
+     0.0f, 0.5f, 0.0f, 1.0f, 0.0f, 0.9f, 1.2f, 0.0f, 1.7f, 0.0f, 1.6f, 0.0f},
+    {0.0f, 2.5f, 1.2f, 0.8f, 0.4f, 0.0f, 1.7f, 3.3f, 0.0f, 1.0f, 1.6f, 0.0f}};
   std::vector<float> input1_data{-0.3f, 2.3f, 0.9f,  0.5f, 0.8f, -1.1f,
                                  1.2f,  2.8f, -1.6f, 0.0f, 0.7f, -2.2f};
   std::vector<float> input2_data{0.2f, 0.3f, -0.4f, 0.5f, 1.0f, 0.9f};
@@ -128,7 +128,7 @@ TEST(AddTest, Float)
     kernel.execute();
 
     EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(test_outputs[i], 0.0001f))
-        << "With shape number " << i;
+      << "With shape number " << i;
   }
   // Re-run with exchanged inputs.
   for (size_t i = 0; i < test_shapes.size(); ++i)
@@ -145,7 +145,7 @@ TEST(AddTest, Float)
     kernel.execute();
 
     EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(test_outputs[i], 0.0001f))
-        << "With shape number " << i;
+      << "With shape number " << i;
   }
 }
 
@@ -154,26 +154,26 @@ TEST(AddTest, SInt16)
   Shape base_shape = {2, 3, 1, 2};
   std::vector<Shape> test_shapes{{1, 1, 3, 2}, {1, 3, 1, 2}, {2, 1, 3, 1}, {2, 3, 1, 1}};
   std::vector<std::vector<int32_t>> ref_output_shapes{
-      {2, 3, 3, 2}, {2, 3, 1, 2}, {2, 3, 3, 2}, {2, 3, 1, 2}};
+    {2, 3, 3, 2}, {2, 3, 1, 2}, {2, 3, 3, 2}, {2, 3, 1, 2}};
 
   std::vector<float> input1_data{-0.3f, 2.3f, 0.9f,  0.5f, 0.8f, -1.1f,
                                  1.2f,  2.8f, -1.6f, 0.0f, 0.7f, -2.2f};
   std::vector<float> input2_data{0.2f, 0.3f, -0.4f, 0.5f, 1.0f, 0.9f};
   std::vector<std::vector<float>> ref_outputs = {
-      {0.0f, 2.6f, 0.0f, 2.8f, 0.7f, 3.2f, 1.1f, 0.8f, 0.5f, 1.0f, 1.9f, 1.4f,
-       1.0f, 0.0f, 0.4f, 0.0f, 1.8f, 0.0f, 1.4f, 3.1f, 0.8f, 3.3f, 2.2f, 3.7f,
-       0.0f, 0.3f, 0.0f, 0.5f, 0.0f, 0.9f, 0.9f, 0.0f, 0.3f, 0.0f, 1.7f, 0.0f},
-      {0.0f, 2.6f, 0.5f, 1.0f, 1.8f, 0.0f, 1.4f, 3.1f, 0.0f, 0.5f, 1.7f, 0.0f},
-      {0.0f, 2.5f, 0.0f, 2.6f, 0.0f, 1.9f, 1.1f, 0.7f, 1.2f, 0.8f, 0.5f, 0.1f,
-       1.0f, 0.0f, 1.1f, 0.0f, 0.4f, 0.0f, 1.7f, 3.3f, 2.2f, 3.8f, 2.1f, 3.7f,
-       0.0f, 0.5f, 0.0f, 1.0f, 0.0f, 0.9f, 1.2f, 0.0f, 1.7f, 0.0f, 1.6f, 0.0f},
-      {0.0f, 2.5f, 1.2f, 0.8f, 0.4f, 0.0f, 1.7f, 3.3f, 0.0f, 1.0f, 1.6f, 0.0f}};
+    {0.0f, 2.6f, 0.0f, 2.8f, 0.7f, 3.2f, 1.1f, 0.8f, 0.5f, 1.0f, 1.9f, 1.4f,
+     1.0f, 0.0f, 0.4f, 0.0f, 1.8f, 0.0f, 1.4f, 3.1f, 0.8f, 3.3f, 2.2f, 3.7f,
+     0.0f, 0.3f, 0.0f, 0.5f, 0.0f, 0.9f, 0.9f, 0.0f, 0.3f, 0.0f, 1.7f, 0.0f},
+    {0.0f, 2.6f, 0.5f, 1.0f, 1.8f, 0.0f, 1.4f, 3.1f, 0.0f, 0.5f, 1.7f, 0.0f},
+    {0.0f, 2.5f, 0.0f, 2.6f, 0.0f, 1.9f, 1.1f, 0.7f, 1.2f, 0.8f, 0.5f, 0.1f,
+     1.0f, 0.0f, 1.1f, 0.0f, 0.4f, 0.0f, 1.7f, 3.3f, 2.2f, 3.8f, 2.1f, 3.7f,
+     0.0f, 0.5f, 0.0f, 1.0f, 0.0f, 0.9f, 1.2f, 0.0f, 1.7f, 0.0f, 1.6f, 0.0f},
+    {0.0f, 2.5f, 1.2f, 0.8f, 0.4f, 0.0f, 1.7f, 3.3f, 0.0f, 1.0f, 1.6f, 0.0f}};
 
   for (size_t i = 0; i < test_shapes.size(); ++i)
   {
     Tensor input1_tensor = makeInputTensor<DataType::S16>(base_shape, 3.0 / 32767, 0, input1_data);
     Tensor input2_tensor =
-        makeInputTensor<DataType::S16>(test_shapes[i], 1.0 / 32767, 0, input2_data);
+      makeInputTensor<DataType::S16>(test_shapes[i], 1.0 / 32767, 0, input2_data);
     Tensor output_tensor = makeOutputTensor(DataType::S16, 4.0 / 32767, 0);
     const float tolerance = output_tensor.scale();
 
@@ -186,15 +186,15 @@ TEST(AddTest, SInt16)
 
     EXPECT_THAT(extractTensorShape(output_tensor),
                 ::testing::ElementsAreArray(ref_output_shapes[i]))
-        << "With shape number " << i;
+      << "With shape number " << i;
     EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_outputs[i], tolerance))
-        << "With shape number " << i;
+      << "With shape number " << i;
   }
   // Re-run with exchanged inputs and different scales.
   for (size_t i = 0; i < test_shapes.size(); ++i)
   {
     Tensor input1_tensor =
-        makeInputTensor<DataType::S16>(test_shapes[i], 2.0 / 32767, 0, input2_data);
+      makeInputTensor<DataType::S16>(test_shapes[i], 2.0 / 32767, 0, input2_data);
     Tensor input2_tensor = makeInputTensor<DataType::S16>(base_shape, 4.0 / 32767, 0, input1_data);
     Tensor output_tensor = makeOutputTensor(DataType::S16, 5.0 / 32767, 0);
     const float tolerance = output_tensor.scale();
@@ -208,9 +208,9 @@ TEST(AddTest, SInt16)
 
     EXPECT_THAT(extractTensorShape(output_tensor),
                 ::testing::ElementsAreArray(ref_output_shapes[i]))
-        << "With shape number " << i;
+      << "With shape number " << i;
     EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_outputs[i], tolerance))
-        << "With shape number " << i;
+      << "With shape number " << i;
   }
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/ArgMax.cpp b/compiler/luci-interpreter/src/kernels/ArgMax.cpp
index 5c464ed09..2437d5762 100644
--- a/compiler/luci-interpreter/src/kernels/ArgMax.cpp
+++ b/compiler/luci-interpreter/src/kernels/ArgMax.cpp
@@ -24,7 +24,7 @@ namespace kernels
 {
 
 ArgMax::ArgMax(const Tensor *input, const Tensor *axis, Tensor *output, const ArgMaxParams &params)
-    : KernelWithParams<ArgMaxParams>({input, axis}, {output}, params)
+  : KernelWithParams<ArgMaxParams>({input, axis}, {output}, params)
 {
 }
 
@@ -60,11 +60,10 @@ void ArgMax::configure()
 void ArgMax::execute() const
 {
 
-#define TF_LITE_ARG_MAX(data_type, axis_type, output_type)                                     \
-  tflite::optimized_ops::ArgMinMax(getTensorShape(input()), getTensorData<data_type>(input()), \
-                                   getTensorData<axis_type>(axis()), getTensorShape(output()), \
-                                   getTensorData<output_type>(output()),                       \
-                                   std::greater<data_type>())
+#define TF_LITE_ARG_MAX(data_type, axis_type, output_type)                                        \
+  tflite::optimized_ops::ArgMinMax(                                                               \
+    getTensorShape(input()), getTensorData<data_type>(input()), getTensorData<axis_type>(axis()), \
+    getTensorShape(output()), getTensorData<output_type>(output()), std::greater<data_type>())
   if (axis()->element_type() == DataType::S32)
   {
     switch (_params.output_type)
diff --git a/compiler/luci-interpreter/src/kernels/ArgMax.test.cpp b/compiler/luci-interpreter/src/kernels/ArgMax.test.cpp
index c6734a114..3362edbf6 100644
--- a/compiler/luci-interpreter/src/kernels/ArgMax.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/ArgMax.test.cpp
@@ -60,14 +60,14 @@ TYPED_TEST(ArgMaxTest, Simple)
                             /*output_shape=*/{1, 1, 1},
                             /*input_data=*/
                             {
-                                1, 9, 7, 3,
+                              1, 9, 7, 3, //
                             },
                             /*dimension_data=*/{3}, /*output_data=*/{1});
   Check<TypeParam, int64_t>(/*input_shape=*/{1, 1, 1, 4}, /*dimension_shape=*/{},
                             /*output_shape=*/{1, 1, 1},
                             /*input_data=*/
                             {
-                                1, 9, 7, 3,
+                              1, 9, 7, 3, //
                             },
                             /*dimension_data=*/{3}, /*output_data=*/{1});
 }
@@ -78,14 +78,16 @@ TYPED_TEST(ArgMaxTest, MultiDimensions)
                             /*output_shape=*/{1, 1, 2},
                             /*input_data=*/
                             {
-                                1, 2, 7, 8, 1, 9, 7, 3,
+                              1, 2, 7, 8, //
+                              1, 9, 7, 3, //
                             },
                             /*dimension_data=*/{3}, /*output_data=*/{3, 1});
   Check<TypeParam, int64_t>(/*input_shape=*/{1, 1, 2, 4}, /*dimension_shape=*/{},
                             /*output_shape=*/{1, 1, 2},
                             /*input_data=*/
                             {
-                                1, 2, 7, 8, 1, 9, 7, 3,
+                              1, 2, 7, 8, //
+                              1, 9, 7, 3, //
                             },
                             /*dimension_data=*/{3}, /*output_data=*/{3, 1});
 }
@@ -93,7 +95,8 @@ TYPED_TEST(ArgMaxTest, MultiDimensions)
 TEST(ArgMaxTest, UnsupportedType_NEG)
 {
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({1, 1, 2, 4}, {
-                                                                             1, 2, 7, 8, 1, 9, 7, 3,
+                                                                           1, 2, 7, 8, //
+                                                                           1, 9, 7, 3, //
                                                                          });
   Tensor dimension_tensor = makeInputTensor<DataType::S32>({}, {3});
   Tensor output_tensor = makeOutputTensor(DataType::U8);
diff --git a/compiler/luci-interpreter/src/kernels/AveragePool2D.cpp b/compiler/luci-interpreter/src/kernels/AveragePool2D.cpp
index df54f9786..65ea4c09e 100644
--- a/compiler/luci-interpreter/src/kernels/AveragePool2D.cpp
+++ b/compiler/luci-interpreter/src/kernels/AveragePool2D.cpp
@@ -30,7 +30,7 @@ namespace kernels
 {
 
 AveragePool2D::AveragePool2D(const Tensor *input, Tensor *output, const Pool2DParams &params)
-    : KernelWithParams<Pool2DParams>({input}, {output}, params)
+  : KernelWithParams<Pool2DParams>({input}, {output}, params)
 {
 }
 
@@ -51,15 +51,15 @@ void AveragePool2D::configure()
   const int32_t input_width = input_shape.dim(2);
   const int32_t depth = input_shape.dim(3);
 
-  const int32_t output_height = computeOutputSize(_params.padding, input_height,
-                                                  _params.filter_height, _params.stride_height);
+  const int32_t output_height =
+    computeOutputSize(_params.padding, input_height, _params.filter_height, _params.stride_height);
   const int32_t output_width =
-      computeOutputSize(_params.padding, input_width, _params.filter_width, _params.stride_width);
+    computeOutputSize(_params.padding, input_width, _params.filter_width, _params.stride_width);
 
   _padding_height =
-      computePadding(_params.stride_height, 1, input_height, _params.filter_height, output_height);
+    computePadding(_params.stride_height, 1, input_height, _params.filter_height, output_height);
   _padding_width =
-      computePadding(_params.stride_width, 1, input_width, _params.filter_width, output_width);
+    computePadding(_params.stride_width, 1, input_width, _params.filter_width, output_width);
   if (input()->element_type() == DataType::U8)
   {
     LUCI_INTERPRETER_CHECK(std::abs(output()->scale() - input()->scale()) <= 1.0e-6);
@@ -149,8 +149,8 @@ void AveragePool2D::evalSInt16() const
   params.quantized_activation_max = activation_max;
 
   tflite::reference_integer_ops::AveragePool(
-      params, getTensorShape(input()), getTensorData<int16_t>(input()), //
-      getTensorShape(output()), getTensorData<int16_t>(output()));
+    params, getTensorShape(input()), getTensorData<int16_t>(input()), //
+    getTensorShape(output()), getTensorData<int16_t>(output()));
 }
 
 } // namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/AveragePool2D.test.cpp b/compiler/luci-interpreter/src/kernels/AveragePool2D.test.cpp
index 83e48c89d..4d7dab86a 100644
--- a/compiler/luci-interpreter/src/kernels/AveragePool2D.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/AveragePool2D.test.cpp
@@ -30,9 +30,9 @@ TEST(AveragePool2DTest, Float)
 {
   Shape input_shape{1, 3, 5, 1};
   std::vector<float> input_data{
-      -4, -3, -2, -1, 0,  //
-      1,  2,  3,  4,  5,  //
-      6,  7,  8,  9,  10, //
+    -4, -3, -2, -1, 0,  //
+    1,  2,  3,  4,  5,  //
+    6,  7,  8,  9,  10, //
   };
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
@@ -50,8 +50,8 @@ TEST(AveragePool2DTest, Float)
   kernel.execute();
 
   std::vector<float> ref_output_data{
-      0, 1.5, //
-      4.5, 6, //
+    0, 1.5, //
+    4.5, 6, //
   };
   EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 2, 1}));
@@ -60,12 +60,12 @@ TEST(AveragePool2DTest, Float)
 TEST(AveragePool2DTest, Uint8_0)
 {
   std::vector<float> input_data{
-      0,  -6, 12, 4, //
-      -3, -2, 10, 7, //
+    0,  -6, 12, 4, //
+    -3, -2, 10, 7, //
   };
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-15.9375f, 15.9375f);
-  Tensor input_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first,
-                                                      quant_param.second, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
 
   Pool2DParams params{};
@@ -87,13 +87,13 @@ TEST(AveragePool2DTest, Uint8_0)
 TEST(AveragePool2DTest, Uint8_1)
 {
   std::vector<float> input_data{
-      0, 6, 12, 4, //
-      3, 2, 10, 7, //
+    0, 6, 12, 4, //
+    3, 2, 10, 7, //
   };
 
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-15.9375f, 15.9375f);
-  Tensor input_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first,
-                                                      quant_param.second, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
 
   Pool2DParams params{};
@@ -117,13 +117,13 @@ TEST(AveragePool2DTest, SInt16)
   Shape input_shape{1, 3, 5, 1};
   std::vector<int32_t> ref_output_shape{1, 2, 2, 1};
   std::vector<float> input_data{
-      -4, -3, -2, -1, 0,  //
-      1,  2,  3,  4,  5,  //
-      6,  7,  8,  9,  10, //
+    -4, -3, -2, -1, 0,  //
+    1,  2,  3,  4,  5,  //
+    6,  7,  8,  9,  10, //
   };
   std::vector<float> ref_output_data{
-      0, 1.5, //
-      4.5, 6, //
+    0, 1.5, //
+    4.5, 6, //
   };
   Tensor input_tensor = makeInputTensor<DataType::S16>(input_shape, 0.5, 0, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::S16, 0.5, 0);
@@ -148,9 +148,9 @@ TEST(AveragePool2DTest, Invalid_Input_Shape_NEG)
 {
   Shape input_shape{1, 3, 5};
   std::vector<float> input_data{
-      -4, -3, -2, -1, 0,  //
-      1,  2,  3,  4,  5,  //
-      6,  7,  8,  9,  10, //
+    -4, -3, -2, -1, 0,  //
+    1,  2,  3,  4,  5,  //
+    6,  7,  8,  9,  10, //
   };
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
@@ -171,9 +171,9 @@ TEST(AveragePool2DTest, In_Out_Type_NEG)
 {
   Shape input_shape{1, 3, 5, 1};
   std::vector<float> input_data{
-      -4, -3, -2, -1, 0,  //
-      1,  2,  3,  4,  5,  //
-      6,  7,  8,  9,  10, //
+    -4, -3, -2, -1, 0,  //
+    1,  2,  3,  4,  5,  //
+    6,  7,  8,  9,  10, //
   };
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::U8);
@@ -193,8 +193,8 @@ TEST(AveragePool2DTest, In_Out_Type_NEG)
 TEST(AveragePool2DTest, Quant_Param_NEG)
 {
   std::vector<float> input_data{
-      0,  -6, 12, 4, //
-      -3, -2, 10, 7, //
+    0,  -6, 12, 4, //
+    -3, -2, 10, 7, //
   };
 
   std::pair<float, int32_t> quant_param1 = quantizationParams<uint8_t>(-15.9375f, 15.9375f);
diff --git a/compiler/luci-interpreter/src/kernels/BatchToSpaceND.cpp b/compiler/luci-interpreter/src/kernels/BatchToSpaceND.cpp
new file mode 100644
index 000000000..591fcc00a
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/BatchToSpaceND.cpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/BatchToSpaceND.h"
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+namespace
+{
+const int kInputMinDimensionNum = 3;
+const int kInputMaxDimensionNum = 4;
+} // namespace
+
+BatchToSpaceND::BatchToSpaceND(const Tensor *input, const Tensor *block_shape, const Tensor *crops,
+                               Tensor *output)
+  : Kernel({input, block_shape, crops}, {output})
+{
+}
+
+void BatchToSpaceND::configure()
+{
+
+  const auto *block_shape_data = block_shape()->data<int32_t>();
+  const auto *crops_data = crops()->data<int32_t>();
+  LUCI_INTERPRETER_CHECK(input()->shape().num_dims() >= kInputMinDimensionNum);
+  LUCI_INTERPRETER_CHECK(input()->shape().num_dims() <= kInputMaxDimensionNum);
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
+
+  int spatial_dims_num = input()->shape().num_dims() - 2;
+
+  LUCI_INTERPRETER_CHECK(block_shape()->shape().num_dims() == 1);
+  LUCI_INTERPRETER_CHECK(block_shape()->shape().dim(0) == spatial_dims_num);
+
+  LUCI_INTERPRETER_CHECK(crops()->shape().num_dims() == 2);
+  LUCI_INTERPRETER_CHECK(crops()->shape().dim(0) == spatial_dims_num);
+  LUCI_INTERPRETER_CHECK(crops()->shape().dim(1) == 2);
+  for (int i = 0; i < spatial_dims_num * 2; ++i)
+  {
+    LUCI_INTERPRETER_CHECK(crops_data[i] >= 0);
+  }
+
+  Shape output_shape = Shape(input()->shape().num_dims());
+  int output_batch_size = input()->shape().dim(0);
+  for (int i = 0; i < spatial_dims_num; ++i)
+  {
+    LUCI_INTERPRETER_CHECK(output_batch_size % block_shape_data[i] == 0);
+    output_batch_size = output_batch_size / block_shape_data[i];
+    output_shape.dim(i + 1) =
+      input()->shape().dim(i + 1) * block_shape_data[i] - crops_data[i * 2] - crops_data[i * 2 + 1];
+  }
+
+  output_shape.dim(0) = output_batch_size;
+  output_shape.dim(input()->shape().num_dims() - 1) =
+    input()->shape().dim(input()->shape().num_dims() - 1);
+  output()->resize(output_shape);
+}
+
+void BatchToSpaceND::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      tflite::optimized_ops::BatchToSpaceND(
+        getTensorShape(input()), getTensorData<float>(input()), getTensorShape(block_shape()),
+        getTensorData<int32_t>(block_shape()), getTensorShape(crops()),
+        getTensorData<int32_t>(crops()), getTensorShape(output()), getTensorData<float>(output()));
+      break;
+    case DataType::U8:
+      tflite::optimized_ops::BatchToSpaceND(
+        getTensorShape(input()), getTensorData<uint8_t>(input()), getTensorShape(block_shape()),
+        getTensorData<int32_t>(block_shape()), getTensorShape(crops()),
+        getTensorData<int32_t>(crops()), getTensorShape(output()),
+        getTensorData<uint8_t>(output()));
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/BatchToSpaceND.h b/compiler/luci-interpreter/src/kernels/BatchToSpaceND.h
new file mode 100644
index 000000000..57703ea5d
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/BatchToSpaceND.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_BATCHTOSPACEND_H
+#define LUCI_INTERPRETER_KERNELS_BATCHTOSPACEND_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class BatchToSpaceND : public Kernel
+{
+public:
+  BatchToSpaceND(const Tensor *input, const Tensor *block_shape, const Tensor *crops,
+                 Tensor *output);
+
+  const Tensor *input() const { return _inputs[0]; }
+  const Tensor *block_shape() const { return _inputs[1]; }
+  const Tensor *crops() const { return _inputs[2]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_BATCHTOSPACEND_H
diff --git a/compiler/luci-interpreter/src/kernels/BatchToSpaceND.test.cpp b/compiler/luci-interpreter/src/kernels/BatchToSpaceND.test.cpp
new file mode 100644
index 000000000..a29981d17
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/BatchToSpaceND.test.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/BatchToSpaceND.h"
+#include "kernels/TestUtils.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T>
+void Check(std::initializer_list<int32_t> input_shape,
+           std::initializer_list<int32_t> block_shape_shape,
+           std::initializer_list<int32_t> crops_shape, std::initializer_list<int32_t> output_shape,
+           std::initializer_list<T> input_data, std::initializer_list<int32_t> block_shape_data,
+           std::initializer_list<int32_t> crops_data, std::initializer_list<T> output_data)
+{
+  constexpr DataType element_type = getElementType<T>();
+  Tensor input_tensor = makeInputTensor<element_type>(input_shape, input_data);
+  Tensor block_shape_tensor = makeInputTensor<DataType::S32>(block_shape_shape, block_shape_data);
+  Tensor crops_tensor = makeInputTensor<DataType::S32>(crops_shape, crops_data);
+  Tensor output_tensor = makeOutputTensor(element_type);
+
+  BatchToSpaceND kernel(&input_tensor, &block_shape_tensor, &crops_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<T>(output_tensor), ::testing::ElementsAreArray(output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), output_shape);
+}
+
+template <typename T> class BatchToSpaceNDTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t>;
+TYPED_TEST_CASE(BatchToSpaceNDTest, DataTypes);
+
+TYPED_TEST(BatchToSpaceNDTest, Simple)
+{
+  Check<TypeParam>(/*input_shape=*/{4, 2, 2, 1}, /*block_shape_shape=*/{2}, /*crops_shape=*/{2, 2},
+                   /*output_shape=*/{1, 4, 4, 1},
+                   /*input_data=*/{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+                   /*block_shape_data=*/{2, 2}, /*crops_data=*/{0, 0, 0, 0},
+                   /*output_data=*/{1, 5, 2, 6, 9, 13, 10, 14, 3, 7, 4, 8, 11, 15, 12, 16});
+}
+
+TEST(BatchToSpaceNDTest, Invalid_Shape_NEG)
+{
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({3, 2, 2, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  Tensor block_shape_tensor = makeInputTensor<DataType::S32>({2}, {2, 2});
+  Tensor crops_tensor = makeInputTensor<DataType::S32>({2, 2}, {0, 0, 0, 0});
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  BatchToSpaceND kernel(&input_tensor, &block_shape_tensor, &crops_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(BatchToSpaceNDTest, Invalid_Crops_NEG)
+{
+  Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(
+    {4, 2, 2, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+  Tensor block_shape_tensor = makeInputTensor<DataType::S32>({2}, {2, 2});
+  Tensor crops_tensor = makeInputTensor<DataType::S32>({2, 2}, {0, 0, -1, 0});
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  BatchToSpaceND kernel(&input_tensor, &block_shape_tensor, &crops_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/BinaryOpCommon.h b/compiler/luci-interpreter/src/kernels/BinaryOpCommon.h
index 62bd4158e..2d2842a9e 100644
--- a/compiler/luci-interpreter/src/kernels/BinaryOpCommon.h
+++ b/compiler/luci-interpreter/src/kernels/BinaryOpCommon.h
@@ -38,7 +38,7 @@ void BinaryOpBroadcastSlow(const tflite::RuntimeShape &unextended_input1_shape,
   if (unextended_input1_shape == unextended_input2_shape)
   {
     const int flat_size = tflite::MatchingElementsSize(
-        unextended_input1_shape, unextended_input2_shape, unextended_output_shape);
+      unextended_input1_shape, unextended_input2_shape, unextended_output_shape);
     for (int i = 0; i < flat_size; ++i)
     {
       output_data[i] = op(input1_data[i], input2_data[i]);
@@ -60,8 +60,8 @@ void BinaryOpBroadcastSlow(const tflite::RuntimeShape &unextended_input1_shape,
 
     auto fn = [&](int indexes[N]) {
       output_data[SubscriptToIndex(output_desc, indexes)] =
-          op(input1_data[SubscriptToIndex(desc1, indexes)],
-             input2_data[SubscriptToIndex(desc2, indexes)]);
+        op(input1_data[SubscriptToIndex(desc1, indexes)],
+           input2_data[SubscriptToIndex(desc2, indexes)]);
     };
     tflite::NDOpsHelper<N>(output_desc, fn);
   }
diff --git a/compiler/luci-interpreter/src/kernels/CMakeLists.txt b/compiler/luci-interpreter/src/kernels/CMakeLists.txt
index a07589dca..d7ab76374 100644
--- a/compiler/luci-interpreter/src/kernels/CMakeLists.txt
+++ b/compiler/luci-interpreter/src/kernels/CMakeLists.txt
@@ -1,5 +1,4 @@
 find_package(Threads REQUIRED)
-nnas_find_package(GTest REQUIRED)
 
 set(SOURCES
     Add.h
@@ -8,6 +7,8 @@ set(SOURCES
     ArgMax.cpp
     AveragePool2D.h
     AveragePool2D.cpp
+    BatchToSpaceND.h
+    BatchToSpaceND.cpp
     Concatenation.h
     Concatenation.cpp
     Conv2D.h
@@ -70,8 +71,12 @@ set(SOURCES
     Minimum.cpp
     Mul.h
     Mul.cpp
+    Neg.h
+    Neg.cpp
     NotEqual.h
     NotEqual.cpp
+    Pack.h
+    Pack.cpp
     Pad.h
     Pad.cpp
     Pow.h
@@ -96,6 +101,8 @@ set(SOURCES
     Slice.cpp
     Softmax.h
     Softmax.cpp
+    SpaceToBatchND.h
+    SpaceToBatchND.cpp
     SpaceToDepth.h
     SpaceToDepth.cpp
     Split.h
@@ -104,6 +111,8 @@ set(SOURCES
     StridedSlice.cpp
     Sqrt.h
     Sqrt.cpp
+    SquaredDifference.h
+    SquaredDifference.cpp
     Squeeze.h
     Squeeze.cpp
     Sub.h
@@ -135,11 +144,17 @@ target_link_libraries(luci_interpreter_kernels
     PUBLIC luci_interpreter_core
     PRIVATE nncc_common Threads::Threads)
 
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
+nnas_find_package(GTest REQUIRED)
 
 set(TEST_SOURCES
     Add.test.cpp
     ArgMax.test.cpp
     AveragePool2D.test.cpp
+    BatchToSpaceND.test.cpp
     Concatenation.test.cpp
     Conv2D.test.cpp
     DepthToSpace.test.cpp
@@ -171,7 +186,9 @@ set(TEST_SOURCES
     Mean.test.cpp
     Minimum.test.cpp
     Mul.test.cpp
+    Neg.test.cpp
     NotEqual.test.cpp
+    Pack.test.cpp
     Pad.test.cpp
     Pow.test.cpp
     Prelu.test.cpp
@@ -184,10 +201,12 @@ set(TEST_SOURCES
     Rsqrt.test.cpp
     Slice.test.cpp
     Softmax.test.cpp
+    SpaceToBatchND.test.cpp
     SpaceToDepth.test.cpp
     Split.test.cpp
     StridedSlice.test.cpp
     Sqrt.test.cpp
+    SquaredDifference.test.cpp
     Squeeze.test.cpp
     Sub.test.cpp
     Tanh.test.cpp
diff --git a/compiler/luci-interpreter/src/kernels/Concatenation.cpp b/compiler/luci-interpreter/src/kernels/Concatenation.cpp
index 6f8820446..e3376c13d 100644
--- a/compiler/luci-interpreter/src/kernels/Concatenation.cpp
+++ b/compiler/luci-interpreter/src/kernels/Concatenation.cpp
@@ -29,7 +29,7 @@ namespace kernels
 
 Concatenation::Concatenation(std::vector<const Tensor *> inputs, Tensor *output,
                              const ConcatenationParams &params)
-    : KernelWithParams<ConcatenationParams>(std::move(inputs), {output}, params)
+  : KernelWithParams<ConcatenationParams>(std::move(inputs), {output}, params)
 {
 }
 
@@ -39,6 +39,9 @@ void Concatenation::configure()
   LUCI_INTERPRETER_CHECK(num_inputs > 0);
   const Tensor *t0 = _inputs[0];
 
+  // TODO: Support concat with fused activation function
+  LUCI_INTERPRETER_CHECK(params().activation == luci::FusedActFunc::NONE);
+
   int axis = _params.axis;
   if (axis < 0)
     axis += t0->shape().num_dims();
diff --git a/compiler/luci-interpreter/src/kernels/Concatenation.test.cpp b/compiler/luci-interpreter/src/kernels/Concatenation.test.cpp
index 91707a256..ee9b7d0d3 100644
--- a/compiler/luci-interpreter/src/kernels/Concatenation.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Concatenation.test.cpp
@@ -38,6 +38,7 @@ TEST(ConcatenationTest, Float)
   // Try different 'axis' and expect different results.
   {
     params.axis = 0;
+    params.activation = luci::FusedActFunc::NONE;
 
     Concatenation kernel({&input1_tensor, &input2_tensor}, &output_tensor, params);
     kernel.configure();
@@ -48,6 +49,7 @@ TEST(ConcatenationTest, Float)
   }
   {
     params.axis = -2; // Same as '0'.
+    params.activation = luci::FusedActFunc::NONE;
 
     Concatenation kernel({&input1_tensor, &input2_tensor}, &output_tensor, params);
     kernel.configure();
@@ -58,6 +60,7 @@ TEST(ConcatenationTest, Float)
   }
   {
     params.axis = 1;
+    params.activation = luci::FusedActFunc::NONE;
 
     Concatenation kernel({&input1_tensor, &input2_tensor}, &output_tensor, params);
     kernel.configure();
@@ -68,6 +71,7 @@ TEST(ConcatenationTest, Float)
   }
   {
     params.axis = -1; // Same as '1'.
+    params.activation = luci::FusedActFunc::NONE;
 
     Concatenation kernel({&input1_tensor, &input2_tensor}, &output_tensor, params);
     kernel.configure();
@@ -84,6 +88,7 @@ TEST(ConcatenationTest, Input_Number_Check_NEG)
   ConcatenationParams params{};
 
   params.axis = -1;
+  params.activation = luci::FusedActFunc::NONE;
 
   Concatenation kernel({}, &output_tensor, params);
   EXPECT_ANY_THROW(kernel.configure());
@@ -99,6 +104,7 @@ TEST(ConcatenationTest, Invalid_Axis_NEG)
   ConcatenationParams params{};
 
   params.axis = -3;
+  params.activation = luci::FusedActFunc::NONE;
 
   Concatenation kernel({&input1_tensor, &input2_tensor}, &output_tensor, params);
   EXPECT_ANY_THROW(kernel.configure());
@@ -114,6 +120,7 @@ TEST(ConcatenationTest, Mismatching_Input_Type_NEG)
   ConcatenationParams params{};
 
   params.axis = -1;
+  params.activation = luci::FusedActFunc::NONE;
 
   Concatenation kernel({&input1_tensor, &input2_tensor}, &output_tensor, params);
   EXPECT_ANY_THROW(kernel.configure());
@@ -129,6 +136,7 @@ TEST(ConcatenationTest, Mismatching_Input_Dimension_Num_NEG)
   ConcatenationParams params{};
 
   params.axis = -1;
+  params.activation = luci::FusedActFunc::NONE;
 
   Concatenation kernel({&input1_tensor, &input2_tensor}, &output_tensor, params);
   EXPECT_ANY_THROW(kernel.configure());
@@ -144,6 +152,7 @@ TEST(ConcatenationTest, Mismatching_Input_Dimension_NEG)
   ConcatenationParams params{};
 
   params.axis = -1;
+  params.activation = luci::FusedActFunc::NONE;
 
   Concatenation kernel({&input1_tensor, &input2_tensor}, &output_tensor, params);
   EXPECT_ANY_THROW(kernel.configure());
@@ -159,6 +168,24 @@ TEST(ConcatenationTest, Unsupported_Configure_Type_NEG)
   ConcatenationParams params{};
 
   params.axis = -1;
+  params.activation = luci::FusedActFunc::NONE;
+
+  Concatenation kernel({&input1_tensor, &input2_tensor}, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+// TODO: Remove this test when concat w/ fused_activation is supported
+TEST(ConcatenationTest, With_Fused_Activation_NEG)
+{
+  std::vector<float> input1_data{1, 2, 3, 4, 5, 6};
+  std::vector<float> input2_data{7, 8, 9, 10, 11, 12};
+  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, input1_data);
+  Tensor input2_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, input2_data);
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  ConcatenationParams params{};
+
+  params.axis = 1;
+  params.activation = luci::FusedActFunc::RELU;
 
   Concatenation kernel({&input1_tensor, &input2_tensor}, &output_tensor, params);
   EXPECT_ANY_THROW(kernel.configure());
diff --git a/compiler/luci-interpreter/src/kernels/Conv2D.cpp b/compiler/luci-interpreter/src/kernels/Conv2D.cpp
index c5069e403..56ca96a34 100644
--- a/compiler/luci-interpreter/src/kernels/Conv2D.cpp
+++ b/compiler/luci-interpreter/src/kernels/Conv2D.cpp
@@ -31,7 +31,7 @@ namespace kernels
 
 Conv2D::Conv2D(const Tensor *input, const Tensor *filter, const Tensor *bias, Tensor *output,
                const Conv2DParams &params)
-    : KernelWithParams<Conv2DParams>({input, filter, bias}, {output}, params)
+  : KernelWithParams<Conv2DParams>({input, filter, bias}, {output}, params)
 {
 }
 
@@ -84,11 +84,11 @@ void Conv2D::configure()
                                                bias()->shape().dim(0) == output_depth));
 
   const int32_t output_height =
-      computeOutputSize(_params.padding, input_height, filter_height, _params.stride_height,
-                        _params.dilation_height_factor);
+    computeOutputSize(_params.padding, input_height, filter_height, _params.stride_height,
+                      _params.dilation_height_factor);
   const int32_t output_width =
-      computeOutputSize(_params.padding, input_width, filter_width, _params.stride_width,
-                        _params.dilation_width_factor);
+    computeOutputSize(_params.padding, input_width, filter_width, _params.stride_width,
+                      _params.dilation_width_factor);
 
   _padding_height = computePadding(_params.stride_height, _params.dilation_height_factor,
                                    input_height, filter_height, output_height);
@@ -100,11 +100,11 @@ void Conv2D::configure()
   // Allocate tensor for Im2Col, if needed.
   // The checks here should be aligned with the actual implementation.
   const bool need_dilated_im2col =
-      _params.dilation_height_factor != 1 || _params.dilation_width_factor != 1;
+    _params.dilation_height_factor != 1 || _params.dilation_width_factor != 1;
   const bool need_non_dilated_im2col = _params.stride_height != 1 || _params.stride_width != 1 ||
                                        filter_height != 1 || filter_width != 1;
   const bool need_im2col =
-      input()->element_type() != DataType::S16 && (need_dilated_im2col || need_non_dilated_im2col);
+    input()->element_type() != DataType::S16 && (need_dilated_im2col || need_non_dilated_im2col);
   if (need_im2col)
   {
     const int input_depth = input_shape.dim(3);
@@ -113,7 +113,7 @@ void Conv2D::configure()
     try
     {
       _im2col =
-          std::make_unique<Tensor>(input()->element_type(), im2col_shape, AffineQuantization{}, "");
+        std::make_unique<Tensor>(input()->element_type(), im2col_shape, AffineQuantization{}, "");
     }
     catch (std::bad_alloc &ba)
     {
@@ -174,16 +174,31 @@ void Conv2D::evalFloat() const
   params.float_activation_max = activation_max;
 
   if (_im2col)
-    tflite::optimized_ops::Conv(params, getTensorShape(input()), getTensorData<float>(input()),
-                                getTensorShape(filter()), getTensorData<float>(filter()),
-                                getTensorShape(bias()), getTensorData<float>(bias()),
-                                getTensorShape(output()), getTensorData<float>(output()),
-                                getTensorShape(_im2col.get()), getTensorData<float>(_im2col.get()));
-  else
-    tflite::reference_ops::Conv(
+  {
+    try
+    {
+      tflite::optimized_ops::Conv(
+        params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(filter()),
+        getTensorData<float>(filter()), getTensorShape(bias()), getTensorData<float>(bias()),
+        getTensorShape(output()), getTensorData<float>(output()), getTensorShape(_im2col.get()),
+        getTensorData<float>(_im2col.get()));
+    }
+    catch (std::bad_alloc &ba)
+    {
+      // Failed memory allocation
+      _im2col->deallocate();
+
+      tflite::reference_ops::Conv(
         params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(filter()),
         getTensorData<float>(filter()), getTensorShape(bias()), getTensorData<float>(bias()),
         getTensorShape(output()), getTensorData<float>(output()), tflite::RuntimeShape(), nullptr);
+    }
+  }
+  else
+    tflite::reference_ops::Conv(
+      params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(filter()),
+      getTensorData<float>(filter()), getTensorShape(bias()), getTensorData<float>(bias()),
+      getTensorShape(output()), getTensorData<float>(output()), tflite::RuntimeShape(), nullptr);
 }
 
 void Conv2D::evalQuantized() const
@@ -223,10 +238,10 @@ void Conv2D::evalQuantized() const
   gemmlowp_context->set_max_num_threads(static_cast<int>(std::thread::hardware_concurrency()));
 
   tflite::optimized_ops::Conv(
-      params, getTensorShape(input()), getTensorData<uint8_t>(input()), getTensorShape(filter()),
-      getTensorData<uint8_t>(filter()), getTensorShape(bias()), getTensorData<int32_t>(bias()),
-      getTensorShape(output()), getTensorData<uint8_t>(output()), getTensorShape(_im2col.get()),
-      getTensorData<uint8_t>(_im2col.get()), gemmlowp_context.get());
+    params, getTensorShape(input()), getTensorData<uint8_t>(input()), getTensorShape(filter()),
+    getTensorData<uint8_t>(filter()), getTensorShape(bias()), getTensorData<int32_t>(bias()),
+    getTensorShape(output()), getTensorData<uint8_t>(output()), getTensorShape(_im2col.get()),
+    getTensorData<uint8_t>(_im2col.get()), gemmlowp_context.get());
 }
 
 void Conv2D::evalQuantizedPerChannel() const
@@ -260,10 +275,10 @@ void Conv2D::evalQuantizedPerChannel() const
   calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
 
   const std::vector<double> effective_output_scale =
-      getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
+    getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
 
   const std::vector<ChannelQuantMultipliers> multipliers_raw =
-      quantizeMultipliers(effective_output_scale);
+    quantizeMultipliers(effective_output_scale);
   BroadcastableWrapper<ChannelQuantMultipliers> quant_multipliers(multipliers_raw);
 
   for (int32_t batch = 0; batch < batches; ++batch)
@@ -288,9 +303,9 @@ void Conv2D::evalQuantizedPerChannel() const
                 for (int32_t in_c = 0; in_c < input_depth; ++in_c)
                 {
                   const uint8_t input_val =
-                      input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
+                    input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
                   const uint8_t filter_val =
-                      filter_data[calcOffset(filter_shape, out_c, filter_y, filter_x, in_c)];
+                    filter_data[calcOffset(filter_shape, out_c, filter_y, filter_x, in_c)];
                   acc += static_cast<int32_t>(input_val - input()->zero_point()) *
                          static_cast<int32_t>(filter_val - filter()->zero_points()[out_c]);
                 }
@@ -303,7 +318,7 @@ void Conv2D::evalQuantizedPerChannel() const
           }
 
           int32_t scaled_acc = tflite::MultiplyByQuantizedMultiplier(
-              acc, quant_multipliers[out_c].multiplier, quant_multipliers[out_c].shift);
+            acc, quant_multipliers[out_c].multiplier, quant_multipliers[out_c].shift);
 
           scaled_acc += output()->zero_point();
           scaled_acc = std::max(scaled_acc, activation_min);
@@ -346,10 +361,10 @@ void Conv2D::evalQuantizedS16() const
   calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
 
   const std::vector<double> effective_output_scale =
-      getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
+    getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
 
   const std::vector<ChannelQuantMultipliers> multipliers_raw =
-      quantizeMultipliers(effective_output_scale);
+    quantizeMultipliers(effective_output_scale);
   BroadcastableWrapper<ChannelQuantMultipliers> multipliers(multipliers_raw);
 
   for (int32_t batch = 0; batch < batches; ++batch)
@@ -374,9 +389,9 @@ void Conv2D::evalQuantizedS16() const
                 for (int32_t in_c = 0; in_c < input_depth; ++in_c)
                 {
                   const int16_t input_val =
-                      input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
+                    input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
                   const int16_t filter_val =
-                      filter_data[calcOffset(filter_shape, out_c, filter_y, filter_x, in_c)];
+                    filter_data[calcOffset(filter_shape, out_c, filter_y, filter_x, in_c)];
                   acc += static_cast<int64_t>(input_val) * static_cast<int64_t>(filter_val);
                 }
               }
@@ -388,7 +403,7 @@ void Conv2D::evalQuantizedS16() const
           }
 
           int32_t scaled_acc = tflite::MultiplyByQuantizedMultiplier(
-              acc, multipliers[out_c].multiplier, multipliers[out_c].shift);
+            acc, multipliers[out_c].multiplier, multipliers[out_c].shift);
 
           scaled_acc = std::max(scaled_acc, activation_min);
           scaled_acc = std::min(scaled_acc, activation_max);
diff --git a/compiler/luci-interpreter/src/kernels/Conv2D.test.cpp b/compiler/luci-interpreter/src/kernels/Conv2D.test.cpp
index 35a0c5491..8610a4fe6 100644
--- a/compiler/luci-interpreter/src/kernels/Conv2D.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Conv2D.test.cpp
@@ -32,16 +32,16 @@ TEST(Conv2DTest, Float)
   Shape filter_shape{2, 2, 2, 2};
   Shape bias_shape{2};
   std::vector<float> input_data{
-      1,  2,  3,  4,  5,  6,  // row = 0
-      7,  8,  9,  10, 11, 12, // row = 1
-      13, 14, 15, 16, 17, 18, // row = 2
-      19, 20, 21, 22, 23, 24, // row = 3
+    1,  2,  3,  4,  5,  6,  // row = 0
+    7,  8,  9,  10, 11, 12, // row = 1
+    13, 14, 15, 16, 17, 18, // row = 2
+    19, 20, 21, 22, 23, 24, // row = 3
   };
   std::vector<float> filter_data{
-      1,  2,  -3, -4, // out = 0, row = 0
-      -5, 6,  -7, 8,  // out = 1, row = 0
-      4,  -2, 3,  -1, // out = 0, row = 1
-      -8, -6, 7,  5,  // out = 1, row = 1
+    1,  2,  -3, -4, // out = 0, row = 0
+    -5, 6,  -7, 8,  // out = 1, row = 0
+    4,  -2, 3,  -1, // out = 0, row = 1
+    -8, -6, 7,  5,  // out = 1, row = 1
   };
   std::vector<float> bias_data{1, 2};
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
@@ -62,8 +62,8 @@ TEST(Conv2DTest, Float)
   kernel.execute();
 
   std::vector<float> ref_output_data{
-      11, 16, 7, 20, // row = 0
-      0,  40, 0, 44, // row = 1
+    11, 16, 7, 20, // row = 0
+    0,  40, 0, 44, // row = 1
   };
   std::vector<int32_t> ref_output_shape{1, 2, 2, 2};
   EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
@@ -76,17 +76,17 @@ TEST(Conv2DTest, FloatCheck)
   Shape filter_shape{3, 2, 2, 1};
   Shape bias_shape{3};
   std::vector<float> input_data{
-      // First batch
-      1, 1, 1, 1, // row = 1
-      2, 2, 2, 2, // row = 2
-      // Second batch
-      1, 2, 3, 4, // row = 1
-      1, 2, 3, 4, // row = 2
+    // First batch
+    1, 1, 1, 1, // row = 1
+    2, 2, 2, 2, // row = 2
+    // Second batch
+    1, 2, 3, 4, // row = 1
+    1, 2, 3, 4, // row = 2
   };
   std::vector<float> filter_data{
-      1,  2,  3,  4, // first 2x2 filter
-      -1, 1,  -1, 1, // second 2x2 filter
-      -1, -1, 1,  1, // third 2x2 filter
+    1,  2,  3,  4, // first 2x2 filter
+    -1, 1,  -1, 1, // second 2x2 filter
+    -1, -1, 1,  1, // third 2x2 filter
   };
   std::vector<float> bias_data{1, 2, 3};
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
@@ -107,10 +107,10 @@ TEST(Conv2DTest, FloatCheck)
   kernel.execute();
 
   std::vector<float> ref_output_data{
-      18, 2, 5, // first batch, left
-      18, 2, 5, // first batch, right
-      17, 4, 3, // second batch, left
-      37, 4, 3, // second batch, right
+    18, 2, 5, // first batch, left
+    18, 2, 5, // first batch, right
+    17, 4, 3, // second batch, left
+    37, 4, 3, // second batch, right
   };
   std::vector<int32_t> ref_output_shape{2, 1, 2, 3};
   EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
@@ -120,17 +120,17 @@ TEST(Conv2DTest, FloatCheck)
 TEST(Conv2DTest, Uint8)
 {
   std::vector<float> input_data{
-      // First batch
-      1, 1, 1, 1, // row = 1
-      2, 2, 2, 2, // row = 2
-                  // Second batch
-      1, 2, 3, 4, // row = 1
-      1, 2, 3, 4, // row = 2
+    // First batch
+    1, 1, 1, 1, // row = 1
+    2, 2, 2, 2, // row = 2
+                // Second batch
+    1, 2, 3, 4, // row = 1
+    1, 2, 3, 4, // row = 2
   };
   std::vector<float> filter_data{
-      1,  2,  3,  4, // first 2x2 filter
-      -1, 1,  -1, 1, // second 2x2 filter
-      -1, -1, 1,  1, // third 2x2 filter
+    1,  2,  3,  4, // first 2x2 filter
+    -1, 1,  -1, 1, // second 2x2 filter
+    -1, -1, 1,  1, // third 2x2 filter
   };
   std::vector<float> bias_data{1, 2, 3};
 
@@ -142,9 +142,9 @@ TEST(Conv2DTest, Uint8)
   Tensor filter_tensor = makeInputTensor<DataType::U8>({3, 2, 2, 1}, input_quant_param.first,
                                                        input_quant_param.second, filter_data);
   Tensor bias_tensor = makeInputTensor<DataType::S32>(
-      {3}, input_quant_param.first * input_quant_param.first, 0, bias_data);
+    {3}, input_quant_param.first * input_quant_param.first, 0, bias_data);
   Tensor output_tensor =
-      makeOutputTensor(DataType::U8, output_quant_param.first, output_quant_param.second);
+    makeOutputTensor(DataType::U8, output_quant_param.first, output_quant_param.second);
 
   Conv2DParams params{};
   params.padding = Padding::VALID;
@@ -159,10 +159,10 @@ TEST(Conv2DTest, Uint8)
   kernel.execute();
 
   std::vector<float> ref_output_data{
-      18, 2, 5, // first batch, left
-      18, 2, 5, // first batch, right
-      17, 4, 3, // second batch, left
-      37, 4, 3, // second batch, right
+    18, 2, 5, // first batch, left
+    18, 2, 5, // first batch, right
+    17, 4, 3, // second batch, left
+    37, 4, 3, // second batch, right
   };
   std::vector<int32_t> ref_output_shape{2, 1, 2, 3};
   EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
@@ -173,17 +173,17 @@ TEST(Conv2DTest, Uint8_CWQ)
 {
   const int output_channels = 3;
   std::vector<float> input_data{
-      // First batch
-      1, 1, 1, 1, // row = 1
-      2, 2, 2, 2, // row = 2
-                  // Second batch
-      1, 2, 3, 4, // row = 1
-      1, 2, 3, 4, // row = 2
+    // First batch
+    1, 1, 1, 1, // row = 1
+    2, 2, 2, 2, // row = 2
+                // Second batch
+    1, 2, 3, 4, // row = 1
+    1, 2, 3, 4, // row = 2
   };
   std::vector<float> filter_data{
-      1,  2,  3,  4, // first 2x2 filter
-      -1, 1,  -1, 1, // second 2x2 filter
-      -1, -1, 1,  1, // third 2x2 filter
+    1,  2,  3,  4, // first 2x2 filter
+    -1, 1,  -1, 1, // second 2x2 filter
+    -1, -1, 1,  1, // third 2x2 filter
   };
   std::vector<float> bias_data{1, 2, 3};
   Shape filter_shape{output_channels, 2, 2, 1};
@@ -212,11 +212,11 @@ TEST(Conv2DTest, Uint8_CWQ)
   Tensor input_tensor = makeInputTensor<DataType::U8>({2, 2, 4, 1}, input_quant_param.first,
                                                       input_quant_param.second, input_data);
   Tensor filter_tensor =
-      makeInputTensor<DataType::U8>(filter_shape, filter_scales, filter_zerops, 0, filter_data);
+    makeInputTensor<DataType::U8>(filter_shape, filter_scales, filter_zerops, 0, filter_data);
   Tensor bias_tensor =
-      makeInputTensor<DataType::S32>({output_channels}, bias_scales, zerop, 0, bias_data);
+    makeInputTensor<DataType::S32>({output_channels}, bias_scales, zerop, 0, bias_data);
   Tensor output_tensor =
-      makeOutputTensor(DataType::U8, output_quant_param.first, output_quant_param.second);
+    makeOutputTensor(DataType::U8, output_quant_param.first, output_quant_param.second);
 
   Conv2DParams params{};
   params.padding = Padding::VALID;
@@ -231,10 +231,10 @@ TEST(Conv2DTest, Uint8_CWQ)
   kernel.execute();
 
   std::vector<float> ref_output_data{
-      18, 2, 5, // first batch, left
-      18, 2, 5, // first batch, right
-      17, 4, 3, // second batch, left
-      37, 4, 3, // second batch, right
+    18, 2, 5, // first batch, left
+    18, 2, 5, // first batch, right
+    17, 4, 3, // second batch, left
+    37, 4, 3, // second batch, right
   };
   std::vector<int32_t> ref_output_shape{2, 1, 2, 3};
   EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
@@ -249,21 +249,21 @@ TEST(Conv2DTest, SInt16)
   std::vector<int32_t> ref_output_shape{1, 2, 2, 2};
 
   std::vector<float> input_data{
-      1,  2,  3,  4,  5,  6,  // row = 0
-      7,  8,  9,  10, 11, 12, // row = 1
-      13, 14, 15, 16, 17, 18, // row = 2
-      19, 20, 21, 22, 23, 24, // row = 3
+    1,  2,  3,  4,  5,  6,  // row = 0
+    7,  8,  9,  10, 11, 12, // row = 1
+    13, 14, 15, 16, 17, 18, // row = 2
+    19, 20, 21, 22, 23, 24, // row = 3
   };
   std::vector<float> filter_data{
-      1,  2,  -3, -4, // out = 0, row = 0
-      -5, 6,  -7, 8,  // out = 1, row = 0
-      4,  -2, 3,  -1, // out = 0, row = 1
-      -8, -6, 7,  5,  // out = 1, row = 1
+    1,  2,  -3, -4, // out = 0, row = 0
+    -5, 6,  -7, 8,  // out = 1, row = 0
+    4,  -2, 3,  -1, // out = 0, row = 1
+    -8, -6, 7,  5,  // out = 1, row = 1
   };
   std::vector<float> bias_data{1, 2};
   std::vector<float> ref_output_data{
-      11, 16, 7, 20, // row = 0
-      0,  40, 0, 44, // row = 1
+    11, 16, 7, 20, // row = 0
+    0,  40, 0, 44, // row = 1
   };
 
   Tensor input_tensor = makeInputTensor<DataType::S16>(input_shape, 0.25, 0, input_data);
@@ -295,22 +295,22 @@ TEST(Conv2DTest, SInt16_CWQ_weights)
   std::vector<int32_t> ref_output_shape{1, 2, 2, 3};
 
   std::vector<float> input_data{
-      1, 2, // row = 0, col 0
-      3, 4, // row = 0, col 1
-      5, 6, // row = 1, col 0
-      7, 8, // row = 1, col 1
+    1, 2, // row = 0, col 0
+    3, 4, // row = 0, col 1
+    5, 6, // row = 1, col 0
+    7, 8, // row = 1, col 1
   };
   std::vector<float> filter_data{
-      4, -3, // out = 0
-      1, -3, // out = 1
-      5, -3, // out = 2
+    4, -3, // out = 0
+    1, -3, // out = 1
+    5, -3, // out = 2
   };
   std::vector<float> bias_data{1, 10, 5};
   std::vector<float> ref_output_data{
-      0, 5, 4,  // row 0, col 0
-      1, 1, 8,  // row 0, col 1
-      3, 0, 12, // row 1, col 0
-      5, 0, 16, // row 1, col 1
+    0, 5, 4,  // row 0, col 0
+    1, 1, 8,  // row 0, col 1
+    3, 0, 12, // row 1, col 0
+    5, 0, 16, // row 1, col 1
   };
 
   float input_scale = 0.25f;
@@ -323,7 +323,7 @@ TEST(Conv2DTest, SInt16_CWQ_weights)
 
   Tensor input_tensor = makeInputTensor<DataType::S16>(input_shape, input_scale, 0, input_data);
   Tensor filter_tensor =
-      makeInputTensor<DataType::S16>(filter_shape, filter_scales, zerop, 0, filter_data);
+    makeInputTensor<DataType::S16>(filter_shape, filter_scales, zerop, 0, filter_data);
   Tensor bias_tensor = makeInputTensor<DataType::S64>(bias_shape, bias_scales, zerop, 0, bias_data);
   Tensor output_tensor = makeOutputTensor(DataType::S16, output_scale, 0);
 
@@ -349,16 +349,16 @@ TEST(Conv2DTest, Unsupported_Type_Configure_NEG)
   Shape filter_shape{2, 2, 2, 2};
   Shape bias_shape{2};
   std::vector<int32_t> input_data{
-      1,  2,  3,  4,  5,  6,  // row = 0
-      7,  8,  9,  10, 11, 12, // row = 1
-      13, 14, 15, 16, 17, 18, // row = 2
-      19, 20, 21, 22, 23, 24, // row = 3
+    1,  2,  3,  4,  5,  6,  // row = 0
+    7,  8,  9,  10, 11, 12, // row = 1
+    13, 14, 15, 16, 17, 18, // row = 2
+    19, 20, 21, 22, 23, 24, // row = 3
   };
   std::vector<float> filter_data{
-      1,  2,  -3, -4, // out = 0, row = 0
-      -5, 6,  -7, 8,  // out = 1, row = 0
-      4,  -2, 3,  -1, // out = 0, row = 1
-      -8, -6, 7,  5,  // out = 1, row = 1
+    1,  2,  -3, -4, // out = 0, row = 0
+    -5, 6,  -7, 8,  // out = 1, row = 0
+    4,  -2, 3,  -1, // out = 0, row = 1
+    -8, -6, 7,  5,  // out = 1, row = 1
   };
   std::vector<float> bias_data{1, 2};
   Tensor input_tensor = makeInputTensor<DataType::S32>(input_shape, input_data);
@@ -384,16 +384,16 @@ TEST(Conv2DTest, Invalid_Bias_Type_NEG)
   Shape filter_shape{2, 2, 2, 2};
   Shape bias_shape{2};
   std::vector<float> input_data{
-      1,  2,  3,  4,  5,  6,  // row = 0
-      7,  8,  9,  10, 11, 12, // row = 1
-      13, 14, 15, 16, 17, 18, // row = 2
-      19, 20, 21, 22, 23, 24, // row = 3
+    1,  2,  3,  4,  5,  6,  // row = 0
+    7,  8,  9,  10, 11, 12, // row = 1
+    13, 14, 15, 16, 17, 18, // row = 2
+    19, 20, 21, 22, 23, 24, // row = 3
   };
   std::vector<float> filter_data{
-      1,  2,  -3, -4, // out = 0, row = 0
-      -5, 6,  -7, 8,  // out = 1, row = 0
-      4,  -2, 3,  -1, // out = 0, row = 1
-      -8, -6, 7,  5,  // out = 1, row = 1
+    1,  2,  -3, -4, // out = 0, row = 0
+    -5, 6,  -7, 8,  // out = 1, row = 0
+    4,  -2, 3,  -1, // out = 0, row = 1
+    -8, -6, 7,  5,  // out = 1, row = 1
   };
   std::vector<uint8_t> bias_data{1, 2};
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
@@ -419,16 +419,16 @@ TEST(Conv2DTest, Invalid_Bias_Data_NEG)
   Shape filter_shape{2, 2, 2, 2};
   Shape bias_shape{3};
   std::vector<float> input_data{
-      1,  2,  3,  4,  5,  6,  // row = 0
-      7,  8,  9,  10, 11, 12, // row = 1
-      13, 14, 15, 16, 17, 18, // row = 2
-      19, 20, 21, 22, 23, 24, // row = 3
+    1,  2,  3,  4,  5,  6,  // row = 0
+    7,  8,  9,  10, 11, 12, // row = 1
+    13, 14, 15, 16, 17, 18, // row = 2
+    19, 20, 21, 22, 23, 24, // row = 3
   };
   std::vector<float> filter_data{
-      1,  2,  -3, -4, // out = 0, row = 0
-      -5, 6,  -7, 8,  // out = 1, row = 0
-      4,  -2, 3,  -1, // out = 0, row = 1
-      -8, -6, 7,  5,  // out = 1, row = 1
+    1,  2,  -3, -4, // out = 0, row = 0
+    -5, 6,  -7, 8,  // out = 1, row = 0
+    4,  -2, 3,  -1, // out = 0, row = 1
+    -8, -6, 7,  5,  // out = 1, row = 1
   };
   std::vector<float> bias_data{1, 2, 3};
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
@@ -454,16 +454,16 @@ TEST(Conv2DTest, Invalid_Input_Shape_NEG)
   Shape filter_shape{2, 2, 2, 2};
   Shape bias_shape{2};
   std::vector<float> input_data{
-      1,  2,  3,  4,  5,  6,  // row = 0
-      7,  8,  9,  10, 11, 12, // row = 1
-      13, 14, 15, 16, 17, 18, // row = 2
-      19, 20, 21, 22, 23, 24, // row = 3
+    1,  2,  3,  4,  5,  6,  // row = 0
+    7,  8,  9,  10, 11, 12, // row = 1
+    13, 14, 15, 16, 17, 18, // row = 2
+    19, 20, 21, 22, 23, 24, // row = 3
   };
   std::vector<float> filter_data{
-      1,  2,  -3, -4, // out = 0, row = 0
-      -5, 6,  -7, 8,  // out = 1, row = 0
-      4,  -2, 3,  -1, // out = 0, row = 1
-      -8, -6, 7,  5,  // out = 1, row = 1
+    1,  2,  -3, -4, // out = 0, row = 0
+    -5, 6,  -7, 8,  // out = 1, row = 0
+    4,  -2, 3,  -1, // out = 0, row = 1
+    -8, -6, 7,  5,  // out = 1, row = 1
   };
   std::vector<float> bias_data{1, 2};
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
diff --git a/compiler/luci-interpreter/src/kernels/DepthToSpace.cpp b/compiler/luci-interpreter/src/kernels/DepthToSpace.cpp
index 57238313c..f2b9e4ccc 100644
--- a/compiler/luci-interpreter/src/kernels/DepthToSpace.cpp
+++ b/compiler/luci-interpreter/src/kernels/DepthToSpace.cpp
@@ -24,7 +24,7 @@ namespace kernels
 {
 
 DepthToSpace::DepthToSpace(const Tensor *input, Tensor *output, const DepthToSpaceParams &params)
-    : KernelWithParams<DepthToSpaceParams>({input}, {output}, params)
+  : KernelWithParams<DepthToSpaceParams>({input}, {output}, params)
 {
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.cpp b/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.cpp
index 921133191..1452f4421 100644
--- a/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.cpp
+++ b/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.cpp
@@ -30,7 +30,7 @@ namespace kernels
 
 DepthwiseConv2D::DepthwiseConv2D(const Tensor *input, const Tensor *filter, const Tensor *bias,
                                  Tensor *output, const DepthwiseConv2DParams &params)
-    : KernelWithParams<DepthwiseConv2DParams>({input, filter, bias}, {output}, params)
+  : KernelWithParams<DepthwiseConv2DParams>({input, filter, bias}, {output}, params)
 {
 }
 
@@ -85,11 +85,11 @@ void DepthwiseConv2D::configure()
                                                bias()->shape().dim(0) == channels_out));
 
   const int32_t output_height =
-      computeOutputSize(_params.padding, input_height, filter_height, _params.stride_height,
-                        _params.dilation_height_factor);
+    computeOutputSize(_params.padding, input_height, filter_height, _params.stride_height,
+                      _params.dilation_height_factor);
   const int32_t output_width =
-      computeOutputSize(_params.padding, input_width, filter_width, _params.stride_width,
-                        _params.dilation_width_factor);
+    computeOutputSize(_params.padding, input_width, filter_width, _params.stride_width,
+                      _params.dilation_width_factor);
 
   _padding_height = computePadding(_params.stride_height, _params.dilation_height_factor,
                                    input_height, filter_height, output_height);
@@ -149,9 +149,9 @@ void DepthwiseConv2D::evalFloat() const
   params.float_activation_max = activation_max;
 
   tflite::reference_ops::DepthwiseConv(
-      params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(filter()),
-      getTensorData<float>(filter()), getTensorShape(bias()), getTensorData<float>(bias()),
-      getTensorShape(output()), getTensorData<float>(output()));
+    params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(filter()),
+    getTensorData<float>(filter()), getTensorShape(bias()), getTensorData<float>(bias()),
+    getTensorShape(output()), getTensorData<float>(output()));
 }
 
 void DepthwiseConv2D::evalQuantizedPerChannel() const
@@ -185,10 +185,10 @@ void DepthwiseConv2D::evalQuantizedPerChannel() const
   calculateActivationRangeQuantized(_params.activation, output(), &activation_min, &activation_max);
 
   const std::vector<double> effective_output_scales =
-      getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
+    getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
 
   std::vector<ChannelQuantMultipliers> quant_multipliers_raw =
-      quantizeMultipliers(effective_output_scales);
+    quantizeMultipliers(effective_output_scales);
   BroadcastableWrapper<ChannelQuantMultipliers> quant_multipliers(quant_multipliers_raw);
 
   for (int batch = 0; batch < batches; ++batch)
@@ -213,13 +213,13 @@ void DepthwiseConv2D::evalQuantizedPerChannel() const
                 const int in_y = in_y_origin + dilation_height_factor * filter_y;
                 // Zero padding by omitting the areas outside the image.
                 const bool is_point_inside_image =
-                    (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height);
+                  (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height);
                 if (is_point_inside_image)
                 {
                   int32 input_val =
-                      input_data[calcOffset(input_shape, batch, in_y, in_x, in_channel)];
+                    input_data[calcOffset(input_shape, batch, in_y, in_x, in_channel)];
                   int32 filter_val =
-                      filter_data[calcOffset(filter_shape, 0, filter_y, filter_x, output_channel)];
+                    filter_data[calcOffset(filter_shape, 0, filter_y, filter_x, output_channel)];
                   acc += (filter_val - filter()->zero_points()[output_channel]) *
                          (input_val - input()->zero_point());
                 }
@@ -232,12 +232,12 @@ void DepthwiseConv2D::evalQuantizedPerChannel() const
             int32_t output_multiplier = quant_multipliers[output_channel].multiplier;
             int output_shift = quant_multipliers[output_channel].shift;
             int32_t scaled_acc =
-                tflite::MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
+              tflite::MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
             scaled_acc += output()->zero_point();
             scaled_acc = std::max(scaled_acc, activation_min);
             scaled_acc = std::min(scaled_acc, activation_max);
             output_data[calcOffset(output_shape, batch, out_y, out_x, output_channel)] =
-                static_cast<uint8_t>(scaled_acc);
+              static_cast<uint8_t>(scaled_acc);
           }
         }
       }
@@ -278,9 +278,9 @@ void DepthwiseConv2D::evalQuantized() const
   params.quantized_activation_max = activation_max;
 
   tflite::reference_ops::DepthwiseConv(
-      params, getTensorShape(input()), getTensorData<uint8_t>(input()), getTensorShape(filter()),
-      getTensorData<uint8_t>(filter()), getTensorShape(bias()), getTensorData<int32_t>(bias()),
-      getTensorShape(output()), getTensorData<uint8_t>(output()));
+    params, getTensorShape(input()), getTensorData<uint8_t>(input()), getTensorShape(filter()),
+    getTensorData<uint8_t>(filter()), getTensorShape(bias()), getTensorData<int32_t>(bias()),
+    getTensorShape(output()), getTensorData<uint8_t>(output()));
 }
 
 void DepthwiseConv2D::evalQuantizedS16() const
@@ -310,10 +310,10 @@ void DepthwiseConv2D::evalQuantizedS16() const
   const int32_t depth_multiplier = _params.depth_multiplier;
 
   const std::vector<double> effective_output_scales =
-      getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
+    getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
 
   std::vector<ChannelQuantMultipliers> quant_multipliers_raw =
-      quantizeMultipliers(effective_output_scales);
+    quantizeMultipliers(effective_output_scales);
 
   BroadcastableWrapper<ChannelQuantMultipliers> quant_multipliers(quant_multipliers_raw);
 
@@ -344,9 +344,9 @@ void DepthwiseConv2D::evalQuantizedS16() const
                 if ((in_y >= 0 && in_y < input_height) && (in_x >= 0 && in_x < input_width))
                 {
                   const int16_t input_val =
-                      input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
+                    input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
                   const int16_t filter_val =
-                      filter_data[calcOffset(filter_shape, 0, filter_y, filter_x, out_c)];
+                    filter_data[calcOffset(filter_shape, 0, filter_y, filter_x, out_c)];
                   acc += static_cast<int64_t>(input_val) * static_cast<int64_t>(filter_val);
                 }
               }
@@ -359,7 +359,7 @@ void DepthwiseConv2D::evalQuantizedS16() const
             int32_t output_multiplier = quant_multipliers[out_c].multiplier;
             int output_shift = quant_multipliers[out_c].shift;
             int32_t scaled_acc =
-                tflite::MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
+              tflite::MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
 
             scaled_acc = std::max(scaled_acc, activation_min);
             scaled_acc = std::min(scaled_acc, activation_max);
diff --git a/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.test.cpp b/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.test.cpp
index f79e888a1..3e2f434dd 100644
--- a/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/DepthwiseConv2D.test.cpp
@@ -32,16 +32,16 @@ TEST(DepthwiseConv2DTest, Float)
   Shape filter_shape{1, 2, 2, 4};
   Shape bias_shape{4};
   std::vector<float> input_data{
-      1,  2,  7,  8,  //
-      3,  4,  9,  10, //
-      5,  6,  11, 12, //
-      13, 14, 15, 16, //
+    1,  2,  7,  8,  //
+    3,  4,  9,  10, //
+    5,  6,  11, 12, //
+    13, 14, 15, 16, //
   };
   std::vector<float> filter_data{
-      1,  2,   3,   4,   //
-      -9, 10,  -11, 12,  //
-      5,  6,   7,   8,   //
-      13, -14, 15,  -16, //
+    1,  2,   3,   4,   //
+    -9, 10,  -11, 12,  //
+    5,  6,   7,   8,   //
+    13, -14, 15,  -16, //
   };
   std::vector<float> bias_data{1, 2, 3, 4};
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
@@ -63,8 +63,8 @@ TEST(DepthwiseConv2DTest, Float)
   kernel.execute();
 
   std::vector<float> ref_output_data{
-      71,  0, 99,  0,  //
-      167, 0, 227, 28, //
+    71,  0, 99,  0,  //
+    167, 0, 227, 28, //
   };
   EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 1, 4}));
@@ -73,15 +73,15 @@ TEST(DepthwiseConv2DTest, Float)
 TEST(DepthwiseConv2DTest, Uint8)
 {
   std::vector<float> input_data{
-      1, 2, 7,  8,  // column 1
-      3, 4, 9,  10, // column 2
-      5, 6, 11, 12, // column 3
+    1, 2, 7,  8,  // column 1
+    3, 4, 9,  10, // column 2
+    5, 6, 11, 12, // column 3
   };
   std::vector<float> filter_data{
-      1,  2,   3,   4,   //
-      -9, 10,  -11, 12,  //
-      5,  6,   7,   8,   //
-      13, -14, 15,  -16, //
+    1,  2,   3,   4,   //
+    -9, 10,  -11, 12,  //
+    5,  6,   7,   8,   //
+    13, -14, 15,  -16, //
   };
   std::vector<float> bias_data{1, 2, 3, 4};
 
@@ -93,9 +93,9 @@ TEST(DepthwiseConv2DTest, Uint8)
   Tensor filter_tensor = makeInputTensor<DataType::U8>({1, 2, 2, 4}, input_quant_param.first,
                                                        input_quant_param.second, filter_data);
   Tensor bias_tensor = makeInputTensor<DataType::S32>(
-      {4}, input_quant_param.first * input_quant_param.first, 0, bias_data);
+    {4}, input_quant_param.first * input_quant_param.first, 0, bias_data);
   Tensor output_tensor =
-      makeOutputTensor(DataType::U8, output_quant_param.first, output_quant_param.second);
+    makeOutputTensor(DataType::U8, output_quant_param.first, output_quant_param.second);
 
   DepthwiseConv2DParams params{};
   params.padding = Padding::VALID;
@@ -111,8 +111,8 @@ TEST(DepthwiseConv2DTest, Uint8)
   kernel.execute();
 
   std::vector<float> ref_output_data{
-      71, -34, 99,  -20, //
-      91, -26, 127, -4,  //
+    71, -34, 99,  -20, //
+    91, -26, 127, -4,  //
   };
   EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
   EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 1, 4}));
@@ -126,21 +126,21 @@ TEST(DepthwiseConv2DTest, SInt16)
   std::vector<int32_t> ref_output_shape{1, 2, 1, 4};
 
   std::vector<float> input_data{
-      1,  2,  7,  8,  //
-      3,  4,  9,  10, //
-      5,  6,  11, 12, //
-      13, 14, 15, 16, //
+    1,  2,  7,  8,  //
+    3,  4,  9,  10, //
+    5,  6,  11, 12, //
+    13, 14, 15, 16, //
   };
   std::vector<float> filter_data{
-      1,  2,   3,   4,   //
-      -9, 10,  -11, 12,  //
-      5,  6,   7,   8,   //
-      13, -14, 15,  -16, //
+    1,  2,   3,   4,   //
+    -9, 10,  -11, 12,  //
+    5,  6,   7,   8,   //
+    13, -14, 15,  -16, //
   };
   std::vector<float> bias_data{1, 2, 3, 4};
   std::vector<float> ref_output_data{
-      71,  0, 99,  0,  //
-      167, 0, 227, 28, //
+    71,  0, 99,  0,  //
+    167, 0, 227, 28, //
   };
 
   Tensor input_tensor = makeInputTensor<DataType::S16>(input_shape, 0.25, 0, input_data);
@@ -174,21 +174,21 @@ TEST(DepthwiseConv2DTest, SInt16_CWQ_weights)
   std::vector<int32_t> ref_output_shape{1, 2, 1, output_channels};
 
   std::vector<float> input_data{
-      1,  2,  7,  8,  //
-      3,  4,  9,  10, //
-      5,  6,  11, 12, //
-      13, 14, 15, 16, //
+    1,  2,  7,  8,  //
+    3,  4,  9,  10, //
+    5,  6,  11, 12, //
+    13, 14, 15, 16, //
   };
   std::vector<float> filter_data{
-      1,  2,   3,   4,   //
-      -9, 10,  -11, 12,  //
-      5,  6,   7,   8,   //
-      13, -14, 15,  -16, //
+    1,  2,   3,   4,   //
+    -9, 10,  -11, 12,  //
+    5,  6,   7,   8,   //
+    13, -14, 15,  -16, //
   };
   std::vector<float> bias_data{1, 2, 3, 4};
   std::vector<float> ref_output_data{
-      71,  0, 99,  0,  //
-      167, 0, 227, 28, //
+    71,  0, 99,  0,  //
+    167, 0, 227, 28, //
   };
 
   float input_scale = 0.25;
@@ -199,7 +199,7 @@ TEST(DepthwiseConv2DTest, SInt16_CWQ_weights)
   std::vector<int32_t> zerop(4, 0);
   Tensor input_tensor = makeInputTensor<DataType::S16>(input_shape, input_scale, 0, input_data);
   Tensor filter_tensor =
-      makeInputTensor<DataType::S16>(filter_shape, filter_scales, zerop, 3, filter_data);
+    makeInputTensor<DataType::S16>(filter_shape, filter_scales, zerop, 3, filter_data);
   Tensor bias_tensor = makeInputTensor<DataType::S64>(bias_shape, bias_scales, zerop, 0, bias_data);
   Tensor output_tensor = makeOutputTensor(DataType::S16, 0.5, 0);
 
@@ -229,20 +229,20 @@ TEST(DepthwiseConv2DTest, Uint8_CWQ_weights)
   std::vector<int32_t> ref_output_shape{1, 2, 1, output_channels};
 
   std::vector<float> input_data{
-      1, 2, 7,  8,  //
-      3, 4, 9,  10, //
-      5, 6, 11, 12, //
+    1, 2, 7,  8,  //
+    3, 4, 9,  10, //
+    5, 6, 11, 12, //
   };
   std::vector<float> filter_data{
-      1,  2,   3,   4,   //
-      -9, 10,  -11, 12,  //
-      5,  6,   7,   8,   //
-      13, -14, 15,  -16, //
+    1,  2,   3,   4,   //
+    -9, 10,  -11, 12,  //
+    5,  6,   7,   8,   //
+    13, -14, 15,  -16, //
   };
   std::vector<float> bias_data{1, 2, 3, 4};
   std::vector<float> ref_output_data{
-      71, -34, 99,  -20, //
-      91, -26, 127, -4,  //
+    71, -34, 99,  -20, //
+    91, -26, 127, -4,  //
   };
 
   std::pair<float, int32_t> input_quant_param = quantizationParams<uint8_t>(0, 16);
@@ -270,10 +270,10 @@ TEST(DepthwiseConv2DTest, Uint8_CWQ_weights)
   Tensor input_tensor = makeInputTensor<DataType::U8>(input_shape, input_quant_param.first,
                                                       input_quant_param.second, input_data);
   Tensor filter_tensor =
-      makeInputTensor<DataType::U8>(filter_shape, filter_scales, filter_zerops, 3, filter_data);
+    makeInputTensor<DataType::U8>(filter_shape, filter_scales, filter_zerops, 3, filter_data);
   Tensor bias_tensor = makeInputTensor<DataType::S32>(bias_shape, bias_scales, zerop, 0, bias_data);
   Tensor output_tensor =
-      makeOutputTensor(DataType::U8, output_quant_param.first, output_quant_param.second);
+    makeOutputTensor(DataType::U8, output_quant_param.first, output_quant_param.second);
 
   DepthwiseConv2DParams params{};
   params.padding = Padding::VALID;
@@ -299,16 +299,16 @@ TEST(DepthwiseConv2DTest, InvalidBiasType_NEG)
   Shape filter_shape{1, 2, 2, 4};
   Shape bias_shape{4};
   std::vector<float> input_data{
-      1,  2,  7,  8,  //
-      3,  4,  9,  10, //
-      5,  6,  11, 12, //
-      13, 14, 15, 16, //
+    1,  2,  7,  8,  //
+    3,  4,  9,  10, //
+    5,  6,  11, 12, //
+    13, 14, 15, 16, //
   };
   std::vector<float> filter_data{
-      1,  2,   3,   4,   //
-      -9, 10,  -11, 12,  //
-      5,  6,   7,   8,   //
-      13, -14, 15,  -16, //
+    1,  2,   3,   4,   //
+    -9, 10,  -11, 12,  //
+    5,  6,   7,   8,   //
+    13, -14, 15,  -16, //
   };
   std::vector<int32_t> bias_data{1, 2, 3, 4};
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
@@ -335,16 +335,16 @@ TEST(DepthwiseConv2DTest, InOutTypeMismatch_NEG)
   Shape filter_shape{1, 2, 2, 4};
   Shape bias_shape{4};
   std::vector<float> input_data{
-      1,  2,  7,  8,  //
-      3,  4,  9,  10, //
-      5,  6,  11, 12, //
-      13, 14, 15, 16, //
+    1,  2,  7,  8,  //
+    3,  4,  9,  10, //
+    5,  6,  11, 12, //
+    13, 14, 15, 16, //
   };
   std::vector<float> filter_data{
-      1,  2,   3,   4,   //
-      -9, 10,  -11, 12,  //
-      5,  6,   7,   8,   //
-      13, -14, 15,  -16, //
+    1,  2,   3,   4,   //
+    -9, 10,  -11, 12,  //
+    5,  6,   7,   8,   //
+    13, -14, 15,  -16, //
   };
   std::vector<float> bias_data{1, 2, 3, 4};
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
@@ -371,16 +371,16 @@ TEST(DepthwiseConv2DTest, InvalidInputShape_NEG)
   Shape filter_shape{2, 2, 4};
   Shape bias_shape{4};
   std::vector<float> input_data{
-      1,  2,  7,  8,  //
-      3,  4,  9,  10, //
-      5,  6,  11, 12, //
-      13, 14, 15, 16, //
+    1,  2,  7,  8,  //
+    3,  4,  9,  10, //
+    5,  6,  11, 12, //
+    13, 14, 15, 16, //
   };
   std::vector<float> filter_data{
-      1,  2,   3,   4,   //
-      -9, 10,  -11, 12,  //
-      5,  6,   7,   8,   //
-      13, -14, 15,  -16, //
+    1,  2,   3,   4,   //
+    -9, 10,  -11, 12,  //
+    5,  6,   7,   8,   //
+    13, -14, 15,  -16, //
   };
   std::vector<float> bias_data{1, 2, 3, 4};
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
@@ -407,16 +407,16 @@ TEST(DepthwiseConv2DTest, InvalidFilterShape_NEG)
   Shape filter_shape{2, 1, 2, 4};
   Shape bias_shape{4};
   std::vector<float> input_data{
-      1,  2,  7,  8,  //
-      3,  4,  9,  10, //
-      5,  6,  11, 12, //
-      13, 14, 15, 16, //
+    1,  2,  7,  8,  //
+    3,  4,  9,  10, //
+    5,  6,  11, 12, //
+    13, 14, 15, 16, //
   };
   std::vector<float> filter_data{
-      1,  2,   3,   4,   //
-      -9, 10,  -11, 12,  //
-      5,  6,   7,   8,   //
-      13, -14, 15,  -16, //
+    1,  2,   3,   4,   //
+    -9, 10,  -11, 12,  //
+    5,  6,   7,   8,   //
+    13, -14, 15,  -16, //
   };
   std::vector<float> bias_data{1, 2, 3, 4};
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
@@ -443,16 +443,16 @@ TEST(DepthwiseConv2DTest, InvalidBiasDim_NEG)
   Shape filter_shape{1, 2, 4, 2};
   Shape bias_shape{4};
   std::vector<float> input_data{
-      1,  2,  7,  8,  //
-      3,  4,  9,  10, //
-      5,  6,  11, 12, //
-      13, 14, 15, 16, //
+    1,  2,  7,  8,  //
+    3,  4,  9,  10, //
+    5,  6,  11, 12, //
+    13, 14, 15, 16, //
   };
   std::vector<float> filter_data{
-      1,  2,   3,   4,   //
-      -9, 10,  -11, 12,  //
-      5,  6,   7,   8,   //
-      13, -14, 15,  -16, //
+    1,  2,   3,   4,   //
+    -9, 10,  -11, 12,  //
+    5,  6,   7,   8,   //
+    13, -14, 15,  -16, //
   };
   std::vector<float> bias_data{1, 2, 3, 4};
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
diff --git a/compiler/luci-interpreter/src/kernels/Div.cpp b/compiler/luci-interpreter/src/kernels/Div.cpp
index e75876b3a..db1496d37 100644
--- a/compiler/luci-interpreter/src/kernels/Div.cpp
+++ b/compiler/luci-interpreter/src/kernels/Div.cpp
@@ -26,7 +26,7 @@ namespace kernels
 {
 
 Div::Div(const Tensor *input1, const Tensor *input2, Tensor *output, const DivParams &params)
-    : KernelWithParams<DivParams>({input1, input2}, {output}, params)
+  : KernelWithParams<DivParams>({input1, input2}, {output}, params)
 {
 }
 
@@ -63,13 +63,13 @@ void Div::evalFloat() const
   params.float_activation_min = activation_min;
   params.float_activation_max = activation_max;
   const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes(
-      getTensorShape(input1()), getTensorShape(input2()), &params);
+    getTensorShape(input1()), getTensorShape(input2()), &params);
 
   if (need_broadcast)
   {
     tflite::reference_ops::BroadcastDivSlow(
-        params, getTensorShape(input1()), getTensorData<float>(input1()), getTensorShape(input2()),
-        getTensorData<float>(input2()), getTensorShape(output()), getTensorData<float>(output()));
+      params, getTensorShape(input1()), getTensorData<float>(input1()), getTensorShape(input2()),
+      getTensorData<float>(input2()), getTensorShape(output()), getTensorData<float>(output()));
   }
   else
   {
@@ -107,14 +107,13 @@ void Div::evalQuantized() const
   params.quantized_activation_max = activation_max;
 
   const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes(
-      getTensorShape(input1()), getTensorShape(input2()), &params);
+    getTensorShape(input1()), getTensorShape(input2()), &params);
 
   if (need_broadcast)
   {
     tflite::reference_ops::BroadcastDivSlow(
-        params, getTensorShape(input1()), getTensorData<uint8_t>(input1()),
-        getTensorShape(input2()), getTensorData<uint8_t>(input2()), getTensorShape(output()),
-        getTensorData<uint8_t>(output()));
+      params, getTensorShape(input1()), getTensorData<uint8_t>(input1()), getTensorShape(input2()),
+      getTensorData<uint8_t>(input2()), getTensorShape(output()), getTensorData<uint8_t>(output()));
   }
   else
   {
diff --git a/compiler/luci-interpreter/src/kernels/Div.test.cpp b/compiler/luci-interpreter/src/kernels/Div.test.cpp
index 77eb2e9c1..1a0c4af15 100644
--- a/compiler/luci-interpreter/src/kernels/Div.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Div.test.cpp
@@ -99,12 +99,12 @@ TEST(DivTest, Uint8)
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-1.f, 1.f);
 
   Tensor input1_tensor =
-      makeInputTensor<DataType::U8>(base_shape, quant_param.first, quant_param.second, input1_data);
+    makeInputTensor<DataType::U8>(base_shape, quant_param.first, quant_param.second, input1_data);
   Tensor input2_tensor =
-      makeInputTensor<DataType::U8>(base_shape, quant_param.first, quant_param.second, input2_data);
+    makeInputTensor<DataType::U8>(base_shape, quant_param.first, quant_param.second, input2_data);
 
   Tensor output_tensor =
-      makeOutputTensor(getElementType<uint8_t>(), quant_param.first, quant_param.second);
+    makeOutputTensor(getElementType<uint8_t>(), quant_param.first, quant_param.second);
 
   DivParams params{};
   params.activation = Activation::RELU;
diff --git a/compiler/luci-interpreter/src/kernels/Elu.test.cpp b/compiler/luci-interpreter/src/kernels/Elu.test.cpp
index 0235d6552..e26eed03e 100644
--- a/compiler/luci-interpreter/src/kernels/Elu.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Elu.test.cpp
@@ -43,25 +43,25 @@ void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int
 TEST(EluTest, SimpleElu)
 {
   Check(
-      /*input_shape=*/{1, 2, 4, 1}, /*output_shape=*/{1, 2, 4, 1},
-      /*input_data=*/
-      {
-          0, -6, 2, -4,    //
-          3, -2, 10, -0.1, //
-      },
-      /*output_data=*/
-      {
-          0.0, -0.997521, 2.0, -0.981684,   //
-          3.0, -0.864665, 10.0, -0.0951626, //
-      });
+    /*input_shape=*/{1, 2, 4, 1}, /*output_shape=*/{1, 2, 4, 1},
+    /*input_data=*/
+    {
+      0, -6, 2, -4,    //
+      3, -2, 10, -0.1, //
+    },
+    /*output_data=*/
+    {
+      0.0, -0.997521, 2.0, -0.981684,   //
+      3.0, -0.864665, 10.0, -0.0951626, //
+    });
 }
 
 TEST(EluTest, InOutTypeMismatch_NEG)
 {
   Shape input_shape{1, 2, 4, 1};
   std::vector<float> input_data{
-      0, -6, 2,  -4,   //
-      3, -2, 10, -0.1, //
+    0, -6, 2,  -4,   //
+    3, -2, 10, -0.1, //
   };
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::U8);
diff --git a/compiler/luci-interpreter/src/kernels/Equal.test.cpp b/compiler/luci-interpreter/src/kernels/Equal.test.cpp
index fb0de8bbf..ba2827ba9 100644
--- a/compiler/luci-interpreter/src/kernels/Equal.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Equal.test.cpp
@@ -30,18 +30,18 @@ using namespace testing;
 TEST(EqualTest, FloatSimple)
 {
   std::vector<float> x_data{
-      0.5, 0.7, 0.9, // Row 1
-      1,   0,   -1,  // Row 2
+    0.5, 0.7, 0.9, // Row 1
+    1,   0,   -1,  // Row 2
   };
 
   std::vector<float> y_data{
-      0.9, 0.7, 0.5, // Row 1
-      -1,  0,   1,   // Row 2
+    0.9, 0.7, 0.5, // Row 1
+    -1,  0,   1,   // Row 2
   };
 
   std::vector<bool> ref_output_data{
-      false, true, false, // Row 1
-      false, true, false, // Row 2
+    false, true, false, // Row 1
+    false, true, false, // Row 2
   };
 
   Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, x_data);
@@ -59,21 +59,21 @@ TEST(EqualTest, FloatSimple)
 TEST(EqualTest, FloatBroardcast)
 {
   std::vector<float> x_data{
-      0.5, 0.7, 0.9, // Row 1
-      1,   0,   -1,  // Row 2
-      -1,  0,   1,   // Row 3
-      0.9, 0.7, 0.5, // Row 4
+    0.5, 0.7, 0.9, // Row 1
+    1,   0,   -1,  // Row 2
+    -1,  0,   1,   // Row 3
+    0.9, 0.7, 0.5, // Row 4
   };
 
   std::vector<float> y_data{
-      0.9, 0.7, 0.5, // Row 1
+    0.9, 0.7, 0.5, // Row 1
   };
 
   std::vector<bool> ref_output_data{
-      false, true,  false, // Row 1
-      false, false, false, // Row 2
-      false, false, false, // Row 3
-      true,  true,  true,  // Row 4
+    false, true,  false, // Row 1
+    false, false, false, // Row 2
+    false, false, false, // Row 3
+    true,  true,  true,  // Row 4
   };
 
   Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({4, 3}, x_data);
@@ -95,27 +95,27 @@ const float F_MAX = 127.0 / 128.0;
 TEST(EqualTest, Uint8Quantized)
 {
   std::vector<float> x_data{
-      0.5, 0.5, 0.7,  0.9, // Row 1
-      1,   0,   0.05, -1,  // Row 2
+    0.5, 0.5, 0.7,  0.9, // Row 1
+    1,   0,   0.05, -1,  // Row 2
   };
 
   std::vector<float> y_data{
-      0.9, 0.5, 0.55, 0.5, // Row 1
-      -1,  0,   0.05, 1,   // Row 2
+    0.9, 0.5, 0.55, 0.5, // Row 1
+    -1,  0,   0.05, 1,   // Row 2
   };
 
   std::vector<bool> ref_output_data{
-      false, true, false, false, // Row 1
-      false, true, true,  false, // Row 2
+    false, true, false, false, // Row 1
+    false, true, true,  false, // Row 2
   };
 
   std::pair<float, int32_t> x_quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
-  Tensor x_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, x_quant_param.first,
-                                                  x_quant_param.second, x_data);
+  Tensor x_tensor =
+    makeInputTensor<DataType::U8>({1, 2, 4, 1}, x_quant_param.first, x_quant_param.second, x_data);
 
   std::pair<float, int32_t> y_quant_param = quantizationParams<uint8_t>(F_MIN * 2, F_MAX * 2);
-  Tensor y_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, y_quant_param.first,
-                                                  y_quant_param.second, y_data);
+  Tensor y_tensor =
+    makeInputTensor<DataType::U8>({1, 2, 4, 1}, y_quant_param.first, y_quant_param.second, y_data);
 
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
@@ -130,28 +130,28 @@ TEST(EqualTest, Uint8Quantized)
 TEST(EqualTest, Uint8QuantizedBroadcast)
 {
   std::vector<float> x_data{
-      0.4,  -0.8, 0.7,  0.3, // Row 1
-      -0.5, 0.1,  0,    0.5, // Row 2
-      1,    0,    0.05, -1,  // Row 3
-      -1,   0.05, 0,    1,   // Row 4
+    0.4,  -0.8, 0.7,  0.3, // Row 1
+    -0.5, 0.1,  0,    0.5, // Row 2
+    1,    0,    0.05, -1,  // Row 3
+    -1,   0.05, 0,    1,   // Row 4
   };
 
   std::vector<float> y_data{
-      -1, 0.05, 0, 1, // Row 1
+    -1, 0.05, 0, 1, // Row 1
   };
 
   std::vector<bool> ref_output_data{
-      false, false, false, false, // Row 1
-      false, false, true,  false, // Row 2
-      false, false, false, false, // Row 3
-      true,  true,  true,  true,  // Row 4
+    false, false, false, false, // Row 1
+    false, false, true,  false, // Row 2
+    false, false, false, false, // Row 3
+    true,  true,  true,  true,  // Row 4
   };
 
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
   Tensor x_tensor =
-      makeInputTensor<DataType::U8>({1, 4, 4, 1}, quant_param.first, quant_param.second, x_data);
+    makeInputTensor<DataType::U8>({1, 4, 4, 1}, quant_param.first, quant_param.second, x_data);
   Tensor y_tensor =
-      makeInputTensor<DataType::U8>({1, 1, 4, 1}, quant_param.first, quant_param.second, y_data);
+    makeInputTensor<DataType::U8>({1, 1, 4, 1}, quant_param.first, quant_param.second, y_data);
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   Equal kernel(&x_tensor, &y_tensor, &output_tensor);
diff --git a/compiler/luci-interpreter/src/kernels/Floor.test.cpp b/compiler/luci-interpreter/src/kernels/Floor.test.cpp
index 3e1ab6f3a..d90d611d9 100644
--- a/compiler/luci-interpreter/src/kernels/Floor.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Floor.test.cpp
@@ -30,14 +30,14 @@ TEST(FloorTest, SimpleFloat)
 {
   std::initializer_list<int32_t> input_shape{1, 2, 4, 1};
   std::vector<float> input_data{
-      0.2, 8.6, 2.4,  4.3,  // Row 1
-      3,   7.1, 10.5, -0.9, // Row 2
+    0.2, 8.6, 2.4,  4.3,  // Row 1
+    3,   7.1, 10.5, -0.9, // Row 2
   };
 
   std::initializer_list<int32_t> ref_output_shape{1, 2, 4, 1};
   std::vector<float> ref_output_data{
-      0, 8, 2,  4,  // Row 1
-      3, 7, 10, -1, // Row 2
+    0, 8, 2,  4,  // Row 1
+    3, 7, 10, -1, // Row 2
   };
 
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
diff --git a/compiler/luci-interpreter/src/kernels/FloorDiv.cpp b/compiler/luci-interpreter/src/kernels/FloorDiv.cpp
index b6f36cea3..a7a10a336 100644
--- a/compiler/luci-interpreter/src/kernels/FloorDiv.cpp
+++ b/compiler/luci-interpreter/src/kernels/FloorDiv.cpp
@@ -28,7 +28,7 @@ namespace kernels
 {
 
 FloorDiv::FloorDiv(const Tensor *input, const Tensor *alpha, Tensor *output)
-    : Kernel({input, alpha}, {output})
+  : Kernel({input, alpha}, {output})
 {
 }
 
@@ -70,14 +70,14 @@ void FloorDiv::evalFloat() const
   if (x()->shape() != y()->shape())
   {
     tflite::reference_ops::BroadcastBinaryFunction4DSlow<float, float, float>(
-        getTensorShape(x()), x_data, getTensorShape(y()), y_data, getTensorShape(output()),
-        getTensorData<float>(output()), FloorDivFunc);
+      getTensorShape(x()), x_data, getTensorShape(y()), y_data, getTensorShape(output()),
+      getTensorData<float>(output()), FloorDivFunc);
   }
   else
   {
     tflite::reference_ops::BinaryFunction<float, float, float>(
-        getTensorShape(x()), x_data, getTensorShape(y()), y_data, getTensorShape(output()),
-        getTensorData<float>(output()), FloorDivFunc);
+      getTensorShape(x()), x_data, getTensorShape(y()), y_data, getTensorShape(output()),
+      getTensorData<float>(output()), FloorDivFunc);
   }
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/FloorDiv.test.cpp b/compiler/luci-interpreter/src/kernels/FloorDiv.test.cpp
index a5bc700f7..16831ca80 100644
--- a/compiler/luci-interpreter/src/kernels/FloorDiv.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/FloorDiv.test.cpp
@@ -31,20 +31,20 @@ TEST(FloorDivTest, FloatSimple)
 {
   Shape x_shape{2, 3};
   std::vector<float> x_data{
-      0.5, 2.4,  3.1,  // Row 1
-      1.9, -1.9, -2.8, // Row 2
+    0.5, 2.4,  3.1,  // Row 1
+    1.9, -1.9, -2.8, // Row 2
   };
 
   Shape y_shape = x_shape;
   std::vector<float> y_data{
-      2.0, 0.5,  3.0,  // Row 1
-      1.0, -1.0, -2.0, // Row 2
+    2.0, 0.5,  3.0,  // Row 1
+    1.0, -1.0, -2.0, // Row 2
   };
 
   std::vector<int32_t> ref_output_shape{2, 3};
   std::vector<float> ref_output_data{
-      0, 4, 1, // Row 1
-      1, 1, 1, // Row 2
+    0, 4, 1, // Row 1
+    1, 1, 1, // Row 2
   };
 
   Tensor x_tensor = makeInputTensor<DataType::FLOAT32>(x_shape, x_data);
@@ -64,21 +64,21 @@ TEST(FloorDivTest, FloatBroadcast)
 {
   Shape x_shape{1, 3};
   std::vector<float> x_data{
-      0.5, 2.4, -3.1, // Row 1
+    0.5, 2.4, -3.1, // Row 1
   };
 
   Shape y_shape{3, 3};
   std::vector<float> y_data{
-      1.0, 1.0,  1.0,  // Row 1
-      2.0, -0.5, -2.0, // Row 2
-      0.3, 0.7,  0.9,  // Row 3
+    1.0, 1.0,  1.0,  // Row 1
+    2.0, -0.5, -2.0, // Row 2
+    0.3, 0.7,  0.9,  // Row 3
   };
 
   std::vector<int32_t> ref_output_shape{3, 3};
   std::vector<float> ref_output_data{
-      0, 2,  -4, // Row 1
-      0, -5, 1,  // Row 2
-      1, 3,  -4, // Row 3
+    0, 2,  -4, // Row 1
+    0, -5, 1,  // Row 2
+    1, 3,  -4, // Row 3
   };
 
   Tensor x_tensor = makeInputTensor<DataType::FLOAT32>(x_shape, x_data);
diff --git a/compiler/luci-interpreter/src/kernels/FullyConnected.cpp b/compiler/luci-interpreter/src/kernels/FullyConnected.cpp
index 7fa76d5e7..48433b42d 100644
--- a/compiler/luci-interpreter/src/kernels/FullyConnected.cpp
+++ b/compiler/luci-interpreter/src/kernels/FullyConnected.cpp
@@ -30,7 +30,7 @@ namespace kernels
 
 FullyConnected::FullyConnected(const Tensor *input, const Tensor *weights, const Tensor *bias,
                                Tensor *output, const FullyConnectedParams &params)
-    : KernelWithParams<FullyConnectedParams>({input, weights, bias}, {output}, params)
+  : KernelWithParams<FullyConnectedParams>({input, weights, bias}, {output}, params)
 {
 }
 
@@ -97,9 +97,9 @@ void FullyConnected::evalFloat() const
   params.weights_format = tflite::FullyConnectedWeightsFormat::kDefault;
 
   tflite::reference_ops::FullyConnected(
-      params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(weights()),
-      getTensorData<float>(weights()), getTensorShape(bias()), getTensorData<float>(bias()),
-      getTensorShape(output()), getTensorData<float>(output()));
+    params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(weights()),
+    getTensorData<float>(weights()), getTensorShape(bias()), getTensorData<float>(bias()),
+    getTensorShape(output()), getTensorData<float>(output()));
 }
 
 void FullyConnected::evalQuantized() const
@@ -110,7 +110,7 @@ void FullyConnected::evalQuantized() const
   int32_t output_activation_max;
   int32_t output_multiplier;
   real_multiplier =
-      getQuantizedConvolutionMultipler(input()->scale(), weights()->scale(), output()->scale());
+    getQuantizedConvolutionMultipler(input()->scale(), weights()->scale(), output()->scale());
   quantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
   calculateActivationRangeQuantized(params().activation, output(), &output_activation_min,
                                     &output_activation_max);
@@ -130,9 +130,9 @@ void FullyConnected::evalQuantized() const
   op_params.lhs_cacheable = false;
   op_params.rhs_cacheable = false;
   tflite::reference_ops::FullyConnected(
-      op_params, getTensorShape(input()), getTensorData<uint8_t>(input()),
-      getTensorShape(weights()), getTensorData<uint8_t>(weights()), getTensorShape(bias()),
-      getTensorData<int32_t>(bias()), getTensorShape(output()), getTensorData<uint8_t>(output()));
+    op_params, getTensorShape(input()), getTensorData<uint8_t>(input()), getTensorShape(weights()),
+    getTensorData<uint8_t>(weights()), getTensorShape(bias()), getTensorData<int32_t>(bias()),
+    getTensorShape(output()), getTensorData<uint8_t>(output()));
 }
 
 } // namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/FullyConnected.test.cpp b/compiler/luci-interpreter/src/kernels/FullyConnected.test.cpp
index d194ce1a0..0259d3e1d 100644
--- a/compiler/luci-interpreter/src/kernels/FullyConnected.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/FullyConnected.test.cpp
@@ -50,10 +50,10 @@ void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int
 
 template <>
 void Check<uint8_t>(
-    std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> weights_shape,
-    std::initializer_list<int32_t> bias_shape, std::initializer_list<int32_t> output_shape,
-    std::initializer_list<float> input_data, std::initializer_list<float> weights_data,
-    std::initializer_list<float> bias_data, std::initializer_list<float> output_data)
+  std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> weights_shape,
+  std::initializer_list<int32_t> bias_shape, std::initializer_list<int32_t> output_shape,
+  std::initializer_list<float> input_data, std::initializer_list<float> weights_data,
+  std::initializer_list<float> bias_data, std::initializer_list<float> output_data)
 {
   const float quantized_tolerance = getTolerance(-127, 128, 255);
   std::pair<float, int32_t> input_quant_param = quantizationParams<uint8_t>(-63.5, 64);
@@ -63,9 +63,9 @@ void Check<uint8_t>(
   Tensor weights_tensor = makeInputTensor<DataType::U8>(weights_shape, input_quant_param.first,
                                                         input_quant_param.second, weights_data);
   Tensor bias_tensor = makeInputTensor<DataType::S32>(
-      bias_shape, input_quant_param.first * input_quant_param.first, 0, bias_data);
+    bias_shape, input_quant_param.first * input_quant_param.first, 0, bias_data);
   Tensor output_tensor =
-      makeOutputTensor(DataType::U8, output_quant_param.first, output_quant_param.second);
+    makeOutputTensor(DataType::U8, output_quant_param.first, output_quant_param.second);
 
   FullyConnectedParams params{};
   params.activation = Activation::RELU;
@@ -90,32 +90,33 @@ TYPED_TEST(FullyConnectedTest, Simple)
 {
   Check<TypeParam>({3, 2, 2, 1}, {3, 6}, {3}, {2, 3},
                    {
-                       -3, -5, 5, 4, 9, -2,  // batch = 0
-                       -3, -2, -4, 9, -8, 1, // batch = 1
+                     -3, -5, 5, 4, 9, -2,  // batch = 0
+                     -3, -2, -4, 9, -8, 1, // batch = 1
                    },
                    {
-                       -3, -7, 4, -4, -6, 4, // unit = 0
-                       3, 5, 2, 3, -3, -8,   // unit = 1
-                       -3, 7, 4, 9, 0, -5,   // unit = 2
+                     -3, -7, 4, -4, -6, 4, // unit = 0
+                     3, 5, 2, 3, -3, -8,   // unit = 1
+                     -3, 7, 4, 9, 0, -5,   // unit = 2
                    },
-                   {-1, -5, -8}, {
-                                     0, 0, 32,   // batch = 0
-                                     22, 11, 47, // batch = 1
-                                 });
+                   {-1, -5, -8},
+                   {
+                     0, 0, 32,   // batch = 0
+                     22, 11, 47, // batch = 1
+                   });
 }
 
 TEST(FullyConnectedTest, InvalidBiasType_NEG)
 {
   Shape input_shape{3, 2, 2, 1};
   std::vector<float> input_data{
-      -3, -5, 5,  4, 9,  -2, // batch = 0
-      -3, -2, -4, 9, -8, 1,  // batch = 1
+    -3, -5, 5,  4, 9,  -2, // batch = 0
+    -3, -2, -4, 9, -8, 1,  // batch = 1
   };
   Shape weights_shape{3, 6};
   std::vector<float> weights_data{
-      -3, -7, 4, -4, -6, 4,  // unit = 0
-      3,  5,  2, 3,  -3, -8, // unit = 1
-      -3, 7,  4, 9,  0,  -5, // unit = 2
+    -3, -7, 4, -4, -6, 4,  // unit = 0
+    3,  5,  2, 3,  -3, -8, // unit = 1
+    -3, 7,  4, 9,  0,  -5, // unit = 2
   };
   Shape bias_shape{3};
   std::vector<int32_t> bias_data{-1, -5, -8};
@@ -136,14 +137,14 @@ TEST(FullyConnectedTest, InvalidWeightShapeDim_NEG)
 {
   Shape input_shape{3, 2, 2, 1};
   std::vector<float> input_data{
-      -3, -5, 5,  4, 9,  -2, // batch = 0
-      -3, -2, -4, 9, -8, 1,  // batch = 1
+    -3, -5, 5,  4, 9,  -2, // batch = 0
+    -3, -2, -4, 9, -8, 1,  // batch = 1
   };
   Shape weights_shape{1, 3, 6};
   std::vector<float> weights_data{
-      -3, -7, 4, -4, -6, 4,  // unit = 0
-      3,  5,  2, 3,  -3, -8, // unit = 1
-      -3, 7,  4, 9,  0,  -5, // unit = 2
+    -3, -7, 4, -4, -6, 4,  // unit = 0
+    3,  5,  2, 3,  -3, -8, // unit = 1
+    -3, 7,  4, 9,  0,  -5, // unit = 2
   };
   Shape bias_shape{3};
   std::vector<float> bias_data{-1, -5, -8};
@@ -164,17 +165,17 @@ TEST(FullyConnectedTest, BiasElementNumWeightDimMismatch_NEG)
 {
   Shape input_shape{3, 2, 2, 1};
   std::vector<float> input_data{
-      -3, -5, 5,  4, 9,  -2, // batch = 0
-      -3, -2, -4, 9, -8, 1,  // batch = 1
+    -3, -5, 5,  4, 9,  -2, // batch = 0
+    -3, -2, -4, 9, -8, 1,  // batch = 1
   };
   Shape weights_shape{6, 3};
   std::vector<float> weights_data{
-      -3, -7, 4,  // unit = 0
-      -4, -6, 4,  // unit = 1
-      3,  5,  2,  // unit = 2
-      3,  -3, -8, // unit = 3
-      -3, 7,  4,  // unit = 4
-      9,  0,  -5, // unit = 5
+    -3, -7, 4,  // unit = 0
+    -4, -6, 4,  // unit = 1
+    3,  5,  2,  // unit = 2
+    3,  -3, -8, // unit = 3
+    -3, 7,  4,  // unit = 4
+    9,  0,  -5, // unit = 5
   };
   Shape bias_shape{3};
   std::vector<float> bias_data{-1, -5, -8};
diff --git a/compiler/luci-interpreter/src/kernels/Greater.test.cpp b/compiler/luci-interpreter/src/kernels/Greater.test.cpp
index 3122fa840..3fcc86603 100644
--- a/compiler/luci-interpreter/src/kernels/Greater.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Greater.test.cpp
@@ -30,18 +30,18 @@ using namespace testing;
 TEST(GreaterTest, FloatSimple)
 {
   std::vector<float> x_data{
-      0.5, 0.7, 0.9, // Row 1
-      1,   0,   -1,  // Row 2
+    0.5, 0.7, 0.9, // Row 1
+    1,   0,   -1,  // Row 2
   };
 
   std::vector<float> y_data{
-      0.9, 0.7, 0.5, // Row 1
-      -1,  0,   1,   // Row 2
+    0.9, 0.7, 0.5, // Row 1
+    -1,  0,   1,   // Row 2
   };
 
   std::vector<bool> ref_output_data{
-      false, false, true,  // Row 1
-      true,  false, false, // Row 2
+    false, false, true,  // Row 1
+    true,  false, false, // Row 2
   };
 
   Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, x_data);
@@ -59,19 +59,19 @@ TEST(GreaterTest, FloatSimple)
 TEST(GreaterTest, FloatBroardcast)
 {
   std::vector<float> x_data{
-      0.5, 0.7, 0.9, // Row 1
-      1,   0,   -1,  // Row 2
-      -1,  0,   1,   // Row 3
+    0.5, 0.7, 0.9, // Row 1
+    1,   0,   -1,  // Row 2
+    -1,  0,   1,   // Row 3
   };
 
   std::vector<float> y_data{
-      0.9, 0.7, 0.5, // Row 1
+    0.9, 0.7, 0.5, // Row 1
   };
 
   std::vector<bool> ref_output_data{
-      false, false, true,  // Row 1
-      true,  false, false, // Row 2
-      false, false, true,  // Row 3
+    false, false, true,  // Row 1
+    true,  false, false, // Row 2
+    false, false, true,  // Row 3
   };
 
   Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({3, 3}, x_data);
@@ -93,25 +93,25 @@ const float F_MAX = 127.0 / 128.0;
 TEST(GreaterTest, Uint8Quantized)
 {
   std::vector<float> x_data{
-      0.5, 0.6, 0.7,  0.9, // Row 1
-      1,   0,   0.05, -1,  // Row 2
+    0.5, 0.6, 0.7,  0.9, // Row 1
+    1,   0,   0.05, -1,  // Row 2
   };
 
   std::vector<float> y_data{
-      0.9, 0.6,  0.6, 0.5, // Row 1
-      -1,  0.05, 0,   1,   // Row 2
+    0.9, 0.6,  0.6, 0.5, // Row 1
+    -1,  0.05, 0,   1,   // Row 2
   };
 
   std::vector<bool> ref_output_data{
-      false, false, true, true,  // Row 1
-      true,  false, true, false, // Row 2
+    false, false, true, true,  // Row 1
+    true,  false, true, false, // Row 2
   };
 
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
   Tensor x_tensor =
-      makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, x_data);
+    makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, x_data);
   Tensor y_tensor =
-      makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, y_data);
+    makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, y_data);
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   Greater kernel(&x_tensor, &y_tensor, &output_tensor);
@@ -125,27 +125,27 @@ TEST(GreaterTest, Uint8Quantized)
 TEST(GreaterTest, Uint8QuantizedRescale)
 {
   std::vector<float> x_data{
-      0.5, 0.6, 0.7,  0.9, // Row 1
-      1,   0,   0.05, -1,  // Row 2
+    0.5, 0.6, 0.7,  0.9, // Row 1
+    1,   0,   0.05, -1,  // Row 2
   };
 
   std::vector<float> y_data{
-      0.9, 0.6,  0.6, 0.5, // Row 1
-      -1,  0.05, 0,   1,   // Row 2
+    0.9, 0.6,  0.6, 0.5, // Row 1
+    -1,  0.05, 0,   1,   // Row 2
   };
 
   std::vector<bool> ref_output_data{
-      false, false, true, true,  // Row 1
-      true,  false, true, false, // Row 2
+    false, false, true, true,  // Row 1
+    true,  false, true, false, // Row 2
   };
 
   std::pair<float, int32_t> x_quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
   std::pair<float, int32_t> y_quant_param = quantizationParams<uint8_t>(F_MIN * 2, F_MAX * 3);
 
-  Tensor x_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, x_quant_param.first,
-                                                  x_quant_param.second, x_data);
-  Tensor y_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, y_quant_param.first,
-                                                  y_quant_param.second, y_data);
+  Tensor x_tensor =
+    makeInputTensor<DataType::U8>({1, 2, 4, 1}, x_quant_param.first, x_quant_param.second, x_data);
+  Tensor y_tensor =
+    makeInputTensor<DataType::U8>({1, 2, 4, 1}, y_quant_param.first, y_quant_param.second, y_data);
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   Greater kernel(&x_tensor, &y_tensor, &output_tensor);
@@ -159,26 +159,26 @@ TEST(GreaterTest, Uint8QuantizedRescale)
 TEST(GreaterTest, Uint8QuantizedBroadcast)
 {
   std::vector<float> x_data{
-      0.4,  -0.8, 0.7,  0.3, // Row 1
-      -0.5, 0.1,  0,    0.5, // Row 2
-      1,    0,    0.05, -1,  // Row 3
+    0.4,  -0.8, 0.7,  0.3, // Row 1
+    -0.5, 0.1,  0,    0.5, // Row 2
+    1,    0,    0.05, -1,  // Row 3
   };
 
   std::vector<float> y_data{
-      -1, 0.05, 0, 1, // Row 1
+    -1, 0.05, 0, 1, // Row 1
   };
 
   std::vector<bool> ref_output_data{
-      true, false, true,  false, // Row 1
-      true, true,  false, false, // Row 2
-      true, false, true,  false, // Row 3
+    true, false, true,  false, // Row 1
+    true, true,  false, false, // Row 2
+    true, false, true,  false, // Row 3
   };
 
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
   Tensor x_tensor =
-      makeInputTensor<DataType::U8>({1, 3, 4, 1}, quant_param.first, quant_param.second, x_data);
+    makeInputTensor<DataType::U8>({1, 3, 4, 1}, quant_param.first, quant_param.second, x_data);
   Tensor y_tensor =
-      makeInputTensor<DataType::U8>({1, 1, 4, 1}, quant_param.first, quant_param.second, y_data);
+    makeInputTensor<DataType::U8>({1, 1, 4, 1}, quant_param.first, quant_param.second, y_data);
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   Greater kernel(&x_tensor, &y_tensor, &output_tensor);
diff --git a/compiler/luci-interpreter/src/kernels/GreaterEqual.cpp b/compiler/luci-interpreter/src/kernels/GreaterEqual.cpp
index 68135e27c..e7c1b4afe 100644
--- a/compiler/luci-interpreter/src/kernels/GreaterEqual.cpp
+++ b/compiler/luci-interpreter/src/kernels/GreaterEqual.cpp
@@ -28,7 +28,7 @@ namespace kernels
 {
 
 GreaterEqual::GreaterEqual(const Tensor *x, const Tensor *y, Tensor *output)
-    : Kernel({x, y}, {output})
+  : Kernel({x, y}, {output})
 {
 }
 
@@ -101,8 +101,8 @@ void GreaterEqual::evalQuantized() const
   if (op_params.is_broadcast)
   {
     tflite::reference_ops::Broadcast4DSlowGreaterEqualWithScaling(
-        op_params, getTensorShape(x()), x_data, getTensorShape(y()), y_data,
-        getTensorShape(output()), output_data);
+      op_params, getTensorShape(x()), x_data, getTensorShape(y()), y_data, getTensorShape(output()),
+      output_data);
   }
   else
   {
diff --git a/compiler/luci-interpreter/src/kernels/GreaterEqual.test.cpp b/compiler/luci-interpreter/src/kernels/GreaterEqual.test.cpp
index 11e62644c..7c79d8abc 100644
--- a/compiler/luci-interpreter/src/kernels/GreaterEqual.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/GreaterEqual.test.cpp
@@ -30,18 +30,18 @@ using namespace testing;
 TEST(GreaterEqualTest, FloatSimple)
 {
   std::vector<float> x_data{
-      0.5, 0.7, 0.9, // Row 1
-      1,   0,   -1,  // Row 2
+    0.5, 0.7, 0.9, // Row 1
+    1,   0,   -1,  // Row 2
   };
 
   std::vector<float> y_data{
-      0.9, 0.7, 0.5, // Row 1
-      -1,  0,   1,   // Row 2
+    0.9, 0.7, 0.5, // Row 1
+    -1,  0,   1,   // Row 2
   };
 
   std::vector<bool> ref_output_data{
-      false, true, true,  // Row 1
-      true,  true, false, // Row 2
+    false, true, true,  // Row 1
+    true,  true, false, // Row 2
   };
 
   Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, x_data);
@@ -59,19 +59,19 @@ TEST(GreaterEqualTest, FloatSimple)
 TEST(GreaterEqualTest, FloatBroardcast)
 {
   std::vector<float> x_data{
-      0.5, 0.7, 0.9, // Row 1
-      1,   0,   -1,  // Row 2
-      -1,  0,   1,   // Row 3
+    0.5, 0.7, 0.9, // Row 1
+    1,   0,   -1,  // Row 2
+    -1,  0,   1,   // Row 3
   };
 
   std::vector<float> y_data{
-      0.9, 0.7, 0.5, // Row 1
+    0.9, 0.7, 0.5, // Row 1
   };
 
   std::vector<bool> ref_output_data{
-      false, true,  true,  // Row 1
-      true,  false, false, // Row 2
-      false, false, true,  // Row 3
+    false, true,  true,  // Row 1
+    true,  false, false, // Row 2
+    false, false, true,  // Row 3
   };
 
   Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({3, 3}, x_data);
@@ -93,25 +93,25 @@ const float F_MAX = 127.0 / 128.0;
 TEST(GreaterEqualTest, Uint8Quantized)
 {
   std::vector<float> x_data{
-      0.5, 0.6, 0.7,  0.9, // Row 1
-      1,   0,   0.05, -1,  // Row 2
+    0.5, 0.6, 0.7,  0.9, // Row 1
+    1,   0,   0.05, -1,  // Row 2
   };
 
   std::vector<float> y_data{
-      0.9, 0.6,  0.55, 0.5, // Row 1
-      -1,  0.05, 0,    1,   // Row 2
+    0.9, 0.6,  0.55, 0.5, // Row 1
+    -1,  0.05, 0,    1,   // Row 2
   };
 
   std::vector<bool> ref_output_data{
-      false, true,  true, true,  // Row 1
-      true,  false, true, false, // Row 2
+    false, true,  true, true,  // Row 1
+    true,  false, true, false, // Row 2
   };
 
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
   Tensor x_tensor =
-      makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, x_data);
+    makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, x_data);
   Tensor y_tensor =
-      makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, y_data);
+    makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, y_data);
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   GreaterEqual kernel(&x_tensor, &y_tensor, &output_tensor);
@@ -125,27 +125,27 @@ TEST(GreaterEqualTest, Uint8Quantized)
 TEST(GreaterEqualTest, Uint8QuantizedRescale)
 {
   std::vector<float> x_data{
-      0.5, 0.5, 0.7,  0.9, // Row 1
-      1,   0,   0.05, -1,  // Row 2
+    0.5, 0.5, 0.7,  0.9, // Row 1
+    1,   0,   0.05, -1,  // Row 2
   };
 
   std::vector<float> y_data{
-      0.9, 0.5,  0.6, 0.5, // Row 1
-      -1,  0.05, 0,   1,   // Row 2
+    0.9, 0.5,  0.6, 0.5, // Row 1
+    -1,  0.05, 0,   1,   // Row 2
   };
 
   std::vector<bool> ref_output_data{
-      false, true,  true, true,  // Row 1
-      true,  false, true, false, // Row 2
+    false, true,  true, true,  // Row 1
+    true,  false, true, false, // Row 2
   };
 
   std::pair<float, int32_t> x_quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
   std::pair<float, int32_t> y_quant_param = quantizationParams<uint8_t>(F_MIN * 1.2, F_MAX * 1.5);
 
-  Tensor x_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, x_quant_param.first,
-                                                  x_quant_param.second, x_data);
-  Tensor y_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, y_quant_param.first,
-                                                  y_quant_param.second, y_data);
+  Tensor x_tensor =
+    makeInputTensor<DataType::U8>({1, 2, 4, 1}, x_quant_param.first, x_quant_param.second, x_data);
+  Tensor y_tensor =
+    makeInputTensor<DataType::U8>({1, 2, 4, 1}, y_quant_param.first, y_quant_param.second, y_data);
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   GreaterEqual kernel(&x_tensor, &y_tensor, &output_tensor);
@@ -159,26 +159,26 @@ TEST(GreaterEqualTest, Uint8QuantizedRescale)
 TEST(GreaterEqualTest, Uint8QuantizedBroadcast)
 {
   std::vector<float> x_data{
-      0.4,  -0.8, 0.7,  0.3, // Row 1
-      -0.5, 0.1,  0,    0.5, // Row 2
-      1,    0,    0.05, -1,  // Row 3
+    0.4,  -0.8, 0.7,  0.3, // Row 1
+    -0.5, 0.1,  0,    0.5, // Row 2
+    1,    0,    0.05, -1,  // Row 3
   };
 
   std::vector<float> y_data{
-      -1, 0.05, 0, 1, // Row 1
+    -1, 0.05, 0, 1, // Row 1
   };
 
   std::vector<bool> ref_output_data{
-      true, false, true, false, // Row 1
-      true, true,  true, false, // Row 2
-      true, false, true, false, // Row 3
+    true, false, true, false, // Row 1
+    true, true,  true, false, // Row 2
+    true, false, true, false, // Row 3
   };
 
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
   Tensor x_tensor =
-      makeInputTensor<DataType::U8>({1, 3, 4, 1}, quant_param.first, quant_param.second, x_data);
+    makeInputTensor<DataType::U8>({1, 3, 4, 1}, quant_param.first, quant_param.second, x_data);
   Tensor y_tensor =
-      makeInputTensor<DataType::U8>({1, 1, 4, 1}, quant_param.first, quant_param.second, y_data);
+    makeInputTensor<DataType::U8>({1, 1, 4, 1}, quant_param.first, quant_param.second, y_data);
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   GreaterEqual kernel(&x_tensor, &y_tensor, &output_tensor);
diff --git a/compiler/luci-interpreter/src/kernels/If.cpp b/compiler/luci-interpreter/src/kernels/If.cpp
index ca982d591..a267f6267 100644
--- a/compiler/luci-interpreter/src/kernels/If.cpp
+++ b/compiler/luci-interpreter/src/kernels/If.cpp
@@ -34,8 +34,8 @@ static std::vector<const Tensor *> joinInputs(const Tensor *cond,
 
 If::If(const Tensor *cond, const std::vector<const Tensor *> &inputs, std::vector<Tensor *> outputs,
        RuntimeGraph *then_graph, RuntimeGraph *else_graph)
-    : Kernel(joinInputs(cond, inputs), std::move(outputs)), _then_graph(then_graph),
-      _else_graph(else_graph)
+  : Kernel(joinInputs(cond, inputs), std::move(outputs)), _then_graph(then_graph),
+    _else_graph(else_graph)
 {
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/If.test.cpp b/compiler/luci-interpreter/src/kernels/If.test.cpp
index 6967407fb..0dba310d9 100644
--- a/compiler/luci-interpreter/src/kernels/If.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/If.test.cpp
@@ -34,11 +34,11 @@ RuntimeGraph *buildAddSubgraph(RuntimeModule *module)
 {
   RuntimeGraph *graph = module->addGraph();
   Tensor *input1 = graph->addTensor(
-      std::make_unique<Tensor>(DataType::FLOAT32, Shape{}, AffineQuantization{}, ""));
+    std::make_unique<Tensor>(DataType::FLOAT32, Shape{}, AffineQuantization{}, ""));
   Tensor *input2 = graph->addTensor(
-      std::make_unique<Tensor>(DataType::FLOAT32, Shape{}, AffineQuantization{}, ""));
+    std::make_unique<Tensor>(DataType::FLOAT32, Shape{}, AffineQuantization{}, ""));
   Tensor *output = graph->addTensor(
-      std::make_unique<Tensor>(DataType::FLOAT32, Shape{}, AffineQuantization{}, ""));
+    std::make_unique<Tensor>(DataType::FLOAT32, Shape{}, AffineQuantization{}, ""));
 
   graph->setInputTensors({input1, input2});
   graph->setOutputTensors({output});
@@ -54,11 +54,11 @@ RuntimeGraph *buildMulSubgraph(RuntimeModule *module)
 {
   RuntimeGraph *graph = module->addGraph();
   Tensor *input1 = graph->addTensor(
-      std::make_unique<Tensor>(DataType::FLOAT32, Shape{}, AffineQuantization{}, ""));
+    std::make_unique<Tensor>(DataType::FLOAT32, Shape{}, AffineQuantization{}, ""));
   Tensor *input2 = graph->addTensor(
-      std::make_unique<Tensor>(DataType::FLOAT32, Shape{}, AffineQuantization{}, ""));
+    std::make_unique<Tensor>(DataType::FLOAT32, Shape{}, AffineQuantization{}, ""));
   Tensor *output = graph->addTensor(
-      std::make_unique<Tensor>(DataType::FLOAT32, Shape{}, AffineQuantization{}, ""));
+    std::make_unique<Tensor>(DataType::FLOAT32, Shape{}, AffineQuantization{}, ""));
 
   graph->setInputTensors({input1, input2});
   graph->setOutputTensors({output});
diff --git a/compiler/luci-interpreter/src/kernels/InstanceNorm.cpp b/compiler/luci-interpreter/src/kernels/InstanceNorm.cpp
index 8e8241a28..b8317e2f2 100644
--- a/compiler/luci-interpreter/src/kernels/InstanceNorm.cpp
+++ b/compiler/luci-interpreter/src/kernels/InstanceNorm.cpp
@@ -28,7 +28,7 @@ namespace kernels
 
 InstanceNorm::InstanceNorm(const Tensor *input, const Tensor *gamma, const Tensor *beta,
                            Tensor *output, const InstanceNormParams &params)
-    : KernelWithParams<InstanceNormParams>({input, gamma, beta}, {output}, params)
+  : KernelWithParams<InstanceNormParams>({input, gamma, beta}, {output}, params)
 {
 }
 
@@ -96,11 +96,11 @@ void InstanceNorm::evalFloat() const
         for (int32_t width = 0; width < widths; width++)
         {
           double input_value =
-              input_data[tflite::Offset(output_shape, batch, height, width, channel)];
+            input_data[tflite::Offset(output_shape, batch, height, width, channel)];
           double output_value = input_value * a + b;
           output_data[tflite::Offset(output_shape, batch, height, width, channel)] =
-              tflite::ActivationFunctionWithMinMax((float)output_value, activation_min,
-                                                   activation_max);
+            tflite::ActivationFunctionWithMinMax((float)output_value, activation_min,
+                                                 activation_max);
         }
       }
     }
diff --git a/compiler/luci-interpreter/src/kernels/L2Normalize.cpp b/compiler/luci-interpreter/src/kernels/L2Normalize.cpp
index 0bf133d9c..2eaf5404e 100644
--- a/compiler/luci-interpreter/src/kernels/L2Normalize.cpp
+++ b/compiler/luci-interpreter/src/kernels/L2Normalize.cpp
@@ -28,7 +28,7 @@ namespace kernels
 {
 
 L2Normalize::L2Normalize(const Tensor *input, Tensor *output, const L2NormParams &params)
-    : KernelWithParams<L2NormParams>({input}, {output}, params)
+  : KernelWithParams<L2NormParams>({input}, {output}, params)
 {
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp b/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp
index 8f9431182..6281b451b 100644
--- a/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp
@@ -51,11 +51,11 @@ void Check<uint8_t>(std::initializer_list<int32_t> input_shape,
                     std::initializer_list<float> output_data)
 {
   std::pair<float, int32_t> quant_param =
-      quantizationParams<uint8_t>(std::min(input_data) < 0 ? std::min(input_data) : 0.f,
-                                  std::max(input_data) > 0 ? std::max(input_data) : 0.f);
+    quantizationParams<uint8_t>(std::min(input_data) < 0 ? std::min(input_data) : 0.f,
+                                std::max(input_data) > 0 ? std::max(input_data) : 0.f);
 
   Tensor input_tensor =
-      makeInputTensor<DataType::U8>(input_shape, quant_param.first, quant_param.second, input_data);
+    makeInputTensor<DataType::U8>(input_shape, quant_param.first, quant_param.second, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::U8, 1. / 128., 128);
 
   L2NormParams params{};
diff --git a/compiler/luci-interpreter/src/kernels/L2Pool2D.cpp b/compiler/luci-interpreter/src/kernels/L2Pool2D.cpp
index 979364a7f..5bf3ba5a8 100644
--- a/compiler/luci-interpreter/src/kernels/L2Pool2D.cpp
+++ b/compiler/luci-interpreter/src/kernels/L2Pool2D.cpp
@@ -30,7 +30,7 @@ namespace kernels
 {
 
 L2Pool2D::L2Pool2D(const Tensor *input, Tensor *output, const Pool2DParams &params)
-    : KernelWithParams<Pool2DParams>({input}, {output}, params)
+  : KernelWithParams<Pool2DParams>({input}, {output}, params)
 {
 }
 
@@ -49,11 +49,11 @@ void L2Pool2D::configure()
   int out_width, out_height;
   out_width = computeOutputSize(padding, width, params().filter_width, params().stride_width, 1);
   out_height =
-      computeOutputSize(padding, height, params().filter_height, params().stride_height, 1);
+    computeOutputSize(padding, height, params().filter_height, params().stride_height, 1);
   _padding_width =
-      computePadding(params().stride_width, 1, width, params().filter_width, out_width);
+    computePadding(params().stride_width, 1, width, params().filter_width, out_width);
   _padding_height =
-      computePadding(params().stride_height, 1, height, params().filter_height, out_height);
+    computePadding(params().stride_height, 1, height, params().filter_height, out_height);
 
   LUCI_INTERPRETER_CHECK(input()->element_type() == DataType::FLOAT32);
   output()->resize({batches, out_height, out_width, channels_out});
diff --git a/compiler/luci-interpreter/src/kernels/L2Pool2D.test.cpp b/compiler/luci-interpreter/src/kernels/L2Pool2D.test.cpp
index 5f834e3c1..52f426a08 100644
--- a/compiler/luci-interpreter/src/kernels/L2Pool2D.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/L2Pool2D.test.cpp
@@ -31,8 +31,8 @@ TEST(L2Pool2DTest, FloatNone)
 {
   Shape input_shape{1, 2, 4, 1};
   std::vector<float> input_data{
-      0, 6, 2,  4, //
-      3, 2, 10, 7, //
+    0, 6, 2,  4, //
+    3, 2, 10, 7, //
   };
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
@@ -58,8 +58,8 @@ TEST(L2Pool2DTest, FloatRelu)
 {
   Shape input_shape{1, 2, 4, 1};
   std::vector<float> input_data{
-      -1, -6, 2,  4, //
-      -3, -2, 10, 7, //
+    -1, -6, 2,  4, //
+    -3, -2, 10, 7, //
   };
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
@@ -85,8 +85,8 @@ TEST(L2Pool2DTest, FloatRelu1)
 {
   Shape input_shape{1, 2, 4, 1};
   std::vector<float> input_data{
-      -0.1, -0.6, 2,  4, //
-      -0.3, -0.2, 10, 7, //
+    -0.1, -0.6, 2,  4, //
+    -0.3, -0.2, 10, 7, //
   };
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
@@ -112,8 +112,8 @@ TEST(L2Pool2DTest, FloatRelu6)
 {
   Shape input_shape{1, 2, 4, 1};
   std::vector<float> input_data{
-      -0.1, -0.6, 2,  4, //
-      -0.3, -0.2, 10, 7, //
+    -0.1, -0.6, 2,  4, //
+    -0.3, -0.2, 10, 7, //
   };
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
@@ -139,8 +139,8 @@ TEST(L2Pool2DTest, FloatPaddingSame)
 {
   Shape input_shape{1, 2, 4, 1};
   std::vector<float> input_data{
-      0, 6, 2,  4, //
-      3, 2, 10, 7, //
+    0, 6, 2,  4, //
+    3, 2, 10, 7, //
   };
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
@@ -166,8 +166,8 @@ TEST(L2Pool2DTest, FloatPaddingSameStride)
 {
   Shape input_shape{1, 2, 4, 1};
   std::vector<float> input_data{
-      0, 6, 2,  4, //
-      3, 2, 10, 7, //
+    0, 6, 2,  4, //
+    3, 2, 10, 7, //
   };
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
@@ -193,8 +193,8 @@ TEST(L2Pool2DTest, FloatPaddingValidStride)
 {
   Shape input_shape{1, 2, 4, 1};
   std::vector<float> input_data{
-      0, 6, 2,  4, //
-      3, 2, 10, 7, //
+    0, 6, 2,  4, //
+    3, 2, 10, 7, //
   };
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
@@ -220,8 +220,8 @@ TEST(L2Pool2DTest, InvalidInputShape_NEG)
 {
   Shape input_shape{1, 2, 4};
   std::vector<float> input_data{
-      0, 6, 2,  4, //
-      3, 2, 10, 7, //
+    0, 6, 2,  4, //
+    3, 2, 10, 7, //
   };
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
@@ -242,8 +242,8 @@ TEST(L2Pool2DTest, InvalidInputOutputType_NEG)
 {
   Shape input_shape{1, 2, 4};
   std::vector<float> input_data{
-      0, 6, 2,  4, //
-      3, 2, 10, 7, //
+    0, 6, 2,  4, //
+    3, 2, 10, 7, //
   };
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::U8);
diff --git a/compiler/luci-interpreter/src/kernels/LeakyRelu.cpp b/compiler/luci-interpreter/src/kernels/LeakyRelu.cpp
index 919b12792..f468da5d3 100644
--- a/compiler/luci-interpreter/src/kernels/LeakyRelu.cpp
+++ b/compiler/luci-interpreter/src/kernels/LeakyRelu.cpp
@@ -30,7 +30,7 @@ namespace kernels
 {
 
 LeakyRelu::LeakyRelu(const Tensor *input, Tensor *output, const LeakyReluParams &params)
-    : KernelWithParams<LeakyReluParams>({input}, {output}, params)
+  : KernelWithParams<LeakyReluParams>({input}, {output}, params)
 {
 }
 
@@ -82,8 +82,8 @@ void LeakyRelu::evalQuantized() const
   op_params.output_shift_identity = _output_shift_identity;
 
   tflite::reference_ops::QuantizeLeakyRelu(
-      op_params, getTensorShape(input()), getTensorData<uint8_t>(input()), getTensorShape(output()),
-      getTensorData<uint8_t>(output()));
+    op_params, getTensorShape(input()), getTensorData<uint8_t>(input()), getTensorShape(output()),
+    getTensorData<uint8_t>(output()));
 }
 
 } // namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp b/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp
index 2778549ed..b5cc3e7fc 100644
--- a/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp
@@ -56,7 +56,7 @@ void Check<uint8_t>(std::initializer_list<int32_t> input_shape,
   const float quantized_tolerance = getTolerance(-8, 127.f / 16.f, 255);
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-8, 127.f / 16.f);
   Tensor input_tensor =
-      makeInputTensor<DataType::U8>(input_shape, quant_param.first, quant_param.second, input_data);
+    makeInputTensor<DataType::U8>(input_shape, quant_param.first, quant_param.second, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
 
   LeakyReluParams params{};
@@ -84,13 +84,13 @@ TYPED_TEST(LeakReluTest, Simple)
   Check<TypeParam>(/*input_shape=*/{2, 3}, /*output_shape=*/{2, 3},
                    /*input_data=*/
                    {
-                       0.0f, 1.0f, 3.0f,   // Row 1
-                       1.0f, -1.0f, -2.0f, // Row 2
+                     0.0f, 1.0f, 3.0f,   // Row 1
+                     1.0f, -1.0f, -2.0f, // Row 2
                    },
                    /*output_data=*/
                    {
-                       0.0f, 1.0f, 3.0f,   // Row 1
-                       1.0f, -0.5f, -1.0f, // Row 2
+                     0.0f, 1.0f, 3.0f,   // Row 1
+                     1.0f, -0.5f, -1.0f, // Row 2
                    },
                    /*alpha=*/0.5f);
 
@@ -100,8 +100,8 @@ TYPED_TEST(LeakReluTest, Simple)
 TEST(LeakReluTest, IvalidInputOutputType_NEG)
 {
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, {
-                                                                       0.0f, 1.0f, 3.0f,   // Row 1
-                                                                       1.0f, -1.0f, -2.0f, // Row 2
+                                                                     0.0f, 1.0f, 3.0f,   // Row 1
+                                                                     1.0f, -1.0f, -2.0f, // Row 2
                                                                    });
   Tensor output_tensor = makeOutputTensor(DataType::U8);
 
diff --git a/compiler/luci-interpreter/src/kernels/Less.test.cpp b/compiler/luci-interpreter/src/kernels/Less.test.cpp
index 73aa30b36..2972bd559 100644
--- a/compiler/luci-interpreter/src/kernels/Less.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Less.test.cpp
@@ -30,18 +30,18 @@ using namespace testing;
 TEST(LessTest, FloatSimple)
 {
   std::vector<float> x_data{
-      0.5, 0.7, 0.9, // Row 1
-      1,   0,   -1,  // Row 2
+    0.5, 0.7, 0.9, // Row 1
+    1,   0,   -1,  // Row 2
   };
 
   std::vector<float> y_data{
-      0.9, 0.7, 0.5, // Row 1
-      -1,  0,   1,   // Row 2
+    0.9, 0.7, 0.5, // Row 1
+    -1,  0,   1,   // Row 2
   };
 
   std::vector<bool> ref_output_data{
-      true,  false, false, // Row 1
-      false, false, true,  // Row 2
+    true,  false, false, // Row 1
+    false, false, true,  // Row 2
   };
 
   Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, x_data);
@@ -59,19 +59,19 @@ TEST(LessTest, FloatSimple)
 TEST(LessTest, FloatBroardcast)
 {
   std::vector<float> x_data{
-      0.5, 0.7, 0.9, // Row 1
-      1,   0,   -1,  // Row 2
-      -1,  0,   1,   // Row 3
+    0.5, 0.7, 0.9, // Row 1
+    1,   0,   -1,  // Row 2
+    -1,  0,   1,   // Row 3
   };
 
   std::vector<float> y_data{
-      0.9, 0.7, 0.5, // Row 1
+    0.9, 0.7, 0.5, // Row 1
   };
 
   std::vector<bool> ref_output_data{
-      true,  false, false, // Row 1
-      false, true,  true,  // Row 2
-      true,  true,  false, // Row 3
+    true,  false, false, // Row 1
+    false, true,  true,  // Row 2
+    true,  true,  false, // Row 3
   };
 
   Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({3, 3}, x_data);
@@ -93,25 +93,25 @@ const float F_MAX = 127.0 / 128.0;
 TEST(LessTest, Uint8Quantized)
 {
   std::vector<float> x_data{
-      0.5, 0.6, 0.7,  0.9, // Row 1
-      1,   0,   0.05, -1,  // Row 2
+    0.5, 0.6, 0.7,  0.9, // Row 1
+    1,   0,   0.05, -1,  // Row 2
   };
 
   std::vector<float> y_data{
-      0.9, 0.6,  0.55, 0.5, // Row 1
-      -1,  0.05, 0,    1,   // Row 2
+    0.9, 0.6,  0.55, 0.5, // Row 1
+    -1,  0.05, 0,    1,   // Row 2
   };
 
   std::vector<bool> ref_output_data{
-      true,  false, false, false, // Row 1
-      false, true,  false, true,  // Row 2
+    true,  false, false, false, // Row 1
+    false, true,  false, true,  // Row 2
   };
 
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
   Tensor x_tensor =
-      makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, x_data);
+    makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, x_data);
   Tensor y_tensor =
-      makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, y_data);
+    makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, y_data);
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   Less kernel(&x_tensor, &y_tensor, &output_tensor);
@@ -125,27 +125,27 @@ TEST(LessTest, Uint8Quantized)
 TEST(LessTest, Uint8QuantizedRescale)
 {
   std::vector<float> x_data{
-      0.5, 0.6, 0.7,  0.9, // Row 1
-      1,   0,   0.05, -1,  // Row 2
+    0.5, 0.6, 0.7,  0.9, // Row 1
+    1,   0,   0.05, -1,  // Row 2
   };
 
   std::vector<float> y_data{
-      0.9, 0.6,  0.6, 0.5, // Row 1
-      -1,  0.05, 0,   1,   // Row 2
+    0.9, 0.6,  0.6, 0.5, // Row 1
+    -1,  0.05, 0,   1,   // Row 2
   };
 
   std::vector<bool> ref_output_data{
-      true,  false, false, false, // Row 1
-      false, true,  false, true,  // Row 2
+    true,  false, false, false, // Row 1
+    false, true,  false, true,  // Row 2
   };
 
   std::pair<float, int32_t> x_quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
   std::pair<float, int32_t> y_quant_param = quantizationParams<uint8_t>(F_MIN * 1.2, F_MAX * 1.5);
 
-  Tensor x_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, x_quant_param.first,
-                                                  x_quant_param.second, x_data);
-  Tensor y_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, y_quant_param.first,
-                                                  y_quant_param.second, y_data);
+  Tensor x_tensor =
+    makeInputTensor<DataType::U8>({1, 2, 4, 1}, x_quant_param.first, x_quant_param.second, x_data);
+  Tensor y_tensor =
+    makeInputTensor<DataType::U8>({1, 2, 4, 1}, y_quant_param.first, y_quant_param.second, y_data);
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   Less kernel(&x_tensor, &y_tensor, &output_tensor);
@@ -159,26 +159,26 @@ TEST(LessTest, Uint8QuantizedRescale)
 TEST(LessTest, Uint8QuantizedBroadcast)
 {
   std::vector<float> x_data{
-      0.4,  -0.8, 0.7,  0.3, // Row 1
-      -0.5, 0.1,  0,    0.5, // Row 2
-      1,    0,    0.05, -1,  // Row 3
+    0.4,  -0.8, 0.7,  0.3, // Row 1
+    -0.5, 0.1,  0,    0.5, // Row 2
+    1,    0,    0.05, -1,  // Row 3
   };
 
   std::vector<float> y_data{
-      -1, 0.05, 0, 1, // Row 1
+    -1, 0.05, 0, 1, // Row 1
   };
 
   std::vector<bool> ref_output_data{
-      false, true,  false, true, // Row 1
-      false, false, false, true, // Row 2
-      false, true,  false, true, // Row 3
+    false, true,  false, true, // Row 1
+    false, false, false, true, // Row 2
+    false, true,  false, true, // Row 3
   };
 
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
   Tensor x_tensor =
-      makeInputTensor<DataType::U8>({1, 3, 4, 1}, quant_param.first, quant_param.second, x_data);
+    makeInputTensor<DataType::U8>({1, 3, 4, 1}, quant_param.first, quant_param.second, x_data);
   Tensor y_tensor =
-      makeInputTensor<DataType::U8>({1, 1, 4, 1}, quant_param.first, quant_param.second, y_data);
+    makeInputTensor<DataType::U8>({1, 1, 4, 1}, quant_param.first, quant_param.second, y_data);
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   Less kernel(&x_tensor, &y_tensor, &output_tensor);
diff --git a/compiler/luci-interpreter/src/kernels/LessEqual.cpp b/compiler/luci-interpreter/src/kernels/LessEqual.cpp
index b8aaba178..5f4c7f7aa 100644
--- a/compiler/luci-interpreter/src/kernels/LessEqual.cpp
+++ b/compiler/luci-interpreter/src/kernels/LessEqual.cpp
@@ -98,8 +98,8 @@ void LessEqual::evalQuantized() const
   if (op_params.is_broadcast)
   {
     tflite::reference_ops::Broadcast4DSlowLessEqualWithScaling(
-        op_params, getTensorShape(x()), x_data, getTensorShape(y()), y_data,
-        getTensorShape(output()), output_data);
+      op_params, getTensorShape(x()), x_data, getTensorShape(y()), y_data, getTensorShape(output()),
+      output_data);
   }
   else
   {
diff --git a/compiler/luci-interpreter/src/kernels/LessEqual.test.cpp b/compiler/luci-interpreter/src/kernels/LessEqual.test.cpp
index 9184c061f..db65815a6 100644
--- a/compiler/luci-interpreter/src/kernels/LessEqual.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/LessEqual.test.cpp
@@ -30,18 +30,18 @@ using namespace testing;
 TEST(LessEqualTest, FloatSimple)
 {
   std::vector<float> x_data{
-      0.5, 0.7, 0.9, // Row 1
-      1,   0,   -1,  // Row 2
+    0.5, 0.7, 0.9, // Row 1
+    1,   0,   -1,  // Row 2
   };
 
   std::vector<float> y_data{
-      0.9, 0.7, 0.5, // Row 1
-      -1,  0,   1,   // Row 2
+    0.9, 0.7, 0.5, // Row 1
+    -1,  0,   1,   // Row 2
   };
 
   std::vector<bool> ref_output_data{
-      true,  true, false, // Row 1
-      false, true, true,  // Row 2
+    true,  true, false, // Row 1
+    false, true, true,  // Row 2
   };
 
   Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, x_data);
@@ -59,19 +59,19 @@ TEST(LessEqualTest, FloatSimple)
 TEST(LessEqualTest, FloatBroardcast)
 {
   std::vector<float> x_data{
-      0.5, 0.7, 0.9, // Row 1
-      1,   0,   -1,  // Row 2
-      -1,  0,   1,   // Row 3
+    0.5, 0.7, 0.9, // Row 1
+    1,   0,   -1,  // Row 2
+    -1,  0,   1,   // Row 3
   };
 
   std::vector<float> y_data{
-      0.9, 0.7, 0.5, // Row 1
+    0.9, 0.7, 0.5, // Row 1
   };
 
   std::vector<bool> ref_output_data{
-      true,  true, false, // Row 1
-      false, true, true,  // Row 2
-      true,  true, false, // Row 3
+    true,  true, false, // Row 1
+    false, true, true,  // Row 2
+    true,  true, false, // Row 3
   };
 
   Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({3, 3}, x_data);
@@ -93,25 +93,25 @@ const float F_MAX = 127.0 / 128.0;
 TEST(LessEqualTest, Uint8Quantized)
 {
   std::vector<float> x_data{
-      0.5, 0.6, 0.7,  0.9, // Row 1
-      1,   0,   0.05, -1,  // Row 2
+    0.5, 0.6, 0.7,  0.9, // Row 1
+    1,   0,   0.05, -1,  // Row 2
   };
 
   std::vector<float> y_data{
-      0.9, 0.6,  0.55, 0.5, // Row 1
-      -1,  0.05, 0,    1,   // Row 2
+    0.9, 0.6,  0.55, 0.5, // Row 1
+    -1,  0.05, 0,    1,   // Row 2
   };
 
   std::vector<bool> ref_output_data{
-      true,  true, false, false, // Row 1
-      false, true, false, true,  // Row 2
+    true,  true, false, false, // Row 1
+    false, true, false, true,  // Row 2
   };
 
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
   Tensor x_tensor =
-      makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, x_data);
+    makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, x_data);
   Tensor y_tensor =
-      makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, y_data);
+    makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, y_data);
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   LessEqual kernel(&x_tensor, &y_tensor, &output_tensor);
@@ -125,27 +125,27 @@ TEST(LessEqualTest, Uint8Quantized)
 TEST(LessEqualTest, Uint8QuantizedRescale)
 {
   std::vector<float> x_data{
-      0.5, 0.6, 0.7,  0.9, // Row 1
-      1,   0,   0.05, -1,  // Row 2
+    0.5, 0.6, 0.7,  0.9, // Row 1
+    1,   0,   0.05, -1,  // Row 2
   };
 
   std::vector<float> y_data{
-      0.9, 0.6,  0.6, 0.5, // Row 1
-      -1,  0.05, 0,   1,   // Row 2
+    0.9, 0.6,  0.6, 0.5, // Row 1
+    -1,  0.05, 0,   1,   // Row 2
   };
 
   std::vector<bool> ref_output_data{
-      true,  true, false, false, // Row 1
-      false, true, false, true,  // Row 2
+    true,  true, false, false, // Row 1
+    false, true, false, true,  // Row 2
   };
 
   std::pair<float, int32_t> x_quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
   std::pair<float, int32_t> y_quant_param = quantizationParams<uint8_t>(F_MIN * 1.2, F_MAX * 1.5);
 
-  Tensor x_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, x_quant_param.first,
-                                                  x_quant_param.second, x_data);
-  Tensor y_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, y_quant_param.first,
-                                                  y_quant_param.second, y_data);
+  Tensor x_tensor =
+    makeInputTensor<DataType::U8>({1, 2, 4, 1}, x_quant_param.first, x_quant_param.second, x_data);
+  Tensor y_tensor =
+    makeInputTensor<DataType::U8>({1, 2, 4, 1}, y_quant_param.first, y_quant_param.second, y_data);
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   LessEqual kernel(&x_tensor, &y_tensor, &output_tensor);
@@ -159,26 +159,26 @@ TEST(LessEqualTest, Uint8QuantizedRescale)
 TEST(LessEqualTest, Uint8QuantizedBroadcast)
 {
   std::vector<float> x_data{
-      0.4,  -0.8, 0.7,  0.3, // Row 1
-      -0.5, 0.1,  0,    0.5, // Row 2
-      1,    0,    0.05, -1,  // Row 3
+    0.4,  -0.8, 0.7,  0.3, // Row 1
+    -0.5, 0.1,  0,    0.5, // Row 2
+    1,    0,    0.05, -1,  // Row 3
   };
 
   std::vector<float> y_data{
-      -1, 0.05, 0, 1, // Row 1
+    -1, 0.05, 0, 1, // Row 1
   };
 
   std::vector<bool> ref_output_data{
-      false, true,  false, true, // Row 1
-      false, false, true,  true, // Row 2
-      false, true,  false, true, // Row 3
+    false, true,  false, true, // Row 1
+    false, false, true,  true, // Row 2
+    false, true,  false, true, // Row 3
   };
 
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
   Tensor x_tensor =
-      makeInputTensor<DataType::U8>({1, 3, 4, 1}, quant_param.first, quant_param.second, x_data);
+    makeInputTensor<DataType::U8>({1, 3, 4, 1}, quant_param.first, quant_param.second, x_data);
   Tensor y_tensor =
-      makeInputTensor<DataType::U8>({1, 1, 4, 1}, quant_param.first, quant_param.second, y_data);
+    makeInputTensor<DataType::U8>({1, 1, 4, 1}, quant_param.first, quant_param.second, y_data);
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   LessEqual kernel(&x_tensor, &y_tensor, &output_tensor);
diff --git a/compiler/luci-interpreter/src/kernels/LocalResponseNormalization.cpp b/compiler/luci-interpreter/src/kernels/LocalResponseNormalization.cpp
index b78e27128..fd2ec41a1 100644
--- a/compiler/luci-interpreter/src/kernels/LocalResponseNormalization.cpp
+++ b/compiler/luci-interpreter/src/kernels/LocalResponseNormalization.cpp
@@ -29,8 +29,8 @@ namespace kernels
 {
 
 LocalResponseNormalization::LocalResponseNormalization(
-    const Tensor *input, Tensor *output, const LocalResponseNormalizationParams &params)
-    : KernelWithParams<LocalResponseNormalizationParams>({input}, {output}, params)
+  const Tensor *input, Tensor *output, const LocalResponseNormalizationParams &params)
+  : KernelWithParams<LocalResponseNormalizationParams>({input}, {output}, params)
 {
 }
 
@@ -53,8 +53,8 @@ void LocalResponseNormalization::execute() const
       op_params.alpha = params().alpha;
       op_params.beta = params().beta;
       tflite::optimized_ops::LocalResponseNormalization(
-          op_params, getTensorShape(input()), getTensorData<float>(input()),
-          getTensorShape(output()), getTensorData<float>(output()));
+        op_params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(output()),
+        getTensorData<float>(output()));
       break;
     default:
       throw std::runtime_error("Unsupported type.");
diff --git a/compiler/luci-interpreter/src/kernels/LocalResponseNormalization.test.cpp b/compiler/luci-interpreter/src/kernels/LocalResponseNormalization.test.cpp
index d98305c1a..6a4331d34 100644
--- a/compiler/luci-interpreter/src/kernels/LocalResponseNormalization.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/LocalResponseNormalization.test.cpp
@@ -30,7 +30,7 @@ using namespace testing;
 TEST(LocalResponseNormalizationTest, SameAsL2Norm)
 {
   Tensor input_tensor =
-      makeInputTensor<DataType::FLOAT32>({1, 1, 1, 6}, {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
+    makeInputTensor<DataType::FLOAT32>({1, 1, 1, 6}, {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   LocalResponseNormalizationParams params{};
@@ -50,7 +50,7 @@ TEST(LocalResponseNormalizationTest, SameAsL2Norm)
 TEST(LocalResponseNormalizationTest, WithAlpha)
 {
   Tensor input_tensor =
-      makeInputTensor<DataType::FLOAT32>({1, 1, 1, 6}, {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
+    makeInputTensor<DataType::FLOAT32>({1, 1, 1, 6}, {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   LocalResponseNormalizationParams params{};
@@ -70,7 +70,7 @@ TEST(LocalResponseNormalizationTest, WithAlpha)
 TEST(LocalResponseNormalizationTest, WithBias)
 {
   Tensor input_tensor =
-      makeInputTensor<DataType::FLOAT32>({1, 1, 1, 6}, {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
+    makeInputTensor<DataType::FLOAT32>({1, 1, 1, 6}, {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   LocalResponseNormalizationParams params{};
@@ -90,7 +90,7 @@ TEST(LocalResponseNormalizationTest, WithBias)
 TEST(LocalResponseNormalizationTest, SmallRadius)
 {
   Tensor input_tensor =
-      makeInputTensor<DataType::FLOAT32>({1, 1, 1, 6}, {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
+    makeInputTensor<DataType::FLOAT32>({1, 1, 1, 6}, {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   LocalResponseNormalizationParams params{};
@@ -110,7 +110,7 @@ TEST(LocalResponseNormalizationTest, SmallRadius)
 TEST(LocalResponseNormalizationTest, InvalidInputDimension_NEG)
 {
   Tensor input_tensor =
-      makeInputTensor<DataType::FLOAT32>({1, 1, 6}, {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
+    makeInputTensor<DataType::FLOAT32>({1, 1, 6}, {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
 
   LocalResponseNormalizationParams params{};
@@ -126,7 +126,7 @@ TEST(LocalResponseNormalizationTest, InvalidInputDimension_NEG)
 TEST(LocalResponseNormalizationTest, InvalidInputOutputType_NEG)
 {
   Tensor input_tensor =
-      makeInputTensor<DataType::FLOAT32>({1, 1, 1, 6}, {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
+    makeInputTensor<DataType::FLOAT32>({1, 1, 1, 6}, {-1.1, 0.6, 0.7, 1.2, -0.7, 0.1});
   Tensor output_tensor = makeOutputTensor(DataType::U8);
 
   LocalResponseNormalizationParams params{};
diff --git a/compiler/luci-interpreter/src/kernels/LogSoftmax.test.cpp b/compiler/luci-interpreter/src/kernels/LogSoftmax.test.cpp
index d3b331dfe..8a90c1dd0 100644
--- a/compiler/luci-interpreter/src/kernels/LogSoftmax.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/LogSoftmax.test.cpp
@@ -31,8 +31,8 @@ TEST(LogSoftmaxTest, Float)
 {
   Shape input_shape{2, 4};
   std::vector<float> input_data{
-      0, -6, 2,  4, //
-      3, -2, 10, 1, //
+    0, -6, 2,  4, //
+    3, -2, 10, 1, //
   };
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
@@ -42,8 +42,8 @@ TEST(LogSoftmaxTest, Float)
   kernel.execute();
 
   std::vector<float> ref_output_data{
-      -4.14297, -10.14297, -2.14297,   -.142971, //
-      -7.00104, -12.00104, -.00104087, -9.00104, //
+    -4.14297, -10.14297, -2.14297,   -.142971, //
+    -7.00104, -12.00104, -.00104087, -9.00104, //
   };
   EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
 }
@@ -55,11 +55,11 @@ TEST(LogSoftmaxTest, Uint8)
   float kLogSoftmaxQuantizedTolerance = 16. / 256;
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(kMin, kMax);
   std::vector<float> input_data{
-      0, -6, 2,  4, //
-      3, -2, 10, 1, //
+    0, -6, 2,  4, //
+    3, -2, 10, 1, //
   };
   Tensor input_tensor =
-      makeInputTensor<DataType::U8>({2, 4}, quant_param.first, quant_param.second, input_data);
+    makeInputTensor<DataType::U8>({2, 4}, quant_param.first, quant_param.second, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::U8, 16. / 256, 255);
 
   LogSoftmax kernel(&input_tensor, &output_tensor);
@@ -67,8 +67,8 @@ TEST(LogSoftmaxTest, Uint8)
   kernel.execute();
 
   std::vector<float> ref_output_data{
-      -4.14297, -10.14297, -2.14297,   -.142971, //
-      -7.00104, -12.00104, -.00104087, -9.00104, //
+    -4.14297, -10.14297, -2.14297,   -.142971, //
+    -7.00104, -12.00104, -.00104087, -9.00104, //
   };
   std::vector<int32_t> ref_output_shape{2, 4};
   EXPECT_THAT(dequantizeTensorData(output_tensor),
@@ -81,8 +81,8 @@ TEST(LogSoftmaxTest, Uint8)
 TEST(LogSoftmaxTest, InvalidInputOutputType_NEG)
 {
   std::vector<float> input_data{
-      0, -6, 2,  4, //
-      3, -2, 10, 1, //
+    0, -6, 2,  4, //
+    3, -2, 10, 1, //
   };
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 4}, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::U8, 16. / 256, 255);
@@ -95,11 +95,11 @@ TEST(LogSoftmaxTest, InvalidOutputQuantParam_NEG)
 {
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-10, 10);
   std::vector<float> input_data{
-      0, -6, 2,  4, //
-      3, -2, 10, 1, //
+    0, -6, 2,  4, //
+    3, -2, 10, 1, //
   };
   Tensor input_tensor =
-      makeInputTensor<DataType::U8>({2, 4}, quant_param.first, quant_param.second, input_data);
+    makeInputTensor<DataType::U8>({2, 4}, quant_param.first, quant_param.second, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::U8, 20. / 256, 255);
 
   LogSoftmax kernel(&input_tensor, &output_tensor);
diff --git a/compiler/luci-interpreter/src/kernels/LogicalAnd.cpp b/compiler/luci-interpreter/src/kernels/LogicalAnd.cpp
index d50d50472..8e7263231 100644
--- a/compiler/luci-interpreter/src/kernels/LogicalAnd.cpp
+++ b/compiler/luci-interpreter/src/kernels/LogicalAnd.cpp
@@ -27,7 +27,7 @@ namespace kernels
 {
 
 LogicalAnd::LogicalAnd(const Tensor *input1, const Tensor *input2, Tensor *output)
-    : Kernel({input1, input2}, {output})
+  : Kernel({input1, input2}, {output})
 {
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/LogicalOr.cpp b/compiler/luci-interpreter/src/kernels/LogicalOr.cpp
index bd2208a4b..7027a2a8b 100644
--- a/compiler/luci-interpreter/src/kernels/LogicalOr.cpp
+++ b/compiler/luci-interpreter/src/kernels/LogicalOr.cpp
@@ -28,7 +28,7 @@ namespace kernels
 {
 
 LogicalOr::LogicalOr(const Tensor *input1, const Tensor *input2, Tensor *output)
-    : Kernel({input1, input2}, {output})
+  : Kernel({input1, input2}, {output})
 {
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/Logistic.test.cpp b/compiler/luci-interpreter/src/kernels/Logistic.test.cpp
index d3bbb330d..41369a417 100644
--- a/compiler/luci-interpreter/src/kernels/Logistic.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Logistic.test.cpp
@@ -48,7 +48,7 @@ void Check<uint8_t>(std::initializer_list<int32_t> input_shape,
                     std::initializer_list<float> output_data)
 {
   std::pair<float, int32_t> input_quant_param =
-      quantizationParams<uint8_t>(std::min(input_data), std::max(input_data));
+    quantizationParams<uint8_t>(std::min(input_data), std::max(input_data));
   Tensor input_tensor = makeInputTensor<DataType::U8>(input_shape, input_quant_param.first,
                                                       input_quant_param.second, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::U8, 1. / 256, 0);
@@ -72,37 +72,37 @@ TYPED_TEST_CASE(LogisticTest, DataTypes);
 TYPED_TEST(LogisticTest, Simple)
 {
   Check<TypeParam>(
-      {89}, {89},
-      {-10.0000000000, -9.7727272727, -9.5454545455, -9.3181818182, -9.0909090909, -8.8636363636,
-       -8.6363636364,  -8.4090909091, -8.1818181818, -7.9545454545, -7.7272727273, -7.5000000000,
-       -7.2727272727,  -7.0454545455, -6.8181818182, -6.5909090909, -6.3636363636, -6.1363636364,
-       -5.9090909091,  -5.6818181818, -5.4545454545, -5.2272727273, -5.0000000000, -4.7727272727,
-       -4.5454545455,  -4.3181818182, -4.0909090909, -3.8636363636, -3.6363636364, -3.4090909091,
-       -3.1818181818,  -2.9545454545, -2.7272727273, -2.5000000000, -2.2727272727, -2.0454545455,
-       -1.8181818182,  -1.5909090909, -1.3636363636, -1.1363636364, -0.9090909091, -0.6818181818,
-       -0.4545454545,  -0.2272727273, 0.0000000000,  0.2272727273,  0.4545454545,  0.6818181818,
-       0.9090909091,   1.1363636364,  1.3636363636,  1.5909090909,  1.8181818182,  2.0454545455,
-       2.2727272727,   2.5000000000,  2.7272727273,  2.9545454545,  3.1818181818,  3.4090909091,
-       3.6363636364,   3.8636363636,  4.0909090909,  4.3181818182,  4.5454545455,  4.7727272727,
-       5.0000000000,   5.2272727273,  5.4545454545,  5.6818181818,  5.9090909091,  6.1363636364,
-       6.3636363636,   6.5909090909,  6.8181818182,  7.0454545455,  7.2727272727,  7.5000000000,
-       7.7272727273,   7.9545454545,  8.1818181818,  8.4090909091,  8.6363636364,  8.8636363636,
-       9.0909090909,   9.3181818182,  9.5454545455,  9.7727272727,  10.0000000000},
-      {0.0000453979, 0.0000569815, 0.0000715205, 0.0000897689, 0.0001126729, 0.0001414198,
-       0.0001774998, 0.0002227827, 0.0002796147, 0.0003509396, 0.0004404502, 0.0005527786,
-       0.0006937345, 0.0008706021, 0.0010925128, 0.0013709094, 0.0017201256, 0.0021581065,
-       0.0027073042, 0.0033957870, 0.0042586071, 0.0053394826, 0.0066928509, 0.0083863576,
-       0.0105038445, 0.0131488902, 0.0164489307, 0.0205599431, 0.0256715863, 0.0320125562,
-       0.0398556989, 0.0495221198, 0.0613831074, 0.0758581800, 0.0934070047, 0.1145124805,
-       0.1396521834, 0.1692560327, 0.2036499335, 0.2429886272, 0.2871859014, 0.3358556241,
-       0.3882805886, 0.4434251301, 0.5000000000, 0.5565748699, 0.6117194114, 0.6641443759,
-       0.7128140986, 0.7570113728, 0.7963500665, 0.8307439673, 0.8603478166, 0.8854875195,
-       0.9065929953, 0.9241418200, 0.9386168926, 0.9504778802, 0.9601443011, 0.9679874438,
-       0.9743284137, 0.9794400569, 0.9835510693, 0.9868511098, 0.9894961555, 0.9916136424,
-       0.9933071491, 0.9946605174, 0.9957413929, 0.9966042130, 0.9972926958, 0.9978418935,
-       0.9982798744, 0.9986290906, 0.9989074872, 0.9991293979, 0.9993062655, 0.9994472214,
-       0.9995595498, 0.9996490604, 0.9997203853, 0.9997772173, 0.9998225002, 0.9998585802,
-       0.9998873271, 0.9999102311, 0.9999284795, 0.9999430185, 0.9999546021});
+    {89}, {89},
+    {-10.0000000000, -9.7727272727, -9.5454545455, -9.3181818182, -9.0909090909, -8.8636363636,
+     -8.6363636364,  -8.4090909091, -8.1818181818, -7.9545454545, -7.7272727273, -7.5000000000,
+     -7.2727272727,  -7.0454545455, -6.8181818182, -6.5909090909, -6.3636363636, -6.1363636364,
+     -5.9090909091,  -5.6818181818, -5.4545454545, -5.2272727273, -5.0000000000, -4.7727272727,
+     -4.5454545455,  -4.3181818182, -4.0909090909, -3.8636363636, -3.6363636364, -3.4090909091,
+     -3.1818181818,  -2.9545454545, -2.7272727273, -2.5000000000, -2.2727272727, -2.0454545455,
+     -1.8181818182,  -1.5909090909, -1.3636363636, -1.1363636364, -0.9090909091, -0.6818181818,
+     -0.4545454545,  -0.2272727273, 0.0000000000,  0.2272727273,  0.4545454545,  0.6818181818,
+     0.9090909091,   1.1363636364,  1.3636363636,  1.5909090909,  1.8181818182,  2.0454545455,
+     2.2727272727,   2.5000000000,  2.7272727273,  2.9545454545,  3.1818181818,  3.4090909091,
+     3.6363636364,   3.8636363636,  4.0909090909,  4.3181818182,  4.5454545455,  4.7727272727,
+     5.0000000000,   5.2272727273,  5.4545454545,  5.6818181818,  5.9090909091,  6.1363636364,
+     6.3636363636,   6.5909090909,  6.8181818182,  7.0454545455,  7.2727272727,  7.5000000000,
+     7.7272727273,   7.9545454545,  8.1818181818,  8.4090909091,  8.6363636364,  8.8636363636,
+     9.0909090909,   9.3181818182,  9.5454545455,  9.7727272727,  10.0000000000},
+    {0.0000453979, 0.0000569815, 0.0000715205, 0.0000897689, 0.0001126729, 0.0001414198,
+     0.0001774998, 0.0002227827, 0.0002796147, 0.0003509396, 0.0004404502, 0.0005527786,
+     0.0006937345, 0.0008706021, 0.0010925128, 0.0013709094, 0.0017201256, 0.0021581065,
+     0.0027073042, 0.0033957870, 0.0042586071, 0.0053394826, 0.0066928509, 0.0083863576,
+     0.0105038445, 0.0131488902, 0.0164489307, 0.0205599431, 0.0256715863, 0.0320125562,
+     0.0398556989, 0.0495221198, 0.0613831074, 0.0758581800, 0.0934070047, 0.1145124805,
+     0.1396521834, 0.1692560327, 0.2036499335, 0.2429886272, 0.2871859014, 0.3358556241,
+     0.3882805886, 0.4434251301, 0.5000000000, 0.5565748699, 0.6117194114, 0.6641443759,
+     0.7128140986, 0.7570113728, 0.7963500665, 0.8307439673, 0.8603478166, 0.8854875195,
+     0.9065929953, 0.9241418200, 0.9386168926, 0.9504778802, 0.9601443011, 0.9679874438,
+     0.9743284137, 0.9794400569, 0.9835510693, 0.9868511098, 0.9894961555, 0.9916136424,
+     0.9933071491, 0.9946605174, 0.9957413929, 0.9966042130, 0.9972926958, 0.9978418935,
+     0.9982798744, 0.9986290906, 0.9989074872, 0.9991293979, 0.9993062655, 0.9994472214,
+     0.9995595498, 0.9996490604, 0.9997203853, 0.9997772173, 0.9998225002, 0.9998585802,
+     0.9998873271, 0.9999102311, 0.9999284795, 0.9999430185, 0.9999546021});
 }
 
 TEST(LogisticTest, IvalidInputOutputType_NEG)
diff --git a/compiler/luci-interpreter/src/kernels/MaxPool2D.cpp b/compiler/luci-interpreter/src/kernels/MaxPool2D.cpp
index 123e6e1a2..8d9760ff2 100644
--- a/compiler/luci-interpreter/src/kernels/MaxPool2D.cpp
+++ b/compiler/luci-interpreter/src/kernels/MaxPool2D.cpp
@@ -30,7 +30,7 @@ namespace kernels
 {
 
 MaxPool2D::MaxPool2D(const Tensor *input, Tensor *output, const Pool2DParams &params)
-    : KernelWithParams<Pool2DParams>({input}, {output}, params)
+  : KernelWithParams<Pool2DParams>({input}, {output}, params)
 {
 }
 
@@ -44,15 +44,15 @@ void MaxPool2D::configure()
   const int32_t input_width = input_shape.dim(2);
   const int32_t depth = input_shape.dim(3);
 
-  const int32_t output_height = computeOutputSize(_params.padding, input_height,
-                                                  _params.filter_height, _params.stride_height);
+  const int32_t output_height =
+    computeOutputSize(_params.padding, input_height, _params.filter_height, _params.stride_height);
   const int32_t output_width =
-      computeOutputSize(_params.padding, input_width, _params.filter_width, _params.stride_width);
+    computeOutputSize(_params.padding, input_width, _params.filter_width, _params.stride_width);
 
   _padding_height =
-      computePadding(_params.stride_height, 1, input_height, _params.filter_height, output_height);
+    computePadding(_params.stride_height, 1, input_height, _params.filter_height, output_height);
   _padding_width =
-      computePadding(_params.stride_width, 1, input_width, _params.filter_width, output_width);
+    computePadding(_params.stride_width, 1, input_width, _params.filter_width, output_width);
 
   output()->resize({batches, output_height, output_width, depth});
   if (input()->element_type() == DataType::U8)
@@ -142,8 +142,8 @@ void MaxPool2D::evalSInt16() const
   params.quantized_activation_max = activation_max;
 
   tflite::reference_integer_ops::MaxPool(
-      params, getTensorShape(input()), getTensorData<int16_t>(input()), //
-      getTensorShape(output()), getTensorData<int16_t>(output()));
+    params, getTensorShape(input()), getTensorData<int16_t>(input()), //
+    getTensorShape(output()), getTensorData<int16_t>(output()));
 }
 
 } // namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/MaxPool2D.test.cpp b/compiler/luci-interpreter/src/kernels/MaxPool2D.test.cpp
index 1d7fe06c4..b9991f7ec 100644
--- a/compiler/luci-interpreter/src/kernels/MaxPool2D.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/MaxPool2D.test.cpp
@@ -30,9 +30,9 @@ TEST(MaxPool2DTest, Float)
 {
   Shape input_shape{1, 3, 5, 1};
   std::vector<float> input_data{
-      1,  -1, 0,  -2, 2,  //
-      -7, -6, -5, -4, -3, //
-      5,  4,  3,  6,  7,  //
+    1,  -1, 0,  -2, 2,  //
+    -7, -6, -5, -4, -3, //
+    5,  4,  3,  6,  7,  //
   };
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
@@ -50,8 +50,8 @@ TEST(MaxPool2DTest, Float)
   kernel.execute();
 
   std::vector<float> ref_output_data{
-      1, 2, //
-      5, 6, //
+    1, 2, //
+    5, 6, //
   };
   std::initializer_list<int32_t> ref_output_shape{1, 2, 2, 1};
   EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
@@ -62,11 +62,11 @@ TEST(MaxPool2DTest, Uint8)
 {
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-15.9375, 15.9375);
   std::vector<float> input_data{
-      0,  -6, 12, 4, //
-      -3, -2, 10, 7, //
+    0,  -6, 12, 4, //
+    -3, -2, 10, 7, //
   };
-  Tensor input_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first,
-                                                      quant_param.second, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
 
   Pool2DParams params{};
@@ -92,13 +92,13 @@ TEST(MaxPool2DTest, SInt16)
   Shape input_shape{1, 3, 5, 1};
   std::vector<int32_t> ref_output_shape{1, 2, 2, 1};
   std::vector<float> input_data{
-      1,  -1, 0,  -2, 2,  //
-      -7, -6, -5, -4, -3, //
-      5,  4,  3,  6,  7,  //
+    1,  -1, 0,  -2, 2,  //
+    -7, -6, -5, -4, -3, //
+    5,  4,  3,  6,  7,  //
   };
   std::vector<float> ref_output_data{
-      1, 2, //
-      5, 6, //
+    1, 2, //
+    5, 6, //
   };
 
   Tensor input_tensor = makeInputTensor<DataType::S16>(input_shape, 0.2, 0, input_data);
diff --git a/compiler/luci-interpreter/src/kernels/Maximum.cpp b/compiler/luci-interpreter/src/kernels/Maximum.cpp
index c522b0706..b102b5e27 100644
--- a/compiler/luci-interpreter/src/kernels/Maximum.cpp
+++ b/compiler/luci-interpreter/src/kernels/Maximum.cpp
@@ -27,7 +27,7 @@ namespace kernels
 {
 
 Maximum::Maximum(const Tensor *input1, const Tensor *input2, Tensor *output)
-    : Kernel({input1, input2}, {output})
+  : Kernel({input1, input2}, {output})
 {
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/Mean.cpp b/compiler/luci-interpreter/src/kernels/Mean.cpp
index f20cf7d89..421632812 100644
--- a/compiler/luci-interpreter/src/kernels/Mean.cpp
+++ b/compiler/luci-interpreter/src/kernels/Mean.cpp
@@ -124,7 +124,7 @@ static Shape getOutputShape(const Shape &input_shape, const int *axes_data, int
 }
 
 Mean::Mean(const Tensor *input, const Tensor *axes, Tensor *output, const ReducerParams &params)
-    : KernelWithParams<ReducerParams>({input, axes}, {output}, params)
+  : KernelWithParams<ReducerParams>({input, axes}, {output}, params)
 {
 }
 
@@ -149,16 +149,15 @@ void Mean::configure()
 
   tflite::MeanParams params{};
   resolveAxes(axes_data, num_axes, &params);
-  const bool need_temporaries =
-      !(_params.keep_dims && input_num_dims == 4 && params.axis_count == 2 &&
-        ((params.axis[0] == 1 && params.axis[1] == 2) ||
-         (params.axis[0] == 2 && params.axis[1] == 1)));
+  const bool need_temporaries = !(
+    _params.keep_dims && input_num_dims == 4 && params.axis_count == 2 &&
+    ((params.axis[0] == 1 && params.axis[1] == 2) || (params.axis[0] == 2 && params.axis[1] == 1)));
   if (need_temporaries)
   {
     _temp_index =
-        std::make_unique<Tensor>(DataType::S32, Shape(input_num_dims), AffineQuantization{}, "");
+      std::make_unique<Tensor>(DataType::S32, Shape(input_num_dims), AffineQuantization{}, "");
     _resolved_axes =
-        std::make_unique<Tensor>(DataType::S32, Shape(num_axes), AffineQuantization{}, "");
+      std::make_unique<Tensor>(DataType::S32, Shape(num_axes), AffineQuantization{}, "");
     _temp_sum = std::make_unique<Tensor>(input()->element_type(), output()->shape(),
                                          AffineQuantization{}, "");
   }
@@ -209,11 +208,11 @@ void Mean::evalFloat() const
   else
   {
     tflite::reference_ops::Mean(
-        getTensorData<float>(input()), getTensorShape(input()).DimsData(),
-        input()->shape().num_dims(), getTensorData<float>(output()),
-        getTensorShape(output()).DimsData(), output()->shape().num_dims(), axes_data, num_axes,
-        _params.keep_dims, getTensorData<int>(_temp_index.get()),
-        getTensorData<int>(_resolved_axes.get()), getTensorData<float>(_temp_sum.get()));
+      getTensorData<float>(input()), getTensorShape(input()).DimsData(),
+      input()->shape().num_dims(), getTensorData<float>(output()),
+      getTensorShape(output()).DimsData(), output()->shape().num_dims(), axes_data, num_axes,
+      _params.keep_dims, getTensorData<int>(_temp_index.get()),
+      getTensorData<int>(_resolved_axes.get()), getTensorData<float>(_temp_sum.get()));
   }
 }
 
@@ -240,22 +239,22 @@ void Mean::evalQuantized() const
   else if (input()->zero_point() == output()->zero_point() && input()->scale() == output()->scale())
   {
     tflite::reference_ops::Mean(
-        getTensorData<uint8_t>(input()), getTensorShape(input()).DimsData(),
-        input()->shape().num_dims(), getTensorData<uint8_t>(output()),
-        getTensorShape(output()).DimsData(), output()->shape().num_dims(), axes_data, num_axes,
-        _params.keep_dims, getTensorData<int>(_temp_index.get()),
-        getTensorData<int>(_resolved_axes.get()), getTensorData<int>(_temp_sum.get()));
+      getTensorData<uint8_t>(input()), getTensorShape(input()).DimsData(),
+      input()->shape().num_dims(), getTensorData<uint8_t>(output()),
+      getTensorShape(output()).DimsData(), output()->shape().num_dims(), axes_data, num_axes,
+      _params.keep_dims, getTensorData<int>(_temp_index.get()),
+      getTensorData<int>(_resolved_axes.get()), getTensorData<int>(_temp_sum.get()));
   }
   else
   {
     tflite::reference_ops::QuantizedMeanOrSum<>(
-        getTensorData<uint8_t>(input()), input()->zero_point(), input()->scale(),
-        getTensorShape(input()).DimsData(), input()->shape().num_dims(),
-        getTensorData<uint8_t>(output()), output()->zero_point(), output()->scale(),
-        getTensorShape(output()).DimsData(), output()->shape().num_dims(), axes_data, num_axes,
-        _params.keep_dims, getTensorData<int>(_temp_index.get()),
-        getTensorData<int>(_resolved_axes.get()), getTensorData<int>(_temp_sum.get()),
-        /*compute_sum=*/false);
+      getTensorData<uint8_t>(input()), input()->zero_point(), input()->scale(),
+      getTensorShape(input()).DimsData(), input()->shape().num_dims(),
+      getTensorData<uint8_t>(output()), output()->zero_point(), output()->scale(),
+      getTensorShape(output()).DimsData(), output()->shape().num_dims(), axes_data, num_axes,
+      _params.keep_dims, getTensorData<int>(_temp_index.get()),
+      getTensorData<int>(_resolved_axes.get()), getTensorData<int>(_temp_sum.get()),
+      /*compute_sum=*/false);
   }
 }
 
@@ -288,7 +287,7 @@ void Mean::evalQuantizedS16() const
     assert(output_shape.dim(3) == depth);
 
     const double real_multiplier =
-        static_cast<double>(input()->scale()) / static_cast<double>(output()->scale());
+      static_cast<double>(input()->scale()) / static_cast<double>(output()->scale());
 
     int32_t output_multiplier{};
     int output_shift{};
@@ -309,11 +308,11 @@ void Mean::evalQuantizedS16() const
           }
         }
         int32_t scaled_acc =
-            tflite::MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
+          tflite::MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
         // Divide by the number of elements rounding to the nearest integer.
         scaled_acc = scaled_acc > 0
-                         ? (scaled_acc + num_elements_in_axes / 2) / num_elements_in_axes
-                         : (scaled_acc - num_elements_in_axes / 2) / num_elements_in_axes;
+                       ? (scaled_acc + num_elements_in_axes / 2) / num_elements_in_axes
+                       : (scaled_acc - num_elements_in_axes / 2) / num_elements_in_axes;
 
         scaled_acc = std::max(scaled_acc, output_min);
         scaled_acc = std::min(scaled_acc, output_max);
diff --git a/compiler/luci-interpreter/src/kernels/Mean.test.cpp b/compiler/luci-interpreter/src/kernels/Mean.test.cpp
index e81d2ad5f..fa0ba2169 100644
--- a/compiler/luci-interpreter/src/kernels/Mean.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Mean.test.cpp
@@ -107,7 +107,7 @@ TEST(MeanTest, Uint8KeepDims)
 
   std::vector<int32_t> axis_data{1};
   Tensor input_tensor =
-      makeInputTensor<DataType::U8>({3, 2}, quant_param.first, quant_param.second, input_data);
+    makeInputTensor<DataType::U8>({3, 2}, quant_param.first, quant_param.second, input_data);
   Tensor axis_tensor = makeInputTensor<DataType::S32>({1}, axis_data);
   Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
 
@@ -133,7 +133,7 @@ TEST(MeanTest, Uint8NotKeepDims)
 
   std::vector<int32_t> axis_data{1};
   Tensor input_tensor =
-      makeInputTensor<DataType::U8>({1, 3, 2}, quant_param.first, quant_param.second, input_data);
+    makeInputTensor<DataType::U8>({1, 3, 2}, quant_param.first, quant_param.second, input_data);
   Tensor axis_tensor = makeInputTensor<DataType::S32>({1}, axis_data);
   Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
 
diff --git a/compiler/luci-interpreter/src/kernels/Minimum.cpp b/compiler/luci-interpreter/src/kernels/Minimum.cpp
index 5eb13455e..5d3dcde72 100644
--- a/compiler/luci-interpreter/src/kernels/Minimum.cpp
+++ b/compiler/luci-interpreter/src/kernels/Minimum.cpp
@@ -27,7 +27,7 @@ namespace kernels
 {
 
 Minimum::Minimum(const Tensor *input1, const Tensor *input2, Tensor *output)
-    : Kernel({input1, input2}, {output})
+  : Kernel({input1, input2}, {output})
 {
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/Mul.cpp b/compiler/luci-interpreter/src/kernels/Mul.cpp
index 513d147a3..4e6e3f75a 100644
--- a/compiler/luci-interpreter/src/kernels/Mul.cpp
+++ b/compiler/luci-interpreter/src/kernels/Mul.cpp
@@ -30,7 +30,7 @@ namespace kernels
 {
 
 Mul::Mul(const Tensor *input1, const Tensor *input2, Tensor *output, const MulParams &params)
-    : KernelWithParams<MulParams>({input1, input2}, {output}, params)
+  : KernelWithParams<MulParams>({input1, input2}, {output}, params)
 {
 }
 
@@ -73,13 +73,13 @@ void Mul::evalFloat() const
   params.float_activation_max = activation_max;
 
   const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes(
-      getTensorShape(input1()), getTensorShape(input2()), &params);
+    getTensorShape(input1()), getTensorShape(input2()), &params);
 
   if (need_broadcast)
   {
     tflite::optimized_ops::BroadcastMul4DSlow(
-        params, getTensorShape(input1()), getTensorData<float>(input1()), getTensorShape(input2()),
-        getTensorData<float>(input2()), getTensorShape(output()), getTensorData<float>(output()));
+      params, getTensorShape(input1()), getTensorData<float>(input1()), getTensorShape(input2()),
+      getTensorData<float>(input2()), getTensorShape(output()), getTensorData<float>(output()));
   }
   else
   {
diff --git a/compiler/luci-interpreter/src/kernels/Mul.test.cpp b/compiler/luci-interpreter/src/kernels/Mul.test.cpp
index 1409b3fae..fc7ffb5a1 100644
--- a/compiler/luci-interpreter/src/kernels/Mul.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Mul.test.cpp
@@ -32,14 +32,14 @@ TEST(MulTest, Float)
   Shape base_shape = {2, 3, 1, 2};
   std::vector<Shape> test_shapes{{1, 1, 3, 2}, {1, 3, 1, 2}, {2, 1, 3, 1}, {2, 3, 1, 1}};
   std::vector<std::vector<float>> test_outputs = {
-      {0.00f, 0.69f, 0.12f, 1.15f, 0.00f, 2.07f, 0.18f, 0.15f, 0.00f, 0.25f, 0.90f, 0.45f,
-       0.16f, 0.00f, 0.00f, 0.00f, 0.80f, 0.00f, 0.24f, 0.84f, 0.00f, 1.40f, 1.20f, 2.52f,
-       0.00f, 0.00f, 0.64f, 0.00f, 0.00f, 0.00f, 0.14f, 0.00f, 0.00f, 0.00f, 0.70f, 0.00f},
-      {0.00f, 0.69f, 0.00f, 0.25f, 0.80f, 0.00f, 0.24f, 0.84f, 0.64f, 0.00f, 0.70f, 0.00f},
-      {0.00f, 0.46f, 0.00f, 0.69f, 0.12f, 0.00f, 0.18f, 0.10f, 0.27f, 0.15f, 0.00f, 0.00f,
-       0.16f, 0.00f, 0.24f, 0.00f, 0.00f, 0.44f, 0.60f, 1.40f, 1.20f, 2.80f, 1.08f, 2.52f,
-       0.00f, 0.00f, 0.00f, 0.00f, 0.00f, 0.00f, 0.35f, 0.00f, 0.70f, 0.00f, 0.63f, 0.00f},
-      {0.00f, 0.46f, 0.27f, 0.15f, 0.00f, 0.44f, 0.60f, 1.40f, 0.00f, 0.00f, 0.63f, 0.00f}};
+    {0.00f, 0.69f, 0.12f, 1.15f, 0.00f, 2.07f, 0.18f, 0.15f, 0.00f, 0.25f, 0.90f, 0.45f,
+     0.16f, 0.00f, 0.00f, 0.00f, 0.80f, 0.00f, 0.24f, 0.84f, 0.00f, 1.40f, 1.20f, 2.52f,
+     0.00f, 0.00f, 0.64f, 0.00f, 0.00f, 0.00f, 0.14f, 0.00f, 0.00f, 0.00f, 0.70f, 0.00f},
+    {0.00f, 0.69f, 0.00f, 0.25f, 0.80f, 0.00f, 0.24f, 0.84f, 0.64f, 0.00f, 0.70f, 0.00f},
+    {0.00f, 0.46f, 0.00f, 0.69f, 0.12f, 0.00f, 0.18f, 0.10f, 0.27f, 0.15f, 0.00f, 0.00f,
+     0.16f, 0.00f, 0.24f, 0.00f, 0.00f, 0.44f, 0.60f, 1.40f, 1.20f, 2.80f, 1.08f, 2.52f,
+     0.00f, 0.00f, 0.00f, 0.00f, 0.00f, 0.00f, 0.35f, 0.00f, 0.70f, 0.00f, 0.63f, 0.00f},
+    {0.00f, 0.46f, 0.27f, 0.15f, 0.00f, 0.44f, 0.60f, 1.40f, 0.00f, 0.00f, 0.63f, 0.00f}};
   std::vector<float> input1_data{-0.3f, 2.3f, 0.9f,  0.5f, 0.8f, -1.1f,
                                  1.2f,  2.8f, -1.6f, 0.0f, 0.7f, -2.2f};
   std::vector<float> input2_data{0.2f, 0.3f, -0.4f, 0.5f, 1.0f, 0.9f};
@@ -57,7 +57,7 @@ TEST(MulTest, Float)
     kernel.execute();
 
     EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(test_outputs[i], 0.0001f))
-        << "With shape number " << i;
+      << "With shape number " << i;
   }
   // Re-run with exchanged inputs.
   for (size_t i = 0; i < test_shapes.size(); ++i)
@@ -74,7 +74,7 @@ TEST(MulTest, Float)
     kernel.execute();
 
     EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(test_outputs[i], 0.0001f))
-        << "With shape number " << i;
+      << "With shape number " << i;
   }
 }
 
@@ -83,25 +83,25 @@ TEST(MulTest, SInt16)
   Shape base_shape = {2, 3, 1, 2};
   std::vector<Shape> test_shapes{{1, 1, 3, 2}, {1, 3, 1, 2}, {2, 1, 3, 1}, {2, 3, 1, 1}};
   std::vector<std::vector<int32_t>> ref_output_shapes{
-      {2, 3, 3, 2}, {2, 3, 1, 2}, {2, 3, 3, 2}, {2, 3, 1, 2}};
+    {2, 3, 3, 2}, {2, 3, 1, 2}, {2, 3, 3, 2}, {2, 3, 1, 2}};
 
   std::vector<float> input1_data{-0.3f, 2.3f, 0.9f,  0.5f, 0.8f, -1.1f,
                                  1.2f,  2.8f, -1.6f, 0.0f, 0.7f, -2.2f};
   std::vector<float> input2_data{0.2f, 0.3f, -0.4f, 0.5f, 1.0f, 0.9f};
   std::vector<std::vector<float>> ref_outputs = {
-      {0.00f, 0.69f, 0.12f, 1.15f, 0.00f, 2.07f, 0.18f, 0.15f, 0.00f, 0.25f, 0.90f, 0.45f,
-       0.16f, 0.00f, 0.00f, 0.00f, 0.80f, 0.00f, 0.24f, 0.84f, 0.00f, 1.40f, 1.20f, 2.52f,
-       0.00f, 0.00f, 0.64f, 0.00f, 0.00f, 0.00f, 0.14f, 0.00f, 0.00f, 0.00f, 0.70f, 0.00f},
-      {0.00f, 0.69f, 0.00f, 0.25f, 0.80f, 0.00f, 0.24f, 0.84f, 0.64f, 0.00f, 0.70f, 0.00f},
-      {0.00f, 0.46f, 0.00f, 0.69f, 0.12f, 0.00f, 0.18f, 0.10f, 0.27f, 0.15f, 0.00f, 0.00f,
-       0.16f, 0.00f, 0.24f, 0.00f, 0.00f, 0.44f, 0.60f, 1.40f, 1.20f, 2.80f, 1.08f, 2.52f,
-       0.00f, 0.00f, 0.00f, 0.00f, 0.00f, 0.00f, 0.35f, 0.00f, 0.70f, 0.00f, 0.63f, 0.00f},
-      {0.00f, 0.46f, 0.27f, 0.15f, 0.00f, 0.44f, 0.60f, 1.40f, 0.00f, 0.00f, 0.63f, 0.00f}};
+    {0.00f, 0.69f, 0.12f, 1.15f, 0.00f, 2.07f, 0.18f, 0.15f, 0.00f, 0.25f, 0.90f, 0.45f,
+     0.16f, 0.00f, 0.00f, 0.00f, 0.80f, 0.00f, 0.24f, 0.84f, 0.00f, 1.40f, 1.20f, 2.52f,
+     0.00f, 0.00f, 0.64f, 0.00f, 0.00f, 0.00f, 0.14f, 0.00f, 0.00f, 0.00f, 0.70f, 0.00f},
+    {0.00f, 0.69f, 0.00f, 0.25f, 0.80f, 0.00f, 0.24f, 0.84f, 0.64f, 0.00f, 0.70f, 0.00f},
+    {0.00f, 0.46f, 0.00f, 0.69f, 0.12f, 0.00f, 0.18f, 0.10f, 0.27f, 0.15f, 0.00f, 0.00f,
+     0.16f, 0.00f, 0.24f, 0.00f, 0.00f, 0.44f, 0.60f, 1.40f, 1.20f, 2.80f, 1.08f, 2.52f,
+     0.00f, 0.00f, 0.00f, 0.00f, 0.00f, 0.00f, 0.35f, 0.00f, 0.70f, 0.00f, 0.63f, 0.00f},
+    {0.00f, 0.46f, 0.27f, 0.15f, 0.00f, 0.44f, 0.60f, 1.40f, 0.00f, 0.00f, 0.63f, 0.00f}};
   for (size_t i = 0; i < test_shapes.size(); ++i)
   {
     Tensor input1_tensor = makeInputTensor<DataType::S16>(base_shape, 3.0 / 32767, 0, input1_data);
     Tensor input2_tensor =
-        makeInputTensor<DataType::S16>(test_shapes[i], 1.0 / 32767, 0, input2_data);
+      makeInputTensor<DataType::S16>(test_shapes[i], 1.0 / 32767, 0, input2_data);
     Tensor output_tensor = makeOutputTensor(DataType::S16, 4.0 / 32767, 0);
     const float tolerance = output_tensor.scale() * 2;
 
@@ -114,15 +114,15 @@ TEST(MulTest, SInt16)
 
     EXPECT_THAT(extractTensorShape(output_tensor),
                 ::testing::ElementsAreArray(ref_output_shapes[i]))
-        << "With shape number " << i;
+      << "With shape number " << i;
     EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_outputs[i], tolerance))
-        << "With shape number " << i;
+      << "With shape number " << i;
   }
   // Re-run with exchanged inputs and different scales.
   for (size_t i = 0; i < test_shapes.size(); ++i)
   {
     Tensor input1_tensor =
-        makeInputTensor<DataType::S16>(test_shapes[i], 2.0 / 32767, 0, input2_data);
+      makeInputTensor<DataType::S16>(test_shapes[i], 2.0 / 32767, 0, input2_data);
     Tensor input2_tensor = makeInputTensor<DataType::S16>(base_shape, 4.0 / 32767, 0, input1_data);
     Tensor output_tensor = makeOutputTensor(DataType::S16, 3.0 / 32767, 0);
     const float tolerance = output_tensor.scale() * 2;
@@ -136,9 +136,9 @@ TEST(MulTest, SInt16)
 
     EXPECT_THAT(extractTensorShape(output_tensor),
                 ::testing::ElementsAreArray(ref_output_shapes[i]))
-        << "With shape number " << i;
+      << "With shape number " << i;
     EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_outputs[i], tolerance))
-        << "With shape number " << i;
+      << "With shape number " << i;
   }
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/Neg.cpp b/compiler/luci-interpreter/src/kernels/Neg.cpp
new file mode 100644
index 000000000..99f4d4a21
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Neg.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Neg.h"
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+Neg::Neg(const Tensor *input, Tensor *output) : Kernel({input}, {output}) {}
+
+void Neg::configure()
+{
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
+
+  output()->resize(input()->shape());
+}
+
+void Neg::execute() const
+{
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalFloat();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void Neg::evalFloat() const
+{
+  tflite::reference_ops::Negate(getTensorShape(input()), getTensorData<float>(input()),
+                                getTensorShape(output()), getTensorData<float>(output()));
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Neg.h b/compiler/luci-interpreter/src/kernels/Neg.h
new file mode 100644
index 000000000..69fa1a18e
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Neg.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_NEG_H
+#define LUCI_INTERPRETER_KERNELS_NEG_H
+
+#include "core/Kernel.h"
+#include <vector>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Neg : public Kernel
+{
+public:
+  Neg(const Tensor *input, Tensor *output);
+
+  const Tensor *input() const { return _inputs[0]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  void evalFloat() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_NEG_H
diff --git a/compiler/luci-interpreter/src/kernels/Neg.test.cpp b/compiler/luci-interpreter/src/kernels/Neg.test.cpp
new file mode 100644
index 000000000..33256e1c6
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Neg.test.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Neg.h"
+#include "kernels/TestUtils.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T>
+void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> output_shape,
+           std::initializer_list<T> input_data, std::initializer_list<T> output_data)
+{
+  constexpr DataType element_type = getElementType<T>();
+  Tensor input_tensor = makeInputTensor<element_type>(input_shape, input_data);
+  Tensor output_tensor = makeOutputTensor(element_type);
+
+  Neg kernel(&input_tensor, &output_tensor);
+
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<T>(output_tensor), ::testing::ElementsAreArray(output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+}
+
+TEST(NegTest, FloatSimple)
+{
+  Check<float>(/*input_shape=*/{2, 3},
+               /*output_shape=*/{2, 3},
+               /*input_data=*/
+               {
+                 0.0f, 1.0f, 3.0f,   // Row 1
+                 1.0f, -1.0f, -2.0f, // Row 2
+               },
+               /*output_data=*/
+               {
+                 0.0f, -1.0f, -3.0f, // Row 1
+                 -1.0f, 1.0f, 2.0f,  // Row 2
+               });
+
+  SUCCEED();
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/NotEqual.cpp b/compiler/luci-interpreter/src/kernels/NotEqual.cpp
index cd2f6c2c1..99d5e0fa0 100644
--- a/compiler/luci-interpreter/src/kernels/NotEqual.cpp
+++ b/compiler/luci-interpreter/src/kernels/NotEqual.cpp
@@ -98,8 +98,8 @@ void NotEqual::evalQuantized() const
   if (op_params.is_broadcast)
   {
     tflite::reference_ops::Broadcast4DSlowNotEqualWithScaling(
-        op_params, getTensorShape(x()), x_data, getTensorShape(y()), y_data,
-        getTensorShape(output()), output_data);
+      op_params, getTensorShape(x()), x_data, getTensorShape(y()), y_data, getTensorShape(output()),
+      output_data);
   }
   else
   {
diff --git a/compiler/luci-interpreter/src/kernels/NotEqual.test.cpp b/compiler/luci-interpreter/src/kernels/NotEqual.test.cpp
index 8c8712371..f9dc7781b 100644
--- a/compiler/luci-interpreter/src/kernels/NotEqual.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/NotEqual.test.cpp
@@ -30,18 +30,18 @@ using namespace testing;
 TEST(NotEqualTest, FloatSimple)
 {
   std::vector<float> x_data{
-      0.5, 0.7, 0.9, // Row 1
-      1,   0,   -1,  // Row 2
+    0.5, 0.7, 0.9, // Row 1
+    1,   0,   -1,  // Row 2
   };
 
   std::vector<float> y_data{
-      0.9, 0.7, 0.5, // Row 1
-      -1,  0,   1,   // Row 2
+    0.9, 0.7, 0.5, // Row 1
+    -1,  0,   1,   // Row 2
   };
 
   std::vector<bool> ref_output_data{
-      true, false, true, // Row 1
-      true, false, true, // Row 2
+    true, false, true, // Row 1
+    true, false, true, // Row 2
   };
 
   Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, x_data);
@@ -59,21 +59,21 @@ TEST(NotEqualTest, FloatSimple)
 TEST(NotEqualTest, FloatBroardcast)
 {
   std::vector<float> x_data{
-      0.5, 0.7, 0.9, // Row 1
-      1,   0,   -1,  // Row 2
-      -1,  0,   1,   // Row 3
-      0.9, 0.7, 0.5, // Row 4
+    0.5, 0.7, 0.9, // Row 1
+    1,   0,   -1,  // Row 2
+    -1,  0,   1,   // Row 3
+    0.9, 0.7, 0.5, // Row 4
   };
 
   std::vector<float> y_data{
-      0.9, 0.7, 0.5, // Row 1
+    0.9, 0.7, 0.5, // Row 1
   };
 
   std::vector<bool> ref_output_data{
-      true,  false, true,  // Row 1
-      true,  true,  true,  // Row 2
-      true,  true,  true,  // Row 3
-      false, false, false, // Row 4
+    true,  false, true,  // Row 1
+    true,  true,  true,  // Row 2
+    true,  true,  true,  // Row 3
+    false, false, false, // Row 4
   };
 
   Tensor x_tensor = makeInputTensor<DataType::FLOAT32>({4, 3}, x_data);
@@ -95,27 +95,27 @@ const float F_MAX = 127.0 / 128.0;
 TEST(NotEqualTest, Uint8Quantized)
 {
   std::vector<float> x_data{
-      0.5, 0.5, 0.7,  0.9, // Row 1
-      1,   0,   0.05, -1,  // Row 2
+    0.5, 0.5, 0.7,  0.9, // Row 1
+    1,   0,   0.05, -1,  // Row 2
   };
 
   std::vector<float> y_data{
-      0.9, 0.5, 0.55, 0.5, // Row 1
-      -1,  0,   0.05, 1,   // Row 2
+    0.9, 0.5, 0.55, 0.5, // Row 1
+    -1,  0,   0.05, 1,   // Row 2
   };
 
   std::vector<bool> ref_output_data{
-      true, false, true,  true, // Row 1
-      true, false, false, true, // Row 2
+    true, false, true,  true, // Row 1
+    true, false, false, true, // Row 2
   };
 
   std::pair<float, int32_t> x_quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
-  Tensor x_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, x_quant_param.first,
-                                                  x_quant_param.second, x_data);
+  Tensor x_tensor =
+    makeInputTensor<DataType::U8>({1, 2, 4, 1}, x_quant_param.first, x_quant_param.second, x_data);
 
   std::pair<float, int32_t> y_quant_param = quantizationParams<uint8_t>(F_MIN * 2, F_MAX * 2);
-  Tensor y_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, y_quant_param.first,
-                                                  y_quant_param.second, y_data);
+  Tensor y_tensor =
+    makeInputTensor<DataType::U8>({1, 2, 4, 1}, y_quant_param.first, y_quant_param.second, y_data);
 
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
@@ -130,28 +130,28 @@ TEST(NotEqualTest, Uint8Quantized)
 TEST(NotEqualTest, Uint8QuantizedBroadcast)
 {
   std::vector<float> x_data{
-      0.4,  -0.8, 0.7,  0.3, // Row 1
-      -0.5, 0.1,  0,    0.5, // Row 2
-      1,    0,    0.05, -1,  // Row 3
-      -1,   0.05, 0,    1,   // Row 4
+    0.4,  -0.8, 0.7,  0.3, // Row 1
+    -0.5, 0.1,  0,    0.5, // Row 2
+    1,    0,    0.05, -1,  // Row 3
+    -1,   0.05, 0,    1,   // Row 4
   };
 
   std::vector<float> y_data{
-      -1, 0.05, 0, 1, // Row 1
+    -1, 0.05, 0, 1, // Row 1
   };
 
   std::vector<bool> ref_output_data{
-      true,  true,  true,  true,  // Row 1
-      true,  true,  false, true,  // Row 2
-      true,  true,  true,  true,  // Row 3
-      false, false, false, false, // Row 4
+    true,  true,  true,  true,  // Row 1
+    true,  true,  false, true,  // Row 2
+    true,  true,  true,  true,  // Row 3
+    false, false, false, false, // Row 4
   };
 
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(F_MIN, F_MAX);
   Tensor x_tensor =
-      makeInputTensor<DataType::U8>({1, 4, 4, 1}, quant_param.first, quant_param.second, x_data);
+    makeInputTensor<DataType::U8>({1, 4, 4, 1}, quant_param.first, quant_param.second, x_data);
   Tensor y_tensor =
-      makeInputTensor<DataType::U8>({1, 1, 4, 1}, quant_param.first, quant_param.second, y_data);
+    makeInputTensor<DataType::U8>({1, 1, 4, 1}, quant_param.first, quant_param.second, y_data);
   Tensor output_tensor = makeOutputTensor(DataType::BOOL);
 
   NotEqual kernel(&x_tensor, &y_tensor, &output_tensor);
diff --git a/compiler/luci-interpreter/src/kernels/Pack.cpp b/compiler/luci-interpreter/src/kernels/Pack.cpp
new file mode 100644
index 000000000..6fee93890
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Pack.cpp
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Pack.h"
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/reference/reference_ops.h>
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+Pack::Pack(std::vector<const Tensor *> inputs, Tensor *output, const PackParams &params)
+  : KernelWithParams<PackParams>(std::move(inputs), {output}, params)
+{
+}
+
+void Pack::configure()
+{
+  LUCI_INTERPRETER_CHECK(_inputs.size() == static_cast<uint32_t>(params().values_count));
+  const Tensor *t0 = _inputs[0];
+  const int dimension_size = t0->shape().num_dims() + 1;
+  int axis = params().axis;
+  if (axis < 0)
+  {
+    axis += dimension_size;
+  }
+  LUCI_INTERPRETER_CHECK(axis >= 0 && axis <= t0->shape().num_dims());
+
+  if (t0->element_type() != DataType::S32 && t0->element_type() != DataType::FLOAT32 &&
+      t0->element_type() != DataType::U8 && t0->element_type() != DataType::S8 &&
+      t0->element_type() != DataType::S16 && t0->element_type() != DataType::S64)
+  {
+    throw std::runtime_error("Unsupported type.");
+  }
+
+  for (uint32_t i = 1; i < _inputs.size(); ++i)
+  {
+    const Tensor *tensor = _inputs[i];
+    LUCI_INTERPRETER_CHECK(tensor->element_type() == t0->element_type());
+    LUCI_INTERPRETER_CHECK(tensor->shape().num_dims() == t0->shape().num_dims());
+    for (int d = 0; d < t0->shape().num_dims(); ++d)
+    {
+      LUCI_INTERPRETER_CHECK(tensor->shape().dim(d) == t0->shape().dim(d));
+    }
+  }
+
+  Shape output_shape(dimension_size);
+  int i = 0;
+  for (int index = 0; index < dimension_size; ++index)
+  {
+    if (index == axis)
+    {
+      output_shape.dim(index) = params().values_count;
+    }
+    else
+    {
+      output_shape.dim(index) = t0->shape().dim(i++);
+    }
+  }
+
+  if (t0->element_type() == DataType::S32 || t0->element_type() == DataType::U8 ||
+      t0->element_type() == DataType::S8 || t0->element_type() == DataType::S16 ||
+      t0->element_type() == DataType::S64)
+  {
+    LUCI_INTERPRETER_CHECK(output()->zero_point() == t0->zero_point());
+    LUCI_INTERPRETER_CHECK(output()->scale() == t0->scale());
+    // Guarantee input/output quantization params match as we do not support
+    // packing quantized tensors.
+    for (int i = 0; i < params().values_count; i++)
+    {
+      LUCI_INTERPRETER_CHECK(_inputs[i]->zero_point() == t0->zero_point());
+      LUCI_INTERPRETER_CHECK(_inputs[i]->scale() == t0->scale());
+    }
+  }
+
+  output()->resize(output_shape);
+}
+
+void Pack::execute() const
+{
+  switch (_inputs[0]->element_type())
+  {
+    case DataType::FLOAT32:
+      evalGeneric<float>();
+      break;
+    case DataType::U8:
+      evalGeneric<uint8_t>();
+      break;
+    case DataType::S8:
+      evalGeneric<int8_t>();
+      break;
+    case DataType::S16:
+      evalGeneric<int16_t>();
+      break;
+    case DataType::S32:
+      evalGeneric<int32_t>();
+      break;
+    case DataType::S64:
+      evalGeneric<int64_t>();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+template <typename T> void Pack::evalGeneric() const
+{
+  const Tensor *t0 = _inputs[0];
+  const int dimension_size = t0->shape().num_dims() + 1;
+  int axis = params().axis;
+  if (axis < 0)
+  {
+    axis += dimension_size;
+  }
+
+  VectorOfTensors<T, true> inputs(_inputs);
+  tflite::PackParams params{};
+  params.axis = axis;
+  params.inputs_count = _inputs.size();
+  tflite::reference_ops::Pack<T>(params, inputs.shapes(), inputs.data(), getTensorShape(output()),
+                                 getTensorData<T>(output()));
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Pack.h b/compiler/luci-interpreter/src/kernels/Pack.h
new file mode 100644
index 000000000..4a2fcfd80
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Pack.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_PACK_H
+#define LUCI_INTERPRETER_KERNELS_PACK_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Pack : public KernelWithParams<PackParams>
+{
+public:
+  Pack(std::vector<const Tensor *> inputs, Tensor *output, const PackParams &params);
+
+  const Tensor *input(int index) const { return _inputs[index]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  template <typename T> void evalGeneric() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_PACK_H
diff --git a/compiler/luci-interpreter/src/kernels/Pack.test.cpp b/compiler/luci-interpreter/src/kernels/Pack.test.cpp
new file mode 100644
index 000000000..092bd449a
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Pack.test.cpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Pack.h"
+#include "kernels/TestUtils.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T>
+void Check(std::vector<std::initializer_list<int32_t>> input_shapes,
+           std::initializer_list<int32_t> output_shape, std::vector<std::vector<T>> input_datas,
+           std::initializer_list<T> output_data, int32_t axis)
+{
+  constexpr DataType element_type = getElementType<T>();
+  std::vector<const Tensor *> inputs(input_datas.size());
+  std::vector<Tensor> tmp_inputs;
+  for (int i = 0; i < input_datas.size(); i++)
+  {
+    if (std::is_same<T, float>::value)
+    {
+      tmp_inputs.push_back(Tensor(element_type, input_shapes[i], {}, ""));
+      tmp_inputs[i].writeData(input_datas[i].data(), input_datas[i].size() * sizeof(T));
+    }
+    else
+    {
+      tmp_inputs.push_back(Tensor(element_type, input_shapes[i], {{1.0f / 255}, {128}}, ""));
+      tmp_inputs[i].writeData(input_datas[i].data(), input_datas[i].size() * sizeof(T));
+    }
+  }
+  for (int i = 0; i < input_datas.size(); i++)
+  {
+    inputs[i] = &tmp_inputs[i];
+  }
+
+  Tensor output_tensor = makeOutputTensor(element_type);
+  if (!std::is_same<T, float>::value)
+  {
+    output_tensor = makeOutputTensor(element_type, 1.0f / 255, 128);
+  }
+
+  PackParams params{};
+  params.axis = axis;
+  params.values_count = input_datas.size();
+  Pack kernel(inputs, &output_tensor, params);
+
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<T>(output_tensor), ::testing::ElementsAreArray(output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+}
+
+template <typename T> class PackTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<uint8_t, float>;
+TYPED_TEST_CASE(PackTest, DataTypes);
+
+TYPED_TEST(PackTest, ThreeInputs)
+{
+  Check<TypeParam>(/*input_shapes=*/{{2}, {2}, {2}},
+                   /*output_shape=*/{3, 2},
+                   /*input_datas=*/
+                   {{1, 4}, {2, 5}, {3, 6}},
+                   /*output_data=*/
+                   {1, 4, 2, 5, 3, 6}, /*axis=*/0);
+
+  SUCCEED();
+}
+
+TYPED_TEST(PackTest, NegAxis)
+{
+  Check<TypeParam>(/*input_shapes=*/{{2}, {2}, {2}},
+                   /*output_shape=*/{2, 3},
+                   /*input_datas=*/
+                   {{1, 4}, {2, 5}, {3, 6}},
+                   /*output_data=*/
+                   {1, 2, 3, 4, 5, 6}, /*axis=*/-1);
+
+  SUCCEED();
+}
+
+TEST(Pack, MismatchingInputValuesCount_NEG)
+{
+  std::vector<float> input1_data{1, 4};
+  std::vector<float> input2_data{2, 5};
+  std::vector<float> input3_data{3, 6};
+  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({2}, input1_data);
+  Tensor input2_tensor = makeInputTensor<DataType::FLOAT32>({2}, input2_data);
+  Tensor input3_tensor = makeInputTensor<DataType::FLOAT32>({2}, input3_data);
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  PackParams params{};
+  {
+    params.axis = 0;
+    params.values_count = 2;
+
+    Pack kernel({&input1_tensor, &input2_tensor, &input3_tensor}, &output_tensor, params);
+    EXPECT_ANY_THROW(kernel.configure());
+  }
+}
+
+TEST(Pack, InvalidInputAxis_NEG)
+{
+  std::vector<float> input1_data{1, 4};
+  std::vector<float> input2_data{2, 5};
+  std::vector<float> input3_data{3, 6};
+  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({2}, input1_data);
+  Tensor input2_tensor = makeInputTensor<DataType::FLOAT32>({2}, input2_data);
+  Tensor input3_tensor = makeInputTensor<DataType::FLOAT32>({2}, input3_data);
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+  PackParams params{};
+  {
+    params.axis = 2;
+    params.values_count = 3;
+
+    Pack kernel({&input1_tensor, &input2_tensor, &input3_tensor}, &output_tensor, params);
+    EXPECT_ANY_THROW(kernel.configure());
+  }
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Pad.cpp b/compiler/luci-interpreter/src/kernels/Pad.cpp
index bdf3a2a95..3e76080a9 100644
--- a/compiler/luci-interpreter/src/kernels/Pad.cpp
+++ b/compiler/luci-interpreter/src/kernels/Pad.cpp
@@ -26,7 +26,7 @@ namespace kernels
 {
 
 Pad::Pad(const Tensor *input, const Tensor *paddings, Tensor *output)
-    : Kernel({input, paddings}, {output})
+  : Kernel({input, paddings}, {output})
 {
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/Pad.test.cpp b/compiler/luci-interpreter/src/kernels/Pad.test.cpp
index 4bee07629..75b2e560e 100644
--- a/compiler/luci-interpreter/src/kernels/Pad.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Pad.test.cpp
@@ -34,8 +34,8 @@ TEST(Pad, Uint8)
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-1.0f, 1.0f);
   std::vector<float> input_data{-0.8, 0.2, 0.9, 0.7, 0.1, -0.3};
   std::vector<int32_t> paddings_data{0, 0, 0, 2, 1, 3, 0, 0};
-  Tensor input_tensor = makeInputTensor<DataType::U8>({1, 2, 3, 1}, quant_param.first,
-                                                      quant_param.second, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>({1, 2, 3, 1}, quant_param.first, quant_param.second, input_data);
   Tensor paddings_tensor = makeInputTensor<DataType::S32>({4, 2}, paddings_data);
   Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
 
diff --git a/compiler/luci-interpreter/src/kernels/Pow.cpp b/compiler/luci-interpreter/src/kernels/Pow.cpp
index a0c092d33..722c64024 100644
--- a/compiler/luci-interpreter/src/kernels/Pow.cpp
+++ b/compiler/luci-interpreter/src/kernels/Pow.cpp
@@ -27,7 +27,7 @@ namespace kernels
 {
 
 Pow::Pow(const Tensor *input1, const Tensor *input2, Tensor *output)
-    : Kernel({input1, input2}, {output})
+  : Kernel({input1, input2}, {output})
 {
 }
 
@@ -59,7 +59,7 @@ template <typename T> void Pow::eval() const
   tflite::ArithmeticParams params{};
 
   const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes(
-      getTensorShape(input1()), getTensorShape(input2()), &params);
+    getTensorShape(input1()), getTensorShape(input2()), &params);
 
   if (need_broadcast)
   {
diff --git a/compiler/luci-interpreter/src/kernels/Prelu.cpp b/compiler/luci-interpreter/src/kernels/Prelu.cpp
index e658d87b5..c4b288f1b 100644
--- a/compiler/luci-interpreter/src/kernels/Prelu.cpp
+++ b/compiler/luci-interpreter/src/kernels/Prelu.cpp
@@ -30,24 +30,58 @@ namespace kernels
 {
 
 Prelu::Prelu(const Tensor *input, const Tensor *alpha, Tensor *output)
-    : Kernel({input, alpha}, {output})
+  : Kernel({input, alpha}, {output})
 {
 }
 
+Prelu::~Prelu()
+{
+  // Destructor declared to delete vector of alpha quantized data properly
+}
+
 void Prelu::configure()
 {
   LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
   LUCI_INTERPRETER_CHECK(alpha()->element_type() == output()->element_type());
+  LUCI_INTERPRETER_CHECK(input()->scales().size() <= 1);
+  LUCI_INTERPRETER_CHECK(output()->scales().size() <= 1);
 
-  if (input()->element_type() == DataType::U8 || input()->element_type() == DataType::S16)
+  if (input()->element_type() == DataType::U8)
   {
-    if (input()->element_type() == DataType::S16)
+    LUCI_INTERPRETER_CHECK(alpha()->scales().size() <= 1); // remove when CWQ kernel arrives
+    _alpha_multipliers.resize(1);
+    double alpha_multiplier = input()->scale() * alpha()->scale() / output()->scale();
+    quantizeMultiplier(alpha_multiplier, &_alpha_multipliers[0].multiplier,
+                       &_alpha_multipliers[0].shift);
+    double identity_multiplier = input()->scale() / output()->scale();
+    quantizeMultiplier(identity_multiplier, &_output_multiplier_identity, &_output_shift_identity);
+  }
+  else if (input()->element_type() == DataType::S16)
+  {
+    // Common check for correctness of quant params
+    LUCI_INTERPRETER_CHECK(input()->zero_point() == 0 && output()->zero_point() == 0);
+    for (size_t channel = 0; channel < alpha()->zero_points().size(); ++channel)
     {
-      LUCI_INTERPRETER_CHECK(input()->zero_point() == 0 && alpha()->zero_point() == 0 &&
-                             output()->zero_point() == 0);
+      LUCI_INTERPRETER_CHECK(alpha()->zero_points()[channel] == 0);
     }
-    double alpha_multiplier = input()->scale() * alpha()->scale() / output()->scale();
-    quantizeMultiplier(alpha_multiplier, &_output_multiplier_alpha, &_output_shift_alpha);
+    // Prelu specific checks for CWQ
+    LUCI_INTERPRETER_CHECK(alpha()->quantized_dimension() == alpha()->shape().num_dims() - 1);
+    LUCI_INTERPRETER_CHECK(static_cast<int32_t>(alpha()->scales().size()) ==
+                           alpha()->shape().dim(alpha()->quantized_dimension()));
+    LUCI_INTERPRETER_CHECK(alpha()->shape().num_elements() ==
+                           input()->shape().dim(input()->shape().num_dims() - 1));
+
+    // all dimension of alpha except last one should be size 1
+    for (int dim = 0; dim < alpha()->shape().num_dims() - 1; ++dim)
+    {
+      LUCI_INTERPRETER_CHECK(alpha()->shape().dim(dim) == 1);
+    }
+
+    std::vector<double> real_multipliers =
+      getQuantizedConvolutionMultiplers(input()->scale(), alpha()->scales(), output()->scale());
+
+    _alpha_multipliers = quantizeMultipliers(real_multipliers);
+
     double identity_multiplier = input()->scale() / output()->scale();
     quantizeMultiplier(identity_multiplier, &_output_multiplier_identity, &_output_shift_identity);
   }
@@ -84,9 +118,9 @@ void Prelu::evalFloat() const
   if (input()->shape() != alpha()->shape())
   {
     tflite::reference_ops::BroadcastBinaryFunction4DSlow<float, float, float>(
-        getTensorShape(input()), getTensorData<float>(input()), getTensorShape(alpha()),
-        getTensorData<float>(alpha()), getTensorShape(output()), getTensorData<float>(output()),
-        PreluFunc);
+      getTensorShape(input()), getTensorData<float>(input()), getTensorShape(alpha()),
+      getTensorData<float>(alpha()), getTensorShape(output()), getTensorData<float>(output()),
+      PreluFunc);
   }
   else
   {
@@ -109,44 +143,66 @@ void Prelu::evalQuantized() const
   op_params.output_offset = output()->zero_point();
   op_params.output_shift_1 = _output_shift_identity;
   op_params.output_multiplier_1 = _output_multiplier_identity;
-  op_params.output_shift_2 = _output_shift_alpha;
-  op_params.output_multiplier_2 = _output_multiplier_alpha;
+  op_params.output_shift_2 = _alpha_multipliers[0].shift;
+  op_params.output_multiplier_2 = _alpha_multipliers[0].multiplier;
 
   if (input()->shape() != alpha()->shape())
   {
     tflite::reference_ops::BroadcastPrelu4DSlow(
-        op_params, getTensorShape(input()), getTensorData<uint8_t>(input()),
-        getTensorShape(alpha()), getTensorData<uint8_t>(alpha()), getTensorShape(output()),
-        getTensorData<uint8_t>(output()));
+      op_params, getTensorShape(input()), getTensorData<uint8_t>(input()), getTensorShape(alpha()),
+      getTensorData<uint8_t>(alpha()), getTensorShape(output()), getTensorData<uint8_t>(output()));
   }
   else
   {
-    tflite::reference_ops::Prelu<uint8_t>(op_params, getTensorShape(input()),
-                                          getTensorData<uint8_t>(input()), getTensorShape(alpha()),
-                                          getTensorData<uint8_t>(alpha()), getTensorShape(output()),
-                                          getTensorData<uint8_t>(output()));
+    tflite::reference_ops::Prelu<uint8_t>(
+      op_params, getTensorShape(input()), getTensorData<uint8_t>(input()), getTensorShape(alpha()),
+      getTensorData<uint8_t>(alpha()), getTensorShape(output()), getTensorData<uint8_t>(output()));
   }
 }
 
-void Prelu::evalQuantizedS16() const
+static inline int16_t evalElemS16Prelu(int16_t input_val, int16_t alpha_val,
+                                       const ChannelQuantMultipliers &identity_mult,
+                                       const ChannelQuantMultipliers &alpha_mult)
 {
   constexpr int32_t quantized_min = std::numeric_limits<int16_t>::min();
   constexpr int32_t quantized_max = std::numeric_limits<int16_t>::max();
 
-  auto fn = [this, quantized_min, quantized_max](int16_t input_val, int16_t alpha_val) {
-    const int32_t output_val =
-        input_val >= 0
-            ? tflite::MultiplyByQuantizedMultiplier(input_val, _output_multiplier_identity,
-                                                    _output_shift_identity)
-            : tflite::MultiplyByQuantizedMultiplier(input_val * alpha_val, _output_multiplier_alpha,
-                                                    _output_shift_alpha);
-    const int32_t clamped_output = std::min(quantized_max, std::max(quantized_min, output_val));
-    return static_cast<int16_t>(clamped_output);
-  };
-
-  BinaryOpBroadcastSlow(getTensorShape(input()), getTensorData<int16_t>(input()),
-                        getTensorShape(alpha()), getTensorData<int16_t>(alpha()),
-                        getTensorShape(output()), getTensorData<int16_t>(output()), fn);
+  const int32_t output_val =
+    input_val >= 0 ? tflite::MultiplyByQuantizedMultiplier(input_val, identity_mult.multiplier,
+                                                           identity_mult.shift)
+                   : tflite::MultiplyByQuantizedMultiplier(input_val * alpha_val,
+                                                           alpha_mult.multiplier, alpha_mult.shift);
+  const int32_t clamped_output = std::min(quantized_max, std::max(quantized_min, output_val));
+  return clamped_output;
+}
+
+void Prelu::evalQuantizedS16() const
+{
+  // Note that this kernel assumes alpha is CWQ
+  tflite::RuntimeShape input_shape = getTensorShape(input());
+  const int16_t *input_data = input()->data<int16_t>();
+  const int16_t *alpha_data = alpha()->data<int16_t>();
+  int16_t *output_data = output()->data<int16_t>();
+
+  const ChannelQuantMultipliers pos_mult{_output_shift_identity, _output_multiplier_identity};
+
+  const int last_dim = input()->shape().num_dims() - 1;
+
+  int32_t outer_dims_size = 1;
+  for (int i = 0; i < last_dim; ++i)
+    outer_dims_size *= input_shape.Dims(i);
+  int32_t quant_dim_size = input_shape.Dims(last_dim);
+
+  for (int32_t outer_dims = 0; outer_dims < outer_dims_size; ++outer_dims)
+    for (int32_t quant_channel = 0; quant_channel < quant_dim_size; ++quant_channel)
+    {
+      const ChannelQuantMultipliers &neg_mult = _alpha_multipliers[quant_channel];
+      size_t offset = static_cast<size_t>(outer_dims) * static_cast<size_t>(quant_dim_size);
+      offset += quant_channel;
+
+      output_data[offset] =
+        evalElemS16Prelu(input_data[offset], alpha_data[quant_channel], pos_mult, neg_mult);
+    }
 }
 
 } // namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/Prelu.h b/compiler/luci-interpreter/src/kernels/Prelu.h
index c7911a63f..08cb0eaa5 100644
--- a/compiler/luci-interpreter/src/kernels/Prelu.h
+++ b/compiler/luci-interpreter/src/kernels/Prelu.h
@@ -18,17 +18,22 @@
 #define LUCI_INTERPRETER_KERNELS_PRELU_H
 
 #include "core/Kernel.h"
+#include <vector>
 
 namespace luci_interpreter
 {
 namespace kernels
 {
 
+class ChannelQuantMultipliers;
+
 class Prelu : public Kernel
 {
 public:
   Prelu(const Tensor *input, const Tensor *alpha, Tensor *output);
 
+  ~Prelu();
+
   const Tensor *input() const { return _inputs[0]; }
   const Tensor *alpha() const { return _inputs[1]; }
   Tensor *output() const { return _outputs[0]; }
@@ -42,8 +47,8 @@ private:
   void evalQuantizedS16() const;
 
 private:
-  int32_t _output_multiplier_alpha = 0;
-  int32_t _output_shift_alpha = 0;
+  std::vector<ChannelQuantMultipliers> _alpha_multipliers;
+  // TODO merge this into one ChannelQuantMultiplier object
   int32_t _output_multiplier_identity = 0;
   int32_t _output_shift_identity = 0;
 };
diff --git a/compiler/luci-interpreter/src/kernels/Prelu.test.cpp b/compiler/luci-interpreter/src/kernels/Prelu.test.cpp
index 30702c826..9d9adf66f 100644
--- a/compiler/luci-interpreter/src/kernels/Prelu.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Prelu.test.cpp
@@ -52,18 +52,18 @@ TEST(PreluTest, FloatSimple)
                /*output_shape=*/{2, 3},
                /*input_data=*/
                {
-                   0.0f, 1.0f, 3.0f,   // Row 1
-                   1.0f, -1.0f, -2.0f, // Row 2
+                 0.0f, 1.0f, 3.0f,   // Row 1
+                 1.0f, -1.0f, -2.0f, // Row 2
                },
                /*alpha_data=*/
                {
-                   0.0f, 0.5f, 0.1f, // Row 1
-                   0.0f, 0.5f, 0.1f, // Row 2
+                 0.0f, 0.5f, 0.1f, // Row 1
+                 0.0f, 0.5f, 0.1f, // Row 2
                },
                /*output_data=*/
                {
-                   0.0f, 1.0f, 3.0f,   // Row 1
-                   1.0f, -0.5f, -0.2f, // Row 2
+                 0.0f, 1.0f, 3.0f,   // Row 1
+                 1.0f, -0.5f, -0.2f, // Row 2
                });
 
   SUCCEED();
@@ -75,19 +75,19 @@ TEST(PreluTest, FloatBroadcast)
                /*output_shape=*/{1, 2, 2, 3},
                /*input_data=*/
                {
-                   0.0f, 0.0f, 0.0f,    // Row 1, Column 1
-                   1.0f, 1.0f, 1.0f,    // Row 1, Column 2
-                   -1.0f, -1.0f, -1.0f, // Row 2, Column 1
-                   -2.0f, -2.0f, -2.0f, // Row 2, Column 2
+                 0.0f, 0.0f, 0.0f,    // Row 1, Column 1
+                 1.0f, 1.0f, 1.0f,    // Row 1, Column 2
+                 -1.0f, -1.0f, -1.0f, // Row 2, Column 1
+                 -2.0f, -2.0f, -2.0f, // Row 2, Column 2
                },
                /*alpha_data=*/
                {0.0f, 1.0f, 2.0f},
                /*output_data=*/
                {
-                   0.0f, 0.0f, 0.0f,   // Row 1, Column 1
-                   1.0f, 1.0f, 1.0f,   // Row 1, Column 2
-                   0.0f, -1.0f, -2.0f, // Row 2, Column 1
-                   0.0f, -2.0f, -4.0f, // Row 2, Column 2
+                 0.0f, 0.0f, 0.0f,   // Row 1, Column 1
+                 1.0f, 1.0f, 1.0f,   // Row 1, Column 2
+                 0.0f, -1.0f, -2.0f, // Row 2, Column 1
+                 0.0f, -2.0f, -4.0f, // Row 2, Column 2
                });
 
   SUCCEED();
@@ -104,10 +104,10 @@ TEST(PreluTest, Uint8Simple)
   float kQuantizedTolerance = GetTolerance(-1.0, 1.0);
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-1.0f, 1.0f);
 
-  Tensor input_tensor = makeInputTensor<DataType::U8>({1, 2, 3, 1}, quant_param.first,
-                                                      quant_param.second, input_data);
-  Tensor alpha_tensor = makeInputTensor<DataType::U8>({1, 2, 3, 1}, quant_param.first,
-                                                      quant_param.second, alpha_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>({1, 2, 3, 1}, quant_param.first, quant_param.second, input_data);
+  Tensor alpha_tensor =
+    makeInputTensor<DataType::U8>({1, 2, 3, 1}, quant_param.first, quant_param.second, alpha_data);
   Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
 
   Prelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
@@ -124,33 +124,33 @@ TEST(PreluTest, Uint8Simple)
 TEST(PreluTest, Uint8Broadcast)
 {
   std::vector<float> input_data{
-      0.0f,   0.0f,   0.0f,   // Row 1, Column 1
-      0.5f,   0.5f,   0.5f,   // Row 1, Column 2
-      -1.0f,  -1.0f,  -1.0f,  // Row 2, Column 1
-      -0.25f, -0.25f, -0.25f, // Row 2, Column 2
+    0.0f,   0.0f,   0.0f,   // Row 1, Column 1
+    0.5f,   0.5f,   0.5f,   // Row 1, Column 2
+    -1.0f,  -1.0f,  -1.0f,  // Row 2, Column 1
+    -0.25f, -0.25f, -0.25f, // Row 2, Column 2
   };
   std::vector<float> alpha_data{0.0f, 0.5f, -0.5f};
   std::vector<float> ref_output_data{
-      0.0f, 0.0f,    0.0f,  // Row 1, Column 1
-      0.5f, 0.5f,    0.5f,  // Row 1, Column 2
-      0.0f, -0.5f,   0.5f,  // Row 2, Column 1
-      0.0f, -0.125f, 0.125f // Row 2, Column 2
+    0.0f, 0.0f,    0.0f,  // Row 1, Column 1
+    0.5f, 0.5f,    0.5f,  // Row 1, Column 2
+    0.0f, -0.5f,   0.5f,  // Row 2, Column 1
+    0.0f, -0.125f, 0.125f // Row 2, Column 2
   };
   std::vector<float> ref_quant_output_data{
-      128, 128, 128, // Row 1, Column 1
-      192, 192, 192, // Row 1, Column 2
-      128, 64,  192, // Row 2, Column 1
-      128, 112, 144  // Row 2, Column 2
+    128, 128, 128, // Row 1, Column 1
+    192, 192, 192, // Row 1, Column 2
+    128, 64,  192, // Row 2, Column 1
+    128, 112, 144  // Row 2, Column 2
   };
   float kQuantizedTolerance = 2 * (1. / 256);
   const float kMin = -1;
   const float kMax = 127.f / 128.f;
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(kMin, kMax);
 
-  Tensor input_tensor = makeInputTensor<DataType::U8>({1, 2, 2, 3}, quant_param.first,
-                                                      quant_param.second, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>({1, 2, 2, 3}, quant_param.first, quant_param.second, input_data);
   Tensor alpha_tensor =
-      makeInputTensor<DataType::U8>({1, 1, 3}, quant_param.first, quant_param.second, alpha_data);
+    makeInputTensor<DataType::U8>({1, 1, 3}, quant_param.first, quant_param.second, alpha_data);
   Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
 
   Prelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
@@ -164,42 +164,114 @@ TEST(PreluTest, Uint8Broadcast)
               ::testing::ElementsAreArray(ref_quant_output_data));
 }
 
-TEST(PreluTest, SInt16Simple)
+TEST(PreluTest, SInt16_LWQ_NEG)
 {
-  std::vector<float> input_data{-0.8f, 0.2f, 0.9f, 0.7f, 0.1f, -0.4f};
-  std::vector<float> alpha_data{0.5f, 0.5f, 0.5f, 0.25f, 1.0f, 0.25f};
-  std::vector<float> ref_output_data{-0.4f, 0.2f, 0.9f, 0.7f, 0.1f, -0.1f};
+  // Rewrite this test in case layer-wise quantization for sint16 is supported
+  std::vector<float> input_data(6); // data is not important
+  std::vector<float> alpha_data(6);
 
   Tensor input_tensor = makeInputTensor<DataType::S16>({1, 2, 3, 1}, 0.1, 0, input_data);
   Tensor alpha_tensor = makeInputTensor<DataType::S16>({1, 2, 3, 1}, 0.1, 0, alpha_data);
   Tensor output_tensor = makeOutputTensor(DataType::S16, 0.1, 0);
 
   Prelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(PreluTest, SInt16_CWQ_Simple)
+{
+  std::vector<float> input_data{-0.8f, 0.2f, 0.9f, -0.7f, 0.1f, -0.4f};
+  std::vector<float> alpha_data{0.5f, 0.25f};
+  std::vector<float> ref_output_data{-0.4f, 0.2f, 0.9f, -0.175f, 0.1f, -0.1f};
+
+  std::vector<float> alpha_scales{0.05f, 0.025f};
+  std::vector<int32_t> zerop{0, 0};
+  Tensor input_tensor = makeInputTensor<DataType::S16>({1, 1, 3, 2}, 0.1, 0, input_data);
+  Tensor alpha_tensor = makeInputTensor<DataType::S16>({2}, alpha_scales, zerop, 0, alpha_data);
+  Tensor output_tensor = makeOutputTensor(DataType::S16, 0.025, 0);
+
+  Prelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
   kernel.configure();
   kernel.execute();
 
-  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 2, 3, 1}));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 1, 3, 2}));
+  EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
+}
+
+TEST(PreluTest, SInt16_CWQ_spatial_alpha_NEG)
+{
+  std::vector<float> input_data(6); // data is not important
+  std::vector<float> alpha_data(6);
+
+  std::vector<float> alpha_scales{0.25f, 0.05f};
+  std::vector<int32_t> zerop{0, 0};
+  Tensor input_tensor = makeInputTensor<DataType::S16>({1, 1, 3, 2}, 0.1, 0, input_data);
+  Tensor alpha_tensor =
+    makeInputTensor<DataType::S16>({1, 1, 3, 2}, alpha_scales, zerop, 3, alpha_data);
+  Tensor output_tensor = makeOutputTensor(DataType::S16, 0.1, 0);
+
+  Prelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(PreluTest, SInt16_CWQ_wrong_dim_quant_NEG)
+{
+  std::vector<float> input_data(6); // data is not important
+  std::vector<float> alpha_data(6);
+
+  std::vector<float> alpha_scales{0.25f};
+  std::vector<int32_t> zerop{0};
+  Tensor input_tensor = makeInputTensor<DataType::S16>({1, 1, 3, 2}, 0.1, 0, input_data);
+  Tensor alpha_tensor =
+    makeInputTensor<DataType::S16>({1, 1, 1, 2}, alpha_scales, zerop, 1, alpha_data);
+  Tensor output_tensor = makeOutputTensor(DataType::S16, 0.1, 0);
+
+  Prelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(PreluTest, SInt16_CWQ_uneven_shape1)
+{
+  std::vector<float> input_data{-0.8f, 0.2f, 0.9f, -0.7f, 0.1f, -0.4f};
+  std::vector<float> alpha_data{0.5f, 0.25f};
+  std::vector<float> ref_output_data{-0.4f, 0.2f, 0.9f, -0.175f, 0.1f, -0.1f};
+
+  std::vector<float> alpha_scales{0.05f, 0.025f};
+  std::vector<int32_t> zerop{0, 0};
+  Tensor input_tensor = makeInputTensor<DataType::S16>({1, 1, 3, 2}, 0.1, 0, input_data);
+  Tensor alpha_tensor =
+    makeInputTensor<DataType::S16>({1, 1, 2}, alpha_scales, zerop, 2, alpha_data);
+  Tensor output_tensor = makeOutputTensor(DataType::S16, 0.025, 0);
+
+  Prelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray({1, 1, 3, 2}));
   EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data));
 }
 
-TEST(PreluTest, SInt16Broadcast)
+TEST(PreluTest, SInt16_CWQ_uneven_shape2)
 {
   std::vector<float> input_data{
-      0.0f,   0.0f,   0.0f,   // Row 1, Column 1
-      0.5f,   0.5f,   0.5f,   // Row 1, Column 2
-      -1.0f,  -1.0f,  -1.0f,  // Row 2, Column 1
-      -0.25f, -0.25f, -0.25f, // Row 2, Column 2
+    0.0f,   0.0f,   0.0f,   // Row 1, Column 1
+    0.5f,   0.5f,   0.5f,   // Row 1, Column 2
+    -1.0f,  -1.0f,  -1.0f,  // Row 2, Column 1
+    -0.25f, -0.25f, -0.25f, // Row 2, Column 2
   };
   std::vector<float> alpha_data{0.0f, 0.5f, -0.5f};
   std::vector<float> ref_output_data{
-      0.0f, 0.0f,    0.0f,  // Row 1, Column 1
-      0.5f, 0.5f,    0.5f,  // Row 1, Column 2
-      0.0f, -0.5f,   0.5f,  // Row 2, Column 1
-      0.0f, -0.125f, 0.125f // Row 2, Column 2
+    0.0f, 0.0f,    0.0f,  // Row 1, Column 1
+    0.5f, 0.5f,    0.5f,  // Row 1, Column 2
+    0.0f, -0.5f,   0.5f,  // Row 2, Column 1
+    0.0f, -0.125f, 0.125f // Row 2, Column 2
   };
 
+  std::vector<float> alpha_scales{1.f, 0.05f, 0.1f};
+  std::vector<int32_t> zerop{0, 0, 0};
   Tensor input_tensor = makeInputTensor<DataType::S16>({1, 2, 2, 3}, 0.01, 0, input_data);
-  Tensor alpha_tensor = makeInputTensor<DataType::S16>({1, 1, 3}, 0.1, 0, alpha_data);
+  Tensor alpha_tensor =
+    makeInputTensor<DataType::S16>({1, 1, 1, 3}, alpha_scales, zerop, 3, alpha_data);
   Tensor output_tensor = makeOutputTensor(DataType::S16, 0.001, 0);
 
   Prelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
@@ -241,6 +313,43 @@ TEST(PreluTest, Invalid_Input_Type_NEG)
   EXPECT_ANY_THROW(kernel.execute());
 }
 
+TEST(PreluTest, Input_Output_U8_CWQ_NEG)
+{
+  std::vector<float> scales{1.f, 1.f};
+  std::vector<int32_t> zerop{0, 0};
+  std::vector<float> dummy_data(4, 0.f);
+  Tensor input_tensor = makeInputTensor<DataType::U8>({2, 2}, scales, zerop, 0, dummy_data);
+  Tensor alpha_tensor = makeInputTensor<DataType::U8>({2, 2}, scales, zerop, 0, dummy_data);
+  Tensor output_tensor = makeInputTensor<DataType::U8>({2, 2}, scales, zerop, 0, dummy_data);
+
+  Prelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(PreluTest, Input_Output_S16_CWQ_NEG)
+{
+  std::vector<float> scales{1.f, 1.f};
+  std::vector<int32_t> zerop{0, 0};
+  std::vector<float> dummy_data(4, 0.f);
+  Tensor input_tensor = makeInputTensor<DataType::S16>({2, 2}, scales, zerop, 0, dummy_data);
+  Tensor alpha_tensor = makeInputTensor<DataType::S16>({2, 2}, scales, zerop, 0, dummy_data);
+  Tensor output_tensor = makeInputTensor<DataType::S16>({2, 2}, scales, zerop, 0, dummy_data);
+
+  Prelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(PreluTest, Mixing_U8_S16_NEG)
+{
+  std::vector<float> dummy_data(4, 0.f);
+  Tensor input_tensor = makeInputTensor<DataType::U8>({2, 2}, 1.f, 0, dummy_data);
+  Tensor alpha_tensor = makeInputTensor<DataType::S16>({2, 2}, 1.f, 0, dummy_data);
+  Tensor output_tensor = makeInputTensor<DataType::U8>({2, 2}, 1.f, 0, dummy_data);
+
+  Prelu kernel(&input_tensor, &alpha_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
 } // namespace
 } // namespace kernels
 } // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Relu.cpp b/compiler/luci-interpreter/src/kernels/Relu.cpp
index a2e02d708..b5acf1d60 100644
--- a/compiler/luci-interpreter/src/kernels/Relu.cpp
+++ b/compiler/luci-interpreter/src/kernels/Relu.cpp
@@ -82,7 +82,7 @@ void Relu::evalQuantized() const
   params.output_shift = _output_shift;
 
   params.quantized_activation_min =
-      std::max(static_cast<int32_t>(std::numeric_limits<uint8_t>::min()), params.output_offset);
+    std::max(static_cast<int32_t>(std::numeric_limits<uint8_t>::min()), params.output_offset);
   params.quantized_activation_max = static_cast<int32_t>(std::numeric_limits<uint8_t>::max());
 
   tflite::optimized_ops::ReluX(params, getTensorShape(input()), getTensorData<uint8_t>(input()),
@@ -103,7 +103,7 @@ void Relu::evalQuantizedS16() const
   {
     const int32_t input_val = input_data[i];
     int32_t output_val =
-        tflite::MultiplyByQuantizedMultiplier(input_val, _output_multiplier, _output_shift);
+      tflite::MultiplyByQuantizedMultiplier(input_val, _output_multiplier, _output_shift);
     output_val = std::max(output_val, output_min);
     output_val = std::min(output_val, output_max);
     output_data[i] = static_cast<int16_t>(output_val);
diff --git a/compiler/luci-interpreter/src/kernels/Relu.test.cpp b/compiler/luci-interpreter/src/kernels/Relu.test.cpp
index cabefa733..6623a5b77 100644
--- a/compiler/luci-interpreter/src/kernels/Relu.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Relu.test.cpp
@@ -30,13 +30,13 @@ using namespace testing;
 TEST(ReluTest, FloatSimple)
 {
   std::vector<float> input_data{
-      0.0f, 1.0f,  3.0f,  // Row 1
-      1.0f, -1.0f, -2.0f, // Row 2
+    0.0f, 1.0f,  3.0f,  // Row 1
+    1.0f, -1.0f, -2.0f, // Row 2
   };
 
   std::vector<float> ref_output_data{
-      0.0f, 1.0f, 3.0f, // Row 1
-      1.0f, 0.0f, 0.0f, // Row 2
+    0.0f, 1.0f, 3.0f, // Row 1
+    1.0f, 0.0f, 0.0f, // Row 2
   };
 
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, input_data);
@@ -54,16 +54,16 @@ TEST(ReluTest, FloatSimple)
 TEST(ReluTest, Uint8Quantized)
 {
   std::vector<float> input_data{
-      0, -6, 2, 4, //
-      3, -2, 7, 1, //
+    0, -6, 2, 4, //
+    3, -2, 7, 1, //
   };
   // Choose min / max in such a way that there are exactly 256 units to avoid rounding errors.
   const float f_min = (-128.0 / 128.0) * 8;
   const float f_max = (127.0 / 128.0) * 8;
 
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(f_min, f_max);
-  Tensor input_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first,
-                                                      quant_param.second, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
 
   Relu kernel(&input_tensor, &output_tensor);
@@ -79,8 +79,8 @@ TEST(ReluTest, Uint8Quantized)
 TEST(ReluTest, Uint8Requantized)
 {
   std::vector<float> input_data{
-      0, -6, 2, 4, //
-      3, -2, 7, 1, //
+    0, -6, 2, 4, //
+    3, -2, 7, 1, //
   };
 
   // Choose min / max in such a way that there are exactly 256 units to avoid rounding errors.
@@ -90,8 +90,8 @@ TEST(ReluTest, Uint8Requantized)
   const float out_max = (255.0 / 256.0) * 8;
 
   std::pair<float, int32_t> quant_input = quantizationParams<uint8_t>(in_min, in_max);
-  Tensor input_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_input.first,
-                                                      quant_input.second, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_input.first, quant_input.second, input_data);
 
   std::pair<float, int32_t> quant_output = quantizationParams<uint8_t>(out_min, out_max);
   Tensor output_tensor = makeOutputTensor(DataType::U8, quant_output.first, quant_output.second);
@@ -109,12 +109,12 @@ TEST(ReluTest, Uint8Requantized)
 TEST(ReluTest, SInt16)
 {
   std::vector<float> input_data{
-      0, -6, 2, 4, //
-      3, -2, 7, 1, //
+    0, -6, 2, 4, //
+    3, -2, 7, 1, //
   };
   std::vector<float> ref_output_data{
-      0, 0, 2, 4, //
-      3, 0, 7, 1, //
+    0, 0, 2, 4, //
+    3, 0, 7, 1, //
   };
 
   Tensor input_tensor = makeInputTensor<DataType::S16>({1, 2, 4, 1}, 0.5, 0, input_data);
diff --git a/compiler/luci-interpreter/src/kernels/Relu6.cpp b/compiler/luci-interpreter/src/kernels/Relu6.cpp
index 1046ef27b..fa7aa504a 100644
--- a/compiler/luci-interpreter/src/kernels/Relu6.cpp
+++ b/compiler/luci-interpreter/src/kernels/Relu6.cpp
@@ -75,10 +75,10 @@ void Relu6::evalQuantized() const
   params.output_shift = _output_shift;
 
   params.quantized_activation_min =
-      std::max(static_cast<int32_t>(std::numeric_limits<uint8_t>::min()), params.output_offset);
+    std::max(static_cast<int32_t>(std::numeric_limits<uint8_t>::min()), params.output_offset);
   params.quantized_activation_max =
-      std::min(static_cast<int32_t>(std::numeric_limits<uint8_t>::max()),
-               params.output_offset + static_cast<int32>(roundf(6.f / output()->scale())));
+    std::min(static_cast<int32_t>(std::numeric_limits<uint8_t>::max()),
+             params.output_offset + static_cast<int32>(roundf(6.f / output()->scale())));
 
   tflite::optimized_ops::ReluX(params, getTensorShape(input()), getTensorData<uint8_t>(input()),
                                getTensorShape(output()), getTensorData<uint8_t>(output()));
diff --git a/compiler/luci-interpreter/src/kernels/Relu6.test.cpp b/compiler/luci-interpreter/src/kernels/Relu6.test.cpp
index a7f104d85..fe991389a 100644
--- a/compiler/luci-interpreter/src/kernels/Relu6.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Relu6.test.cpp
@@ -30,13 +30,13 @@ using namespace testing;
 TEST(Relu6Test, FloatSimple)
 {
   std::vector<float> input_data{
-      0.0f, 1.0f,  3.0f,  // Row 1
-      7.0f, -1.0f, -2.0f, // Row 2
+    0.0f, 1.0f,  3.0f,  // Row 1
+    7.0f, -1.0f, -2.0f, // Row 2
   };
 
   std::vector<float> ref_output_data{
-      0.0f, 1.0f, 3.0f, // Row 1
-      6.0f, 0.0f, 0.0f, // Row 2
+    0.0f, 1.0f, 3.0f, // Row 1
+    6.0f, 0.0f, 0.0f, // Row 2
   };
 
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 3}, input_data);
@@ -59,13 +59,13 @@ TEST(Relu6Test, Uint8Quantized)
   const float tolerance = (f_max - f_min) / 255.0;
 
   std::vector<float> input_data{
-      0,  -6, 2, 8, //
-      -2, 3,  7, 1, //
+    0,  -6, 2, 8, //
+    -2, 3,  7, 1, //
   };
 
   std::pair<float, int32_t> quant_param = quantizationParams<uint8_t>(f_min, f_max);
-  Tensor input_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first,
-                                                      quant_param.second, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_param.first, quant_param.second, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.second);
 
   Relu6 kernel(&input_tensor, &output_tensor);
@@ -89,13 +89,13 @@ TEST(Relu6Test, Uint8Requantized)
   const float tolerance = (in_max - in_min) / 255.0;
 
   std::vector<float> input_data{
-      0,  -6, 2, 8, //
-      -2, 3,  7, 1, //
+    0,  -6, 2, 8, //
+    -2, 3,  7, 1, //
   };
 
   std::pair<float, int32_t> quant_input = quantizationParams<uint8_t>(in_min, in_max);
-  Tensor input_tensor = makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_input.first,
-                                                      quant_input.second, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>({1, 2, 4, 1}, quant_input.first, quant_input.second, input_data);
 
   std::pair<float, int32_t> quant_output = quantizationParams<uint8_t>(out_min, out_max);
   Tensor output_tensor = makeOutputTensor(DataType::U8, quant_output.first, quant_output.second);
diff --git a/compiler/luci-interpreter/src/kernels/Reshape.cpp b/compiler/luci-interpreter/src/kernels/Reshape.cpp
index d88b5392a..61d3300b2 100644
--- a/compiler/luci-interpreter/src/kernels/Reshape.cpp
+++ b/compiler/luci-interpreter/src/kernels/Reshape.cpp
@@ -65,7 +65,7 @@ static void resolveUnknownDimension(const Shape &input_shape, Shape *output_shap
 }
 
 Reshape::Reshape(const Tensor *input, const Tensor *shape, Tensor *output)
-    : Kernel({input, shape}, {output})
+  : Kernel({input, shape}, {output})
 {
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/ResizeBilinear.cpp b/compiler/luci-interpreter/src/kernels/ResizeBilinear.cpp
index 9385855cf..0e9bcc920 100644
--- a/compiler/luci-interpreter/src/kernels/ResizeBilinear.cpp
+++ b/compiler/luci-interpreter/src/kernels/ResizeBilinear.cpp
@@ -28,7 +28,7 @@ namespace kernels
 
 ResizeBilinear::ResizeBilinear(const Tensor *input, const Tensor *size, Tensor *output,
                                const ResizeBilinearParams &params)
-    : KernelWithParams<ResizeBilinearParams>({input, size}, {output}, params)
+  : KernelWithParams<ResizeBilinearParams>({input, size}, {output}, params)
 {
 }
 
@@ -57,14 +57,13 @@ void ResizeBilinear::execute() const
   {
     case DataType::FLOAT32:
       tflite::optimized_ops::ResizeBilinear(
-          op_params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(size()),
-          getTensorData<int32_t>(size()), getTensorShape(output()), getTensorData<float>(output()));
+        op_params, getTensorShape(input()), getTensorData<float>(input()), getTensorShape(size()),
+        getTensorData<int32_t>(size()), getTensorShape(output()), getTensorData<float>(output()));
       break;
     case DataType::U8:
       tflite::optimized_ops::ResizeBilinear(
-          op_params, getTensorShape(input()), getTensorData<uint8_t>(input()),
-          getTensorShape(size()), getTensorData<int32_t>(size()), getTensorShape(output()),
-          getTensorData<uint8_t>(output()));
+        op_params, getTensorShape(input()), getTensorData<uint8_t>(input()), getTensorShape(size()),
+        getTensorData<int32_t>(size()), getTensorShape(output()), getTensorData<uint8_t>(output()));
       break;
     default:
       throw std::runtime_error("Unsupported type.");
diff --git a/compiler/luci-interpreter/src/kernels/ResizeBilinear.test.cpp b/compiler/luci-interpreter/src/kernels/ResizeBilinear.test.cpp
index 51c1359da..68ef6e6c1 100644
--- a/compiler/luci-interpreter/src/kernels/ResizeBilinear.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/ResizeBilinear.test.cpp
@@ -88,19 +88,19 @@ TYPED_TEST(ResizeBilinearTest, SimpleTest)
 {
   Check<TypeParam>({2, 2, 2, 1}, {2}, {2, 3, 3, 1},
                    {
-                       3, 6,  //
-                       9, 12, //
-                       4, 10, //
-                       10, 16 //
+                     3, 6,  //
+                     9, 12, //
+                     4, 10, //
+                     10, 16 //
                    },
                    {3, 3},
                    {
-                       3, 5, 6,    //
-                       7, 9, 10,   //
-                       9, 11, 12,  //
-                       4, 8, 10,   //
-                       8, 12, 14,  //
-                       10, 14, 16, //
+                     3, 5, 6,    //
+                     7, 9, 10,   //
+                     9, 11, 12,  //
+                     4, 8, 10,   //
+                     8, 12, 14,  //
+                     10, 14, 16, //
                    },
                    false, false);
   SUCCEED();
@@ -110,19 +110,19 @@ TEST(ResizeBilinearTest, HalfPixelCenterFloatTest)
 {
   Check<float>({2, 2, 2, 1}, {2}, {2, 3, 3, 1},
                {
-                   1, 2, //
-                   3, 4, //
-                   1, 2, //
-                   3, 4  //
+                 1, 2, //
+                 3, 4, //
+                 1, 2, //
+                 3, 4  //
                },
                {3, 3},
                {
-                   1, 1.5, 2, //
-                   2, 2.5, 3, //
-                   3, 3.5, 4, //
-                   1, 1.5, 2, //
-                   2, 2.5, 3, //
-                   3, 3.5, 4, //
+                 1, 1.5, 2, //
+                 2, 2.5, 3, //
+                 3, 3.5, 4, //
+                 1, 1.5, 2, //
+                 2, 2.5, 3, //
+                 3, 3.5, 4, //
                },
                false, true);
   SUCCEED();
@@ -132,19 +132,19 @@ TEST(ResizeBilinearTest, HalfPixelCenterUint8Test)
 {
   Check<uint8_t>({2, 2, 2, 1}, {2}, {2, 3, 3, 1},
                  {
-                     3, 6,  //
-                     9, 12, //
-                     4, 10, //
-                     12, 16 //
+                   3, 6,  //
+                   9, 12, //
+                   4, 10, //
+                   12, 16 //
                  },
                  {3, 3},
                  {
-                     2, 4, 6,    //
-                     6, 7, 9,    //
-                     9, 10, 12,  //
-                     4, 7, 10,   //
-                     8, 10, 13,  //
-                     12, 14, 16, //
+                   2, 4, 6,    //
+                   6, 7, 9,    //
+                   9, 10, 12,  //
+                   4, 7, 10,   //
+                   8, 10, 13,  //
+                   12, 14, 16, //
                  },
                  false, true);
   SUCCEED();
@@ -153,10 +153,10 @@ TEST(ResizeBilinearTest, HalfPixelCenterUint8Test)
 TEST(ResizeBilinearTest, InputShapeInvalid_NEG)
 {
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 2, 2}, {
-                                                                          3, 6,  //
-                                                                          9, 12, //
-                                                                          4, 10, //
-                                                                          10, 16 //
+                                                                        3, 6,  //
+                                                                        9, 12, //
+                                                                        4, 10, //
+                                                                        10, 16 //
                                                                       });
   Tensor size_tensor = makeInputTensor<DataType::S32>({2}, {3, 3});
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
@@ -172,10 +172,10 @@ TEST(ResizeBilinearTest, InputShapeInvalid_NEG)
 TEST(ResizeBilinearTest, SizeShapeInvalid_NEG)
 {
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 2, 2, 1}, {
-                                                                             3, 6,  //
-                                                                             9, 12, //
-                                                                             4, 10, //
-                                                                             10, 16 //
+                                                                           3, 6,  //
+                                                                           9, 12, //
+                                                                           4, 10, //
+                                                                           10, 16 //
                                                                          });
   Tensor size_tensor = makeInputTensor<DataType::S32>({2, 1}, {3, 3});
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
@@ -191,10 +191,10 @@ TEST(ResizeBilinearTest, SizeShapeInvalid_NEG)
 TEST(ResizeBilinearTest, SizeDimInvalid_NEG)
 {
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 2, 2, 1}, {
-                                                                             3, 6,  //
-                                                                             9, 12, //
-                                                                             4, 10, //
-                                                                             10, 16 //
+                                                                           3, 6,  //
+                                                                           9, 12, //
+                                                                           4, 10, //
+                                                                           10, 16 //
                                                                          });
   Tensor size_tensor = makeInputTensor<DataType::S32>({3}, {3, 3, 1});
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
@@ -210,10 +210,10 @@ TEST(ResizeBilinearTest, SizeDimInvalid_NEG)
 TEST(ResizeBilinearTest, InvalidParams_NEG)
 {
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 2, 2, 1}, {
-                                                                             3, 6,  //
-                                                                             9, 12, //
-                                                                             4, 10, //
-                                                                             10, 16 //
+                                                                           3, 6,  //
+                                                                           9, 12, //
+                                                                           4, 10, //
+                                                                           10, 16 //
                                                                          });
   Tensor size_tensor = makeInputTensor<DataType::S32>({2}, {3, 3});
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
diff --git a/compiler/luci-interpreter/src/kernels/ResizeNearestNeighbor.cpp b/compiler/luci-interpreter/src/kernels/ResizeNearestNeighbor.cpp
index e4ad8f742..c52264997 100644
--- a/compiler/luci-interpreter/src/kernels/ResizeNearestNeighbor.cpp
+++ b/compiler/luci-interpreter/src/kernels/ResizeNearestNeighbor.cpp
@@ -30,7 +30,7 @@ namespace kernels
 ResizeNearestNeighbor::ResizeNearestNeighbor(const Tensor *input, const Tensor *size,
                                              Tensor *output,
                                              const ResizeNearestNeighborParams &params)
-    : KernelWithParams<ResizeNearestNeighborParams>({input, size}, {output}, params)
+  : KernelWithParams<ResizeNearestNeighborParams>({input, size}, {output}, params)
 {
 }
 
@@ -57,15 +57,13 @@ void ResizeNearestNeighbor::execute() const
   {
     case DataType::FLOAT32:
       tflite::reference_ops::ResizeNearestNeighbor(
-          op_params, getTensorShape(input()), getTensorData<int32_t>(input()),
-          getTensorShape(size()), getTensorData<int32_t>(size()), getTensorShape(output()),
-          getTensorData<int32_t>(output()));
+        op_params, getTensorShape(input()), getTensorData<int32_t>(input()), getTensorShape(size()),
+        getTensorData<int32_t>(size()), getTensorShape(output()), getTensorData<int32_t>(output()));
       break;
     case DataType::U8:
       tflite::optimized_ops::ResizeNearestNeighbor(
-          op_params, getTensorShape(input()), getTensorData<uint8_t>(input()),
-          getTensorShape(size()), getTensorData<int32_t>(size()), getTensorShape(output()),
-          getTensorData<uint8_t>(output()));
+        op_params, getTensorShape(input()), getTensorData<uint8_t>(input()), getTensorShape(size()),
+        getTensorData<int32_t>(size()), getTensorShape(output()), getTensorData<uint8_t>(output()));
       break;
     default:
       throw std::runtime_error("Unsupported type.");
diff --git a/compiler/luci-interpreter/src/kernels/ResizeNearestNeighbor.test.cpp b/compiler/luci-interpreter/src/kernels/ResizeNearestNeighbor.test.cpp
index 9a804cca7..0b36a29af 100644
--- a/compiler/luci-interpreter/src/kernels/ResizeNearestNeighbor.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/ResizeNearestNeighbor.test.cpp
@@ -59,10 +59,10 @@ void Check<uint8_t>(std::initializer_list<int32_t> input_shape,
                     bool half_pixel_centers)
 {
   std::pair<float, int32_t> quant_param =
-      quantizationParams<uint8_t>(std::min(input_data) < 0 ? std::min(input_data) : 0.f,
-                                  std::max(input_data) > 0 ? std::max(input_data) : 0.f);
+    quantizationParams<uint8_t>(std::min(input_data) < 0 ? std::min(input_data) : 0.f,
+                                std::max(input_data) > 0 ? std::max(input_data) : 0.f);
   Tensor input_tensor =
-      makeInputTensor<DataType::U8>(input_shape, quant_param.first, quant_param.second, input_data);
+    makeInputTensor<DataType::U8>(input_shape, quant_param.first, quant_param.second, input_data);
   Tensor size_tensor = makeInputTensor<DataType::S32>(size_shape, size_data);
   Tensor output_tensor = makeOutputTensor(DataType::U8, quant_param.first, quant_param.first);
 
@@ -90,19 +90,19 @@ TYPED_TEST(ResizeNearestNeighborTest, SimpleTest)
 {
   Check<TypeParam>({2, 2, 2, 1}, {2}, {2, 3, 3, 1},
                    {
-                       3, 6,  //
-                       9, 12, //
-                       4, 10, //
-                       10, 16 //
+                     3, 6,  //
+                     9, 12, //
+                     4, 10, //
+                     10, 16 //
                    },
                    {3, 3},
                    {
-                       3, 3, 6,    //
-                       3, 3, 6,    //
-                       9, 9, 12,   //
-                       4, 4, 10,   //
-                       4, 4, 10,   //
-                       10, 10, 16, //
+                     3, 3, 6,    //
+                     3, 3, 6,    //
+                     9, 9, 12,   //
+                     4, 4, 10,   //
+                     4, 4, 10,   //
+                     10, 10, 16, //
                    },
                    false, false);
 }
@@ -111,19 +111,19 @@ TYPED_TEST(ResizeNearestNeighborTest, AlignCenterTest)
 {
   Check<TypeParam>({2, 2, 2, 1}, {2}, {2, 3, 3, 1},
                    {
-                       3, 6,  //
-                       9, 12, //
-                       4, 10, //
-                       10, 16 //
+                     3, 6,  //
+                     9, 12, //
+                     4, 10, //
+                     10, 16 //
                    },
                    {3, 3},
                    {
-                       3, 6, 6,    //
-                       9, 12, 12,  //
-                       9, 12, 12,  //
-                       4, 10, 10,  //
-                       10, 16, 16, //
-                       10, 16, 16, //
+                     3, 6, 6,    //
+                     9, 12, 12,  //
+                     9, 12, 12,  //
+                     4, 10, 10,  //
+                     10, 16, 16, //
+                     10, 16, 16, //
                    },
                    true, false);
 }
@@ -132,19 +132,19 @@ TYPED_TEST(ResizeNearestNeighborTest, HalfPixelCenterTest)
 {
   Check<TypeParam>({2, 2, 2, 1}, {2}, {2, 3, 3, 1},
                    {
-                       3, 6,  //
-                       9, 12, //
-                       4, 10, //
-                       10, 16 //
+                     3, 6,  //
+                     9, 12, //
+                     4, 10, //
+                     10, 16 //
                    },
                    {3, 3},
                    {
-                       3, 6, 6,    //
-                       9, 12, 12,  //
-                       9, 12, 12,  //
-                       4, 10, 10,  //
-                       10, 16, 16, //
-                       10, 16, 16, //
+                     3, 6, 6,    //
+                     9, 12, 12,  //
+                     9, 12, 12,  //
+                     4, 10, 10,  //
+                     10, 16, 16, //
+                     10, 16, 16, //
                    },
                    false, true);
 }
@@ -152,10 +152,10 @@ TYPED_TEST(ResizeNearestNeighborTest, HalfPixelCenterTest)
 TEST(ResizeNearestNeighborTest, InputShapeInvalid_NEG)
 {
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 2, 2}, {
-                                                                          3, 6,  //
-                                                                          9, 12, //
-                                                                          4, 10, //
-                                                                          10, 16 //
+                                                                        3, 6,  //
+                                                                        9, 12, //
+                                                                        4, 10, //
+                                                                        10, 16 //
                                                                       });
   Tensor size_tensor = makeInputTensor<DataType::S32>({2}, {3, 3});
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
@@ -171,10 +171,10 @@ TEST(ResizeNearestNeighborTest, InputShapeInvalid_NEG)
 TEST(ResizeNearestNeighborTest, SizeShapeInvalid_NEG)
 {
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 2, 2, 1}, {
-                                                                             3, 6,  //
-                                                                             9, 12, //
-                                                                             4, 10, //
-                                                                             10, 16 //
+                                                                           3, 6,  //
+                                                                           9, 12, //
+                                                                           4, 10, //
+                                                                           10, 16 //
                                                                          });
   Tensor size_tensor = makeInputTensor<DataType::S32>({2, 1}, {3, 3});
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
@@ -190,10 +190,10 @@ TEST(ResizeNearestNeighborTest, SizeShapeInvalid_NEG)
 TEST(ResizeNearestNeighborTest, SizeDimInvalid_NEG)
 {
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 2, 2, 1}, {
-                                                                             3, 6,  //
-                                                                             9, 12, //
-                                                                             4, 10, //
-                                                                             10, 16 //
+                                                                           3, 6,  //
+                                                                           9, 12, //
+                                                                           4, 10, //
+                                                                           10, 16 //
                                                                          });
   Tensor size_tensor = makeInputTensor<DataType::S32>({3}, {3, 3, 1});
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
diff --git a/compiler/luci-interpreter/src/kernels/Reverse.cpp b/compiler/luci-interpreter/src/kernels/Reverse.cpp
index a46308412..e9893fadc 100644
--- a/compiler/luci-interpreter/src/kernels/Reverse.cpp
+++ b/compiler/luci-interpreter/src/kernels/Reverse.cpp
@@ -25,7 +25,7 @@ namespace kernels
 {
 
 Reverse::Reverse(const Tensor *input, const Tensor *axes, Tensor *output)
-    : Kernel({input, axes}, {output})
+  : Kernel({input, axes}, {output})
 {
 }
 
@@ -69,8 +69,8 @@ void Reverse::execute() const
       break;
     case DataType::U8:
       tflite::reference_ops::Reverse<uint8_t>(
-          axis_value, getTensorShape(input()), getTensorData<uint8_t>(input()),
-          getTensorShape(output()), getTensorData<uint8_t>(output()));
+        axis_value, getTensorShape(input()), getTensorData<uint8_t>(input()),
+        getTensorShape(output()), getTensorData<uint8_t>(output()));
       break;
     default:
       throw std::runtime_error("Unsupported output type");
diff --git a/compiler/luci-interpreter/src/kernels/Rsqrt.test.cpp b/compiler/luci-interpreter/src/kernels/Rsqrt.test.cpp
index d33b800be..b93a04ddd 100644
--- a/compiler/luci-interpreter/src/kernels/Rsqrt.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Rsqrt.test.cpp
@@ -43,17 +43,17 @@ void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int
 TEST(RsqrtTest, SimpleRsqrt)
 {
   Check(
-      /*input_shape=*/{1, 2, 4, 1}, /*output_shape=*/{1, 2, 4, 1},
-      /*input_data=*/
-      {
-          5, 4, 8, 2,     //
-          6, 7.5, 9, 0.3, //
-      },
-      /*output_data=*/
-      {
-          0.44721360, 0.5, 0.35355339, 0.70710678,       //
-          0.40824829, 0.36514837, 0.33333333, 1.8257419, //
-      });
+    /*input_shape=*/{1, 2, 4, 1}, /*output_shape=*/{1, 2, 4, 1},
+    /*input_data=*/
+    {
+      5, 4, 8, 2,     //
+      6, 7.5, 9, 0.3, //
+    },
+    /*output_data=*/
+    {
+      0.44721360, 0.5, 0.35355339, 0.70710678,       //
+      0.40824829, 0.36514837, 0.33333333, 1.8257419, //
+    });
 }
 
 TEST(RsqrtTest, Input_Output_Type_NEG)
diff --git a/compiler/luci-interpreter/src/kernels/Slice.cpp b/compiler/luci-interpreter/src/kernels/Slice.cpp
index c4bc3c57c..626521815 100644
--- a/compiler/luci-interpreter/src/kernels/Slice.cpp
+++ b/compiler/luci-interpreter/src/kernels/Slice.cpp
@@ -29,7 +29,7 @@ namespace kernels
 const int max_dim = 4;
 
 Slice::Slice(const Tensor *input, const Tensor *begin, const Tensor *size, Tensor *output)
-    : Kernel({input, begin, size}, {output})
+  : Kernel({input, begin, size}, {output})
 {
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/Softmax.cpp b/compiler/luci-interpreter/src/kernels/Softmax.cpp
index 642c0ad75..8e29f53ee 100644
--- a/compiler/luci-interpreter/src/kernels/Softmax.cpp
+++ b/compiler/luci-interpreter/src/kernels/Softmax.cpp
@@ -30,7 +30,7 @@ namespace kernels
 {
 
 Softmax::Softmax(const Tensor *input, Tensor *output, const SoftmaxParams &params)
-    : KernelWithParams<SoftmaxParams>({input}, {output}, params)
+  : KernelWithParams<SoftmaxParams>({input}, {output}, params)
 {
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/Softmax.test.cpp b/compiler/luci-interpreter/src/kernels/Softmax.test.cpp
index d3d8209a5..c69a2f9cc 100644
--- a/compiler/luci-interpreter/src/kernels/Softmax.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Softmax.test.cpp
@@ -51,15 +51,15 @@ void Check<uint8_t>(std::initializer_list<int32_t> input_shape,
                     std::initializer_list<float> output_data)
 {
   std::pair<float, int32_t> input_quant_param =
-      quantizationParams<uint8_t>(std::min<float>(std::min<float>(input_data), 0.f),
-                                  std::max<float>(std::max<float>(input_data), 0.f));
+    quantizationParams<uint8_t>(std::min<float>(std::min<float>(input_data), 0.f),
+                                std::max<float>(std::max<float>(input_data), 0.f));
   std::pair<float, int32_t> output_quant_param =
-      quantizationParams<uint8_t>(std::min<float>(std::min<float>(output_data), 0.f),
-                                  std::max<float>(std::max<float>(output_data), 0.f));
+    quantizationParams<uint8_t>(std::min<float>(std::min<float>(output_data), 0.f),
+                                std::max<float>(std::max<float>(output_data), 0.f));
   Tensor input_tensor = makeInputTensor<DataType::U8>(input_shape, input_quant_param.first,
                                                       input_quant_param.second, input_data);
   Tensor output_tensor =
-      makeOutputTensor(DataType::U8, output_quant_param.first, output_quant_param.second);
+    makeOutputTensor(DataType::U8, output_quant_param.first, output_quant_param.second);
 
   SoftmaxParams params{};
   params.beta = 0.1;
@@ -84,16 +84,16 @@ TYPED_TEST(SoftmaxTest, Simple)
 {
   Check<TypeParam>({2, 1, 2, 3}, {2, 1, 2, 3},
                    {
-                       5, -9, 8,  //
-                       -7, 2, -4, //
-                       1, -2, 9,  //
-                       3, -6, -1, //
+                     5, -9, 8,  //
+                     -7, 2, -4, //
+                     1, -2, 9,  //
+                     3, -6, -1, //
                    },
                    {
-                       0.38514, 0.09497, 0.51989, //
-                       0.20792, 0.51141, 0.28067, //
-                       0.25212, 0.18678, 0.56110, //
-                       0.48149, 0.19576, 0.32275, //
+                     0.38514, 0.09497, 0.51989, //
+                     0.20792, 0.51141, 0.28067, //
+                     0.25212, 0.18678, 0.56110, //
+                     0.48149, 0.19576, 0.32275, //
                    });
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/SpaceToBatchND.cpp b/compiler/luci-interpreter/src/kernels/SpaceToBatchND.cpp
new file mode 100644
index 000000000..2f6a47925
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/SpaceToBatchND.cpp
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/SpaceToBatchND.h"
+#include "kernels/Utils.h"
+
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+#include <stdexcept>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+const int kInputMinDimensionNum = 3;
+const int kInputMaxDimensionNum = 4;
+
+} // namespace
+
+SpaceToBatchND::SpaceToBatchND(const Tensor *input, const Tensor *block_shape,
+                               const Tensor *paddings, Tensor *output)
+  : Kernel({input, block_shape, paddings}, {output})
+{
+}
+
+void SpaceToBatchND::configure()
+{
+  const auto *block_shape_data = block_shape()->data<int32_t>();
+  const auto *paddings_data = paddings()->data<int32_t>();
+  LUCI_INTERPRETER_CHECK(input()->shape().num_dims() >= kInputMinDimensionNum);
+  LUCI_INTERPRETER_CHECK(input()->shape().num_dims() <= kInputMaxDimensionNum);
+  LUCI_INTERPRETER_CHECK(input()->element_type() == output()->element_type());
+
+  int spatial_dims_num = input()->shape().num_dims() - 2;
+
+  LUCI_INTERPRETER_CHECK(block_shape()->shape().num_dims() == 1);
+  LUCI_INTERPRETER_CHECK(block_shape()->shape().dim(0) == spatial_dims_num);
+
+  LUCI_INTERPRETER_CHECK(paddings()->shape().num_dims() == 2);
+  LUCI_INTERPRETER_CHECK(paddings()->shape().dim(0) == spatial_dims_num);
+  LUCI_INTERPRETER_CHECK(paddings()->shape().dim(1) == 2);
+
+  Shape output_shape = Shape(input()->shape().num_dims());
+  int output_batch_size = input()->shape().dim(0);
+  for (int i = 0; i < spatial_dims_num; ++i)
+  {
+    int final_dim_size =
+      (input()->shape().dim(i + 1) + paddings_data[i * 2] + paddings_data[i * 2 + 1]);
+    LUCI_INTERPRETER_CHECK(final_dim_size % block_shape_data[i] == 0);
+    output_shape.dim(i + 1) = final_dim_size / block_shape_data[i];
+    output_batch_size = output_batch_size * block_shape_data[i];
+  }
+  output_shape.dim(0) = output_batch_size;
+  output_shape.dim(input()->shape().num_dims() - 1) =
+    input()->shape().dim(input()->shape().num_dims() - 1);
+  output()->resize(output_shape);
+}
+
+void SpaceToBatchND::execute() const
+{
+  switch (input()->element_type())
+  {
+    tflite::SpaceToBatchParams op_params;
+    case DataType::FLOAT32:
+      op_params.output_offset = 0;
+      tflite::optimized_ops::SpaceToBatchND(
+        op_params, getTensorShape(input()), getTensorData<float>(input()),
+        getTensorShape(block_shape()), getTensorData<int32_t>(block_shape()),
+        getTensorShape(paddings()), getTensorData<int32_t>(paddings()), getTensorShape(output()),
+        getTensorData<float>(output()));
+      break;
+    case DataType::U8:
+      op_params.output_offset = output()->zero_point();
+      tflite::optimized_ops::SpaceToBatchND(
+        op_params, getTensorShape(input()), getTensorData<uint8_t>(input()),
+        getTensorShape(block_shape()), getTensorData<int32_t>(block_shape()),
+        getTensorShape(paddings()), getTensorData<int32_t>(paddings()), getTensorShape(output()),
+        getTensorData<uint8_t>(output()));
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/SpaceToBatchND.h b/compiler/luci-interpreter/src/kernels/SpaceToBatchND.h
new file mode 100644
index 000000000..0893003bb
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/SpaceToBatchND.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_SPACETOBATCHND_H
+#define LUCI_INTERPRETER_KERNELS_SPACETOBATCHND_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class SpaceToBatchND : public Kernel
+{
+public:
+  SpaceToBatchND(const Tensor *input, const Tensor *block_shape, const Tensor *paddings,
+                 Tensor *output);
+
+  const Tensor *input() const { return _inputs[0]; }
+  const Tensor *block_shape() const { return _inputs[1]; }
+  const Tensor *paddings() const { return _inputs[2]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_SPACETOBATCHND_H
diff --git a/compiler/luci-interpreter/src/kernels/SpaceToBatchND.test.cpp b/compiler/luci-interpreter/src/kernels/SpaceToBatchND.test.cpp
new file mode 100644
index 000000000..a6ec6f23f
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/SpaceToBatchND.test.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/SpaceToBatchND.h"
+#include "kernels/TestUtils.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T>
+void Check(std::initializer_list<int32_t> input_shape,
+           std::initializer_list<int32_t> block_shape_shape,
+           std::initializer_list<int32_t> paddings_shape,
+           std::initializer_list<int32_t> output_shape, std::initializer_list<float> input_data,
+           std::initializer_list<int32_t> block_shape_data,
+           std::initializer_list<int32_t> paddings_data, std::initializer_list<float> output_data)
+{
+  constexpr DataType element_type = getElementType<T>();
+  Tensor input_tensor = makeInputTensor<element_type>(input_shape, input_data);
+  Tensor block_shape_tensor = makeInputTensor<DataType::S32>(block_shape_shape, block_shape_data);
+  Tensor paddings_tensor = makeInputTensor<DataType::S32>(paddings_shape, paddings_data);
+  Tensor output_tensor = makeOutputTensor(element_type);
+
+  SpaceToBatchND kernel(&input_tensor, &block_shape_tensor, &paddings_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<T>(output_tensor), ::testing::ElementsAreArray(output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), output_shape);
+}
+
+template <>
+void Check<uint8_t>(
+  std::initializer_list<int32_t> input_shape, std::initializer_list<int32_t> block_shape_shape,
+  std::initializer_list<int32_t> paddings_shape, std::initializer_list<int32_t> output_shape,
+  std::initializer_list<float> input_data, std::initializer_list<int32_t> block_shape_data,
+  std::initializer_list<int32_t> paddings_data, std::initializer_list<float> output_data)
+{
+  std::pair<float, int32_t> input_quant_param =
+    quantizationParams<uint8_t>(std::min(input_data), std::max(input_data));
+  Tensor input_tensor = makeInputTensor<DataType::U8>(input_shape, input_quant_param.first,
+                                                      input_quant_param.second, input_data);
+  Tensor block_shape_tensor = makeInputTensor<DataType::S32>(block_shape_shape, block_shape_data);
+  Tensor paddings_tensor = makeInputTensor<DataType::S32>(paddings_shape, paddings_data);
+  Tensor output_tensor =
+    makeOutputTensor(DataType::U8, input_quant_param.first, input_quant_param.second);
+
+  SpaceToBatchND kernel(&input_tensor, &block_shape_tensor, &paddings_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(dequantizeTensorData(output_tensor),
+              FloatArrayNear(output_data, output_tensor.scale()));
+  EXPECT_THAT(extractTensorShape(output_tensor), output_shape);
+}
+
+template <typename T> class SpaceToBatchNDTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t>;
+TYPED_TEST_CASE(SpaceToBatchNDTest, DataTypes);
+
+TYPED_TEST(SpaceToBatchNDTest, Simple)
+{
+  Check<TypeParam>(/*input_shape=*/{1, 5, 2, 1}, /*block_shape_shape=*/{2},
+                   /*paddings_shape=*/{2, 2},
+                   /*output_shape=*/{6, 2, 2, 1},
+                   /*input_data=*/{-1.0, 0.2, -0.3, 0.4, -0.5, 0.6, -0.7, 0.8, -0.9, 1.0},
+                   /*block_shape_data=*/{3, 2}, /*paddings_data=*/{1, 0, 2, 0},
+                   /*output_data=*/{0, 0,   0, -0.5, 0, 0,    0, 0.6,  0, -1.0, 0, -0.7,
+                                    0, 0.2, 0, 0.8,  0, -0.3, 0, -0.9, 0, 0.4,  0, 1.0});
+}
+
+TEST(SpaceToBatchNDTest, Invalid_Shape_NEG)
+{
+  Tensor input_tensor =
+    makeInputTensor<DataType::FLOAT32>({1, 3, 3, 1}, {1, 2, 3, 4, 5, 6, 7, 8, 9});
+  Tensor block_shape_tensor = makeInputTensor<DataType::S32>({2}, {2, 2});
+  Tensor paddings_tensor = makeInputTensor<DataType::S32>({2, 2}, {0, 0, 0, 0});
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  SpaceToBatchND kernel(&input_tensor, &block_shape_tensor, &paddings_tensor, &output_tensor);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/SpaceToDepth.cpp b/compiler/luci-interpreter/src/kernels/SpaceToDepth.cpp
index 6a5bd7cf8..fc999372a 100644
--- a/compiler/luci-interpreter/src/kernels/SpaceToDepth.cpp
+++ b/compiler/luci-interpreter/src/kernels/SpaceToDepth.cpp
@@ -24,7 +24,7 @@ namespace kernels
 {
 
 SpaceToDepth::SpaceToDepth(const Tensor *input, Tensor *output, const SpaceToDepthParams &params)
-    : KernelWithParams<SpaceToDepthParams>({input}, {output}, params)
+  : KernelWithParams<SpaceToDepthParams>({input}, {output}, params)
 {
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/Split.cpp b/compiler/luci-interpreter/src/kernels/Split.cpp
index 325b1c22f..0da0f3779 100644
--- a/compiler/luci-interpreter/src/kernels/Split.cpp
+++ b/compiler/luci-interpreter/src/kernels/Split.cpp
@@ -26,7 +26,7 @@ namespace kernels
 {
 
 Split::Split(const Tensor *axis, const Tensor *input, std::vector<Tensor *> outputs)
-    : Kernel({axis, input}, std::move(outputs))
+  : Kernel({axis, input}, std::move(outputs))
 {
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/Split.test.cpp b/compiler/luci-interpreter/src/kernels/Split.test.cpp
index 2147d15c1..c558928e8 100644
--- a/compiler/luci-interpreter/src/kernels/Split.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Split.test.cpp
@@ -72,44 +72,48 @@ TYPED_TEST(SplitTest, FourDimensional)
   Check<TypeParam>(/*axis=*/0, /*num_splits=*/2, {2, 2, 2, 2}, {1, 2, 2, 2},
                    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
                    {
-                       {1, 2, 3, 4, 5, 6, 7, 8},        //
-                       {9, 10, 11, 12, 13, 14, 15, 16}, //
+                     {1, 2, 3, 4, 5, 6, 7, 8},        //
+                     {9, 10, 11, 12, 13, 14, 15, 16}, //
                    });
   Check<TypeParam>(
-      /*axis=*/1, /*num_splits=*/2, {2, 2, 2, 2}, {2, 1, 2, 2},
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {
-                                                                   {1, 2, 3, 4, 9, 10, 11, 12},  //
-                                                                   {5, 6, 7, 8, 13, 14, 15, 16}, //
-                                                               });
+    /*axis=*/1, /*num_splits=*/2, {2, 2, 2, 2}, {2, 1, 2, 2},
+    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+    {
+      {1, 2, 3, 4, 9, 10, 11, 12},  //
+      {5, 6, 7, 8, 13, 14, 15, 16}, //
+    });
   Check<TypeParam>(
-      /*axis=*/2, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 1, 2},
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {
-                                                                   {1, 2, 5, 6, 9, 10, 13, 14},  //
-                                                                   {3, 4, 7, 8, 11, 12, 15, 16}, //
-                                                               });
+    /*axis=*/2, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 1, 2},
+    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+    {
+      {1, 2, 5, 6, 9, 10, 13, 14},  //
+      {3, 4, 7, 8, 11, 12, 15, 16}, //
+    });
   Check<TypeParam>(
-      /*axis=*/3, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 2, 1},
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {
-                                                                   {1, 3, 5, 7, 9, 11, 13, 15},  //
-                                                                   {2, 4, 6, 8, 10, 12, 14, 16}, //
-                                                               });
+    /*axis=*/3, /*num_splits=*/2, {2, 2, 2, 2}, {2, 2, 2, 1},
+    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+    {
+      {1, 3, 5, 7, 9, 11, 13, 15},  //
+      {2, 4, 6, 8, 10, 12, 14, 16}, //
+    });
 }
 
 TYPED_TEST(SplitTest, OneDimensional)
 {
   Check<TypeParam>(
-      /*axis=*/0, /*num_splits=*/8, {8}, {1}, {1, 2, 3, 4, 5, 6, 7, 8},
-      {{1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}});
+    /*axis=*/0, /*num_splits=*/8, {8}, {1}, {1, 2, 3, 4, 5, 6, 7, 8},
+    {{1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}});
 }
 
 TYPED_TEST(SplitTest, NegativeAxis)
 {
   Check<TypeParam>(
-      /*axis=*/-4, /*num_splits=*/2, {2, 2, 2, 2}, {1, 2, 2, 2},
-      {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {
-                                                                   {1, 2, 3, 4, 5, 6, 7, 8}, //
-                                                                   {9, 10, 11, 12, 13, 14, 15, 16},
-                                                               });
+    /*axis=*/-4, /*num_splits=*/2, {2, 2, 2, 2}, {1, 2, 2, 2},
+    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+    {
+      {1, 2, 3, 4, 5, 6, 7, 8}, //
+      {9, 10, 11, 12, 13, 14, 15, 16},
+    });
 }
 
 } // namespace
diff --git a/compiler/luci-interpreter/src/kernels/Sqrt.test.cpp b/compiler/luci-interpreter/src/kernels/Sqrt.test.cpp
index 504db4493..e40a91e97 100644
--- a/compiler/luci-interpreter/src/kernels/Sqrt.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Sqrt.test.cpp
@@ -43,17 +43,17 @@ void Check(std::initializer_list<int32_t> input_shape, std::initializer_list<int
 TEST(SqrtTest, SimpleSqrt)
 {
   Check(
-      /*input_shape=*/{1, 2, 4, 1}, /*output_shape=*/{1, 2, 4, 1},
-      /*input_data=*/
-      {
-          0, 8, 2, 4,    //
-          3, 7, 10, 0.3, //
-      },
-      /*output_data=*/
-      {
-          0.0, 2.8284271, 1.4142136, 2,                //
-          1.7320508, 2.6457513, 3.1622777, 0.54772256, //
-      });
+    /*input_shape=*/{1, 2, 4, 1}, /*output_shape=*/{1, 2, 4, 1},
+    /*input_data=*/
+    {
+      0, 8, 2, 4,    //
+      3, 7, 10, 0.3, //
+    },
+    /*output_data=*/
+    {
+      0.0, 2.8284271, 1.4142136, 2,                //
+      1.7320508, 2.6457513, 3.1622777, 0.54772256, //
+    });
 }
 
 TEST(SqrtTest, Input_Output_Type_NEG)
diff --git a/compiler/luci-interpreter/src/kernels/SquaredDifference.cpp b/compiler/luci-interpreter/src/kernels/SquaredDifference.cpp
new file mode 100644
index 000000000..3bafeba4a
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/SquaredDifference.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/SquaredDifference.h"
+
+#include "kernels/Utils.h"
+
+#include "kernels/BinaryOpCommon.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+SquaredDifference::SquaredDifference(const Tensor *input1, const Tensor *input2, Tensor *output)
+  : Kernel({input1, input2}, {output})
+{
+}
+
+void SquaredDifference::configure()
+{
+  LUCI_INTERPRETER_CHECK(input1()->element_type() == input2()->element_type())
+  LUCI_INTERPRETER_CHECK(input1()->element_type() == output()->element_type())
+  output()->resize(calculateShapeForBroadcast(input1()->shape(), input2()->shape()));
+}
+
+void SquaredDifference::execute() const
+{
+  switch (input1()->element_type())
+  {
+    case DataType::FLOAT32:
+      evalSquaredDifference<float>();
+      break;
+    default:
+      throw std::runtime_error("Unsupported type.");
+  }
+}
+
+template <typename T> inline void SquaredDifference::evalSquaredDifference() const
+{
+  BinaryOpBroadcastSlow(getTensorShape(input1()), getTensorData<T>(input1()),
+                        getTensorShape(input2()), getTensorData<T>(input2()),
+                        getTensorShape(output()), getTensorData<T>(output()), [](T x, T y) {
+                          const T difference = x - y;
+                          return difference * difference;
+                        });
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/SquaredDifference.h b/compiler/luci-interpreter/src/kernels/SquaredDifference.h
new file mode 100644
index 000000000..9327caf93
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/SquaredDifference.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_SQUAREDDIFFERENCE_H
+#define LUCI_INTERPRETER_KERNELS_SQUAREDDIFFERENCE_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class SquaredDifference : public Kernel
+{
+public:
+  SquaredDifference(const Tensor *input1, const Tensor *input2, Tensor *output);
+
+  const Tensor *input1() const { return _inputs[0]; }
+  const Tensor *input2() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+
+private:
+  template <typename T> inline void evalSquaredDifference() const;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_SQUAREDDIFFERENCE_H
diff --git a/compiler/luci-interpreter/src/kernels/SquaredDifference.test.cpp b/compiler/luci-interpreter/src/kernels/SquaredDifference.test.cpp
new file mode 100644
index 000000000..a72eaadfa
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/SquaredDifference.test.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/SquaredDifference.h"
+#include "kernels/TestUtils.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+TEST(SquaredDifferenceTest, Float)
+{
+  Shape input_shape{3, 1, 2};
+  std::vector<float> input_data1{1.0, 0.0, -1.0, 11.0, -2.0, -1.44};
+  std::vector<float> input_data2{-1.0, 0.0, 1.0, 12.0, -3.0, -1.43};
+  Tensor input_tensor1 = makeInputTensor<DataType::FLOAT32>(input_shape, input_data1);
+  Tensor input_tensor2 = makeInputTensor<DataType::FLOAT32>(input_shape, input_data2);
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  SquaredDifference kernel(&input_tensor1, &input_tensor2, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  std::vector<float> ref_output_data{4.0, 0.0, 4.0, 1.0, 1.0, 0.0001};
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+}
+
+TEST(SquaredDifferenceTest, FloatBroadcast)
+{
+  Shape input_shape1{3, 1, 2};
+  Shape input_shape2{1};
+  std::vector<float> input_data1{1.0, 0.0, -1.0, 11.0, -2.0, -1.44};
+  std::vector<float> input_data2{1.0};
+  Tensor input_tensor1 = makeInputTensor<DataType::FLOAT32>(input_shape1, input_data1);
+  Tensor input_tensor2 = makeInputTensor<DataType::FLOAT32>(input_shape2, input_data2);
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  SquaredDifference kernel(&input_tensor1, &input_tensor2, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  std::vector<float> ref_output_data{0.0, 1.0, 4.0, 100.0, 9.0, 5.9536};
+  EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Squeeze.cpp b/compiler/luci-interpreter/src/kernels/Squeeze.cpp
index ce43ef789..4a75518c7 100644
--- a/compiler/luci-interpreter/src/kernels/Squeeze.cpp
+++ b/compiler/luci-interpreter/src/kernels/Squeeze.cpp
@@ -27,7 +27,7 @@ namespace kernels
 {
 
 Squeeze::Squeeze(const Tensor *input, Tensor *output, const SqueezeParams &params)
-    : KernelWithParams<SqueezeParams>({input}, {output}, params)
+  : KernelWithParams<SqueezeParams>({input}, {output}, params)
 {
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/Squeeze.test.cpp b/compiler/luci-interpreter/src/kernels/Squeeze.test.cpp
index ff9fb09d2..1c81893b9 100644
--- a/compiler/luci-interpreter/src/kernels/Squeeze.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Squeeze.test.cpp
@@ -56,12 +56,12 @@ TYPED_TEST_CASE(SqueezeTest, DataTypes);
 TYPED_TEST(SqueezeTest, TotalTest)
 {
   Check<TypeParam>(
-      /*input_shape=*/{1, 24, 1}, /*output_shape=*/{24},
-      /*input_data=*/{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
-                      13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24},
-      /*output_data=*/{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
-                       13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24},
-      {-1, 0});
+    /*input_shape=*/{1, 24, 1}, /*output_shape=*/{24},
+    /*input_data=*/{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                    13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24},
+    /*output_data=*/{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                     13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24},
+    {-1, 0});
 }
 
 } // namespace
diff --git a/compiler/luci-interpreter/src/kernels/StridedSlice.cpp b/compiler/luci-interpreter/src/kernels/StridedSlice.cpp
index 679485439..37b0dd8c5 100644
--- a/compiler/luci-interpreter/src/kernels/StridedSlice.cpp
+++ b/compiler/luci-interpreter/src/kernels/StridedSlice.cpp
@@ -31,7 +31,7 @@ namespace kernels
 
 StridedSlice::StridedSlice(const Tensor *input, const Tensor *begin, const Tensor *end,
                            const Tensor *strides, Tensor *output, const StridedSliceParams &params)
-    : KernelWithParams<StridedSliceParams>({input, begin, end, strides}, {output}, params)
+  : KernelWithParams<StridedSliceParams>({input, begin, end, strides}, {output}, params)
 {
 }
 
@@ -82,7 +82,7 @@ void StridedSlice::configure()
     assert(stride != 0);
     int32_t begin = ::tflite::strided_slice::StartForAxis(op_params, getTensorShape(input()), idx);
     int32_t end =
-        ::tflite::strided_slice::StopForAxis(op_params, getTensorShape(input()), idx, begin);
+      ::tflite::strided_slice::StopForAxis(op_params, getTensorShape(input()), idx, begin);
 
     const bool shrink_axis = params().shrink_axis_mask & (1 << idx);
     if (shrink_axis)
diff --git a/compiler/luci-interpreter/src/kernels/Sub.cpp b/compiler/luci-interpreter/src/kernels/Sub.cpp
index dd9c1102f..3c7588d62 100644
--- a/compiler/luci-interpreter/src/kernels/Sub.cpp
+++ b/compiler/luci-interpreter/src/kernels/Sub.cpp
@@ -28,7 +28,7 @@ namespace kernels
 {
 
 Sub::Sub(const Tensor *input1, const Tensor *input2, Tensor *output, const SubParams &params)
-    : KernelWithParams<SubParams>({input1, input2}, {output}, params)
+  : KernelWithParams<SubParams>({input1, input2}, {output}, params)
 {
 }
 
@@ -64,13 +64,13 @@ void Sub::evalFloat() const
   params.float_activation_max = activation_max;
 
   const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes(
-      getTensorShape(input1()), getTensorShape(input2()), &params);
+    getTensorShape(input1()), getTensorShape(input2()), &params);
 
   if (need_broadcast)
   {
     tflite::reference_ops::BroadcastSubSlow(
-        params, getTensorShape(input1()), getTensorData<float>(input1()), getTensorShape(input2()),
-        getTensorData<float>(input2()), getTensorShape(output()), getTensorData<float>(output()));
+      params, getTensorShape(input1()), getTensorData<float>(input1()), getTensorShape(input2()),
+      getTensorData<float>(input2()), getTensorShape(output()), getTensorData<float>(output()));
   }
   else
   {
@@ -118,14 +118,13 @@ void Sub::evalQuantized() const
   params.quantized_activation_max = activation_max;
 
   const bool need_broadcast = tflite::reference_ops::ProcessBroadcastShapes(
-      getTensorShape(input1()), getTensorShape(input2()), &params);
+    getTensorShape(input1()), getTensorShape(input2()), &params);
 
   if (need_broadcast)
   {
     tflite::reference_ops::BroadcastSubSlow(
-        params, getTensorShape(input1()), getTensorData<uint8_t>(input1()),
-        getTensorShape(input2()), getTensorData<uint8_t>(input2()), getTensorShape(output()),
-        getTensorData<uint8_t>(output()));
+      params, getTensorShape(input1()), getTensorData<uint8_t>(input1()), getTensorShape(input2()),
+      getTensorData<uint8_t>(input2()), getTensorShape(output()), getTensorData<uint8_t>(output()));
   }
   else
   {
diff --git a/compiler/luci-interpreter/src/kernels/Sub.test.cpp b/compiler/luci-interpreter/src/kernels/Sub.test.cpp
index 9f77fe7e0..f560ceb36 100644
--- a/compiler/luci-interpreter/src/kernels/Sub.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Sub.test.cpp
@@ -49,25 +49,25 @@ TEST(SubTest, Uint8)
   vector<float> test_data = {0.2f, 0.3f, -0.4f, 0.5f, 1.0f, 0.9f};
   vector<vector<int32_t>> output_shapes = {{2, 3, 3, 2}, {2, 3, 1, 2}, {2, 3, 3, 2}, {2, 3, 1, 2}};
   vector<vector<float>> output_data = {
-      {-0.5f, 2.0f,  0.1f,  1.8f,  -1.3f, 1.4f,  0.7f, 0.2f,  1.3f, 0.0f,  -0.1f, -0.4f,
-       0.6f,  -1.4f, 1.2f,  -1.6f, -0.2f, -2.0f, 1.0f, 2.5f,  1.6f, 2.3f,  0.2f,  1.9f,
-       -1.8f, -0.3f, -1.2f, -0.5f, -2.6f, -0.9f, 0.5f, -2.5f, 1.1f, -2.7f, -0.3f, -3.0f},
-      {-0.5f, 2.0f, 1.3f, 0.0f, -0.2f, -2.0f, 1.0f, 2.5f, -1.2f, -0.5f, -0.3f, -3.0f},
-      {-0.5f, 2.1f,  -0.6f, 2.0f,  0.1f,  2.7f,  0.7f, 0.3f,  0.6f,  0.2f,  1.3f,  0.9f,
-       0.6f,  -1.3f, 0.5f,  -1.4f, 1.2f,  -0.7f, 0.7f, 2.3f,  0.2f,  1.8f,  0.3f,  1.9f,
-       -2.1f, -0.5f, -2.6f, -1.0f, -2.5f, -0.9f, 0.2f, -2.7f, -0.3f, -3.0f, -0.2f, -3.0f},
-      {-0.5f, 2.1f, 0.6f, 0.2f, 1.2f, -0.7f, 0.7f, 2.3f, -2.6f, -1.0f, -0.2f, -3.0f}};
+    {-0.5f, 2.0f,  0.1f,  1.8f,  -1.3f, 1.4f,  0.7f, 0.2f,  1.3f, 0.0f,  -0.1f, -0.4f,
+     0.6f,  -1.4f, 1.2f,  -1.6f, -0.2f, -2.0f, 1.0f, 2.5f,  1.6f, 2.3f,  0.2f,  1.9f,
+     -1.8f, -0.3f, -1.2f, -0.5f, -2.6f, -0.9f, 0.5f, -2.5f, 1.1f, -2.7f, -0.3f, -3.0f},
+    {-0.5f, 2.0f, 1.3f, 0.0f, -0.2f, -2.0f, 1.0f, 2.5f, -1.2f, -0.5f, -0.3f, -3.0f},
+    {-0.5f, 2.1f,  -0.6f, 2.0f,  0.1f,  2.7f,  0.7f, 0.3f,  0.6f,  0.2f,  1.3f,  0.9f,
+     0.6f,  -1.3f, 0.5f,  -1.4f, 1.2f,  -0.7f, 0.7f, 2.3f,  0.2f,  1.8f,  0.3f,  1.9f,
+     -2.1f, -0.5f, -2.6f, -1.0f, -2.5f, -0.9f, 0.2f, -2.7f, -0.3f, -3.0f, -0.2f, -3.0f},
+    {-0.5f, 2.1f, 0.6f, 0.2f, 1.2f, -0.7f, 0.7f, 2.3f, -2.6f, -1.0f, -0.2f, -3.0f}};
 
   float kQuantizedTolerance = GetTolerance(-3.f, 3.f);
   pair<float, int32_t> quant_param = quantizationParams<uint8_t>(-3.f, 3.f);
   for (size_t i = 0; i < output_data.size(); ++i)
   {
     Tensor input1_tensor =
-        makeInputTensor<DataType::U8>(base_shape, quant_param.first, quant_param.second, base_data);
+      makeInputTensor<DataType::U8>(base_shape, quant_param.first, quant_param.second, base_data);
     Tensor input2_tensor = makeInputTensor<DataType::U8>(test_shapes[i], quant_param.first,
                                                          quant_param.second, test_data);
     Tensor output_tensor =
-        makeOutputTensor(getElementType<uint8_t>(), quant_param.first, quant_param.second);
+      makeOutputTensor(getElementType<uint8_t>(), quant_param.first, quant_param.second);
 
     SubParams params{};
     params.activation = Activation::NONE;
@@ -93,9 +93,9 @@ TEST(SubTest, Uint8)
     Tensor input1_tensor = makeInputTensor<DataType::U8>(test_shapes[i], quant_param.first,
                                                          quant_param.second, test_data);
     Tensor input2_tensor =
-        makeInputTensor<DataType::U8>(base_shape, quant_param.first, quant_param.second, base_data);
+      makeInputTensor<DataType::U8>(base_shape, quant_param.first, quant_param.second, base_data);
     Tensor output_tensor =
-        makeOutputTensor(getElementType<uint8_t>(), quant_param.first, quant_param.second);
+      makeOutputTensor(getElementType<uint8_t>(), quant_param.first, quant_param.second);
 
     SubParams params{};
     params.activation = Activation::NONE;
@@ -116,14 +116,14 @@ TEST(SubTest, Float)
   vector<Shape> test_shapes{{1, 1, 3, 2}, {1, 3, 1, 2}, {2, 1, 3, 1}, {2, 3, 1, 1}};
   vector<vector<int32_t>> output_shapes{{2, 3, 3, 2}, {2, 3, 1, 2}, {2, 3, 3, 2}, {2, 3, 1, 2}};
   vector<vector<float>> test_outputs = {
-      {0.0f, 2.0f, 0.1f, 1.8f, 0.0f, 1.4f, 0.7f, 0.2f, 1.3f, 0.0f, 0.0f, 0.0f,
-       0.6f, 0.0f, 1.2f, 0.0f, 0.0f, 0.0f, 1.0f, 2.5f, 1.6f, 2.3f, 0.2f, 1.9f,
-       0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.5f, 0.0f, 1.1f, 0.0f, 0.0f, 0.0f},
-      {0.0f, 2.0f, 1.3f, 0.0f, 0.0f, 0.0f, 1.0f, 2.5f, 0.0f, 0.0f, 0.0f, 0.0f},
-      {0.0f, 2.1f, 0.0f, 2.0f, 0.1f, 2.7f, 0.7f, 0.3f, 0.6f, 0.2f, 1.3f, 0.9f,
-       0.6f, 0.0f, 0.5f, 0.0f, 1.2f, 0.0f, 0.7f, 2.3f, 0.2f, 1.8f, 0.3f, 1.9f,
-       0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.2f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
-      {0.0f, 2.1f, 0.6f, 0.2f, 1.2f, 0.0f, 0.7f, 2.3f, 0.0f, 0.0f, 0.0f, 0.0f}};
+    {0.0f, 2.0f, 0.1f, 1.8f, 0.0f, 1.4f, 0.7f, 0.2f, 1.3f, 0.0f, 0.0f, 0.0f,
+     0.6f, 0.0f, 1.2f, 0.0f, 0.0f, 0.0f, 1.0f, 2.5f, 1.6f, 2.3f, 0.2f, 1.9f,
+     0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.5f, 0.0f, 1.1f, 0.0f, 0.0f, 0.0f},
+    {0.0f, 2.0f, 1.3f, 0.0f, 0.0f, 0.0f, 1.0f, 2.5f, 0.0f, 0.0f, 0.0f, 0.0f},
+    {0.0f, 2.1f, 0.0f, 2.0f, 0.1f, 2.7f, 0.7f, 0.3f, 0.6f, 0.2f, 1.3f, 0.9f,
+     0.6f, 0.0f, 0.5f, 0.0f, 1.2f, 0.0f, 0.7f, 2.3f, 0.2f, 1.8f, 0.3f, 1.9f,
+     0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.2f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+    {0.0f, 2.1f, 0.6f, 0.2f, 1.2f, 0.0f, 0.7f, 2.3f, 0.0f, 0.0f, 0.0f, 0.0f}};
 
   vector<float> input1_data{-0.3f, 2.3f, 0.9f,  0.5f, 0.8f, -1.1f,
                             1.2f,  2.8f, -1.6f, 0.0f, 0.7f, -2.2f};
@@ -142,7 +142,7 @@ TEST(SubTest, Float)
     kernel.execute();
 
     EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(test_outputs[i], 0.0001f))
-        << "With shape number " << i;
+      << "With shape number " << i;
 
     EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shapes[i]));
   }
diff --git a/compiler/luci-interpreter/src/kernels/Tanh.test.cpp b/compiler/luci-interpreter/src/kernels/Tanh.test.cpp
index 17b50f259..ef727d6eb 100644
--- a/compiler/luci-interpreter/src/kernels/Tanh.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Tanh.test.cpp
@@ -31,8 +31,8 @@ TEST(TanhTest, Float)
 {
   Shape input_shape{1, 2, 4, 1};
   std::vector<float> input_data{
-      0, -6, 2,  4, //
-      3, -2, 10, 1, //
+    0, -6, 2,  4, //
+    3, -2, 10, 1, //
   };
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>(input_shape, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
@@ -42,8 +42,8 @@ TEST(TanhTest, Float)
   kernel.execute();
 
   std::vector<float> ref_output_data{
-      0,          -0.9999877, 0.9640275, 0.999329,  //
-      0.99505475, -0.9640275, 1,         0.7615941, //
+    0,          -0.9999877, 0.9640275, 0.999329,  //
+    0.99505475, -0.9640275, 1,         0.7615941, //
   };
   EXPECT_THAT(extractTensorData<float>(output_tensor), FloatArrayNear(ref_output_data));
 }
@@ -56,41 +56,41 @@ TEST(TanhTest, Uint8)
   std::pair<float, int32_t> input_quant_param = quantizationParams<uint8_t>(8 * kMin, 8 * kMax);
   std::pair<float, int32_t> output_quant_param = quantizationParams<uint8_t>(kMin, kMax);
   std::vector<float> input_data{
-      0,  -6, 2, 4, //
-      -4, -2, 8, 1, //
-      0,  -6, 2, 4, //
-      -4, -2, 8, 1, //
-      0,  -6, 2, 4, //
-      -4, -2, 8, 1, //
-      0,  -6, 2, 4, //
-      -4, -2, 8, 1, //
-      0,  -6, 2, 4, //
-      -4, -2, 8, 1, //
-      0,  -6, 2, 4, //
-      -4, -2, 8, 1, //
+    0,  -6, 2, 4, //
+    -4, -2, 8, 1, //
+    0,  -6, 2, 4, //
+    -4, -2, 8, 1, //
+    0,  -6, 2, 4, //
+    -4, -2, 8, 1, //
+    0,  -6, 2, 4, //
+    -4, -2, 8, 1, //
+    0,  -6, 2, 4, //
+    -4, -2, 8, 1, //
+    0,  -6, 2, 4, //
+    -4, -2, 8, 1, //
   };
   Tensor input_tensor = makeInputTensor<DataType::U8>({2, 6, 4, 1}, input_quant_param.first,
                                                       input_quant_param.second, input_data);
   Tensor output_tensor =
-      makeOutputTensor(DataType::U8, output_quant_param.first, output_quant_param.second);
+    makeOutputTensor(DataType::U8, output_quant_param.first, output_quant_param.second);
 
   Tanh kernel(&input_tensor, &output_tensor);
   kernel.configure();
   kernel.execute();
 
   std::vector<float> ref_output_data{
-      0.0,       -0.999987, 0.964027, 0.999329, //
-      -0.999329, -0.96402,  0.99999,  0.76159,  //
-      0.0,       -0.999987, 0.964027, 0.999329, //
-      -0.999329, -0.96402,  0.99999,  0.76159,  //
-      0.0,       -0.999987, 0.964027, 0.999329, //
-      -0.999329, -0.96402,  0.99999,  0.76159,  //
-      0.0,       -0.999987, 0.964027, 0.999329, //
-      -0.999329, -0.96402,  0.99999,  0.76159,  //
-      0.0,       -0.999987, 0.964027, 0.999329, //
-      -0.999329, -0.96402,  0.99999,  0.76159,  //
-      0.0,       -0.999987, 0.964027, 0.999329, //
-      -0.999329, -0.96402,  0.99999,  0.76159,  //
+    0.0,       -0.999987, 0.964027, 0.999329, //
+    -0.999329, -0.96402,  0.99999,  0.76159,  //
+    0.0,       -0.999987, 0.964027, 0.999329, //
+    -0.999329, -0.96402,  0.99999,  0.76159,  //
+    0.0,       -0.999987, 0.964027, 0.999329, //
+    -0.999329, -0.96402,  0.99999,  0.76159,  //
+    0.0,       -0.999987, 0.964027, 0.999329, //
+    -0.999329, -0.96402,  0.99999,  0.76159,  //
+    0.0,       -0.999987, 0.964027, 0.999329, //
+    -0.999329, -0.96402,  0.99999,  0.76159,  //
+    0.0,       -0.999987, 0.964027, 0.999329, //
+    -0.999329, -0.96402,  0.99999,  0.76159,  //
   };
   std::vector<int32_t> ref_output_shape{2, 6, 4, 1};
   EXPECT_THAT(dequantizeTensorData(output_tensor), FloatArrayNear(ref_output_data, kTanhTolerance));
@@ -100,18 +100,18 @@ TEST(TanhTest, Uint8)
 TEST(TanhTest, InputTypeInvalid_NEG)
 {
   std::vector<int64_t> input_data{
-      0,  -6, 2, 4, //
-      -4, -2, 8, 1, //
-      0,  -6, 2, 4, //
-      -4, -2, 8, 1, //
-      0,  -6, 2, 4, //
-      -4, -2, 8, 1, //
-      0,  -6, 2, 4, //
-      -4, -2, 8, 1, //
-      0,  -6, 2, 4, //
-      -4, -2, 8, 1, //
-      0,  -6, 2, 4, //
-      -4, -2, 8, 1, //
+    0,  -6, 2, 4, //
+    -4, -2, 8, 1, //
+    0,  -6, 2, 4, //
+    -4, -2, 8, 1, //
+    0,  -6, 2, 4, //
+    -4, -2, 8, 1, //
+    0,  -6, 2, 4, //
+    -4, -2, 8, 1, //
+    0,  -6, 2, 4, //
+    -4, -2, 8, 1, //
+    0,  -6, 2, 4, //
+    -4, -2, 8, 1, //
   };
   Tensor input_tensor = makeInputTensor<DataType::S64>({2, 6, 4, 1}, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
@@ -123,18 +123,18 @@ TEST(TanhTest, InputTypeInvalid_NEG)
 TEST(TanhTest, InputOutputMismatch_NEG)
 {
   std::vector<float> input_data{
-      0,  -6, 2, 4, //
-      -4, -2, 8, 1, //
-      0,  -6, 2, 4, //
-      -4, -2, 8, 1, //
-      0,  -6, 2, 4, //
-      -4, -2, 8, 1, //
-      0,  -6, 2, 4, //
-      -4, -2, 8, 1, //
-      0,  -6, 2, 4, //
-      -4, -2, 8, 1, //
-      0,  -6, 2, 4, //
-      -4, -2, 8, 1, //
+    0,  -6, 2, 4, //
+    -4, -2, 8, 1, //
+    0,  -6, 2, 4, //
+    -4, -2, 8, 1, //
+    0,  -6, 2, 4, //
+    -4, -2, 8, 1, //
+    0,  -6, 2, 4, //
+    -4, -2, 8, 1, //
+    0,  -6, 2, 4, //
+    -4, -2, 8, 1, //
+    0,  -6, 2, 4, //
+    -4, -2, 8, 1, //
   };
   Tensor input_tensor = makeInputTensor<DataType::FLOAT32>({2, 6, 4, 1}, input_data);
   Tensor output_tensor = makeOutputTensor(DataType::U8);
diff --git a/compiler/luci-interpreter/src/kernels/TestUtils.cpp b/compiler/luci-interpreter/src/kernels/TestUtils.cpp
index c3c0b5a7d..831dc4247 100644
--- a/compiler/luci-interpreter/src/kernels/TestUtils.cpp
+++ b/compiler/luci-interpreter/src/kernels/TestUtils.cpp
@@ -84,7 +84,7 @@ std::vector<float> dequantizeTensorData(const Tensor &tensor)
         float scale = tensor.scales()[channel];
         size_t offset = inner_dims_size * (quant_dim_size * outer_it + channel);
         std::vector<float> part_dequantized_data =
-            dequantize(data.data() + offset, inner_dims_size, scale, 0);
+          dequantize(data.data() + offset, inner_dims_size, scale, 0);
         dequantized_data.insert(dequantized_data.end(), part_dequantized_data.begin(),
                                 part_dequantized_data.end());
       }
diff --git a/compiler/luci-interpreter/src/kernels/TestUtils.h b/compiler/luci-interpreter/src/kernels/TestUtils.h
index 1f17e39e1..c4c73d546 100644
--- a/compiler/luci-interpreter/src/kernels/TestUtils.h
+++ b/compiler/luci-interpreter/src/kernels/TestUtils.h
@@ -59,7 +59,7 @@ Tensor makeInputTensor(const Shape &shape, float scale, int32_t zero_point,
   using NativeT = typename DataTypeImpl<DT>::Type;
   Tensor tensor(DT, shape, {{scale}, {zero_point}}, "");
   std::vector<NativeT> quantized_data =
-      quantize<NativeT>(data.data(), data.size(), scale, zero_point);
+    quantize<NativeT>(data.data(), data.size(), scale, zero_point);
   tensor.writeData(quantized_data.data(), quantized_data.size() * sizeof(NativeT));
   return tensor;
 }
@@ -108,7 +108,7 @@ Tensor makeInputTensor(const Shape &shape, const std::vector<float> &scales,
       float scale = scales[channel];
       size_t offset = inner_dims_size * (quant_dim_size * outer_it + channel);
       std::vector<NativeT> part_quantized_data =
-          quantize<NativeT>(data.data() + offset, inner_dims_size, scale, zero_point);
+        quantize<NativeT>(data.data() + offset, inner_dims_size, scale, zero_point);
       quantized_data.insert(quantized_data.end(), part_quantized_data.begin(),
                             part_quantized_data.end());
     }
@@ -172,7 +172,7 @@ std::vector<T> quantize(const float *data, size_t num_elements, float scale, int
   {
     const auto &f = data[i];
     q.push_back(static_cast<T>(
-        std::max<float>(q_min, std::min<float>(q_max, std::round(zero_point + (f / scale))))));
+      std::max<float>(q_min, std::min<float>(q_max, std::round(zero_point + (f / scale))))));
   }
   return q;
 }
@@ -233,8 +233,8 @@ template <typename T> std::pair<float, int32_t> quantizationParams(float f_min,
   const float zero_point_from_max_error = std::abs(qmax_double) + std::abs(f_max / scale);
 
   const float zero_point_double = zero_point_from_min_error < zero_point_from_max_error
-                                      ? zero_point_from_min
-                                      : zero_point_from_max;
+                                    ? zero_point_from_min
+                                    : zero_point_from_max;
 
   // Now we need to nudge the zero point to be an integer
   // (our zero points are integer, and this is motivated by the requirement
diff --git a/compiler/luci-interpreter/src/kernels/Transpose.cpp b/compiler/luci-interpreter/src/kernels/Transpose.cpp
index 8265d9937..c1a11cdb0 100644
--- a/compiler/luci-interpreter/src/kernels/Transpose.cpp
+++ b/compiler/luci-interpreter/src/kernels/Transpose.cpp
@@ -29,7 +29,7 @@ namespace kernels
 {
 
 Transpose::Transpose(const Tensor *input, const Tensor *perm, Tensor *output)
-    : Kernel({input, perm}, {output})
+  : Kernel({input, perm}, {output})
 {
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/Transpose.test.cpp b/compiler/luci-interpreter/src/kernels/Transpose.test.cpp
index 1c99223a8..f0a915c35 100644
--- a/compiler/luci-interpreter/src/kernels/Transpose.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Transpose.test.cpp
@@ -63,46 +63,47 @@ TYPED_TEST(TransposeTest, Small3D)
 TYPED_TEST(TransposeTest, Large4D)
 {
   Check<TypeParam>(
-      /*input_shape=*/{2, 3, 4, 5}, /*perm_shape=*/{4}, /*output_shape=*/{4, 2, 3, 5},
-      /*input_data=*/{0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
-                      15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
-                      30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
-                      45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
-                      60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
-                      75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
-                      90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
-                      105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119},
-      /*perm_data=*/{2, 0, 1, 3},
-      /*output_data=*/{0,  1,  2,  3,  4,  20, 21, 22, 23, 24, 40,  41,  42,  43,  44,
-                       60, 61, 62, 63, 64, 80, 81, 82, 83, 84, 100, 101, 102, 103, 104,
-                       5,  6,  7,  8,  9,  25, 26, 27, 28, 29, 45,  46,  47,  48,  49,
-                       65, 66, 67, 68, 69, 85, 86, 87, 88, 89, 105, 106, 107, 108, 109,
-                       10, 11, 12, 13, 14, 30, 31, 32, 33, 34, 50,  51,  52,  53,  54,
-                       70, 71, 72, 73, 74, 90, 91, 92, 93, 94, 110, 111, 112, 113, 114,
-                       15, 16, 17, 18, 19, 35, 36, 37, 38, 39, 55,  56,  57,  58,  59,
-                       75, 76, 77, 78, 79, 95, 96, 97, 98, 99, 115, 116, 117, 118, 119});
+    /*input_shape=*/{2, 3, 4, 5}, /*perm_shape=*/{4}, /*output_shape=*/{4, 2, 3, 5},
+    /*input_data=*/{0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+                    15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+                    30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+                    45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+                    60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+                    75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+                    90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+                    105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119},
+    /*perm_data=*/{2, 0, 1, 3},
+    /*output_data=*/{0,  1,  2,  3,  4,  20, 21, 22, 23, 24, 40,  41,  42,  43,  44,
+                     60, 61, 62, 63, 64, 80, 81, 82, 83, 84, 100, 101, 102, 103, 104,
+                     5,  6,  7,  8,  9,  25, 26, 27, 28, 29, 45,  46,  47,  48,  49,
+                     65, 66, 67, 68, 69, 85, 86, 87, 88, 89, 105, 106, 107, 108, 109,
+                     10, 11, 12, 13, 14, 30, 31, 32, 33, 34, 50,  51,  52,  53,  54,
+                     70, 71, 72, 73, 74, 90, 91, 92, 93, 94, 110, 111, 112, 113, 114,
+                     15, 16, 17, 18, 19, 35, 36, 37, 38, 39, 55,  56,  57,  58,  59,
+                     75, 76, 77, 78, 79, 95, 96, 97, 98, 99, 115, 116, 117, 118, 119});
 }
 
 TYPED_TEST(TransposeTest, Large2D)
 {
   Check<TypeParam>(
-      /*input_shape=*/{10, 12}, /*perm_shape=*/{2}, /*output_shape=*/{12, 10},
-      /*input_data=*/{0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
-                      15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
-                      30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
-                      45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
-                      60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
-                      75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
-                      90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
-                      105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119},
-      /*perm_data=*/{1, 0},
-      /*output_data=*/{
-          0,  12, 24, 36, 48, 60, 72, 84, 96,  108, 1,  13, 25, 37, 49, 61, 73, 85, 97,  109,
-          2,  14, 26, 38, 50, 62, 74, 86, 98,  110, 3,  15, 27, 39, 51, 63, 75, 87, 99,  111,
-          4,  16, 28, 40, 52, 64, 76, 88, 100, 112, 5,  17, 29, 41, 53, 65, 77, 89, 101, 113,
-          6,  18, 30, 42, 54, 66, 78, 90, 102, 114, 7,  19, 31, 43, 55, 67, 79, 91, 103, 115,
-          8,  20, 32, 44, 56, 68, 80, 92, 104, 116, 9,  21, 33, 45, 57, 69, 81, 93, 105, 117,
-          10, 22, 34, 46, 58, 70, 82, 94, 106, 118, 11, 23, 35, 47, 59, 71, 83, 95, 107, 119});
+    /*input_shape=*/{10, 12}, /*perm_shape=*/{2}, /*output_shape=*/{12, 10},
+    /*input_data=*/{0,   1,   2,   3,   4,   5,   6,   7,   8,   9,   10,  11,  12,  13,  14,
+                    15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,
+                    30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,
+                    45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
+                    60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,
+                    75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,
+                    90,  91,  92,  93,  94,  95,  96,  97,  98,  99,  100, 101, 102, 103, 104,
+                    105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119},
+    /*perm_data=*/{1, 0},
+    /*output_data=*/{0,  12, 24, 36,  48,  60, 72, 84, 96,  108, 1,  13, 25, 37,  49,
+                     61, 73, 85, 97,  109, 2,  14, 26, 38,  50,  62, 74, 86, 98,  110,
+                     3,  15, 27, 39,  51,  63, 75, 87, 99,  111, 4,  16, 28, 40,  52,
+                     64, 76, 88, 100, 112, 5,  17, 29, 41,  53,  65, 77, 89, 101, 113,
+                     6,  18, 30, 42,  54,  66, 78, 90, 102, 114, 7,  19, 31, 43,  55,
+                     67, 79, 91, 103, 115, 8,  20, 32, 44,  56,  68, 80, 92, 104, 116,
+                     9,  21, 33, 45,  57,  69, 81, 93, 105, 117, 10, 22, 34, 46,  58,
+                     70, 82, 94, 106, 118, 11, 23, 35, 47,  59,  71, 83, 95, 107, 119});
 }
 
 } // namespace
diff --git a/compiler/luci-interpreter/src/kernels/TransposeConv.cpp b/compiler/luci-interpreter/src/kernels/TransposeConv.cpp
index 491ae51ae..0c70756b2 100644
--- a/compiler/luci-interpreter/src/kernels/TransposeConv.cpp
+++ b/compiler/luci-interpreter/src/kernels/TransposeConv.cpp
@@ -31,7 +31,7 @@ namespace kernels
 
 TransposeConv::TransposeConv(const Tensor *output_shape, const Tensor *filter, const Tensor *input,
                              const Tensor *bias, Tensor *output, const TransposeConvParams &params)
-    : KernelWithParams<TransposeConvParams>({output_shape, filter, input, bias}, {output}, params)
+  : KernelWithParams<TransposeConvParams>({output_shape, filter, input, bias}, {output}, params)
 {
 }
 
@@ -63,23 +63,23 @@ void TransposeConv::configure()
   const int32_t output_width = out_shape.dim(2);
 
   const int32_t unused_output_height =
-      computeOutputSize(params().padding, output_height, filter_height, params().stride_height, 1);
+    computeOutputSize(params().padding, output_height, filter_height, params().stride_height, 1);
   const int32_t unused_output_width =
-      computeOutputSize(params().padding, output_width, filter_width, params().stride_width, 1);
+    computeOutputSize(params().padding, output_width, filter_width, params().stride_width, 1);
 
   _padding_height =
-      computePadding(params().stride_height, 1, output_height, filter_height, unused_output_height);
+    computePadding(params().stride_height, 1, output_height, filter_height, unused_output_height);
   _padding_width =
-      computePadding(params().stride_width, 1, output_width, filter_width, unused_output_width);
+    computePadding(params().stride_width, 1, output_width, filter_width, unused_output_width);
 
   if (input()->element_type() == DataType::U8 || input()->element_type() == DataType::S16)
   {
     DataType scratch_data_type =
-        input()->element_type() == DataType::S16 ? DataType::S64 : DataType::S32;
+      input()->element_type() == DataType::S16 ? DataType::S64 : DataType::S32;
     _scratch_tensor =
-        std::make_unique<Tensor>(scratch_data_type, output()->shape(), AffineQuantization{}, "");
+      std::make_unique<Tensor>(scratch_data_type, output()->shape(), AffineQuantization{}, "");
     const std::vector<double> real_multipliers =
-        getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
+      getQuantizedConvolutionMultiplers(input()->scale(), filter()->scales(), output()->scale());
 
     _quant_multipliers = quantizeMultipliers(real_multipliers);
   }
@@ -210,12 +210,12 @@ void TransposeConv::evalQuantizedPerChannel() const
                 for (int32_t out_c = 0; out_c < output_depth; ++out_c)
                 {
                   const uint8_t input_val =
-                      input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
+                    input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
                   const uint8_t filter_val =
-                      filter_data[calcOffset(filter_shape, out_c, filter_y, filter_x, in_c)];
+                    filter_data[calcOffset(filter_shape, out_c, filter_y, filter_x, in_c)];
                   scratch_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] +=
-                      static_cast<int32_t>(input_val - input()->zero_point()) *
-                      static_cast<int32_t>(filter_val - filter()->zero_points()[out_c]);
+                    static_cast<int32_t>(input_val - input()->zero_point()) *
+                    static_cast<int32_t>(filter_val - filter()->zero_points()[out_c]);
                 }
               }
             }
@@ -236,7 +236,7 @@ void TransposeConv::evalQuantizedPerChannel() const
           }
 
           int32_t scaled_acc = tflite::MultiplyByQuantizedMultiplier(
-              acc, output_multipliers[out_c].multiplier, output_multipliers[out_c].shift);
+            acc, output_multipliers[out_c].multiplier, output_multipliers[out_c].shift);
 
           scaled_acc += output()->zero_point();
           scaled_acc = std::max(scaled_acc, activation_min);
@@ -302,11 +302,11 @@ void TransposeConv::evalQuantizedS16() const
                 for (int32_t out_c = 0; out_c < output_depth; ++out_c)
                 {
                   const int16_t input_val =
-                      input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
+                    input_data[calcOffset(input_shape, batch, in_y, in_x, in_c)];
                   const int16_t filter_val =
-                      filter_data[calcOffset(filter_shape, out_c, filter_y, filter_x, in_c)];
+                    filter_data[calcOffset(filter_shape, out_c, filter_y, filter_x, in_c)];
                   scratch_data[calcOffset(output_shape, batch, out_y, out_x, out_c)] +=
-                      static_cast<int64_t>(input_val) * static_cast<int64_t>(filter_val);
+                    static_cast<int64_t>(input_val) * static_cast<int64_t>(filter_val);
                 }
               }
             }
@@ -326,7 +326,7 @@ void TransposeConv::evalQuantizedS16() const
             acc += bias_data[out_c];
           }
           int32_t scaled_acc = tflite::MultiplyByQuantizedMultiplier(
-              acc, output_multipliers[out_c].multiplier, output_multipliers[out_c].shift);
+            acc, output_multipliers[out_c].multiplier, output_multipliers[out_c].shift);
 
           scaled_acc = std::max(scaled_acc, activation_min);
           scaled_acc = std::min(scaled_acc, activation_max);
diff --git a/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp b/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp
index b1309c128..9bcb015c1 100644
--- a/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp
@@ -37,7 +37,7 @@ void Check(std::initializer_list<int32_t> output_shape_shape,
 {
   constexpr DataType element_type = getElementType<T>();
   Tensor output_shape_tensor =
-      makeInputTensor<DataType::S32>(output_shape_shape, output_shape_data);
+    makeInputTensor<DataType::S32>(output_shape_shape, output_shape_data);
   Tensor weight_tensor = makeInputTensor<element_type>(weight_shape, weight_data);
   Tensor input_data_tensor = makeInputTensor<element_type>(input_shape, input_data);
   Tensor output_tensor = makeOutputTensor(element_type);
@@ -68,13 +68,13 @@ void Check(std::initializer_list<int32_t> output_shape_shape,
 TEST(TransposeConvTest, FloatSimple)
 {
   Check<float, float>(
-      /*output_shape_shape=*/{4}, /*weight_shape=*/{1, 3, 3, 1}, /*input_shape=*/{1, 4, 4, 1},
-      /*bias_shape=*/{}, /*output_shape=*/{1, 4, 4, 1}, /*output_shape_data=*/{1, 4, 4, 1},
-      /*weight_data=*/{1, 2, 3, 4, 5, 6, 7, 8, 9},
-      /*input_data=*/{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-      /*bias_data=*/{},
-      /*output_data=*/{29, 62, 83, 75, 99, 192, 237, 198, 207, 372, 417, 330, 263, 446, 485, 365},
-      /*params.padding=*/luci::Padding::SAME, /*stride_height=*/1, /*stride_width=*/1);
+    /*output_shape_shape=*/{4}, /*weight_shape=*/{1, 3, 3, 1}, /*input_shape=*/{1, 4, 4, 1},
+    /*bias_shape=*/{}, /*output_shape=*/{1, 4, 4, 1}, /*output_shape_data=*/{1, 4, 4, 1},
+    /*weight_data=*/{1, 2, 3, 4, 5, 6, 7, 8, 9},
+    /*input_data=*/{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+    /*bias_data=*/{},
+    /*output_data=*/{29, 62, 83, 75, 99, 192, 237, 198, 207, 372, 417, 330, 263, 446, 485, 365},
+    /*params.padding=*/luci::Padding::SAME, /*stride_height=*/1, /*stride_width=*/1);
 
   SUCCEED();
 }
@@ -82,15 +82,15 @@ TEST(TransposeConvTest, FloatSimple)
 TEST(TransposeConvTest, FloatTwoFiltersTest)
 {
   Check<float, float>(
-      /*output_shape_shape=*/{4}, /*weight_shape=*/{1, 3, 3, 2}, /*input_shape=*/{1, 4, 4, 2},
-      /*bias_shape=*/{}, /*output_shape=*/{1, 4, 4, 1}, /*output_shape_data=*/{1, 4, 4, 1},
-      /*weight_data=*/{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18},
-      /*input_data=*/{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
-                      17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
-      /*bias_data=*/{},
-      /*output_data=*/
-      {184, 412, 568, 528, 678, 1347, 1689, 1434, 1494, 2715, 3057, 2442, 1968, 3352, 3652, 2760},
-      /*params.padding=*/luci::Padding::SAME, /*stride_height=*/1, /*stride_width=*/1);
+    /*output_shape_shape=*/{4}, /*weight_shape=*/{1, 3, 3, 2}, /*input_shape=*/{1, 4, 4, 2},
+    /*bias_shape=*/{}, /*output_shape=*/{1, 4, 4, 1}, /*output_shape_data=*/{1, 4, 4, 1},
+    /*weight_data=*/{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18},
+    /*input_data=*/{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+                    17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32},
+    /*bias_data=*/{},
+    /*output_data=*/
+    {184, 412, 568, 528, 678, 1347, 1689, 1434, 1494, 2715, 3057, 2442, 1968, 3352, 3652, 2760},
+    /*params.padding=*/luci::Padding::SAME, /*stride_height=*/1, /*stride_width=*/1);
 
   SUCCEED();
 }
@@ -98,16 +98,16 @@ TEST(TransposeConvTest, FloatTwoFiltersTest)
 TEST(TransposeConvTest, SimpleBiasTest)
 {
   Check<float, float>(
-      /*output_shape_shape=*/{4}, /*weight_shape=*/{2, 3, 3, 1},
-      /*input_shape=*/{1, 2, 2, 1},
-      /*bias_shape=*/{2}, /*output_shape=*/{1, 4, 4, 1}, /*output_shape_data=*/{1, 5, 5, 2},
-      /*weight_data=*/{1, 3, 5, 7, 9, 11, 13, 15, 17, 2, 4, 6, 8, 10, 12, 14, 16, 18},
-      /*input_data=*/{1, 2, 3, 4},
-      /*bias_data=*/{3, 4},
-      /*output_data=*/{4,  6,  6,  8,  10, 14, 9,  12, 13, 16, 10,  12,  12, 14, 28, 32, 21,
-                       24, 25, 28, 19, 24, 27, 32, 65, 76, 45, 52,  57,  64, 24, 28, 30, 34,
-                       64, 72, 39, 44, 47, 52, 42, 46, 48, 52, 106, 114, 63, 68, 71, 76},
-      /*params.padding=*/luci::Padding::VALID, /*stride_height=*/2, /*stride_width=*/2);
+    /*output_shape_shape=*/{4}, /*weight_shape=*/{2, 3, 3, 1},
+    /*input_shape=*/{1, 2, 2, 1},
+    /*bias_shape=*/{2}, /*output_shape=*/{1, 4, 4, 1}, /*output_shape_data=*/{1, 5, 5, 2},
+    /*weight_data=*/{1, 3, 5, 7, 9, 11, 13, 15, 17, 2, 4, 6, 8, 10, 12, 14, 16, 18},
+    /*input_data=*/{1, 2, 3, 4},
+    /*bias_data=*/{3, 4},
+    /*output_data=*/{4,  6,  6,  8,  10, 14, 9,  12, 13, 16, 10,  12,  12, 14, 28, 32, 21,
+                     24, 25, 28, 19, 24, 27, 32, 65, 76, 45, 52,  57,  64, 24, 28, 30, 34,
+                     64, 72, 39, 44, 47, 52, 42, 46, 48, 52, 106, 114, 63, 68, 71, 76},
+    /*params.padding=*/luci::Padding::VALID, /*stride_height=*/2, /*stride_width=*/2);
 
   SUCCEED();
 }
@@ -119,11 +119,11 @@ TEST(TransposeConvTest, UInt8)
   std::vector<float> bias_data{3, 4};
   std::vector<int32_t> output_shape_data{1, 5, 5, 2};
   std::vector<float> ref_output_data{
-      4,  6,  6,  8,  10,  14,  9,  12, 13, 16, //
-      10, 12, 12, 14, 28,  32,  21, 24, 25, 28, //
-      19, 24, 27, 32, 65,  76,  45, 52, 57, 64, //
-      24, 28, 30, 34, 64,  72,  39, 44, 47, 52, //
-      42, 46, 48, 52, 106, 114, 63, 68, 71, 76, //
+    4,  6,  6,  8,  10,  14,  9,  12, 13, 16, //
+    10, 12, 12, 14, 28,  32,  21, 24, 25, 28, //
+    19, 24, 27, 32, 65,  76,  45, 52, 57, 64, //
+    24, 28, 30, 34, 64,  72,  39, 44, 47, 52, //
+    42, 46, 48, 52, 106, 114, 63, 68, 71, 76, //
   };
 
   // Choose quantization parameters carefully.
@@ -131,12 +131,12 @@ TEST(TransposeConvTest, UInt8)
   auto filter_quant = quantizationParams<uint8_t>(-24.0, 39.75); // s = 1 / 4, zp = 96
   auto output_quant = quantizationParams<uint8_t>(-64.0, 191.0); // s = 1, zp = 64
 
-  Tensor input_tensor = makeInputTensor<DataType::U8>({1, 2, 2, 1}, input_quant.first,
-                                                      input_quant.second, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>({1, 2, 2, 1}, input_quant.first, input_quant.second, input_data);
   Tensor filter_tensor = makeInputTensor<DataType::U8>({2, 3, 3, 1}, filter_quant.first,
                                                        filter_quant.second, filter_data);
   Tensor bias_tensor =
-      makeInputTensor<DataType::S32>({2}, input_quant.first * filter_quant.first, 0, bias_data);
+    makeInputTensor<DataType::S32>({2}, input_quant.first * filter_quant.first, 0, bias_data);
   Tensor output_shape_tensor = makeInputTensor<DataType::S32>({4}, output_shape_data);
   Tensor output_tensor = makeOutputTensor(DataType::U8, output_quant.first, output_quant.second);
 
@@ -162,11 +162,11 @@ TEST(TransposeConvTest, UInt8_CWQ)
   std::vector<float> bias_data{3, 4};
   std::vector<int32_t> output_shape_data{1, 5, 5, 2};
   std::vector<float> ref_output_data{
-      4,  6,  6,  8,  10,  14,  9,  12, 13, 16, //
-      10, 12, 12, 14, 28,  32,  21, 24, 25, 28, //
-      19, 24, 27, 32, 65,  76,  45, 52, 57, 64, //
-      24, 28, 30, 34, 64,  72,  39, 44, 47, 52, //
-      42, 46, 48, 52, 106, 114, 63, 68, 71, 76, //
+    4,  6,  6,  8,  10,  14,  9,  12, 13, 16, //
+    10, 12, 12, 14, 28,  32,  21, 24, 25, 28, //
+    19, 24, 27, 32, 65,  76,  45, 52, 57, 64, //
+    24, 28, 30, 34, 64,  72,  39, 44, 47, 52, //
+    42, 46, 48, 52, 106, 114, 63, 68, 71, 76, //
   };
 
   // Choose quantization parameters carefully.
@@ -190,12 +190,12 @@ TEST(TransposeConvTest, UInt8_CWQ)
     bias_scales.push_back(filter_quant_params[i].first * input_quant.first);
   std::vector<int32_t> zerop(output_channels, 0);
 
-  Tensor input_tensor = makeInputTensor<DataType::U8>({1, 2, 2, 1}, input_quant.first,
-                                                      input_quant.second, input_data);
+  Tensor input_tensor =
+    makeInputTensor<DataType::U8>({1, 2, 2, 1}, input_quant.first, input_quant.second, input_data);
   Tensor filter_tensor = makeInputTensor<DataType::U8>({output_channels, 3, 3, 1}, filter_scales,
                                                        filter_zerops, 0, filter_data);
   Tensor bias_tensor =
-      makeInputTensor<DataType::S32>({output_channels}, bias_scales, zerop, 0, bias_data);
+    makeInputTensor<DataType::S32>({output_channels}, bias_scales, zerop, 0, bias_data);
   Tensor output_shape_tensor = makeInputTensor<DataType::S32>({4}, output_shape_data);
   Tensor output_tensor = makeOutputTensor(DataType::U8, output_quant.first, output_quant.second);
 
@@ -220,11 +220,11 @@ TEST(TransposeConvTest, SInt16)
   std::vector<float> bias_data{3, 4};
   std::vector<int32_t> output_shape_data{1, 5, 5, 2};
   std::vector<float> ref_output_data{
-      4,  6,  6,  8,  10,  14,  9,  12, 13, 16, //
-      10, 12, 12, 14, 28,  32,  21, 24, 25, 28, //
-      19, 24, 27, 32, 65,  76,  45, 52, 57, 64, //
-      24, 28, 30, 34, 64,  72,  39, 44, 47, 52, //
-      42, 46, 48, 52, 106, 114, 63, 68, 71, 76, //
+    4,  6,  6,  8,  10,  14,  9,  12, 13, 16, //
+    10, 12, 12, 14, 28,  32,  21, 24, 25, 28, //
+    19, 24, 27, 32, 65,  76,  45, 52, 57, 64, //
+    24, 28, 30, 34, 64,  72,  39, 44, 47, 52, //
+    42, 46, 48, 52, 106, 114, 63, 68, 71, 76, //
   };
 
   Tensor input_tensor = makeInputTensor<DataType::S16>({1, 2, 2, 1}, 0.25, 0, input_data);
@@ -260,11 +260,11 @@ TEST(TransposeConvTest, SInt16_CWQ_weights)
   std::vector<float> bias_data{3, 4};
 
   std::vector<float> ref_output_data{
-      4,  6,  6,  8,  10,  14,  9,  12, 13, 16, //
-      10, 12, 12, 14, 28,  32,  21, 24, 25, 28, //
-      19, 24, 27, 32, 65,  76,  45, 52, 57, 64, //
-      24, 28, 30, 34, 64,  72,  39, 44, 47, 52, //
-      42, 46, 48, 52, 106, 114, 63, 68, 71, 76, //
+    4,  6,  6,  8,  10,  14,  9,  12, 13, 16, //
+    10, 12, 12, 14, 28,  32,  21, 24, 25, 28, //
+    19, 24, 27, 32, 65,  76,  45, 52, 57, 64, //
+    24, 28, 30, 34, 64,  72,  39, 44, 47, 52, //
+    42, 46, 48, 52, 106, 114, 63, 68, 71, 76, //
   };
 
   const float input_scale = 0.25;
@@ -275,7 +275,7 @@ TEST(TransposeConvTest, SInt16_CWQ_weights)
 
   Tensor input_tensor = makeInputTensor<DataType::S16>(input_shape, input_scale, 0, input_data);
   Tensor filter_tensor =
-      makeInputTensor<DataType::S16>(filter_shape, filter_scales, zerop, 0, filter_data);
+    makeInputTensor<DataType::S16>(filter_shape, filter_scales, zerop, 0, filter_data);
   Tensor bias_tensor = makeInputTensor<DataType::S64>(bias_shape, bias_scales, zerop, 0, bias_data);
   Tensor output_shape_tensor = makeInputTensor<DataType::S32>({4}, output_shape_data);
   Tensor output_tensor = makeOutputTensor(DataType::S16, output_scale, 0);
diff --git a/compiler/luci-interpreter/src/kernels/Unpack.cpp b/compiler/luci-interpreter/src/kernels/Unpack.cpp
index 834b79926..9127241c0 100644
--- a/compiler/luci-interpreter/src/kernels/Unpack.cpp
+++ b/compiler/luci-interpreter/src/kernels/Unpack.cpp
@@ -29,7 +29,7 @@ namespace kernels
 {
 
 Unpack::Unpack(const Tensor *input, std::vector<Tensor *> outputs, const UnpackParams &params)
-    : KernelWithParams<UnpackParams>({input}, std::move(outputs), params)
+  : KernelWithParams<UnpackParams>({input}, std::move(outputs), params)
 {
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/Unpack.test.cpp b/compiler/luci-interpreter/src/kernels/Unpack.test.cpp
index f70c5847a..6d611e12e 100644
--- a/compiler/luci-interpreter/src/kernels/Unpack.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Unpack.test.cpp
@@ -121,11 +121,11 @@ TYPED_TEST(UnpackTest, ThreeDimensionsTwoOutputs)
 TYPED_TEST(UnpackTest, FiveDimensionsTwoOutputs)
 {
   Check<TypeParam>(
-      /*axis=*/2, /*input_shape=*/{2, 2, 2, 2, 1},
-      /*input_data=*/{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
-      /*exp_output_shape=*/{{2, 2, 2, 1}, {2, 2, 2, 1}},
-      /*exp_output_data=*/
-      {{1, 2, 5, 6, 9, 10, 13, 14}, {3, 4, 7, 8, 11, 12, 15, 16}});
+    /*axis=*/2, /*input_shape=*/{2, 2, 2, 2, 1},
+    /*input_data=*/{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+    /*exp_output_shape=*/{{2, 2, 2, 1}, {2, 2, 2, 1}},
+    /*exp_output_data=*/
+    {{1, 2, 5, 6, 9, 10, 13, 14}, {3, 4, 7, 8, 11, 12, 15, 16}});
 }
 
 TYPED_TEST(UnpackTest, VectorToScalar)
diff --git a/compiler/luci-interpreter/src/kernels/Utils.h b/compiler/luci-interpreter/src/kernels/Utils.h
index 4b5e72917..817a42f83 100644
--- a/compiler/luci-interpreter/src/kernels/Utils.h
+++ b/compiler/luci-interpreter/src/kernels/Utils.h
@@ -108,6 +108,8 @@ inline double getQuantizedConvolutionMultipler(float input_scale, float filter_s
   return input_product_scale / static_cast<double>(output_scale);
 }
 
+// TODO rename getQuantizedConvolutionMultiplers to something more general
+// it is used for non conv operators too
 inline std::vector<double> getQuantizedConvolutionMultiplers(float input_scale,
                                                              const std::vector<float> &filter_scale,
                                                              float output_scale)
@@ -118,7 +120,7 @@ inline std::vector<double> getQuantizedConvolutionMultiplers(float input_scale,
   for (size_t i = 0; i < n; ++i)
   {
     effective_output_scales.push_back(
-        getQuantizedConvolutionMultipler(input_scale, filter_scale[i], output_scale));
+      getQuantizedConvolutionMultipler(input_scale, filter_scale[i], output_scale));
   }
   return effective_output_scales;
 }
@@ -149,6 +151,7 @@ public:
   BroadcastableWrapper(const std::vector<T> &v) : _v(v), _stride(v.size() == 1 ? 0 : 1) {}
 
   T operator[](int idx) { return _v[idx * _stride]; }
+
 private:
   const std::vector<T> &_v;
   int _stride;
@@ -236,7 +239,7 @@ public:
 
   // Build with the tensors in 'tensor_list'.
   explicit VectorOfQuantizedTensors(const std::vector<TensorT *> &tensor_list)
-      : VectorOfTensors<uint8_t, is_const>(tensor_list)
+    : VectorOfTensors<uint8_t, is_const>(tensor_list)
   {
     for (TensorT *tensor : tensor_list)
     {
diff --git a/compiler/luci-interpreter/src/loader/CMakeLists.txt b/compiler/luci-interpreter/src/loader/CMakeLists.txt
index d99485d06..20a6f03cd 100644
--- a/compiler/luci-interpreter/src/loader/CMakeLists.txt
+++ b/compiler/luci-interpreter/src/loader/CMakeLists.txt
@@ -1,5 +1,3 @@
-nnas_find_package(GTest REQUIRED)
-
 set(SOURCES
     GraphLoader.h
     GraphLoader.cpp
@@ -16,6 +14,12 @@ target_link_libraries(luci_interpreter_loader
     PUBLIC luci_lang luci_interpreter_core
     PRIVATE luci_interpreter_kernels nncc_common)
 
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
+nnas_find_package(GTest REQUIRED)
+
 set(TEST_SOURCES KernelBuilder.test.cpp)
 
 GTest_AddTest(luci_interpreter_loader_test ${TEST_SOURCES})
diff --git a/compiler/luci-interpreter/src/loader/GraphLoader.cpp b/compiler/luci-interpreter/src/loader/GraphLoader.cpp
index 09e923597..bc44c7efa 100644
--- a/compiler/luci-interpreter/src/loader/GraphLoader.cpp
+++ b/compiler/luci-interpreter/src/loader/GraphLoader.cpp
@@ -107,11 +107,11 @@ bool isTensorProducingNode(const luci::CircleNode *node)
 } // namespace
 
 GraphLoader::GraphLoader(
-    const loco::Graph *graph, RuntimeGraph *runtime_graph, RuntimeToIR &runtime_to_ir,
-    const std::unordered_map<const loco::Graph *, RuntimeGraph *> &graph_to_runtime_graph,
-    std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor)
-    : _graph(graph), _runtime_graph(runtime_graph), _runtime_to_ir(runtime_to_ir),
-      _graph_to_runtime_graph(graph_to_runtime_graph), _node_to_tensor(node_to_tensor)
+  const loco::Graph *graph, RuntimeGraph *runtime_graph, RuntimeToIR &runtime_to_ir,
+  const std::unordered_map<const loco::Graph *, RuntimeGraph *> &graph_to_runtime_graph,
+  std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor)
+  : _graph(graph), _runtime_graph(runtime_graph), _runtime_to_ir(runtime_to_ir),
+    _graph_to_runtime_graph(graph_to_runtime_graph), _node_to_tensor(node_to_tensor)
 {
 }
 
diff --git a/compiler/luci-interpreter/src/loader/KernelBuilder.cpp b/compiler/luci-interpreter/src/loader/KernelBuilder.cpp
index 7b723e88a..913a062d7 100644
--- a/compiler/luci-interpreter/src/loader/KernelBuilder.cpp
+++ b/compiler/luci-interpreter/src/loader/KernelBuilder.cpp
@@ -19,6 +19,7 @@
 #include "kernels/Add.h"
 #include "kernels/ArgMax.h"
 #include "kernels/AveragePool2D.h"
+#include "kernels/BatchToSpaceND.h"
 #include "kernels/Concatenation.h"
 #include "kernels/Conv2D.h"
 #include "kernels/DepthToSpace.h"
@@ -50,7 +51,9 @@
 #include "kernels/Mean.h"
 #include "kernels/Minimum.h"
 #include "kernels/Mul.h"
+#include "kernels/Neg.h"
 #include "kernels/NotEqual.h"
+#include "kernels/Pack.h"
 #include "kernels/Pad.h"
 #include "kernels/Pow.h"
 #include "kernels/Prelu.h"
@@ -63,12 +66,14 @@
 #include "kernels/Rsqrt.h"
 #include "kernels/Slice.h"
 #include "kernels/Softmax.h"
+#include "kernels/SpaceToBatchND.h"
 #include "kernels/SpaceToDepth.h"
 #include "kernels/Split.h"
 #include "kernels/StridedSlice.h"
 #include "kernels/Sqrt.h"
-#include "kernels/Sub.h"
+#include "kernels/SquaredDifference.h"
 #include "kernels/Squeeze.h"
+#include "kernels/Sub.h"
 #include "kernels/Tanh.h"
 #include "kernels/Unpack.h"
 #include "kernels/Transpose.h"
@@ -134,6 +139,11 @@ RuntimeGraph *KernelBuilder::getRuntimeGraph(const loco::Graph *graph) const
   return runtime_graph;
 }
 
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleNode *)
+{
+  throw std::invalid_argument("Unsupported operator.");
+}
+
 std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleAdd *node)
 {
   assert(node->arity() == 2);
@@ -179,6 +189,18 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleAveragePool2D *no
   return std::make_unique<kernels::AveragePool2D>(input, output, params);
 }
 
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleBatchToSpaceND *node)
+{
+  assert(node->arity() == 3);
+
+  const Tensor *input = getInputTensor(node->input());
+  const Tensor *block_shape = getInputTensor(node->block_shape());
+  const Tensor *crops = getInputTensor(node->crops());
+  Tensor *output = getOutputTensor(node);
+
+  return std::make_unique<kernels::BatchToSpaceND>(input, block_shape, crops, output);
+}
+
 std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleConcatenation *node)
 {
   std::vector<const Tensor *> inputs(node->numValues());
@@ -190,6 +212,7 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleConcatenation *no
 
   ConcatenationParams params{};
   params.axis = node->axis();
+  params.activation = node->fusedActivationFunction();
 
   return std::make_unique<kernels::Concatenation>(std::move(inputs), output, params);
 }
@@ -598,6 +621,16 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleMul *node)
   return std::make_unique<kernels::Mul>(input1, input2, output, params);
 }
 
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleNeg *node)
+{
+  assert(node->arity() == 1);
+
+  const Tensor *input = getInputTensor(node->x());
+  Tensor *output = getOutputTensor(node);
+
+  return std::make_unique<kernels::Neg>(input, output);
+}
+
 std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleNotEqual *node)
 {
   assert(node->arity() == 2);
@@ -614,6 +647,24 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleOutput *)
   throw std::runtime_error("Output node cannot be executed.");
 }
 
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CirclePack *node)
+{
+  assert(node->arity() == node->values_count());
+
+  std::vector<const Tensor *> inputs(node->values_count());
+  for (uint32_t i = 0; i < node->values_count(); ++i)
+  {
+    inputs[i] = getInputTensor(node->values(i));
+  }
+  Tensor *output = getOutputTensor(node);
+
+  PackParams params{};
+  params.axis = node->axis();
+  params.values_count = node->values_count();
+
+  return std::make_unique<kernels::Pack>(std::move(inputs), output, params);
+}
+
 std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CirclePad *node)
 {
   assert(node->arity() == 2);
@@ -735,20 +786,6 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleRsqrt *node)
   return std::make_unique<kernels::Rsqrt>(input, output);
 }
 
-std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleSub *node)
-{
-  assert(node->arity() == 2);
-
-  const Tensor *input1 = getInputTensor(node->x());
-  const Tensor *input2 = getInputTensor(node->y());
-  Tensor *output = getOutputTensor(node);
-
-  SubParams params{};
-  params.activation = node->fusedActivationFunction();
-
-  return std::make_unique<kernels::Sub>(input1, input2, output, params);
-}
-
 std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleSlice *node)
 {
   assert(node->arity() == 3);
@@ -775,6 +812,20 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleSoftmax *node)
   return std::make_unique<kernels::Softmax>(input, output, params);
 }
 
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleSpaceToBatchND *node)
+{
+  assert(node->arity() == 3);
+
+  const Tensor *input = getInputTensor(node->input());
+  const Tensor *block_shape = getInputTensor(node->block_shape());
+  const Tensor *paddings = getInputTensor(node->paddings());
+
+  Tensor *output = getOutputTensor(node);
+
+  return std::make_unique<kernels::SpaceToBatchND>(input, block_shape, paddings, output);
+  ;
+}
+
 std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleSpaceToDepth *node)
 {
   assert(node->arity() == 1);
@@ -812,6 +863,17 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleSqrt *node)
   return std::make_unique<kernels::Sqrt>(input, output);
 }
 
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleSquaredDifference *node)
+{
+  assert(node->arity() == 2);
+
+  const Tensor *input1 = getInputTensor(node->x());
+  const Tensor *input2 = getInputTensor(node->y());
+  Tensor *output = getOutputTensor(node);
+
+  return std::make_unique<kernels::SquaredDifference>(input1, input2, output);
+}
+
 std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleSqueeze *node)
 {
   assert(node->arity() == 1);
@@ -846,6 +908,20 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleStridedSlice *nod
   return std::make_unique<kernels::StridedSlice>(input, begin, end, strides, output, params);
 }
 
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleSub *node)
+{
+  assert(node->arity() == 2);
+
+  const Tensor *input1 = getInputTensor(node->x());
+  const Tensor *input2 = getInputTensor(node->y());
+  Tensor *output = getOutputTensor(node);
+
+  SubParams params{};
+  params.activation = node->fusedActivationFunction();
+
+  return std::make_unique<kernels::Sub>(input1, input2, output, params);
+}
+
 std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleTanh *node)
 {
   assert(node->arity() == 1);
diff --git a/compiler/luci-interpreter/src/loader/KernelBuilder.h b/compiler/luci-interpreter/src/loader/KernelBuilder.h
index 1546ba01b..6f482b29e 100644
--- a/compiler/luci-interpreter/src/loader/KernelBuilder.h
+++ b/compiler/luci-interpreter/src/loader/KernelBuilder.h
@@ -33,15 +33,17 @@ class KernelBuilder : public luci::CircleNodeVisitor<std::unique_ptr<Kernel>>
 {
 public:
   KernelBuilder(
-      const std::unordered_map<const loco::Graph *, RuntimeGraph *> &graph_to_runtime_graph,
-      const std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor)
-      : _graph_to_runtime_graph(graph_to_runtime_graph), _node_to_tensor(node_to_tensor)
+    const std::unordered_map<const loco::Graph *, RuntimeGraph *> &graph_to_runtime_graph,
+    const std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor)
+    : _graph_to_runtime_graph(graph_to_runtime_graph), _node_to_tensor(node_to_tensor)
   {
   }
 
+  std::unique_ptr<Kernel> visit(const luci::CircleNode *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleAdd *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleArgMax *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleAveragePool2D *node) override;
+  std::unique_ptr<Kernel> visit(const luci::CircleBatchToSpaceND *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleConcatenation *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleConv2D *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleConst *node) override;
@@ -75,8 +77,10 @@ public:
   std::unique_ptr<Kernel> visit(const luci::CircleMean *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleMinimum *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleMul *node) override;
+  std::unique_ptr<Kernel> visit(const luci::CircleNeg *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleNotEqual *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleOutput *node) override;
+  std::unique_ptr<Kernel> visit(const luci::CirclePack *node) override;
   std::unique_ptr<Kernel> visit(const luci::CirclePad *node) override;
   std::unique_ptr<Kernel> visit(const luci::CirclePow *node) override;
   std::unique_ptr<Kernel> visit(const luci::CirclePRelu *node) override;
@@ -87,14 +91,16 @@ public:
   std::unique_ptr<Kernel> visit(const luci::CircleResizeNearestNeighbor *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleReverseV2 *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleRsqrt *node) override;
-  std::unique_ptr<Kernel> visit(const luci::CircleSub *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleSlice *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleSoftmax *node) override;
+  std::unique_ptr<Kernel> visit(const luci::CircleSpaceToBatchND *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleSpaceToDepth *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleSplit *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleStridedSlice *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleSqrt *node) override;
+  std::unique_ptr<Kernel> visit(const luci::CircleSquaredDifference *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleSqueeze *node) override;
+  std::unique_ptr<Kernel> visit(const luci::CircleSub *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleTanh *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleTranspose *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleTransposeConv *node) override;
diff --git a/compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp b/compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp
index c49a05a49..b49085325 100644
--- a/compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp
+++ b/compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp
@@ -50,6 +50,7 @@
 #include <kernels/Mean.h>
 #include <kernels/Minimum.h>
 #include <kernels/Mul.h>
+#include <kernels/Neg.h>
 #include <kernels/NotEqual.h>
 #include <kernels/Pad.h>
 #include <kernels/Pow.h>
@@ -66,9 +67,10 @@
 #include <kernels/SpaceToDepth.h>
 #include <kernels/Split.h>
 #include <kernels/Sqrt.h>
-#include <kernels/Sub.h>
+#include <kernels/SquaredDifference.h>
 #include <kernels/Squeeze.h>
 #include <kernels/StridedSlice.h>
+#include <kernels/Sub.h>
 #include <kernels/Tanh.h>
 #include <kernels/Transpose.h>
 #include <kernels/TransposeConv.h>
@@ -216,6 +218,7 @@ TEST_F(KernelBuilderTest, Concatenation)
   checkTensor(kernel->input(1), input2);
   checkTensor(kernel->output(), op);
   EXPECT_THAT(kernel->params().axis, Eq(op->axis()));
+  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
 }
 
 TEST_F(KernelBuilderTest, Conv2D)
@@ -776,6 +779,20 @@ TEST_F(KernelBuilderTest, Mul)
   EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
 }
 
+TEST_F(KernelBuilderTest, Neg)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleNeg>();
+  op->x(input);
+
+  auto kernel = buildKernel<kernels::Neg>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+}
+
 TEST_F(KernelBuilderTest, NotEqual)
 {
   auto *x_input = createInputNode();
@@ -1052,24 +1069,21 @@ TEST_F(KernelBuilderTest, Sqrt)
   checkTensor(kernel->output(), op);
 }
 
-TEST_F(KernelBuilderTest, Sub)
+TEST_F(KernelBuilderTest, SquaredDifference)
 {
   auto *input1 = createInputNode();
   auto *input2 = createInputNode();
 
-  auto *op = createNode<luci::CircleSub>();
+  auto *op = createNode<luci::CircleSquaredDifference>();
   op->x(input1);
   op->y(input2);
 
-  op->fusedActivationFunction(luci::FusedActFunc::RELU);
-
-  auto kernel = buildKernel<kernels::Sub>(op);
+  auto kernel = buildKernel<kernels::SquaredDifference>(op);
   ASSERT_THAT(kernel, NotNull());
 
   checkTensor(kernel->input1(), input1);
   checkTensor(kernel->input2(), input2);
   checkTensor(kernel->output(), op);
-  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
 }
 
 TEST_F(KernelBuilderTest, Squeeze)
@@ -1123,6 +1137,26 @@ TEST_F(KernelBuilderTest, StridedSlice)
   EXPECT_THAT(kernel->params().shrink_axis_mask, Eq(op->shrink_axis_mask()));
 }
 
+TEST_F(KernelBuilderTest, Sub)
+{
+  auto *input1 = createInputNode();
+  auto *input2 = createInputNode();
+
+  auto *op = createNode<luci::CircleSub>();
+  op->x(input1);
+  op->y(input2);
+
+  op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto kernel = buildKernel<kernels::Sub>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input1(), input1);
+  checkTensor(kernel->input2(), input2);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
 TEST_F(KernelBuilderTest, Tanh)
 {
   auto *input = createInputNode();
diff --git a/compiler/luci-interpreter/src/loader/ModuleLoader.cpp b/compiler/luci-interpreter/src/loader/ModuleLoader.cpp
index b9a2ae0a9..ff211bf09 100644
--- a/compiler/luci-interpreter/src/loader/ModuleLoader.cpp
+++ b/compiler/luci-interpreter/src/loader/ModuleLoader.cpp
@@ -24,8 +24,8 @@ namespace luci_interpreter
 ModuleLoader::ModuleLoader(const luci::Module *module, RuntimeModule *runtime_module,
                            RuntimeToIR &runtime_to_ir,
                            std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor)
-    : _module(module), _runtime_module(runtime_module), _runtime_to_ir(runtime_to_ir),
-      _node_to_tensor(node_to_tensor)
+  : _module(module), _runtime_module(runtime_module), _runtime_to_ir(runtime_to_ir),
+    _node_to_tensor(node_to_tensor)
 {
 }
 
diff --git a/compiler/luci-pass-value-test/.gitignore b/compiler/luci-pass-value-test/.gitignore
new file mode 100644
index 000000000..8dbfa9012
--- /dev/null
+++ b/compiler/luci-pass-value-test/.gitignore
@@ -0,0 +1 @@
+/test.local.lst
diff --git a/compiler/luci-pass-value-test/CMakeLists.txt b/compiler/luci-pass-value-test/CMakeLists.txt
new file mode 100644
index 000000000..2d2befe57
--- /dev/null
+++ b/compiler/luci-pass-value-test/CMakeLists.txt
@@ -0,0 +1,44 @@
+unset(TEST_DEPS)
+unset(LUCI_PASS_VALUE_TESTS)
+
+get_target_property(ARTIFACTS_BIN_PATH testDataGenerator BINARY_DIR)
+
+macro(addeval RECIPE PASS_OPTION)
+  list(APPEND LUCI_PASS_VALUE_TESTS ${RECIPE})
+
+  set(CIRCLE_FILE "${RECIPE}.circle")
+  set(CIRCLE_PATH "${ARTIFACTS_BIN_PATH}/${CIRCLE_FILE}")
+
+  set(PASS_CIRCLE_FILE "${RECIPE}.pass.circle")
+  set(PASS_CIRCLE_OUTPUT_PATH "${CMAKE_CURRENT_BINARY_DIR}/${PASS_CIRCLE_FILE}")
+
+  set(DASH_PASS_OPTION "--${PASS_OPTION}")
+
+  # Generate optimized .circle
+  add_custom_command(OUTPUT ${PASS_CIRCLE_OUTPUT_PATH}
+    COMMAND $<TARGET_FILE:circle2circle> ${DASH_PASS_OPTION} ${CIRCLE_PATH} ${PASS_CIRCLE_OUTPUT_PATH}
+    DEPENDS $<TARGET_FILE:circle2circle> ${CIRCLE_PATH}
+    COMMENT "Generate ${PASS_CIRCLE_FILE} with ${DASH_PASS_OPTION}"
+  )
+
+  # depends
+  list(APPEND TEST_DEPS ${PASS_CIRCLE_OUTPUT_PATH})
+
+endmacro(addeval)
+
+# Read "test.lst"
+include("test.lst")
+# Read "test.local.lst" if exists
+include("test.local.lst" OPTIONAL)
+
+add_custom_target(luci_pass_value_test_files ALL DEPENDS ${TEST_DEPS})
+add_dependencies(luci_pass_value_test_files common_artifacts_deps)
+
+add_test(NAME luci_pass_value_test
+  COMMAND "${CMAKE_CURRENT_SOURCE_DIR}/eval_driver.sh"
+          "${CMAKE_CURRENT_BINARY_DIR}"
+          "${ARTIFACTS_BIN_PATH}"
+          "${NNCC_OVERLAY_DIR}/venv_2_3_0"
+          "$<TARGET_FILE:luci_eval_driver>"
+          ${LUCI_PASS_VALUE_TESTS}
+)
diff --git a/compiler/luci-pass-value-test/README.md b/compiler/luci-pass-value-test/README.md
new file mode 100644
index 000000000..f09619da6
--- /dev/null
+++ b/compiler/luci-pass-value-test/README.md
@@ -0,0 +1,20 @@
+# luci-pass-value-test
+
+`luci-pass-value-test` validates execution result values of tflite model and
+circle model generated with specific optimization.
+
+The test proceeds as follows:
+
+Step 0: Use tflite and circle file in 'common-artifacts' folder as the source model.
+   - tflite file is used as to generate reference execution result
+   - circle file is used as source of optimization to apply
+
+Step 1: Run circle2circle with given optimization option to produce transformed circle.
+   - "modelfile.circle" -> circle2circle -> "modelfile.pass.circle"
+
+Step 2: Run TFLite interpreter and luci-interpreter for the source tflite and circle, respectively.
+        (with the same input tensors filled with random values)
+   - "modelfile.tflite" ------> TFLite interpreter -> Execution result 1
+   - "modelfile.pass.circle" -> luci-interpreter ---> Execution result 2
+
+Step 3: Compare the execution result 1 and 2. Test is PASSED if results are sames.
diff --git a/compiler/luci-pass-value-test/eval_driver.sh b/compiler/luci-pass-value-test/eval_driver.sh
new file mode 100755
index 000000000..848b6419a
--- /dev/null
+++ b/compiler/luci-pass-value-test/eval_driver.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+
+# This script verifies the tflite and circle execution result values
+#
+# HOW TO USE
+#
+# ./eval_driver.sh <path/to/bin_dir> <path/to/work_dir> <path/to/venv_dir> <path/to/intp_dir>
+#                  <TEST 1> <TEST 2> ...
+# bin_dir  : build directory of luci-pass-value-test (ex: build/compiler/luci-pass-value-test)
+# work_dir : artifacts directoy where test materials exist
+# venv_dir : python virtual environment home directory
+# intp_dir : path to luci_eval_driver from luci-eval-driver
+
+VERIFY_SOURCE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+VERIFY_SCRIPT_PATH="${VERIFY_SOURCE_PATH}/eval_result_verifier.py"
+BINDIR="$1"; shift
+WORKDIR="$1"; shift
+VIRTUALENV="$1"; shift
+INTERPRETER_DRIVER_PATH="$1"; shift
+
+TESTED=()
+PASSED=()
+FAILED=()
+
+for TESTCASE in "$@"; do
+  TESTED+=("${TESTCASE}")
+
+  TESTCASE_TFLITE_FILE="${WORKDIR}/${TESTCASE}.tflite"
+  TESTCASE_CIRCLE_FILE="${BINDIR}/${TESTCASE}.pass.circle"
+  TEST_RESULT_FILE="${BINDIR}/${TESTCASE}"
+
+  PASSED_TAG="${TEST_RESULT_FILE}.passed"
+  rm -f "${PASSED_TAG}"
+
+  cat > "${TEST_RESULT_FILE}.log" <(
+    exec 2>&1
+    set -ex
+
+    source "${VIRTUALENV}/bin/activate"
+
+    "${VIRTUALENV}/bin/python" "${VERIFY_SCRIPT_PATH}" \
+    --driver "${INTERPRETER_DRIVER_PATH}" \
+    --tflite "${TESTCASE_TFLITE_FILE}" \
+    --circle "${TESTCASE_CIRCLE_FILE}"
+
+    if [[ $? -eq 0 ]]; then
+      touch "${PASSED_TAG}"
+    fi
+  )
+
+  if [[ -f "${PASSED_TAG}" ]]; then
+    PASSED+=("${TESTCASE}")
+  else
+    FAILED+=("${TESTCASE}")
+  fi
+done
+
+if [[ ${#TESTED[@]} -ne ${#PASSED[@]} ]]; then
+  echo "FAILED"
+  for TEST in "${FAILED[@]}"
+  do
+    echo "- ${TEST}"
+  done
+  exit 255
+fi
+
+echo "PASSED"
+exit 0
diff --git a/compiler/luci-pass-value-test/eval_result_verifier.py b/compiler/luci-pass-value-test/eval_result_verifier.py
new file mode 100644
index 000000000..c6005edfc
--- /dev/null
+++ b/compiler/luci-pass-value-test/eval_result_verifier.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python3
+import numpy as np
+import tensorflow as tf
+import subprocess
+import argparse
+import traceback
+
+#
+# This script was copied from luci-value-test with input arguments are tflite and circle path
+#
+parser = argparse.ArgumentParser()
+parser.add_argument('--driver', type=str, required=True)
+parser.add_argument('--tflite', type=str, required=True)
+parser.add_argument('--circle', type=str, required=True)
+args = parser.parse_args()
+
+driver = args.driver
+tflite_model = args.tflite
+circle_model = args.circle
+
+# Build TFLite interpreter.
+interpreter = tf.lite.Interpreter(tflite_model)
+interpreter.allocate_tensors()
+
+# Generate random input data.
+num_inputs = len(interpreter.get_input_details())
+for i in range(num_inputs):
+    input_details = interpreter.get_input_details()[i]
+    if input_details["dtype"] == np.float32:
+        input_data = np.array(
+            np.random.random_sample(input_details["shape"]), input_details["dtype"])
+    elif input_details["dtype"] == np.uint8:
+        input_data = np.array(
+            np.random.randint(0, 256, size=input_details["shape"]),
+            input_details["dtype"])
+    elif input_details["dtype"] == np.bool_:
+        input_data = np.array(
+            np.random.choice(a=[True, False], size=input_details["shape"]),
+            input_details["dtype"])
+    else:
+        raise SystemExit("Unsupported input dtype")
+
+    interpreter.set_tensor(input_details["index"], input_data)
+    input_data.tofile(circle_model + ".input" + str(i))
+
+# Do inference
+interpreter.invoke()
+
+# Execute luci interpreter.
+subprocess.run(
+    [
+        driver, circle_model,
+        str(num_inputs), circle_model + ".input", circle_model + ".output"
+    ],
+    check=True)
+
+# Compare the results.
+for idx in range(len(interpreter.get_output_details())):
+    output_details = interpreter.get_output_details()[idx]
+    output_data = np.fromfile(circle_model + ".output" + str(idx),
+                              output_details["dtype"])
+    shape_file = open(circle_model + ".output" + str(idx) + ".shape", 'r')
+    output_shape = [int(i) for i in shape_file.read().split(',')]
+    luci_output_data = np.reshape(output_data, output_shape)
+    try:
+        if output_details["dtype"] == np.uint8:
+            if np.allclose(
+                    luci_output_data,
+                    interpreter.get_tensor(
+                        interpreter.get_output_details()[idx]["index"]),
+                    rtol=0,
+                    atol=0) == False:
+                raise SystemExit("Execution result of " + tflite_model +
+                                 " does not match with " + circle_model)
+        elif output_details["dtype"] == np.float32:
+            if np.allclose(
+                    luci_output_data,
+                    interpreter.get_tensor(
+                        interpreter.get_output_details()[idx]["index"]),
+                    rtol=1.e-5,
+                    atol=1.e-5) == False:
+                raise SystemExit("Execution result of " + tflite_model +
+                                 " does not match with " + circle_model)
+        elif output_details["dtype"] == np.int64:
+            if np.allclose(
+                    luci_output_data,
+                    interpreter.get_tensor(
+                        interpreter.get_output_details()[idx]["index"]),
+                    rtol=0,
+                    atol=0) == False:
+                raise SystemExit("Execution result of " + tflite_model +
+                                 " does not match with " + circle_model)
+        elif output_details["dtype"] == np.int32:
+            if np.allclose(
+                    luci_output_data,
+                    interpreter.get_tensor(
+                        interpreter.get_output_details()[idx]["index"]),
+                    rtol=0,
+                    atol=0) == False:
+                raise SystemExit("Execution result of " + tflite_model +
+                                 " does not match with " + circle_model)
+        else:
+            raise SystemExit("Unsupported data type: ", output_details["dtype"])
+    except:
+        print(traceback.format_exc())
+        quit(255)
+
+quit(0)
diff --git a/compiler/luci-pass-value-test/requires.cmake b/compiler/luci-pass-value-test/requires.cmake
new file mode 100644
index 000000000..d977a51b6
--- /dev/null
+++ b/compiler/luci-pass-value-test/requires.cmake
@@ -0,0 +1,7 @@
+require("common-artifacts")
+require("luci-interpreter")
+require("safemain")
+require("oops")
+require("loco")
+require("luci-value-test")
+require("luci-eval-driver")
diff --git a/compiler/luci-pass-value-test/test.lst b/compiler/luci-pass-value-test/test.lst
new file mode 100644
index 000000000..e607904cb
--- /dev/null
+++ b/compiler/luci-pass-value-test/test.lst
@@ -0,0 +1,30 @@
+#
+# Format:
+#   addeval(MODEL PASS)
+# MODEL: tflite model file name in build/compiler/common-artifacts folder.
+# PASS: Optimization Pass to test. Supports only one Pass for now.
+#
+
+# addeval(Net_Preactivation_BN_000 fuse_preactivation_batchnorm) : value diff exist
+# --> https://github.com/Samsung/ONE/issues/5782
+addeval(Net_Conv_Add_Mul_000 fuse_batchnorm_with_conv)
+addeval(Net_Conv_Add_Mul_000 fuse_batchnorm_with_conv)
+addeval(Net_Conv_Add_Mul_001 fuse_batchnorm_with_conv)
+addeval(Net_Conv_Add_Mul_002 fuse_batchnorm_with_conv)
+addeval(Net_Conv_Min_Max_000 transform_min_max_to_relu6)
+addeval(Net_Conv_Relu6_000 fuse_activation_function)
+addeval(Net_DwConv_BN_000 fuse_batchnorm_with_dwconv)
+addeval(Net_DwConv_BN_001 fuse_batchnorm_with_dwconv)
+addeval(Net_Reshape_Neg_000 forward_reshape_to_unaryop)
+addeval(Net_Reshape_Reshape_000 remove_redundant_reshape)
+addeval(Net_Squeeze_Squeeze_000 substitute_squeeze_to_reshape)
+addeval(Net_TConv_Add_000 fuse_add_with_tconv)
+addeval(Net_TConv_Add_001 fuse_add_with_tconv)
+addeval(Net_TConv_Add_002 fuse_add_with_tconv)
+addeval(Net_TConv_BN_000 fuse_batchnorm_with_tconv)
+addeval(Net_TConv_BN_001 fuse_batchnorm_with_tconv)
+addeval(Net_TConv_BN_002 fuse_batchnorm_with_tconv)
+addeval(Net_InstanceNorm_001 fuse_instnorm)
+addeval(Net_InstanceNorm_002 fuse_instnorm)
+addeval(Net_InstanceNorm_003 fuse_instnorm)
+addeval(Net_StridedSlice_StridedSlice_000 remove_unnecessary_strided_slice)
diff --git a/compiler/luci-value-test/.gitignore b/compiler/luci-value-test/.gitignore
new file mode 100644
index 000000000..8dbfa9012
--- /dev/null
+++ b/compiler/luci-value-test/.gitignore
@@ -0,0 +1 @@
+/test.local.lst
diff --git a/compiler/luci-value-test/CMakeLists.txt b/compiler/luci-value-test/CMakeLists.txt
index ec7463409..124f120d4 100644
--- a/compiler/luci-value-test/CMakeLists.txt
+++ b/compiler/luci-value-test/CMakeLists.txt
@@ -12,8 +12,6 @@ include("test.local.lst" OPTIONAL)
 # Generate dependencies
 add_custom_target(luci_eval_testfiles ALL DEPENDS ${TESTFILES})
 
-add_subdirectory(tester)
-
 get_target_property(ARTIFACTS_BIN_PATH testDataGenerator BINARY_DIR)
 
 add_test(NAME luci_value_test
@@ -21,5 +19,6 @@ add_test(NAME luci_value_test
           "${CMAKE_CURRENT_BINARY_DIR}"
           "${ARTIFACTS_BIN_PATH}"
           "${NNCC_OVERLAY_DIR}/venv_2_3_0"
+          "$<TARGET_FILE:luci_eval_driver>"
           ${LUCI_VALUE_TESTS}
 )
diff --git a/compiler/luci-value-test/evalverify.sh b/compiler/luci-value-test/evalverify.sh
index 12c9a459a..01c4bce46 100755
--- a/compiler/luci-value-test/evalverify.sh
+++ b/compiler/luci-value-test/evalverify.sh
@@ -14,7 +14,7 @@ VERIFY_SCRIPT_PATH="${VERIFY_SOURCE_PATH}/luci_eval_verifier.py"
 BINDIR="$1"; shift
 WORKDIR="$1"; shift
 VIRTUALENV="$1"; shift
-INTERPRETER_DRIVER_PATH="${BINDIR}/tester/luci_eval_tester"
+INTERPRETER_DRIVER_PATH="$1"; shift
 
 TESTED=()
 PASSED=()
diff --git a/compiler/luci-value-test/luci_eval_verifier.py b/compiler/luci-value-test/luci_eval_verifier.py
index 7a2cebb91..f6b0620d8 100755
--- a/compiler/luci-value-test/luci_eval_verifier.py
+++ b/compiler/luci-value-test/luci_eval_verifier.py
@@ -9,7 +9,7 @@ import traceback
 # This script compares the execution result of luci-interpreter with that of TFLite interpreter
 #
 # Basic usage:
-#   eval_verifier.py --driver build/compiler/luci-value-test/tester/luci_eval_tester
+#   eval_verifier.py --driver build/compiler/luci-eval-driver/luci_eval_driver
 #           --model inception_v3
 parser = argparse.ArgumentParser()
 parser.add_argument('--driver', type=str, required=True)
diff --git a/compiler/luci-value-test/requires.cmake b/compiler/luci-value-test/requires.cmake
index f8af5f27e..e1a0f8367 100644
--- a/compiler/luci-value-test/requires.cmake
+++ b/compiler/luci-value-test/requires.cmake
@@ -4,3 +4,4 @@ require("luci-interpreter")
 require("safemain")
 require("oops")
 require("loco")
+require("luci-eval-driver")
diff --git a/compiler/luci-value-test/test.lst b/compiler/luci-value-test/test.lst
index 0e5231eca..edf329aff 100644
--- a/compiler/luci-value-test/test.lst
+++ b/compiler/luci-value-test/test.lst
@@ -155,6 +155,7 @@ addeval(Split_000)
 #addeval(Square_000)
 #addeval(SquaredDifference_000)
 addeval(Squeeze_000)
+addeval(Squeeze_001)
 addeval(StridedSlice_000)
 addeval(StridedSlice_001)
 addeval(StridedSlice_002)
diff --git a/compiler/luci-value-test/tester/CMakeLists.txt b/compiler/luci-value-test/tester/CMakeLists.txt
deleted file mode 100644
index f2a4ff4b6..000000000
--- a/compiler/luci-value-test/tester/CMakeLists.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-
-set(SRCS_EVAL_TESTER
-      src/EvalTester.cpp
-   )
-
-add_executable(luci_eval_tester ${SRCS_EVAL_TESTER})
-target_link_libraries(luci_eval_tester PRIVATE oops)
-target_link_libraries(luci_eval_tester PRIVATE loco)
-target_link_libraries(luci_eval_tester PRIVATE luci_import)
-target_link_libraries(luci_eval_tester PRIVATE luci_export)
-target_link_libraries(luci_eval_tester PRIVATE luci_lang)
-target_link_libraries(luci_eval_tester PRIVATE luci_interpreter)
-target_link_libraries(luci_eval_tester PRIVATE safemain)
diff --git a/compiler/luci/CMakeLists.txt b/compiler/luci/CMakeLists.txt
index 214a1bbf2..3771176f0 100644
--- a/compiler/luci/CMakeLists.txt
+++ b/compiler/luci/CMakeLists.txt
@@ -1,8 +1,11 @@
 add_subdirectory(env)
 add_subdirectory(log)
 add_subdirectory(lang)
+add_subdirectory(testhelper)
 add_subdirectory(service)
 add_subdirectory(pass)
+add_subdirectory(profile)
+add_subdirectory(partition)
 add_subdirectory(logex)
 add_subdirectory(import)
 add_subdirectory(export)
diff --git a/compiler/luci/env/include/luci/UserSettings.h b/compiler/luci/env/include/luci/UserSettings.h
index bcfd16071..b56bd65e2 100644
--- a/compiler/luci/env/include/luci/UserSettings.h
+++ b/compiler/luci/env/include/luci/UserSettings.h
@@ -32,6 +32,7 @@ struct UserSettings
     Undefined,
     MuteWarnings,
     DisableValidation,
+    ProfilingDataGen,
   };
 
   static UserSettings *settings();
diff --git a/compiler/luci/env/src/UserSettings.cpp b/compiler/luci/env/src/UserSettings.cpp
index 27dec762d..b4c661190 100644
--- a/compiler/luci/env/src/UserSettings.cpp
+++ b/compiler/luci/env/src/UserSettings.cpp
@@ -30,6 +30,7 @@ public:
 private:
   bool _MuteWarnings{false};
   bool _DisableValidation{false};
+  bool _ProfilingDataGen{false};
 };
 
 void UserSettingsImpl::set(const Key key, bool value)
@@ -42,6 +43,9 @@ void UserSettingsImpl::set(const Key key, bool value)
     case Key::DisableValidation:
       _DisableValidation = value;
       break;
+    case Key::ProfilingDataGen:
+      _ProfilingDataGen = value;
+      break;
     default:
       throw std::runtime_error("Invalid key in boolean set");
       break;
@@ -56,6 +60,8 @@ bool UserSettingsImpl::get(const Key key) const
       return _MuteWarnings;
     case Key::DisableValidation:
       return _DisableValidation;
+    case Key::ProfilingDataGen:
+      return _ProfilingDataGen;
     default:
       throw std::runtime_error("Invalid key in boolean get");
       break;
diff --git a/compiler/luci/env/src/UserSettings.test.cpp b/compiler/luci/env/src/UserSettings.test.cpp
index 8d9d1875b..899c0c2a1 100644
--- a/compiler/luci/env/src/UserSettings.test.cpp
+++ b/compiler/luci/env/src/UserSettings.test.cpp
@@ -51,6 +51,18 @@ TEST(UserSettings, DisableValidation)
   ASSERT_TRUE(settings->get(luci::UserSettings::Key::DisableValidation));
 }
 
+TEST(UserSettings, ProfilingDataGen)
+{
+  auto settings = luci::UserSettings::settings();
+  ASSERT_NE(nullptr, settings);
+
+  settings->set(luci::UserSettings::Key::ProfilingDataGen, false);
+  ASSERT_FALSE(settings->get(luci::UserSettings::Key::ProfilingDataGen));
+
+  settings->set(luci::UserSettings::Key::ProfilingDataGen, true);
+  ASSERT_TRUE(settings->get(luci::UserSettings::Key::ProfilingDataGen));
+}
+
 TEST(UserSettings, undefined_set_NEG)
 {
   auto settings = luci::UserSettings::settings();
diff --git a/compiler/luci/export/CMakeLists.txt b/compiler/luci/export/CMakeLists.txt
index fe4382ecd..01f737110 100644
--- a/compiler/luci/export/CMakeLists.txt
+++ b/compiler/luci/export/CMakeLists.txt
@@ -13,6 +13,7 @@ target_link_libraries(luci_export PRIVATE mio_circle)
 target_link_libraries(luci_export PRIVATE luci_env)
 target_link_libraries(luci_export PRIVATE luci_log)
 target_link_libraries(luci_export PRIVATE luci_logex)
+target_link_libraries(luci_export PRIVATE luci_profile)
 target_link_libraries(luci_export PRIVATE nncc_common)
 target_link_libraries(luci_export PRIVATE locop)
 target_link_libraries(luci_export PRIVATE oops)
diff --git a/compiler/luci/export/include/luci/CircleFileExpContract.h b/compiler/luci/export/include/luci/CircleFileExpContract.h
index eeaf2d9bb..8ef1b5e0c 100644
--- a/compiler/luci/export/include/luci/CircleFileExpContract.h
+++ b/compiler/luci/export/include/luci/CircleFileExpContract.h
@@ -33,7 +33,7 @@ struct CircleFileExpContract : public luci::CircleExporter::Contract
 {
 public:
   CircleFileExpContract(luci::Module *module, const std::string &filename)
-      : _module(module), _filepath(filename)
+    : _module(module), _filepath(filename)
   {
     // NOTHING TO DO
   }
diff --git a/compiler/luci/export/src/CircleExportMetadata.cpp b/compiler/luci/export/src/CircleExportMetadata.cpp
new file mode 100644
index 000000000..ef905a882
--- /dev/null
+++ b/compiler/luci/export/src/CircleExportMetadata.cpp
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleExportMetadata.h"
+
+#include <luci/UserSettings.h>
+
+namespace
+{
+
+void write_u32(std::vector<uint8_t> &to, uint32_t value)
+{
+  to.emplace_back(0xFF & (value >> 0 * 8));
+  to.emplace_back(0xFF & (value >> 1 * 8));
+  to.emplace_back(0xFF & (value >> 2 * 8));
+  to.emplace_back(0xFF & (value >> 3 * 8));
+}
+
+flatbuffers::Offset<circle::Metadata> metadata_offset(flatbuffers::FlatBufferBuilder &builder,
+                                                      luci::SerializedModelData &md,
+                                                      const std::vector<uint8_t> &data,
+                                                      const std::string &metadata_name)
+{
+  auto buffer_id = static_cast<uint32_t>(md._buffers.size());
+  md._buffers.push_back(circle::CreateBufferDirect(builder, &data));
+  return circle::CreateMetadataDirect(builder, metadata_name.c_str(), buffer_id);
+}
+
+} // namespace
+
+namespace luci
+{
+
+// 'source_table' is encoded to binary format.
+const std::vector<uint8_t> CircleExportMetadata::encoded_source_table(void)
+{
+  std::vector<uint8_t> data;
+
+  write_u32(data, _source_table.size());
+
+  for (auto &kv : _source_table)
+  {
+    const auto id = kv.first;
+    write_u32(data, id);
+
+    const auto origin_name = kv.second;
+    const auto length = origin_name.length();
+    write_u32(data, length + 1); // name + '\0
+
+    for (uint32_t i = 0; i < length; ++i)
+    {
+      data.emplace_back(origin_name.at(i));
+    }
+    data.emplace_back('\0');
+  }
+
+  return data;
+}
+
+// 'op_table' is encoded to binary format.
+const std::vector<uint8_t> CircleExportMetadata::encoded_op_table(void)
+{
+  std::vector<uint8_t> data;
+
+  write_u32(data, _op_table.size());
+
+  for (auto &kv : _op_table)
+  {
+    const auto id = kv.first;
+    write_u32(data, id);
+
+    const auto origins = kv.second;
+    const auto node_num = origins.size();
+    write_u32(data, node_num);
+
+    for (auto origin : origins)
+    {
+      write_u32(data, origin);
+    }
+  }
+
+  return data;
+}
+
+} // namespace luci
+
+namespace luci
+{
+
+std::vector<flatbuffers::Offset<circle::Metadata>>
+createCircleMetadataVector(flatbuffers::FlatBufferBuilder &builder, luci::SerializedModelData &md)
+{
+  std::vector<flatbuffers::Offset<circle::Metadata>> metadata_vec;
+
+  auto settings = luci::UserSettings::settings();
+  if (settings->get(luci::UserSettings::Key::ProfilingDataGen))
+  {
+    metadata_vec.emplace_back(
+      metadata_offset(builder, md, md._metadata.encoded_source_table(), "ONE_source_table"));
+
+    metadata_vec.emplace_back(
+      metadata_offset(builder, md, md._metadata.encoded_op_table(), "ONE_op_table"));
+  }
+
+  return metadata_vec;
+}
+
+} // namespace luci
diff --git a/compiler/luci/export/src/CircleExportMetadata.h b/compiler/luci/export/src/CircleExportMetadata.h
new file mode 100644
index 000000000..10cda421e
--- /dev/null
+++ b/compiler/luci/export/src/CircleExportMetadata.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_CIRCLE_EXPORT_METADATA_H__
+#define __LUCI_CIRCLE_EXPORT_METADATA_H__
+
+#include "SerializedData.h"
+
+#include <flatbuffers/flatbuffers.h>
+#include <mio/circle/schema_generated.h>
+
+namespace luci
+{
+
+/**
+ * @brief  Create Metadata corresponding to model metadata
+ */
+std::vector<flatbuffers::Offset<circle::Metadata>>
+createCircleMetadataVector(flatbuffers::FlatBufferBuilder &builder, SerializedModelData &md);
+
+} // namespace luci
+
+#endif // __LUCI_CIRCLE_EXPORT_METADATA_H__
diff --git a/compiler/luci/export/src/CircleExporterImpl.cpp b/compiler/luci/export/src/CircleExporterImpl.cpp
index df7542797..7e218191c 100644
--- a/compiler/luci/export/src/CircleExporterImpl.cpp
+++ b/compiler/luci/export/src/CircleExporterImpl.cpp
@@ -16,10 +16,13 @@
 
 #include "CircleExporterImpl.h"
 #include "Optimize.h"
+#include "CircleExportMetadata.h"
 #include "CircleTensorExporter.h"
 #include "CircleOperationExporter.h"
 #include "CircleExporterUtils.h"
 
+#include <luci/IR/CircleNodes.h>
+
 #include <oops/InternalExn.h>
 #include <mio/circle/schema_generated.h>
 #include <flatbuffers/flatbuffers.h>
@@ -27,46 +30,16 @@
 #include <cassert>
 #include <unordered_map>
 #include <string>
-#include <stdexcept>
+#include <vector>
 
 namespace
 {
 
-luci::CircleInput *input_node(loco::Graph *g, const loco::GraphInputIndex &index)
-{
-  for (uint32_t n = 0; n < g->nodes()->size(); ++n)
-  {
-    if (auto input = dynamic_cast<luci::CircleInput *>(g->nodes()->at(n)))
-    {
-      if (input->indexed() && input->index() == index)
-      {
-        return input;
-      }
-    }
-  }
-  return nullptr;
-}
-
-luci::CircleOutput *output_node(loco::Graph *g, const loco::GraphOutputIndex &index)
-{
-  for (uint32_t n = 0; n < g->nodes()->size(); ++n)
-  {
-    if (auto output = dynamic_cast<luci::CircleOutput *>(g->nodes()->at(n)))
-    {
-      if (output->indexed() && output->index() == index)
-      {
-        return output;
-      }
-    }
-  }
-  return nullptr;
-}
-
 void registerGraphInputTensors(loco::Graph *graph, luci::SubGraphContext &ctx)
 {
   for (uint32_t n = 0; n < graph->inputs()->size(); ++n)
   {
-    auto node = input_node(graph, n);
+    auto node = luci::input_node(graph, n);
     assert(node != nullptr);
     ctx._inputs.push_back(luci::get_tensor_index(node));
   }
@@ -76,7 +49,7 @@ void registerGraphOutputTensors(loco::Graph *graph, luci::SubGraphContext &ctx)
 {
   for (uint32_t n = 0; n < graph->outputs()->size(); ++n)
   {
-    auto push = output_node(graph, n);
+    auto push = luci::output_node(graph, n);
     assert(push != nullptr);
     auto node = push->from();
     assert(node != nullptr);
@@ -113,7 +86,7 @@ encodeOperatorCodes(FlatBufferBuilder &builder, std::unordered_map<luci::OpCode,
     else
     {
       operator_codes_vec[idx] =
-          CreateOperatorCode(builder, it.first.opcode, builder.CreateString(it.first.custom_code));
+        CreateOperatorCode(builder, it.first.opcode, builder.CreateString(it.first.custom_code));
     }
   }
 
@@ -186,16 +159,16 @@ void CircleExporterImpl::exportGraph(loco::Graph *graph)
   std::string description_str = "nnpackage";
   auto description = _builder.CreateString(description_str);
 
+  // Metadata
+  auto metadata_vec = createCircleMetadataVector(_builder, md);
+  auto metadata = _builder.CreateVector(std::vector<Offset<Metadata>>(metadata_vec));
+
   // create array of buffers
   auto buffers = _builder.CreateVector(md._buffers);
 
-  // empty metadata
-  std::vector<int> metadata_buffer_vec;
-  auto metadata_buffer = _builder.CreateVector(metadata_buffer_vec);
-
   // Model
   auto model_offset = CreateModel(_builder, version, operator_codes, subgraphs, description,
-                                  buffers, metadata_buffer);
+                                  buffers, 0 /* metadata_buffer */, metadata);
   FinishModelBuffer(_builder, model_offset);
 }
 
@@ -250,19 +223,19 @@ void CircleExporterImpl::exportModule(Module *module)
   std::string description_str = "nnpackage";
   auto description = _builder.CreateString(description_str);
 
+  // Metadata
+  auto metadata_vec = createCircleMetadataVector(_builder, md);
+  auto metadata = _builder.CreateVector(std::vector<Offset<Metadata>>(metadata_vec));
+
   // create array of buffers
   auto buffers = _builder.CreateVector(md._buffers);
 
-  // empty metadata
-  std::vector<int> metadata_buffer_vec;
-  auto metadata_buffer = _builder.CreateVector(metadata_buffer_vec);
-
   // This version is taken from comment in fbs
   constexpr uint32_t version = 0;
 
   // Model
   auto model_offset = CreateModel(_builder, version, operator_codes, subgraphs, description,
-                                  buffers, metadata_buffer);
+                                  buffers, 0 /* metadata_buffer */, metadata);
   FinishModelBuffer(_builder, model_offset);
 }
 
diff --git a/compiler/luci/export/src/CircleExporterImpl.h b/compiler/luci/export/src/CircleExporterImpl.h
index e5d5b5a00..069f62afd 100644
--- a/compiler/luci/export/src/CircleExporterImpl.h
+++ b/compiler/luci/export/src/CircleExporterImpl.h
@@ -22,8 +22,6 @@
 
 #include "SerializedData.h"
 
-#include "SerializedData.h"
-
 #include <mio/circle/schema_generated.h>
 
 #include <loco.h>
diff --git a/compiler/luci/export/src/CircleExporterUtils.cpp b/compiler/luci/export/src/CircleExporterUtils.cpp
index 3715513e0..1b21fdd86 100644
--- a/compiler/luci/export/src/CircleExporterUtils.cpp
+++ b/compiler/luci/export/src/CircleExporterUtils.cpp
@@ -208,13 +208,13 @@ circle::Padding getOpPadding(const loco::Padding2D *pad, const loco::Stride<2> *
   //
   // NOTE input and output 'feature' map are shape of NHWC
   bool same_padding_criterion_1 =
-      (static_cast<uint32_t>(ofm._dims[1]) == (ifm._dims[1] - 1) / stride->vertical() + 1) &&
-      (static_cast<uint32_t>(ofm._dims[2]) == (ifm._dims[2] - 1) / stride->horizontal() + 1);
+    (static_cast<uint32_t>(ofm._dims[1]) == (ifm._dims[1] - 1) / stride->vertical() + 1) &&
+    (static_cast<uint32_t>(ofm._dims[2]) == (ifm._dims[2] - 1) / stride->horizontal() + 1);
 
   // For same padding, rear padding is same or bigger than front padding by at most 1
   bool same_padding_criterion_2 =
-      (pad->top() <= pad->bottom()) && (pad->bottom() <= pad->top() + 1) &&
-      (pad->left() <= pad->right()) && (pad->right() <= pad->left() + 1);
+    (pad->top() <= pad->bottom()) && (pad->bottom() <= pad->top() + 1) &&
+    (pad->left() <= pad->right()) && (pad->right() <= pad->left() + 1);
 
   if (same_padding_criterion_1 && same_padding_criterion_2)
     return circle::Padding_SAME;
diff --git a/compiler/luci/export/src/CircleOperationExporter.cpp b/compiler/luci/export/src/CircleOperationExporter.cpp
index 4343cf3c9..4bf674b9b 100644
--- a/compiler/luci/export/src/CircleOperationExporter.cpp
+++ b/compiler/luci/export/src/CircleOperationExporter.cpp
@@ -21,6 +21,7 @@
 #include <luci/IR/CircleNode.h>
 #include <luci/IR/CircleNodes.h>
 #include <luci/IR/CircleNodeVisitor.h>
+#include <luci/Profile/CircleNodeOrigin.h>
 #include <luci/UserSettings.h>
 #include <luci/Log.h>
 
@@ -53,8 +54,8 @@ template <class CirclePool2D>
 void export_pool_2d(ExportContext &ctx, CirclePool2D *node, circle::BuiltinOperator builtin_op)
 {
   LUCI_ASSERT(builtin_op == circle::BuiltinOperator_MAX_POOL_2D ||
-                  builtin_op == circle::BuiltinOperator_L2_POOL_2D ||
-                  builtin_op == circle::BuiltinOperator_AVERAGE_POOL_2D,
+                builtin_op == circle::BuiltinOperator_L2_POOL_2D ||
+                builtin_op == circle::BuiltinOperator_AVERAGE_POOL_2D,
               "Should be L2Pool, MaxPool or AvgPool");
   LUCI_ASSERT(node->padding() != luci::Padding::UNDEFINED, "Padding is not set");
 
@@ -81,7 +82,7 @@ void export_node(ExportContext &ctx, loco::Node *node, circle::BuiltinOperator b
                  circle::BuiltinOptions bot, flatbuffers::Offset<void> options_offset)
 {
   uint32_t op_idx =
-      ctx.md.registerBuiltinOpcode(bop, loco::must_cast<luci::CircleNode *>(node)->op_version());
+    ctx.md.registerBuiltinOpcode(bop, loco::must_cast<luci::CircleNode *>(node)->op_version());
   std::vector<int32_t> inputs_vec;
   std::vector<int32_t> outputs_vec{get_tensor_index(node)};
   for (uint32_t i = 0; i < node->arity(); ++i)
@@ -98,7 +99,7 @@ void export_node(ExportContext &ctx, loco::Node *node, circle::BuiltinOperator b
 void export_node(ExportContext &ctx, loco::Node *node, circle::BuiltinOperator bop)
 {
   uint32_t op_idx =
-      ctx.md.registerBuiltinOpcode(bop, loco::must_cast<luci::CircleNode *>(node)->op_version());
+    ctx.md.registerBuiltinOpcode(bop, loco::must_cast<luci::CircleNode *>(node)->op_version());
   std::vector<int32_t> inputs_vec;
   std::vector<int32_t> outputs_vec{get_tensor_index(static_cast<loco::Node *>(node))};
   for (uint32_t i = 0; i < node->arity(); ++i)
@@ -152,7 +153,7 @@ void export_node(ExportContext &ctx, luci::CircleCast *node)
 void export_node(ExportContext &ctx, luci::CircleConcatenation *node)
 {
   uint32_t op_idx =
-      ctx.md.registerBuiltinOpcode(circle::BuiltinOperator_CONCATENATION, node->op_version());
+    ctx.md.registerBuiltinOpcode(circle::BuiltinOperator_CONCATENATION, node->op_version());
   std::vector<int32_t> inputs_vec;
   std::vector<int32_t> outputs_vec{get_tensor_index(static_cast<loco::Node *>(node))};
 
@@ -171,6 +172,7 @@ void export_node(ExportContext &ctx, luci::CircleConcatenation *node)
 void export_node(ExportContext &ctx, luci::CircleCustom *node)
 {
   auto custom_outputs = loco::succs(node);
+  assert(custom_outputs.size() == node->numOutputs());
 
   uint32_t op_idx = ctx.md.registerCustomOpcode(node->custom_code());
   std::vector<int32_t> inputs_vec;
@@ -260,9 +262,9 @@ void export_node(ExportContext &ctx, luci::CircleNonMaxSuppressionV4 *node)
   uint32_t op_idx = ctx.md.registerBuiltinOpcode(circle::BuiltinOperator_NON_MAX_SUPPRESSION_V4,
                                                  node->op_version());
   std::vector<int32_t> inputs_vec{
-      get_tensor_index(node->boxes()),           get_tensor_index(node->scores()),
-      get_tensor_index(node->max_output_size()), get_tensor_index(node->iou_threshold()),
-      get_tensor_index(node->score_threshold()),
+    get_tensor_index(node->boxes()),           get_tensor_index(node->scores()),
+    get_tensor_index(node->max_output_size()), get_tensor_index(node->iou_threshold()),
+    get_tensor_index(node->score_threshold()),
   };
   std::vector<int32_t> outputs_vec;
 
@@ -290,8 +292,8 @@ void export_node(ExportContext &ctx, luci::CircleNonMaxSuppressionV4 *node)
   auto outputs = ctx.builder.CreateVector(outputs_vec);
   auto options = CreateNonMaxSuppressionV4Options(ctx.builder);
   auto op_offset =
-      CreateOperator(ctx.builder, op_idx, inputs, outputs,
-                     circle::BuiltinOptions_NonMaxSuppressionV4Options, options.Union());
+    CreateOperator(ctx.builder, op_idx, inputs, outputs,
+                   circle::BuiltinOptions_NonMaxSuppressionV4Options, options.Union());
   ctx.gd._operators.push_back(op_offset);
 }
 
@@ -303,9 +305,9 @@ void export_node(ExportContext &ctx, luci::CircleNonMaxSuppressionV5 *node)
   uint32_t op_idx = ctx.md.registerBuiltinOpcode(circle::BuiltinOperator_NON_MAX_SUPPRESSION_V5,
                                                  node->op_version());
   std::vector<int32_t> inputs_vec{
-      get_tensor_index(node->boxes()),           get_tensor_index(node->scores()),
-      get_tensor_index(node->max_output_size()), get_tensor_index(node->iou_threshold()),
-      get_tensor_index(node->score_threshold()), get_tensor_index(node->soft_nms_sigma()),
+    get_tensor_index(node->boxes()),           get_tensor_index(node->scores()),
+    get_tensor_index(node->max_output_size()), get_tensor_index(node->iou_threshold()),
+    get_tensor_index(node->score_threshold()), get_tensor_index(node->soft_nms_sigma()),
   };
   std::vector<int32_t> outputs_vec;
 
@@ -333,15 +335,15 @@ void export_node(ExportContext &ctx, luci::CircleNonMaxSuppressionV5 *node)
   auto outputs = ctx.builder.CreateVector(outputs_vec);
   auto options = CreateNonMaxSuppressionV5Options(ctx.builder);
   auto op_offset =
-      CreateOperator(ctx.builder, op_idx, inputs, outputs,
-                     circle::BuiltinOptions_NonMaxSuppressionV5Options, options.Union());
+    CreateOperator(ctx.builder, op_idx, inputs, outputs,
+                   circle::BuiltinOptions_NonMaxSuppressionV5Options, options.Union());
   ctx.gd._operators.push_back(op_offset);
 }
 
 void export_node(ExportContext &ctx, luci::CircleReverseV2 *node)
 {
   uint32_t op_idx =
-      ctx.md.registerBuiltinOpcode(circle::BuiltinOperator_REVERSE_V2, node->op_version());
+    ctx.md.registerBuiltinOpcode(circle::BuiltinOperator_REVERSE_V2, node->op_version());
   std::vector<int32_t> inputs_vec{get_tensor_index(node->tensor()), get_tensor_index(node->axis())};
   std::vector<int32_t> outputs_vec{get_tensor_index(static_cast<loco::Node *>(node))};
   auto inputs = ctx.builder.CreateVector(inputs_vec);
@@ -397,7 +399,7 @@ void export_node(ExportContext &ctx, luci::CircleSplitV *node)
   assert(int32_t(split_outs.size()) == node->num_split());
 
   uint32_t op_idx =
-      ctx.md.registerBuiltinOpcode(circle::BuiltinOperator_SPLIT_V, node->op_version());
+    ctx.md.registerBuiltinOpcode(circle::BuiltinOperator_SPLIT_V, node->op_version());
   std::vector<int32_t> inputs_vec{get_tensor_index(node->input()),
                                   get_tensor_index(node->size_splits()),
                                   get_tensor_index(node->split_dim())};
@@ -438,7 +440,7 @@ void export_node(ExportContext &ctx, luci::CircleTopKV2 *node)
   assert(outs_count == 2);
 
   uint32_t op_idx =
-      ctx.md.registerBuiltinOpcode(circle::BuiltinOperator_TOPK_V2, node->op_version());
+    ctx.md.registerBuiltinOpcode(circle::BuiltinOperator_TOPK_V2, node->op_version());
   std::vector<int32_t> inputs_vec{get_tensor_index(node->input()), get_tensor_index(node->k())};
   std::vector<int32_t> outputs_vec;
 
@@ -475,7 +477,7 @@ void export_node(ExportContext &ctx, luci::CircleUnique *node)
   auto unique_outs = loco::succs(node);
   assert(int32_t(unique_outs.size()) == 2);
   uint32_t op_idx =
-      ctx.md.registerBuiltinOpcode(circle::BuiltinOperator_UNIQUE, node->op_version());
+    ctx.md.registerBuiltinOpcode(circle::BuiltinOperator_UNIQUE, node->op_version());
 
   std::vector<int32_t> inputs_vec{get_tensor_index(node->input())};
   std::vector<int32_t> outputs_vec;
@@ -526,7 +528,7 @@ void export_node(ExportContext &ctx, luci::CircleUnpack *node)
   }
 
   uint32_t op_idx =
-      ctx.md.registerBuiltinOpcode(circle::BuiltinOperator_UNPACK, node->op_version());
+    ctx.md.registerBuiltinOpcode(circle::BuiltinOperator_UNPACK, node->op_version());
   std::vector<int32_t> inputs_vec{get_tensor_index(node->value())};
   std::vector<int32_t> outputs_vec;
 
@@ -622,6 +624,7 @@ public:
   void visit(luci::CircleAveragePool2D *) final;
   void visit(luci::CircleBatchMatMul *) final;
   void visit(luci::CircleBatchToSpaceND *) final;
+  void visit(luci::CircleBidirectionalSequenceLSTM *) final;
   void visit(luci::CircleCast *) final;
   void visit(luci::CircleCeil *) final;
   void visit(luci::CircleConcatenation *) final;
@@ -637,6 +640,7 @@ public:
   void visit(luci::CircleEqual *) final;
   void visit(luci::CircleExp *) final;
   void visit(luci::CircleExpandDims *) final;
+  void visit(luci::CircleFakeQuant *) final;
   void visit(luci::CircleFill *) final;
   void visit(luci::CircleFloor *) final;
   void visit(luci::CircleFloorDiv *) final;
@@ -734,6 +738,7 @@ public:
   void visit(luci::CircleOutputDummy *) final {}
   void visit(luci::CircleOutputExclude *) final {}
   // Virtual for multiple-outputs
+  void visit(luci::CircleBidirectionalSequenceLSTMOut *) final {}
   void visit(luci::CircleCustomOut *) final {}
   void visit(luci::CircleIfOut *) final {}
   void visit(luci::CircleNonMaxSuppressionV4Out *) final {}
@@ -782,8 +787,8 @@ void OperationExporter::visit(luci::CircleAbs *node)
 void OperationExporter::visit(luci::CircleAdd *node)
 {
   export_simple(
-      node, circle::BuiltinOperator_ADD, circle::BuiltinOptions_AddOptions,
-      CreateAddOptions(_ctx.builder, to_circle_actfunc(node->fusedActivationFunction())).Union());
+    node, circle::BuiltinOperator_ADD, circle::BuiltinOptions_AddOptions,
+    CreateAddOptions(_ctx.builder, to_circle_actfunc(node->fusedActivationFunction())).Union());
 }
 
 void OperationExporter::visit(luci::CircleAddN *node) { export_node(_ctx, node); }
@@ -791,15 +796,15 @@ void OperationExporter::visit(luci::CircleAddN *node) { export_node(_ctx, node);
 void OperationExporter::visit(luci::CircleArgMax *node)
 {
   export_simple(
-      node, circle::BuiltinOperator_ARG_MAX, circle::BuiltinOptions_ArgMaxOptions,
-      CreateArgMaxOptions(_ctx.builder, to_circle_tensortype(node->output_type())).Union());
+    node, circle::BuiltinOperator_ARG_MAX, circle::BuiltinOptions_ArgMaxOptions,
+    CreateArgMaxOptions(_ctx.builder, to_circle_tensortype(node->output_type())).Union());
 }
 
 void OperationExporter::visit(luci::CircleArgMin *node)
 {
   export_simple(
-      node, circle::BuiltinOperator_ARG_MIN, circle::BuiltinOptions_ArgMinOptions,
-      CreateArgMinOptions(_ctx.builder, to_circle_tensortype(node->output_type())).Union());
+    node, circle::BuiltinOperator_ARG_MIN, circle::BuiltinOptions_ArgMinOptions,
+    CreateArgMinOptions(_ctx.builder, to_circle_tensortype(node->output_type())).Union());
 }
 
 void OperationExporter::visit(luci::CircleAveragePool2D *node)
@@ -814,6 +819,48 @@ void OperationExporter::visit(luci::CircleBatchMatMul *node)
                 CreateBatchMatMulOptions(_ctx.builder, node->adj_x(), node->adj_y()).Union());
 }
 
+void OperationExporter::visit(luci::CircleBidirectionalSequenceLSTM *node)
+{
+  auto bidi_lstm_outs = loco::succs(node);
+  assert((bidi_lstm_outs.size() == 1) || (bidi_lstm_outs.size() == 2));
+  uint32_t op_idx = _ctx.md.registerBuiltinOpcode(
+    circle::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM, node->op_version());
+
+  std::vector<int32_t> inputs_vec{get_tensor_index(node->input())};
+  std::vector<int32_t> outputs_vec;
+
+  for (int32_t index = 0; index < 2; index++)
+  {
+    // store in order of index
+    bool found = false;
+    for (auto out : bidi_lstm_outs)
+    {
+      auto bidi_lstm_out = loco::must_cast<luci::CircleBidirectionalSequenceLSTMOut *>(out);
+      if (bidi_lstm_out->index() == index)
+      {
+        outputs_vec.push_back(get_tensor_index(bidi_lstm_out));
+        found = true;
+        break;
+      }
+    }
+    if (!found)
+    {
+      INTERNAL_EXN("Invalid BidirectionalSequenceLSTM output");
+    }
+  }
+
+  auto inputs = _ctx.builder.CreateVector(inputs_vec);
+  auto outputs = _ctx.builder.CreateVector(outputs_vec);
+  auto options = CreateBidirectionalSequenceLSTMOptions(
+    _ctx.builder, to_circle_actfunc(node->fusedActivationFunction()), node->cell_clip(),
+    node->proj_clip(), node->merge_outputs(), node->time_major(),
+    node->asymmetric_quantize_inputs());
+  auto op_offset =
+    CreateOperator(_ctx.builder, op_idx, inputs, outputs,
+                   circle::BuiltinOptions_BidirectionalSequenceLSTMOptions, options.Union());
+  _ctx.gd._operators.push_back(op_offset);
+}
+
 void OperationExporter::visit(luci::CircleCast *node) { export_node(_ctx, node); }
 
 void OperationExporter::visit(luci::CircleCeil *node)
@@ -837,7 +884,7 @@ void OperationExporter::visit(luci::CircleConv2D *node)
                                     node->stride()->w(), node->stride()->h(),
                                     to_circle_actfunc(node->fusedActivationFunction()),
                                     node->dilation()->w(), node->dilation()->h())
-                    .Union());
+                  .Union());
 }
 
 void OperationExporter::visit(luci::CircleCos *node)
@@ -857,14 +904,13 @@ void OperationExporter::visit(luci::CircleDepthToSpace *node)
 
 void OperationExporter::visit(luci::CircleDepthwiseConv2D *node)
 {
-  export_simple(node, circle::BuiltinOperator_DEPTHWISE_CONV_2D,
-                circle::BuiltinOptions_DepthwiseConv2DOptions,
-                CreateDepthwiseConv2DOptions(_ctx.builder, getOpPadding(node->padding()),
-                                             node->stride()->w(), node->stride()->h(),
-                                             node->depthMultiplier(),
-                                             to_circle_actfunc(node->fusedActivationFunction()),
-                                             node->dilation()->w(), node->dilation()->h())
-                    .Union());
+  export_simple(
+    node, circle::BuiltinOperator_DEPTHWISE_CONV_2D, circle::BuiltinOptions_DepthwiseConv2DOptions,
+    CreateDepthwiseConv2DOptions(_ctx.builder, getOpPadding(node->padding()), node->stride()->w(),
+                                 node->stride()->h(), node->depthMultiplier(),
+                                 to_circle_actfunc(node->fusedActivationFunction()),
+                                 node->dilation()->w(), node->dilation()->h())
+      .Union());
 }
 
 void OperationExporter::visit(luci::CircleDequantize *node)
@@ -875,8 +921,8 @@ void OperationExporter::visit(luci::CircleDequantize *node)
 void OperationExporter::visit(luci::CircleDiv *node)
 {
   export_simple(
-      node, circle::BuiltinOperator_DIV, circle::BuiltinOptions_DivOptions,
-      CreateDivOptions(_ctx.builder, to_circle_actfunc(node->fusedActivationFunction())).Union());
+    node, circle::BuiltinOperator_DIV, circle::BuiltinOptions_DivOptions,
+    CreateDivOptions(_ctx.builder, to_circle_actfunc(node->fusedActivationFunction())).Union());
 }
 
 void OperationExporter::visit(luci::CircleElu *node)
@@ -902,6 +948,14 @@ void OperationExporter::visit(luci::CircleExpandDims *node)
                 CreateExpandDimsOptions(_ctx.builder).Union());
 }
 
+void OperationExporter::visit(luci::CircleFakeQuant *node)
+{
+  export_simple(node, circle::BuiltinOperator_FAKE_QUANT, circle::BuiltinOptions_FakeQuantOptions,
+                CreateFakeQuantOptions(_ctx.builder, node->min(), node->max(), node->num_bits(),
+                                       node->narrow_range())
+                  .Union());
+}
+
 void OperationExporter::visit(luci::CircleFill *node)
 {
   export_simple(node, circle::BuiltinOperator_FILL, circle::BuiltinOptions_FillOptions,
@@ -928,10 +982,10 @@ void OperationExporter::visit(luci::CircleFloorMod *node)
 void OperationExporter::visit(luci::CircleFullyConnected *node)
 {
   export_simple(
-      node, circle::BuiltinOperator_FULLY_CONNECTED, circle::BuiltinOptions_FullyConnectedOptions,
-      CreateFullyConnectedOptions(_ctx.builder, to_circle_actfunc(node->fusedActivationFunction()),
-                                  to_circle_weightsformat(node->weights_format()))
-          .Union());
+    node, circle::BuiltinOperator_FULLY_CONNECTED, circle::BuiltinOptions_FullyConnectedOptions,
+    CreateFullyConnectedOptions(_ctx.builder, to_circle_actfunc(node->fusedActivationFunction()),
+                                to_circle_weightsformat(node->weights_format()))
+      .Union());
 }
 
 void OperationExporter::visit(luci::CircleGather *node)
@@ -964,9 +1018,8 @@ void OperationExporter::visit(luci::CircleIf *node) { export_node(_ctx, node); }
 void OperationExporter::visit(luci::CircleL2Normalize *node)
 {
   export_simple(
-      node, circle::BuiltinOperator_L2_NORMALIZATION, circle::BuiltinOptions_L2NormOptions,
-      CreateL2NormOptions(_ctx.builder, to_circle_actfunc(node->fusedActivationFunction()))
-          .Union());
+    node, circle::BuiltinOperator_L2_NORMALIZATION, circle::BuiltinOptions_L2NormOptions,
+    CreateL2NormOptions(_ctx.builder, to_circle_actfunc(node->fusedActivationFunction())).Union());
 }
 
 void OperationExporter::visit(luci::CircleL2Pool2D *node)
@@ -998,7 +1051,7 @@ void OperationExporter::visit(luci::CircleLocalResponseNormalization *node)
                 circle::BuiltinOptions_LocalResponseNormalizationOptions,
                 CreateLocalResponseNormalizationOptions(_ctx.builder, node->radius(), node->bias(),
                                                         node->alpha(), node->beta())
-                    .Union());
+                  .Union());
 }
 
 void OperationExporter::visit(luci::CircleLog *node)
@@ -1074,15 +1127,15 @@ void OperationExporter::visit(luci::CircleMinimum *node)
 void OperationExporter::visit(luci::CircleMirrorPad *node)
 {
   export_simple(
-      node, circle::BuiltinOperator_MIRROR_PAD, circle::BuiltinOptions_MirrorPadOptions,
-      CreateMirrorPadOptions(_ctx.builder, to_circle_mirrorpadmode(node->mode())).Union());
+    node, circle::BuiltinOperator_MIRROR_PAD, circle::BuiltinOptions_MirrorPadOptions,
+    CreateMirrorPadOptions(_ctx.builder, to_circle_mirrorpadmode(node->mode())).Union());
 }
 
 void OperationExporter::visit(luci::CircleMul *node)
 {
   export_simple(
-      node, circle::BuiltinOperator_MUL, circle::BuiltinOptions_MulOptions,
-      CreateMulOptions(_ctx.builder, to_circle_actfunc(node->fusedActivationFunction())).Union());
+    node, circle::BuiltinOperator_MUL, circle::BuiltinOptions_MulOptions,
+    CreateMulOptions(_ctx.builder, to_circle_actfunc(node->fusedActivationFunction())).Union());
 }
 
 void OperationExporter::visit(luci::CircleNeg *node)
@@ -1190,7 +1243,7 @@ void OperationExporter::visit(luci::CircleReluN1To1 *node)
 void OperationExporter::visit(luci::CircleReshape *node)
 {
   auto new_shape = _ctx.builder.CreateVector<int32_t>(
-      node->newShape()->rank(), [node](size_t i) { return node->newShape()->dim(i); });
+    node->newShape()->rank(), [node](size_t i) { return node->newShape()->dim(i); });
 
   export_simple(node, circle::BuiltinOperator_RESHAPE, circle::BuiltinOptions_ReshapeOptions,
                 CreateReshapeOptions(_ctx.builder, new_shape).Union());
@@ -1199,9 +1252,9 @@ void OperationExporter::visit(luci::CircleReshape *node)
 void OperationExporter::visit(luci::CircleResizeBilinear *node)
 {
   export_simple(
-      node, circle::BuiltinOperator_RESIZE_BILINEAR, circle::BuiltinOptions_ResizeBilinearOptions,
-      CreateResizeBilinearOptions(_ctx.builder, node->align_corners(), node->half_pixel_centers())
-          .Union());
+    node, circle::BuiltinOperator_RESIZE_BILINEAR, circle::BuiltinOptions_ResizeBilinearOptions,
+    CreateResizeBilinearOptions(_ctx.builder, node->align_corners(), node->half_pixel_centers())
+      .Union());
 }
 
 void OperationExporter::visit(luci::CircleResizeNearestNeighbor *node)
@@ -1214,8 +1267,8 @@ void OperationExporter::visit(luci::CircleResizeNearestNeighbor *node)
 void OperationExporter::visit(luci::CircleReverseSequence *node)
 {
   export_simple(
-      node, circle::BuiltinOperator_REVERSE_SEQUENCE, circle::BuiltinOptions_ReverseSequenceOptions,
-      CreateReverseSequenceOptions(_ctx.builder, node->seq_axis(), node->batch_axis()).Union());
+    node, circle::BuiltinOperator_REVERSE_SEQUENCE, circle::BuiltinOptions_ReverseSequenceOptions,
+    CreateReverseSequenceOptions(_ctx.builder, node->seq_axis(), node->batch_axis()).Union());
 }
 
 void OperationExporter::visit(luci::CircleReverseV2 *node) { export_node(_ctx, node); }
@@ -1334,14 +1387,14 @@ void OperationExporter::visit(luci::CircleStridedSlice *node)
                 CreateStridedSliceOptions(_ctx.builder, node->begin_mask(), node->end_mask(),
                                           node->ellipsis_mask(), node->new_axis_mask(),
                                           node->shrink_axis_mask())
-                    .Union());
+                  .Union());
 }
 
 void OperationExporter::visit(luci::CircleSub *node)
 {
   export_simple(
-      node, circle::BuiltinOperator_SUB, circle::BuiltinOptions_SubOptions,
-      CreateSubOptions(_ctx.builder, to_circle_actfunc(node->fusedActivationFunction())).Union());
+    node, circle::BuiltinOperator_SUB, circle::BuiltinOptions_SubOptions,
+    CreateSubOptions(_ctx.builder, to_circle_actfunc(node->fusedActivationFunction())).Union());
 }
 
 void OperationExporter::visit(luci::CircleSum *node)
@@ -1375,7 +1428,7 @@ void OperationExporter::visit(luci::CircleTransposeConv *node)
                 circle::BuiltinOptions_TransposeConvOptions,
                 CreateTransposeConvOptions(_ctx.builder, getOpPadding(node->padding()),
                                            node->stride()->w(), node->stride()->h())
-                    .Union());
+                  .Union());
 }
 
 void OperationExporter::visit(luci::CircleUnidirectionalSequenceLSTM *node)
@@ -1383,10 +1436,10 @@ void OperationExporter::visit(luci::CircleUnidirectionalSequenceLSTM *node)
   export_simple(node, circle::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM,
                 circle::BuiltinOptions_UnidirectionalSequenceLSTMOptions,
                 CreateUnidirectionalSequenceLSTMOptions(
-                    _ctx.builder, to_circle_actfunc(node->fusedActivationFunction()),
-                    node->cell_clip(), node->proj_clip(), node->time_major(),
-                    node->asymmetric_quantize_inputs())
-                    .Union());
+                  _ctx.builder, to_circle_actfunc(node->fusedActivationFunction()),
+                  node->cell_clip(), node->proj_clip(), node->time_major(),
+                  node->asymmetric_quantize_inputs())
+                  .Union());
 }
 
 void OperationExporter::visit(luci::CircleUnique *node) { export_node(_ctx, node); }
@@ -1413,14 +1466,14 @@ void OperationExporter::visit(luci::CircleBCQFullyConnected *node)
                 circle::BuiltinOptions_BCQFullyConnectedOptions,
                 CreateBCQFullyConnectedOptions(_ctx.builder, node->weights_hidden_size(),
                                                to_circle_actfunc(node->fusedActivationFunction()))
-                    .Union());
+                  .Union());
 }
 
 void OperationExporter::visit(luci::CircleBCQGather *node)
 {
   export_simple(
-      node, circle::BuiltinOperator_BCQ_GATHER, circle::BuiltinOptions_BCQGatherOptions,
-      CreateBCQGatherOptions(_ctx.builder, node->input_hidden_size(), node->axis()).Union());
+    node, circle::BuiltinOperator_BCQ_GATHER, circle::BuiltinOptions_BCQGatherOptions,
+    CreateBCQGatherOptions(_ctx.builder, node->input_hidden_size(), node->axis()).Union());
 }
 
 void OperationExporter::visit(luci::CircleInstanceNorm *node)
@@ -1429,7 +1482,7 @@ void OperationExporter::visit(luci::CircleInstanceNorm *node)
                 circle::BuiltinOptions_InstanceNormOptions,
                 CreateInstanceNormOptions(_ctx.builder, node->epsilon(),
                                           to_circle_actfunc(node->fusedActivationFunction()))
-                    .Union());
+                  .Union());
 }
 
 void exportNode(loco::Node *node, flatbuffers::FlatBufferBuilder &builder, SerializedModelData &md,
@@ -1439,7 +1492,19 @@ void exportNode(loco::Node *node, flatbuffers::FlatBufferBuilder &builder, Seria
   {
     ExportContext ctx{builder, md, gd};
     OperationExporter exporter{ctx};
+
+    const auto ops_size = gd._operators.size();
+
     circle_node->accept(&exporter);
+    if (has_origin(circle_node) && ops_size != gd._operators.size())
+    {
+      const auto node_id = gd._operators.size() - 1;
+      for (auto source : get_origin(circle_node)->sources())
+      {
+        md._metadata.add_source_table(source->id(), source->name());
+        md._metadata.add_op_table(node_id, source->id());
+      }
+    }
   }
   else
   {
diff --git a/compiler/luci/export/src/CircleTensorExporter.cpp b/compiler/luci/export/src/CircleTensorExporter.cpp
index 9bdfa0079..fefdf4e73 100644
--- a/compiler/luci/export/src/CircleTensorExporter.cpp
+++ b/compiler/luci/export/src/CircleTensorExporter.cpp
@@ -15,11 +15,9 @@
  */
 
 #include "CircleTensorExporter.h"
-#include "TypeBridge.h"
 
 #include <luci/IR/CircleNodes.h>
 #include <luci/IR/CircleNodeVisitor.h>
-#include <luci/IR/CircleShapeSignature.h>
 #include <luci/Service/CircleTypeInference.h>
 #include <luci/Service/CircleShapeInference.h>
 #include <luci/Log.h>
@@ -38,10 +36,10 @@ namespace
 
 using namespace luci;
 
-class CircleTensoInfo
+class CircleTensorInfo
 {
 public:
-  CircleTensoInfo() = default;
+  CircleTensorInfo() = default;
 
 public:
   void name(const std::string &name) { _name = name; }
@@ -54,9 +52,6 @@ public:
   const ShapeDescription &shape(void) const { return _shape; }
   void shape(const ShapeDescription &shape) { _shape = shape; }
 
-  const ShapeSignature &shape_signature(void) const { return _shape_signature; }
-  void shape_signature(const ShapeSignature &ss) { _shape_signature = ss; }
-
   luci::ShapeStatus shape_status(void) const { return _shape_status; }
   void shape_status(luci::ShapeStatus ss) { _shape_status = ss; }
 
@@ -75,7 +70,6 @@ private:
 
   circle::TensorType _dtype{circle::TensorType_FLOAT32};
   ShapeDescription _shape{};
-  ShapeSignature _shape_signature;
   luci::ShapeStatus _shape_status{luci::ShapeStatus::UNDEFINED};
 
   luci::CircleConst *_content = nullptr;
@@ -83,7 +77,29 @@ private:
   luci::SparsityParam *_sparsityparam = nullptr;
 };
 
-using CircleTensorContext = std::vector<CircleTensoInfo>;
+class CircleTensorContext
+{
+public:
+  CircleTensorContext() = default;
+
+public:
+  void emplace_back(CircleTensorInfo &ti)
+  {
+    assert(_names.find(ti.name()) == _names.end());
+    _tis.emplace_back(ti);
+    _names.insert(ti.name());
+  }
+  size_t size(void) const { return _tis.size(); }
+  std::vector<CircleTensorInfo>::iterator begin(void) { return _tis.begin(); }
+  std::vector<CircleTensorInfo>::iterator end(void) { return _tis.end(); }
+
+public:
+  bool exist(const std::string &name) const { return _names.find(name) != _names.end(); }
+
+private:
+  std::vector<CircleTensorInfo> _tis;
+  std::set<std::string> _names;
+};
 
 struct NoOpDetector final : public luci::CircleNodeMutableVisitor<bool>
 {
@@ -102,17 +118,23 @@ void allocateCircleTensorInfo(CircleNode *node, CircleTensorContext &ctx)
 
   auto tensor_index = static_cast<CircleTensorIndex>(ctx.size());
   // TODO Use Graph-level metadata for Input & Output
-  // auto tensor_name = "t_" + std::to_string(tensor_index);
   std::string tensor_name = node->name();
-  if (tensor_name.empty())
-    tensor_name = "t_" + std::to_string(tensor_index);
+  // NOTE tensor_name maybe empty. this assertion will alert when this happens.
+  //      currently we require tensor should have a name.
+  // TODO if this breaks, fix the cause or permit empty tensor_name.
+  assert(!tensor_name.empty());
+  if (ctx.exist(tensor_name))
+  {
+    // NOTE this should assign unique name for a Tensor.
+    tensor_name = tensor_name + "_" + std::to_string(tensor_index);
+    assert(!ctx.exist(tensor_name));
+  }
   INFO(l) << "[luci] Tensor for " << tensor_name << ": " << tensor_index << std::endl;
 
-  CircleTensoInfo tensor_info;
+  CircleTensorInfo tensor_info;
 
   tensor_info.name(tensor_name);
   tensor_info.dtype(to_circle_tensortype(node->dtype()));
-  tensor_info.shape_signature(node->shape_signature());
   if (node->shape_status() == ShapeStatus::VALID)
     tensor_info.shape(to_shape_description(node));
   tensor_info.shape_status(node->shape_status());
@@ -146,19 +168,55 @@ private:
   }
 
 public:
+  bool visit(luci::CircleBidirectionalSequenceLSTMOut *) final { return true; }
+  bool visit(luci::CircleCustomOut *) final { return true; }
   bool visit(luci::CircleIfOut *) final { return true; }
+  bool visit(luci::CircleNonMaxSuppressionV4Out *) final { return true; }
+  bool visit(luci::CircleNonMaxSuppressionV5Out *) final { return true; }
   bool visit(luci::CircleSplitOut *) final { return true; }
   bool visit(luci::CircleSplitVOut *) final { return true; }
   bool visit(luci::CircleTopKV2Out *) final { return true; }
   bool visit(luci::CircleUnpackOut *) final { return true; }
+  bool visit(luci::CircleUniqueOut *) final { return true; }
   bool visit(luci::CircleWhileOut *) final { return true; }
 
+  bool visit(luci::CircleBidirectionalSequenceLSTM *node) final
+  {
+    if (node->merge_outputs())
+    {
+      store_outputs(node, 1);
+    }
+    else
+    {
+      store_outputs(node, 2);
+    }
+    return true;
+  }
+
+  bool visit(luci::CircleCustom *node) final
+  {
+    store_outputs(node, node->numOutputs());
+    return true;
+  }
+
   bool visit(luci::CircleIf *node) final
   {
     store_outputs(node, node->output_count());
     return true;
   }
 
+  bool visit(luci::CircleNonMaxSuppressionV4 *node) final
+  {
+    store_outputs(node, 2);
+    return true;
+  }
+
+  bool visit(luci::CircleNonMaxSuppressionV5 *node) final
+  {
+    store_outputs(node, 3);
+    return true;
+  }
+
   bool visit(luci::CircleSplit *node) final
   {
     store_outputs(node, uint32_t(node->num_split()));
@@ -183,6 +241,12 @@ public:
     return true;
   }
 
+  bool visit(luci::CircleUnique *node) final
+  {
+    store_outputs(node, 2);
+    return true;
+  }
+
   bool visit(luci::CircleWhile *node) final
   {
     store_outputs(node, node->output_count());
@@ -237,16 +301,26 @@ flatbuffers::Offset<Vector<int32_t>> encodeShape(FlatBufferBuilder &builder,
                                                  const ShapeDescription &shape)
 {
   assert(shape._rank_known && "unknown number of dimensions is not supported");
-  return builder.CreateVector(shape._dims);
+
+  std::vector<int32_t> encoded_shape;
+  encoded_shape.resize(shape._dims.size());
+  for (uint32_t i = 0; i < shape._dims.size(); ++i)
+    encoded_shape.at(i) = shape._dims.at(i) == -1 ? 1 : shape._dims.at(i);
+
+  return builder.CreateVector(encoded_shape);
 }
 
 flatbuffers::Offset<Vector<int32_t>> encodeShapeSignature(FlatBufferBuilder &builder,
-                                                          const ShapeSignature &shape_signature)
+                                                          const ShapeDescription &shape)
 {
-  if (shape_signature.rank() == 0)
-    return 0;
+  assert(shape._rank_known && "unknown number of dimensions is not supported");
+
+  // shape_signature is set if and only if at least one of dimensions are unknown.
+  for (uint32_t i = 0; i < shape._dims.size(); ++i)
+    if (shape._dims.at(i) == -1)
+      return builder.CreateVector(shape._dims);
 
-  return builder.CreateVector(shape_signature.as_vector());
+  return flatbuffers::Offset<Vector<int32_t>>();
 }
 
 flatbuffers::Offset<circle::Buffer> encodeOpBuffer(FlatBufferBuilder &builder)
@@ -343,14 +417,14 @@ encodeSparsityParameters(FlatBufferBuilder &builder, luci::SparsityParam *sparsi
     // array_segments
     auto circle_array_segments = to_circle_sparse_index_vector(builder, it.array_segments());
     auto circle_array_segments_type =
-        to_circle_sparse_index_vector_type(it.array_segments().type());
+      to_circle_sparse_index_vector_type(it.array_segments().type());
 
     // array_indices
     auto circle_array_indices = to_circle_sparse_index_vector(builder, it.array_indices());
     auto circle_array_indices_type = to_circle_sparse_index_vector_type(it.array_indices().type());
     auto dim_metadata = circle::CreateDimensionMetadata(
-        builder, to_circle_dimensiontype(it.format()), it.dense_size(), circle_array_segments_type,
-        circle_array_segments, circle_array_indices_type, circle_array_indices);
+      builder, to_circle_dimensiontype(it.format()), it.dense_size(), circle_array_segments_type,
+      circle_array_segments, circle_array_indices_type, circle_array_indices);
     dim_metadata_vec.emplace_back(dim_metadata);
   }
 
@@ -358,6 +432,18 @@ encodeSparsityParameters(FlatBufferBuilder &builder, luci::SparsityParam *sparsi
                                                 &sparsityparam->block_map, &dim_metadata_vec);
 }
 
+template <loco::DataType DT> bool has_same_elements(luci::CircleConst *lhs, luci::CircleConst *rhs)
+{
+  assert(lhs->dtype() == DT);
+  assert(rhs->dtype() == DT);
+  assert(lhs->size<DT>() == rhs->size<DT>());
+
+  for (uint32_t i = 0; i < lhs->size<DT>(); ++i)
+    if (lhs->at<DT>(i) != rhs->at<DT>(i))
+      return false;
+  return true;
+}
+
 bool has_same_values(luci::CircleConst *lhs, luci::CircleConst *rhs)
 {
   if (lhs->dtype() != rhs->dtype())
@@ -373,34 +459,31 @@ bool has_same_values(luci::CircleConst *lhs, luci::CircleConst *rhs)
   switch (lhs->dtype())
   {
     case loco::DataType::FLOAT32:
-      for (uint32_t i = 0; i < lhs->size<loco::DataType::FLOAT32>(); ++i)
-        if (lhs->at<loco::DataType::FLOAT32>(i) != rhs->at<loco::DataType::FLOAT32>(i))
-          return false;
-      break;
+      return has_same_elements<loco::DataType::FLOAT32>(lhs, rhs);
+
+    case loco::DataType::S8:
+      return has_same_elements<loco::DataType::S8>(lhs, rhs);
+
+    case loco::DataType::S16:
+      return has_same_elements<loco::DataType::S16>(lhs, rhs);
 
     case loco::DataType::S32:
-      for (uint32_t i = 0; i < lhs->size<loco::DataType::S32>(); ++i)
-        if (lhs->at<loco::DataType::S32>(i) != rhs->at<loco::DataType::S32>(i))
-          return false;
-      break;
+      return has_same_elements<loco::DataType::S32>(lhs, rhs);
 
     case loco::DataType::S64:
-      for (uint32_t i = 0; i < lhs->size<loco::DataType::S64>(); ++i)
-        if (lhs->at<loco::DataType::S64>(i) != rhs->at<loco::DataType::S64>(i))
-          return false;
-      break;
+      return has_same_elements<loco::DataType::S64>(lhs, rhs);
+
+    case loco::DataType::U8:
+      return has_same_elements<loco::DataType::U8>(lhs, rhs);
 
     case loco::DataType::BOOL:
-      for (uint32_t i = 0; i < lhs->size<loco::DataType::BOOL>(); ++i)
-        if (lhs->at<loco::DataType::BOOL>(i) != rhs->at<loco::DataType::BOOL>(i))
-          return false;
-      break;
+      return has_same_elements<loco::DataType::BOOL>(lhs, rhs);
 
     default:
-      return false;
+      break;
   }
 
-  return true;
+  return false;
 }
 
 uint32_t get_buffer_id(FlatBufferBuilder &builder, SerializedModelData &md, luci::CircleConst *node)
@@ -433,26 +516,28 @@ uint32_t get_buffer_id(FlatBufferBuilder &builder, SerializedModelData &md, luci
   }
 }
 
-void exportOpDefinedTensor(const CircleTensoInfo &info, FlatBufferBuilder &builder,
+void exportOpDefinedTensor(const CircleTensorInfo &info, FlatBufferBuilder &builder,
                            SerializedModelData &md, SerializedGraphData &gd)
 {
   // Create and register output tensor shape
   flatbuffers::Offset<Vector<int32_t>> shape_offset;
+  flatbuffers::Offset<Vector<int32_t>> shape_signature_offset;
   if (info.shape_status() == ShapeStatus::VALID)
+  {
     shape_offset = encodeShape(builder, info.shape());
+    shape_signature_offset = encodeShapeSignature(builder, info.shape());
+  }
 
   auto quantparam = encodeQuantizationParameters(builder, info.quantparam());
 
   auto sparsityparam = encodeSparsityParameters(builder, info.sparsityparam());
 
-  auto shape_signature_offset = encodeShapeSignature(builder, info.shape_signature());
-
   auto buffer_id = get_buffer_id(builder, md, info.content());
 
   auto name_offset = builder.CreateString(info.name());
   auto tensor_offset =
-      CreateTensor(builder, shape_offset, info.dtype(), buffer_id, name_offset, quantparam,
-                   /*is_variable*/ false, sparsityparam, shape_signature_offset);
+    CreateTensor(builder, shape_offset, info.dtype(), buffer_id, name_offset, quantparam,
+                 /*is_variable*/ false, sparsityparam, shape_signature_offset);
   gd._tensors.push_back(tensor_offset);
 }
 
diff --git a/compiler/luci/export/src/Optimize.cpp b/compiler/luci/export/src/Optimize.cpp
index 036a4a2f9..e59f15204 100644
--- a/compiler/luci/export/src/Optimize.cpp
+++ b/compiler/luci/export/src/Optimize.cpp
@@ -17,9 +17,8 @@
 #include "Optimize.h"
 #include "ProgressReporter.h"
 
-#include <luci/Pass/ShapeInferencePass.h>
-#include <luci/Pass/ShapeSignatureInferencePass.h>
-#include <luci/Pass/TypeInferencePass.h>
+#include <luci/Pass/CircleShapeInferencePass.h>
+#include <luci/Pass/CircleTypeInferencePass.h>
 
 #include <logo/Phase.h>
 
@@ -33,9 +32,8 @@ void optimize(loco::Graph *g)
   logo::Phase phase;
   {
     // prepare type and shape before optimization
-    phase.emplace_back(std::make_unique<TypeInferencePass>());
-    phase.emplace_back(std::make_unique<ShapeInferencePass>());
-    phase.emplace_back(std::make_unique<ShapeSignatureInferencePass>());
+    phase.emplace_back(std::make_unique<luci::CircleShapeInferencePass>());
+    phase.emplace_back(std::make_unique<luci::CircleTypeInferencePass>());
 
     // TODO add more optimization passes (with a knob)
   }
diff --git a/compiler/luci/export/src/ProgressReporter.h b/compiler/luci/export/src/ProgressReporter.h
index e91f42592..5d55bcd07 100644
--- a/compiler/luci/export/src/ProgressReporter.h
+++ b/compiler/luci/export/src/ProgressReporter.h
@@ -28,7 +28,7 @@ class ProgressReporter : public logo::PhaseEventListener
 {
 public:
   ProgressReporter(loco::Graph *graph, logo::PhaseStrategy strategy)
-      : _graph{graph}, _strategy{strategy}
+    : _graph{graph}, _strategy{strategy}
   {
     // DO NOTHING
   }
diff --git a/compiler/luci/export/src/SerializedData.h b/compiler/luci/export/src/SerializedData.h
index c41f50edd..df71e5c21 100644
--- a/compiler/luci/export/src/SerializedData.h
+++ b/compiler/luci/export/src/SerializedData.h
@@ -48,6 +48,37 @@ struct OpCode
   }
 };
 
+class CircleExportMetadata
+{
+public:
+  void add_source_table(uint32_t source_id, std::string origin_name)
+  {
+    // Model with multiple subgraph may have different origin_name
+    // even if source_id is same. However, as we do not consider about
+    // multiple subgraph in profiling for now, just do not care those cases
+    // and support them correctly in the future.
+    _source_table.emplace(source_id, origin_name);
+  }
+
+  void add_op_table(uint32_t node_id, uint32_t source_id)
+  {
+    // Model with multiple subgraph may have duplicated node id.
+    // For now, as we do not consider about multiple subgraph in profiling,
+    // just ignore those cases and support them in the future.
+    if (_op_table.find(node_id) == _op_table.end())
+      _op_table.emplace(node_id, std::set<uint32_t>());
+    _op_table.at(node_id).emplace(source_id);
+  }
+
+public:
+  const std::vector<uint8_t> encoded_source_table(void);
+  const std::vector<uint8_t> encoded_op_table(void);
+
+private:
+  std::map<uint32_t, std::string> _source_table;
+  std::map<uint32_t, std::set<uint32_t>> _op_table;
+};
+
 } // namespace luci
 
 namespace std
@@ -86,6 +117,7 @@ struct SerializedModelData final
 
   std::unordered_map<OpCode, uint32_t> _operator_codes;
   std::vector<flatbuffers::Offset<circle::Buffer>> _buffers;
+  CircleExportMetadata _metadata;
 
   // This is used for removing buffers with same values
   std::map<luci::CircleConst *, uint32_t> _cached_buffer_id;
diff --git a/compiler/luci/export/src/TypeBridge.cpp b/compiler/luci/export/src/TypeBridge.cpp
deleted file mode 100644
index 9ccd52376..000000000
--- a/compiler/luci/export/src/TypeBridge.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "TypeBridge.h"
-
-#include "CircleExporterUtils.h"
-
-#include <luci/IR/CircleNodes.h>
-#include <luci/IR/CircleNodeVisitor.h>
-#include <luci/Service/CircleTypeInference.h>
-#include <luci/Service/CircleShapeInference.h>
-
-#include <loco/Service/TypeInference.h>
-#include <loco/Service/ShapeInference.h>
-
-namespace
-{
-
-/**
- * @brief CopySelector will return condition of copy shape/type inference to node
- */
-struct CopySelector final : public luci::CircleNodeVisitor<bool>
-{
-  // return false(don't copy) for nodes that provides shape/type from nature
-  bool visit(const luci::CircleInput *) final { return false; }
-  bool visit(const luci::CircleConst *) final { return false; }
-
-  // default is copy attributes
-  bool visit(const luci::CircleNode *) { return true; }
-};
-
-} // namespace
-
-namespace luci
-{
-
-loco::TensorShape node_shape(CircleNode *node)
-{
-  loco::TensorShape shape;
-
-  shape.rank(node->rank());
-  for (uint32_t r = 0; r < node->rank(); ++r)
-  {
-    shape.dim(r) = loco::Dimension(node->dim(r).value());
-  }
-  return shape;
-}
-
-loco::DataType node_dtype(CircleNode *node) { return node->dtype(); }
-
-void copy_shape_dtype(loco::Graph *graph)
-{
-  /**
-   * @note We will iterate all the nodes in the graph to include dangle nodes
-   */
-  auto nodes = graph->nodes();
-  for (uint32_t n = 0; n < nodes->size(); ++n)
-  {
-    auto node = loco::must_cast<luci::CircleNode *>(nodes->at(n));
-
-    CopySelector cs;
-    if (node->accept(&cs))
-    {
-      // NOTE not all nodes have infered shape/dtype: multiple outs may not be
-      //      visited when outputs are not used
-      // TODO fix shape inference traversal
-      // NOTE when loco supports multiple outputs in nature this issue should be
-      //      resolved also
-
-      if (loco::dtype_known(node))
-      {
-        node->dtype(loco::dtype_get(node));
-      }
-
-      if (loco::shape_known(node))
-      {
-        auto shape = loco::shape_get(node).as<loco::TensorShape>();
-        node->rank(shape.rank());
-        for (uint32_t r = 0; r < shape.rank(); ++r)
-        {
-          node->dim(r) = loco::Dimension(shape.dim(r).value());
-        }
-
-        // ShapeStatus should be update only when the status was UNDEFINED
-        if (node->shape_status() == ShapeStatus::UNDEFINED)
-          node->shape_status(ShapeStatus::VALID);
-      }
-    }
-  }
-}
-
-} // namespace luci
diff --git a/compiler/luci/import/CMakeLists.txt b/compiler/luci/import/CMakeLists.txt
index 2ae00b837..642751ca6 100644
--- a/compiler/luci/import/CMakeLists.txt
+++ b/compiler/luci/import/CMakeLists.txt
@@ -6,6 +6,7 @@ add_library(luci_import SHARED ${SOURCES})
 target_include_directories(luci_import PRIVATE src)
 target_include_directories(luci_import PUBLIC include)
 target_link_libraries(luci_import PUBLIC luci_lang)
+target_link_libraries(luci_import PUBLIC luci_profile)
 target_link_libraries(luci_import PUBLIC mio_circle)
 target_link_libraries(luci_import PRIVATE luci_env)
 target_link_libraries(luci_import PRIVATE luci_log)
diff --git a/compiler/luci/import/include/luci/Import/CircleReader.h b/compiler/luci/import/include/luci/Import/CircleReader.h
index 8e210dd77..b9697fb86 100644
--- a/compiler/luci/import/include/luci/Import/CircleReader.h
+++ b/compiler/luci/import/include/luci/Import/CircleReader.h
@@ -23,7 +23,6 @@
 #include <luci/IR/AttrPadding.h>
 #include <luci/IR/CircleNode.h>
 #include <luci/IR/CircleQuantParam.h>
-#include <luci/IR/CircleShapeSignature.h>
 #include <luci/IR/SparsityParam.h>
 
 #include <loco.h>
@@ -64,6 +63,7 @@ private:
   using CircleTensors_t = std::vector<std::unique_ptr<circle::TensorT>>;
   using CircleOperators_t = std::vector<std::unique_ptr<circle::OperatorT>>;
   using CircleOperatorCodes_t = std::vector<std::unique_ptr<circle::OperatorCodeT>>;
+  using CircleMetadata_t = std::vector<std::unique_ptr<circle::MetadataT>>;
 
   using CircleSubGraphsPtr_t = flatbuffers::Vector<flatbuffers::Offset<circle::SubGraph>>;
   using CircleTensorsPtr_t = flatbuffers::Vector<flatbuffers::Offset<circle::Tensor>>;
@@ -79,6 +79,8 @@ public:
   const std::vector<int32_t> &inputs() const { return _current_subgraph->inputs; }
   const std::vector<int32_t> &outputs() const { return _current_subgraph->outputs; }
   const std::string &name() const { return _current_subgraph->name; }
+  const circle::DataFormat &data_format() const { return _current_subgraph->data_format; }
+  const CircleMetadata_t &metadata() const { return _model->metadata; }
 
   const CircleTensorsPtr_t *tensors_ptr() const { return _tensors_ptr; }
 
diff --git a/compiler/luci/import/include/luci/Import/GraphBuilder.h b/compiler/luci/import/include/luci/Import/GraphBuilder.h
index 548264dac..0db612652 100644
--- a/compiler/luci/import/include/luci/Import/GraphBuilder.h
+++ b/compiler/luci/import/include/luci/Import/GraphBuilder.h
@@ -33,7 +33,13 @@ class GraphBuilder : public GraphBuilderBase
 public:
   virtual ~GraphBuilder() = default;
 
-  void build(const circle::OperatorT &op, GraphBuilderContext *context) const final;
+  // common validate method to check number of inputs and single output
+  bool validate(const ValidateArgs &args, size_t input_cnt) const
+  {
+    return (args.op.inputs.size() == input_cnt && args.op.outputs.size() == 1);
+  }
+
+  CircleNode *build(const circle::OperatorT &op, GraphBuilderContext *context) const final;
 
 private:
   virtual CircleNode *build_node(const circle::OperatorT &op,
diff --git a/compiler/luci/import/include/luci/Import/GraphBuilderBase.h b/compiler/luci/import/include/luci/Import/GraphBuilderBase.h
index a0cd008e0..ddd4445cd 100644
--- a/compiler/luci/import/include/luci/Import/GraphBuilderBase.h
+++ b/compiler/luci/import/include/luci/Import/GraphBuilderBase.h
@@ -19,6 +19,8 @@
 
 #include "GraphBuilderContext.h"
 
+#include <luci/IR/CircleNode.h>
+
 #include <mio/circle/schema_generated.h>
 
 namespace luci
@@ -38,7 +40,7 @@ struct GraphBuilderBase
   };
 
   virtual bool validate(const ValidateArgs &) const = 0;
-  virtual void build(const circle::OperatorT &op, GraphBuilderContext *context) const = 0;
+  virtual CircleNode *build(const circle::OperatorT &op, GraphBuilderContext *context) const = 0;
 
   virtual ~GraphBuilderBase() = default;
 };
diff --git a/compiler/luci/import/include/luci/Import/GraphBuilderContext.h b/compiler/luci/import/include/luci/Import/GraphBuilderContext.h
index 72e237abc..1673df43d 100644
--- a/compiler/luci/import/include/luci/Import/GraphBuilderContext.h
+++ b/compiler/luci/import/include/luci/Import/GraphBuilderContext.h
@@ -71,7 +71,7 @@ class GraphBuilderContext
 public:
   GraphBuilderContext(loco::Graph *g, CircleReader *reader, IndexNodeFinder *nodefinder,
                       IndexTensorOutputs *tensoroutputs)
-      : _g(g), _reader(reader), _indexnodefinder(nodefinder), _indextensoroutputs(tensoroutputs)
+    : _g(g), _reader(reader), _indexnodefinder(nodefinder), _indextensoroutputs(tensoroutputs)
   {
     // DO NOTHING
   }
diff --git a/compiler/luci/import/include/luci/Import/GraphBuilderMultiOutput.h b/compiler/luci/import/include/luci/Import/GraphBuilderMultiOutput.h
new file mode 100644
index 000000000..6e8791b62
--- /dev/null
+++ b/compiler/luci/import/include/luci/Import/GraphBuilderMultiOutput.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_IMPORT_GRAPH_BUILDER_MULTI_OUTPUT_H__
+#define __LUCI_IMPORT_GRAPH_BUILDER_MULTI_OUTPUT_H__
+
+#include "GraphBuilderContext.h"
+#include "GraphBuilderBase.h"
+
+#include <mio/circle/schema_generated.h>
+
+namespace luci
+{
+
+/**
+ * @brief Base of general multiple outputs graph builder(e.g., CircleIfGraphBuilder)
+ */
+class GraphBuilderMultiOutput : public GraphBuilderBase
+{
+public:
+  virtual ~GraphBuilderMultiOutput() = default;
+
+  CircleNode *build(const circle::OperatorT &op, GraphBuilderContext *context) const final;
+
+protected:
+  struct BuildNodeArgs
+  {
+    BuildNodeArgs(const circle::OperatorT &o, GraphBuilderContext *c,
+                  const std::vector<CircleNode *> &i)
+      : op(o), context(c), input_nodes(i)
+    {
+    }
+
+    const circle::OperatorT &op;
+    GraphBuilderContext *context;
+    const std::vector<CircleNode *> &input_nodes;
+  };
+
+  struct BuildOutArgs
+  {
+    BuildOutArgs(CircleNode *nd, uint32_t n) : node(nd), index(n) {}
+
+    CircleNode *node;
+    uint32_t index;
+  };
+
+private:
+  virtual CircleNode *build_node(const BuildNodeArgs &) const = 0;
+  virtual CircleNode *build_out(const BuildOutArgs &) const = 0;
+};
+
+} // namespace luci
+
+#endif // __LUCI_IMPORT_GRAPH_BUILDER_MULTI_OUTPUT_H__
diff --git a/compiler/luci/import/include/luci/Import/Nodes.h b/compiler/luci/import/include/luci/Import/Nodes.h
index 28741064e..b084c7dbc 100644
--- a/compiler/luci/import/include/luci/Import/Nodes.h
+++ b/compiler/luci/import/include/luci/Import/Nodes.h
@@ -27,6 +27,7 @@
 #include "Nodes/CircleBatchToSpaceND.h"
 #include "Nodes/CircleBCQFullyConnected.h"
 #include "Nodes/CircleBCQGather.h"
+#include "Nodes/CircleBidirectionalSequenceLSTM.h"
 #include "Nodes/CircleCast.h"
 #include "Nodes/CircleCeil.h"
 #include "Nodes/CircleConcatenation.h"
@@ -42,6 +43,7 @@
 #include "Nodes/CircleEqual.h"
 #include "Nodes/CircleExp.h"
 #include "Nodes/CircleExpandDims.h"
+#include "Nodes/CircleFakeQuant.h"
 #include "Nodes/CircleFill.h"
 #include "Nodes/CircleFloor.h"
 #include "Nodes/CircleFloorDiv.h"
diff --git a/compiler/luci/import/include/luci/Import/Nodes/CircleBidirectionalSequenceLSTM.h b/compiler/luci/import/include/luci/Import/Nodes/CircleBidirectionalSequenceLSTM.h
new file mode 100644
index 000000000..491517268
--- /dev/null
+++ b/compiler/luci/import/include/luci/Import/Nodes/CircleBidirectionalSequenceLSTM.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_IMPORT_OP_CIRCLE_BIDIRECTIONALSEQUENCE_LSTM_H__
+#define __LUCI_IMPORT_OP_CIRCLE_BIDIRECTIONALSEQUENCE_LSTM_H__
+
+#include "luci/Import/GraphBuilderMultiOutput.h"
+
+namespace luci
+{
+
+class CircleBidirectionalSequenceLSTMGraphBuilder : public GraphBuilderMultiOutput
+{
+public:
+  bool validate(const ValidateArgs &args) const final;
+
+private:
+  CircleNode *build_node(const BuildNodeArgs &) const final;
+  CircleNode *build_out(const BuildOutArgs &) const final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_IMPORT_OP_CIRCLE_BIDIRECTIONALSEQUENCE_LSTM_H__
diff --git a/compiler/luci/import/include/luci/Import/Nodes/CircleCustom.h b/compiler/luci/import/include/luci/Import/Nodes/CircleCustom.h
index 65745be4b..f0d7e303d 100644
--- a/compiler/luci/import/include/luci/Import/Nodes/CircleCustom.h
+++ b/compiler/luci/import/include/luci/Import/Nodes/CircleCustom.h
@@ -17,17 +17,19 @@
 #ifndef __LUCI_IMPORT_OP_CIRCLE_CUSTOM_H__
 #define __LUCI_IMPORT_OP_CIRCLE_CUSTOM_H__
 
-#include "luci/Import/GraphBuilder.h"
+#include "luci/Import/GraphBuilderMultiOutput.h"
 
 namespace luci
 {
 
-class CircleCustomGraphBuilder : public GraphBuilderBase
+class CircleCustomGraphBuilder : public GraphBuilderMultiOutput
 {
 public:
   bool validate(const ValidateArgs &args) const final;
 
-  void build(const circle::OperatorT &op, GraphBuilderContext *context) const final;
+private:
+  CircleNode *build_node(const BuildNodeArgs &) const final;
+  CircleNode *build_out(const BuildOutArgs &) const final;
 };
 
 } // namespace luci
diff --git a/compiler/luci/import/include/luci/Import/Nodes/CircleFakeQuant.h b/compiler/luci/import/include/luci/Import/Nodes/CircleFakeQuant.h
new file mode 100644
index 000000000..9d9f7b07b
--- /dev/null
+++ b/compiler/luci/import/include/luci/Import/Nodes/CircleFakeQuant.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_IMPORT_OP_CIRCLE_FAKE_QUANT_H__
+#define __LUCI_IMPORT_OP_CIRCLE_FAKE_QUANT_H__
+
+#include "luci/Import/GraphBuilder.h"
+
+namespace luci
+{
+
+class CircleFakeQuantGraphBuilder : public GraphBuilder
+{
+public:
+  bool validate(const ValidateArgs &args) const final;
+
+private:
+  CircleNode *build_node(const circle::OperatorT &op, const std::vector<CircleNode *> &inputs,
+                         loco::Graph *graph) const final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_IMPORT_OP_CIRCLE_FAKE_QUANT_H__
diff --git a/compiler/luci/import/include/luci/Import/Nodes/CircleIf.h b/compiler/luci/import/include/luci/Import/Nodes/CircleIf.h
index 8faf09cae..94052f5be 100644
--- a/compiler/luci/import/include/luci/Import/Nodes/CircleIf.h
+++ b/compiler/luci/import/include/luci/Import/Nodes/CircleIf.h
@@ -17,17 +17,19 @@
 #ifndef __LUCI_IMPORT_OP_CIRCLE_IF_H__
 #define __LUCI_IMPORT_OP_CIRCLE_IF_H__
 
-#include "luci/Import/GraphBuilderBase.h"
+#include "luci/Import/GraphBuilderMultiOutput.h"
 
 namespace luci
 {
 
-class CircleIfGraphBuilder : public GraphBuilderBase
+class CircleIfGraphBuilder : public GraphBuilderMultiOutput
 {
 public:
   bool validate(const ValidateArgs &args) const final;
 
-  void build(const circle::OperatorT &op, GraphBuilderContext *context) const final;
+private:
+  CircleNode *build_node(const BuildNodeArgs &) const final;
+  CircleNode *build_out(const BuildOutArgs &) const final;
 };
 
 } // namespace luci
diff --git a/compiler/luci/import/include/luci/Import/Nodes/CircleNonMaxSuppressionV4.h b/compiler/luci/import/include/luci/Import/Nodes/CircleNonMaxSuppressionV4.h
index f193aae35..4e8388b3e 100644
--- a/compiler/luci/import/include/luci/Import/Nodes/CircleNonMaxSuppressionV4.h
+++ b/compiler/luci/import/include/luci/Import/Nodes/CircleNonMaxSuppressionV4.h
@@ -17,17 +17,19 @@
 #ifndef __LUCI_IMPORT_OP_CIRCLE_NON_MAX_SUPPRESSION_V4_H__
 #define __LUCI_IMPORT_OP_CIRCLE_NON_MAX_SUPPRESSION_V4_H__
 
-#include "luci/Import/GraphBuilderBase.h"
+#include "luci/Import/GraphBuilderMultiOutput.h"
 
 namespace luci
 {
 
-class CircleNonMaxSuppressionV4GraphBuilder : public GraphBuilderBase
+class CircleNonMaxSuppressionV4GraphBuilder : public GraphBuilderMultiOutput
 {
 public:
   bool validate(const ValidateArgs &args) const final;
 
-  void build(const circle::OperatorT &op, GraphBuilderContext *context) const final;
+private:
+  CircleNode *build_node(const BuildNodeArgs &) const final;
+  CircleNode *build_out(const BuildOutArgs &) const final;
 };
 
 } // namespace luci
diff --git a/compiler/luci/import/include/luci/Import/Nodes/CircleNonMaxSuppressionV5.h b/compiler/luci/import/include/luci/Import/Nodes/CircleNonMaxSuppressionV5.h
index 62be0758e..4120a30eb 100644
--- a/compiler/luci/import/include/luci/Import/Nodes/CircleNonMaxSuppressionV5.h
+++ b/compiler/luci/import/include/luci/Import/Nodes/CircleNonMaxSuppressionV5.h
@@ -17,17 +17,19 @@
 #ifndef __LUCI_IMPORT_OP_CIRCLE_NON_MAX_SUPPRESSION_V5_H__
 #define __LUCI_IMPORT_OP_CIRCLE_NON_MAX_SUPPRESSION_V5_H__
 
-#include "luci/Import/GraphBuilderBase.h"
+#include "luci/Import/GraphBuilderMultiOutput.h"
 
 namespace luci
 {
 
-class CircleNonMaxSuppressionV5GraphBuilder : public GraphBuilderBase
+class CircleNonMaxSuppressionV5GraphBuilder : public GraphBuilderMultiOutput
 {
 public:
   bool validate(const ValidateArgs &args) const final;
 
-  void build(const circle::OperatorT &op, GraphBuilderContext *context) const final;
+private:
+  CircleNode *build_node(const BuildNodeArgs &) const final;
+  CircleNode *build_out(const BuildOutArgs &) const final;
 };
 
 } // namespace luci
diff --git a/compiler/luci/import/include/luci/Import/Nodes/CircleSplit.h b/compiler/luci/import/include/luci/Import/Nodes/CircleSplit.h
index 3395e40fd..5b45c9a9e 100644
--- a/compiler/luci/import/include/luci/Import/Nodes/CircleSplit.h
+++ b/compiler/luci/import/include/luci/Import/Nodes/CircleSplit.h
@@ -17,17 +17,19 @@
 #ifndef __LUCI_IMPORT_OP_CIRCLE_SPLIT_H__
 #define __LUCI_IMPORT_OP_CIRCLE_SPLIT_H__
 
-#include "luci/Import/GraphBuilderBase.h"
+#include "luci/Import/GraphBuilderMultiOutput.h"
 
 namespace luci
 {
 
-class CircleSplitGraphBuilder : public GraphBuilderBase
+class CircleSplitGraphBuilder : public GraphBuilderMultiOutput
 {
 public:
   bool validate(const ValidateArgs &args) const final;
 
-  void build(const circle::OperatorT &op, GraphBuilderContext *context) const final;
+private:
+  CircleNode *build_node(const BuildNodeArgs &) const final;
+  CircleNode *build_out(const BuildOutArgs &) const final;
 };
 
 } // namespace luci
diff --git a/compiler/luci/import/include/luci/Import/Nodes/CircleSplitV.h b/compiler/luci/import/include/luci/Import/Nodes/CircleSplitV.h
index 3e53df362..de712f90c 100644
--- a/compiler/luci/import/include/luci/Import/Nodes/CircleSplitV.h
+++ b/compiler/luci/import/include/luci/Import/Nodes/CircleSplitV.h
@@ -17,17 +17,19 @@
 #ifndef __LUCI_IMPORT_OP_CIRCLE_SPLIT_V_H__
 #define __LUCI_IMPORT_OP_CIRCLE_SPLIT_V_H__
 
-#include "luci/Import/GraphBuilderBase.h"
+#include "luci/Import/GraphBuilderMultiOutput.h"
 
 namespace luci
 {
 
-class CircleSplitVGraphBuilder : public GraphBuilderBase
+class CircleSplitVGraphBuilder : public GraphBuilderMultiOutput
 {
 public:
   bool validate(const ValidateArgs &args) const final;
 
-  void build(const circle::OperatorT &op, GraphBuilderContext *context) const final;
+private:
+  CircleNode *build_node(const BuildNodeArgs &) const final;
+  CircleNode *build_out(const BuildOutArgs &) const final;
 };
 
 } // namespace luci
diff --git a/compiler/luci/import/include/luci/Import/Nodes/CircleTopKV2.h b/compiler/luci/import/include/luci/Import/Nodes/CircleTopKV2.h
index 8ec3f3311..b4ad97130 100644
--- a/compiler/luci/import/include/luci/Import/Nodes/CircleTopKV2.h
+++ b/compiler/luci/import/include/luci/Import/Nodes/CircleTopKV2.h
@@ -17,17 +17,19 @@
 #ifndef __LUCI_IMPORT_OP_CIRCLE_TOPK_V2_H__
 #define __LUCI_IMPORT_OP_CIRCLE_TOPK_V2_H__
 
-#include "luci/Import/GraphBuilderBase.h"
+#include "luci/Import/GraphBuilderMultiOutput.h"
 
 namespace luci
 {
 
-class CircleTopKV2GraphBuilder : public GraphBuilderBase
+class CircleTopKV2GraphBuilder : public GraphBuilderMultiOutput
 {
 public:
   bool validate(const ValidateArgs &args) const final;
 
-  void build(const circle::OperatorT &op, GraphBuilderContext *context) const final;
+private:
+  CircleNode *build_node(const BuildNodeArgs &) const final;
+  CircleNode *build_out(const BuildOutArgs &) const final;
 };
 
 } // namespace luci
diff --git a/compiler/luci/import/include/luci/Import/Nodes/CircleUnique.h b/compiler/luci/import/include/luci/Import/Nodes/CircleUnique.h
index ed5b5035d..40e75ec73 100644
--- a/compiler/luci/import/include/luci/Import/Nodes/CircleUnique.h
+++ b/compiler/luci/import/include/luci/Import/Nodes/CircleUnique.h
@@ -17,17 +17,19 @@
 #ifndef __LUCI_IMPORT_OP_CIRCLE_UNIQUE_H__
 #define __LUCI_IMPORT_OP_CIRCLE_UNIQUE_H__
 
-#include "luci/Import/GraphBuilderBase.h"
+#include "luci/Import/GraphBuilderMultiOutput.h"
 
 namespace luci
 {
 
-class CircleUniqueGraphBuilder : public GraphBuilderBase
+class CircleUniqueGraphBuilder : public GraphBuilderMultiOutput
 {
 public:
   bool validate(const ValidateArgs &args) const final;
 
-  void build(const circle::OperatorT &op, GraphBuilderContext *context) const final;
+private:
+  CircleNode *build_node(const BuildNodeArgs &) const final;
+  CircleNode *build_out(const BuildOutArgs &) const final;
 };
 
 } // namespace luci
diff --git a/compiler/luci/import/include/luci/Import/Nodes/CircleUnpack.h b/compiler/luci/import/include/luci/Import/Nodes/CircleUnpack.h
index f1a21de22..0b623655f 100644
--- a/compiler/luci/import/include/luci/Import/Nodes/CircleUnpack.h
+++ b/compiler/luci/import/include/luci/Import/Nodes/CircleUnpack.h
@@ -17,17 +17,19 @@
 #ifndef __LUCI_IMPORT_OP_CIRCLE_UNPACK_H__
 #define __LUCI_IMPORT_OP_CIRCLE_UNPACK_H__
 
-#include "luci/Import/GraphBuilderBase.h"
+#include "luci/Import/GraphBuilderMultiOutput.h"
 
 namespace luci
 {
 
-class CircleUnpackGraphBuilder : public GraphBuilderBase
+class CircleUnpackGraphBuilder : public GraphBuilderMultiOutput
 {
 public:
   bool validate(const ValidateArgs &args) const final;
 
-  void build(const circle::OperatorT &op, GraphBuilderContext *context) const final;
+private:
+  CircleNode *build_node(const BuildNodeArgs &) const final;
+  CircleNode *build_out(const BuildOutArgs &) const final;
 };
 
 } // namespace luci
diff --git a/compiler/luci/import/include/luci/Import/Nodes/CircleWhile.h b/compiler/luci/import/include/luci/Import/Nodes/CircleWhile.h
index 68c56b3c6..69d23f823 100644
--- a/compiler/luci/import/include/luci/Import/Nodes/CircleWhile.h
+++ b/compiler/luci/import/include/luci/Import/Nodes/CircleWhile.h
@@ -27,7 +27,7 @@ class CircleWhileGraphBuilder : public GraphBuilderBase
 public:
   bool validate(const ValidateArgs &args) const final;
 
-  void build(const circle::OperatorT &op, GraphBuilderContext *context) const final;
+  CircleNode *build(const circle::OperatorT &op, GraphBuilderContext *context) const final;
 };
 
 } // namespace luci
diff --git a/compiler/luci/import/src/CircleImportMetadata.cpp b/compiler/luci/import/src/CircleImportMetadata.cpp
new file mode 100644
index 000000000..f68f3301a
--- /dev/null
+++ b/compiler/luci/import/src/CircleImportMetadata.cpp
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleImportMetadata.h"
+
+#include <vector>
+
+namespace
+{
+
+uint32_t read_u32(const std::vector<uint8_t> &buffer, uint32_t idx)
+{
+  uint32_t val = 0;
+  val += (buffer.at(idx + 0) << 0 * 8);
+  val += (buffer.at(idx + 1) << 1 * 8);
+  val += (buffer.at(idx + 2) << 2 * 8);
+  val += (buffer.at(idx + 3) << 3 * 8);
+  return val;
+}
+
+} // namespace
+
+namespace
+{
+
+// 'source_table' is decoded to std::map<uint32_t, std::string> format.
+const std::map<uint32_t, std::string>
+decoded_source_table(const std::vector<uint8_t> &source_table_data)
+{
+  std::map<uint32_t, std::string> source_id_name_map;
+  uint32_t idx = 0;
+
+  if (source_table_data.size() < 4)
+    throw std::runtime_error("Source table decode error : invalid entry number");
+
+  uint32_t entry_number = read_u32(source_table_data, idx);
+  idx += sizeof(uint32_t);
+
+  while (idx < source_table_data.size())
+  {
+    if (idx + 2 * sizeof(uint32_t) > source_table_data.size())
+      throw std::runtime_error("Source table decode error : invalid entry item");
+
+    uint32_t id = read_u32(source_table_data, idx);
+    idx += sizeof(uint32_t);
+
+    uint32_t length = read_u32(source_table_data, idx);
+    idx += sizeof(uint32_t);
+
+    if (idx + sizeof(char) * length > source_table_data.size())
+      throw std::runtime_error("Source table decode error : invalid entry data");
+
+    // The last character of name is '\0'.
+    // However, as std::string do not use '\0' for finding the end of string,
+    // we ignore the character and do not include it in the string.
+    std::string origin_name;
+    for (uint32_t j = 0; j < length - 1; ++j)
+      origin_name += source_table_data.at(idx + j);
+    assert(source_table_data.at(idx + length - 1) == '\0');
+    idx += sizeof(char) * length;
+
+    if (source_id_name_map.insert({id, origin_name}).second == false)
+      throw std::runtime_error("Source table decode error : duplicated origin ID");
+  }
+
+  if (idx != source_table_data.size())
+    throw std::runtime_error("Source table decode error : data size invalid");
+
+  if (source_id_name_map.size() != entry_number)
+    throw std::runtime_error("Source table decode error : result size mismatch");
+
+  return source_id_name_map;
+}
+
+// 'op_table' is decoded to std::map<uint32_t, std::set<uint32_t>> format.
+const std::map<uint32_t, std::set<uint32_t>>
+decoded_op_table(const std::vector<uint8_t> &op_table_data)
+{
+  std::map<uint32_t, std::set<uint32_t>> node_source_ids_map;
+  uint32_t idx = 0;
+
+  if (op_table_data.size() < 4)
+    throw std::runtime_error("Op table decode error : invalid entry number");
+
+  uint32_t entry_number = read_u32(op_table_data, idx);
+  idx += sizeof(uint32_t);
+
+  while (idx < op_table_data.size())
+  {
+    if (idx + 2 * sizeof(uint32_t) > op_table_data.size())
+      throw std::runtime_error("Op table decode error : invalid entry item");
+
+    uint32_t id = read_u32(op_table_data, idx);
+    idx += sizeof(uint32_t);
+
+    uint32_t node_num = read_u32(op_table_data, idx);
+    idx += sizeof(uint32_t);
+
+    if (idx + sizeof(uint32_t) * node_num > op_table_data.size())
+      throw std::runtime_error("Source table decode error : invalid entry data");
+
+    std::set<uint32_t> source_ids;
+    for (uint32_t j = 0; j < node_num; ++j)
+    {
+      uint32_t origin = read_u32(op_table_data, idx);
+      idx += sizeof(uint32_t);
+
+      source_ids.insert(origin);
+    }
+
+    if (node_source_ids_map.insert({id, source_ids}).second == false)
+      throw std::runtime_error("Op table decode error : duplicated origin ID");
+  }
+
+  if (idx != op_table_data.size())
+    throw std::runtime_error("Op table decode error : data size invalid");
+
+  if (node_source_ids_map.size() != entry_number)
+    throw std::runtime_error("Op table decode error : entry number invalid");
+
+  return node_source_ids_map;
+}
+
+} // namespace
+
+namespace luci
+{
+
+CircleImportMetadata::CircleImportMetadata(const luci::CircleReader &reader)
+{
+  const auto &metadata = reader.metadata();
+  for (uint32_t i = 0; i < metadata.size(); ++i)
+  {
+    const circle::MetadataT &meta = *metadata[i];
+
+    assert(meta.buffer < reader.buffers().size());
+    const std::vector<uint8_t> &buffer = reader.buffers()[meta.buffer]->data;
+
+    if (meta.name.compare("ONE_op_table") == 0)
+      _op_table = decoded_op_table(buffer);
+    else if (meta.name.compare("ONE_source_table") == 0)
+      _source_table = decoded_source_table(buffer);
+  }
+}
+
+const OriginTable CircleImportMetadata::origin_table(void)
+{
+  OriginTable origin_table;
+
+  if (_op_table.size() > 0 && _source_table.size() > 0)
+  {
+    for (auto &kv : _op_table)
+    {
+      const auto node_id = kv.first;
+      const auto &source_ids = kv.second;
+
+      std::vector<std::shared_ptr<CircleNodeOrigin>> origins;
+      for (auto source_id : source_ids)
+      {
+        const auto source_name = _source_table.at(source_id);
+        origins.push_back(single_origin(source_id, source_name));
+      }
+
+      auto origin = composite_origin(origins);
+      origin_table.emplace(node_id, origin);
+    }
+  }
+
+  return origin_table;
+}
+
+} // namespace luci
diff --git a/compiler/luci/import/src/CircleImportMetadata.h b/compiler/luci/import/src/CircleImportMetadata.h
new file mode 100644
index 000000000..80176db94
--- /dev/null
+++ b/compiler/luci/import/src/CircleImportMetadata.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_CIRCLE_IMPORT_METADATA_H__
+#define __LUCI_CIRCLE_IMPORT_METADATA_H__
+
+#include "luci/Import/CircleReader.h"
+
+#include <luci/Profile/CircleNodeOrigin.h>
+
+#include <map>
+#include <set>
+#include <string>
+
+namespace luci
+{
+
+using OriginTable = std::map<uint32_t, std::shared_ptr<CircleNodeOrigin>>;
+
+class CircleImportMetadata
+{
+public:
+  CircleImportMetadata() = delete;
+
+  CircleImportMetadata(const luci::CircleReader &reader);
+
+public:
+  /**
+   * @brief Create origin table using _source_table and _op_table in CircleImportMetadata
+   * @note  For creating origin table, both _op_table and _source_table should exist.
+   *        If one of them does not exist, empty table is returned.
+   */
+  const OriginTable origin_table(void);
+
+private:
+  // Decoded metadata is stored
+  std::map<uint32_t, std::string> _source_table;
+  std::map<uint32_t, std::set<uint32_t>> _op_table;
+};
+
+} // namespace luci
+
+#endif // __LUCI_CIRCLE_IMPORT_METADATA_H__
diff --git a/compiler/luci/import/src/CircleReader.cpp b/compiler/luci/import/src/CircleReader.cpp
index b33c920b1..861c1bbe3 100644
--- a/compiler/luci/import/src/CircleReader.cpp
+++ b/compiler/luci/import/src/CircleReader.cpp
@@ -190,19 +190,19 @@ luci_sparse_index_vector(const circle::SparseIndexVectorUnion &sparse_index_vect
     case circle::SparseIndexVector_Int32Vector:
     {
       const auto const_vec_ptr =
-          static_cast<const void *>(&(sparse_index_vector.AsInt32Vector()->values));
+        static_cast<const void *>(&(sparse_index_vector.AsInt32Vector()->values));
       return SparseIndexVector{SparseIndexVectorType::I32, const_vec_ptr};
     }
     case circle::SparseIndexVector_Uint16Vector:
     {
       const auto const_vec_ptr =
-          static_cast<const void *>(&(sparse_index_vector.AsUint16Vector()->values));
+        static_cast<const void *>(&(sparse_index_vector.AsUint16Vector()->values));
       return SparseIndexVector{SparseIndexVectorType::U16, const_vec_ptr};
     }
     case circle::SparseIndexVector_Uint8Vector:
     {
       const auto const_vec_ptr =
-          static_cast<const void *>(&(sparse_index_vector.AsUint8Vector()->values));
+        static_cast<const void *>(&(sparse_index_vector.AsUint8Vector()->values));
       return SparseIndexVector{SparseIndexVectorType::U8, const_vec_ptr};
     }
     default:
@@ -262,15 +262,19 @@ void copy_tensor_attributes(const circle::TensorT &tensor, CircleNode *node)
   node->name(tensor_name(tensor));
   node->dtype(luci_datatype(tensor.type));
 
+  assert(tensor.shape_signature.size() == 0 ||
+         tensor.shape_signature.size() == tensor.shape.size());
+
   std::vector<int32_t> dims = tensor.shape; // in NHWC
   node->rank(dims.size());
   for (uint32_t r = 0; r < dims.size(); ++r)
   {
-    node->dim(r) = loco::Dimension(dims[r]);
+    if (tensor.shape_signature.size() > 0 && tensor.shape_signature.at(r) == -1)
+      node->dim(r).unset();
+    else
+      node->dim(r).set(dims[r]);
   }
 
-  node->shape_signature(tensor.shape_signature);
-
   const auto *quantization = tensor.quantization.get();
   if (quantization != nullptr)
   {
diff --git a/compiler/luci/import/src/GraphBuilder.cpp b/compiler/luci/import/src/GraphBuilder.cpp
index 80a9f986a..356501c2f 100644
--- a/compiler/luci/import/src/GraphBuilder.cpp
+++ b/compiler/luci/import/src/GraphBuilder.cpp
@@ -21,7 +21,7 @@
 namespace luci
 {
 
-void GraphBuilder::build(const circle::OperatorT &op, GraphBuilderContext *context) const
+CircleNode *GraphBuilder::build(const circle::OperatorT &op, GraphBuilderContext *context) const
 {
   LOGGER(l);
 
@@ -47,7 +47,11 @@ void GraphBuilder::build(const circle::OperatorT &op, GraphBuilderContext *conte
     else
     {
       // If there is no tensor, insert CircleOutputExclude.
-      input_nodes.push_back(context->graph()->nodes()->create<luci::CircleOutputExclude>());
+      auto *node = context->graph()->nodes()->create<luci::CircleOutputExclude>();
+      // CircleOutputExclude doesn't need a type, but since all nodes must have a type,
+      // a dummy type is inserted.
+      node->dtype(loco::DataType::FLOAT32);
+      input_nodes.push_back(node);
     }
   }
 
@@ -73,6 +77,8 @@ void GraphBuilder::build(const circle::OperatorT &op, GraphBuilderContext *conte
   {
     context->nodefinder()->enroll(outputs[0], node);
   }
+
+  return node;
 }
 
 } // namespace luci
diff --git a/compiler/luci/import/src/GraphBuilderMultiOutput.cpp b/compiler/luci/import/src/GraphBuilderMultiOutput.cpp
new file mode 100644
index 000000000..9b42e997e
--- /dev/null
+++ b/compiler/luci/import/src/GraphBuilderMultiOutput.cpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Import/GraphBuilderMultiOutput.h"
+
+#include <luci/Log.h>
+
+namespace luci
+{
+
+CircleNode *GraphBuilderMultiOutput::build(const circle::OperatorT &op,
+                                           GraphBuilderContext *context) const
+{
+  LOGGER(l);
+
+  assert(context != nullptr);
+
+  const std::vector<int32_t> &inputs = op.inputs;
+  const std::vector<int32_t> &outputs = op.outputs;
+  const auto &tensors = context->reader()->tensors();
+  const auto &opcodes = context->reader()->opcodes();
+  auto tensors_ptr = context->reader()->tensors_ptr();
+  assert(tensors_ptr != nullptr);
+
+  std::vector<CircleNode *> input_nodes;
+  for (const int32_t input_tensor_index : inputs)
+  {
+    if (input_tensor_index >= 0)
+    {
+      auto input = context->nodefinder()->node(input_tensor_index);
+      if (input == nullptr)
+        INFO(l) << "[luci] Warning: input node is null " << input_tensor_index << std::endl;
+      input_nodes.push_back(input);
+    }
+    else
+    {
+      // If there is no tensor, insert CircleOutputExclude.
+      auto *node = context->graph()->nodes()->create<luci::CircleOutputExclude>();
+      // CircleOutputExclude doesn't need a type, but since all nodes must have a type,
+      // a dummy type is inserted.
+      node->dtype(loco::DataType::FLOAT32);
+      input_nodes.push_back(node);
+    }
+  }
+
+  BuildNodeArgs bna(op, context, input_nodes);
+  auto *node = build_node(bna);
+
+  uint32_t output_count = outputs.size();
+  assert(output_count > 0);
+  {
+    // Let's use attributes from output 0 for this node
+    const circle::TensorT &output_tensor = *tensors[outputs[0]];
+    node->name(tensor_name(output_tensor));
+    node->dtype(luci_datatype(output_tensor.type));
+
+    // mark operator version
+    node->op_version(opcodes[op.opcode_index].get()->version);
+
+    // NOTE We don't set quantization for multiple output nodes but to virtual outputs
+  }
+
+  // Create virtual outputs of Virtual Output node(s)
+  for (uint32_t n = 0; n < output_count; ++n)
+  {
+    const circle::TensorT &output_tensor = *tensors[outputs[n]];
+
+    BuildOutArgs boa(node, n);
+    auto *nodeout = build_out(boa);
+
+    copy_tensor_attributes(output_tensor, nodeout);
+    // mark shape_status
+    if (tensors_ptr->Get(outputs[n])->shape() == nullptr)
+      nodeout->shape_status(ShapeStatus::NOSHAPE);
+    else
+      nodeout->shape_status(ShapeStatus::VALID);
+
+    context->nodefinder()->enroll(outputs[n], nodeout);
+  }
+
+  return node;
+}
+
+} // namespace luci
diff --git a/compiler/luci/import/src/GraphBuilderRegistry.cpp b/compiler/luci/import/src/GraphBuilderRegistry.cpp
index d598d30f4..7f98aab78 100644
--- a/compiler/luci/import/src/GraphBuilderRegistry.cpp
+++ b/compiler/luci/import/src/GraphBuilderRegistry.cpp
@@ -37,6 +37,7 @@ GraphBuilderRegistry::GraphBuilderRegistry()
   CIRCLE_NODE(BATCH_TO_SPACE_ND, CircleBatchToSpaceNDGraphBuilder);                        // 37
   CIRCLE_NODE(BCQ_FULLY_CONNECTED, CircleBCQFullyConnectedGraphBuilder);                   // 253
   CIRCLE_NODE(BCQ_GATHER, CircleBCQGatherGraphBuilder);                                    // 252
+  CIRCLE_NODE(BIDIRECTIONAL_SEQUENCE_LSTM, CircleBidirectionalSequenceLSTMGraphBuilder);   // 52
   CIRCLE_NODE(CAST, CircleCastGraphBuilder);                                               // 53
   CIRCLE_NODE(CEIL, CircleCeilGraphBuilder);                                               // 104
   CIRCLE_NODE(CUSTOM, CircleCustomGraphBuilder);                                           // 32
@@ -51,6 +52,7 @@ GraphBuilderRegistry::GraphBuilderRegistry()
   CIRCLE_NODE(EQUAL, CircleEqualGraphBuilder);                                             // 71
   CIRCLE_NODE(EXP, CircleExpGraphBuilder);                                                 // 47
   CIRCLE_NODE(EXPAND_DIMS, CircleExpandDimsGraphBuilder);                                  // 70
+  CIRCLE_NODE(FAKE_QUANT, CircleFakeQuantGraphBuilder);                                    // 80
   CIRCLE_NODE(FILL, CircleFillGraphBuilder);                                               // 94
   CIRCLE_NODE(FLOOR, CircleFloorGraphBuilder);                                             // 8
   CIRCLE_NODE(FLOOR_DIV, CircleFloorDivGraphBuilder);                                      // 90
@@ -155,9 +157,7 @@ GraphBuilderRegistry::GraphBuilderRegistry()
   // BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_RNN = 35,
   // BuiltinOperator_BIDIRECTIONAL_SEQUENCE_RNN = 46,
   // BuiltinOperator_DELEGATE = 51,
-  // BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM = 52,
   // BuiltinOperator_ARG_MAX = 56,
-  // BuiltinOperator_FAKE_QUANT = 80,
   // BuiltinOperator_QUANTIZE = 114,
   // BuiltinOperator_HARD_SWISH = 117,
   // BuiltinOperator_DENSIFY = 124,
diff --git a/compiler/luci/import/src/Importer.cpp b/compiler/luci/import/src/Importer.cpp
index ab89f3587..193afffcb 100644
--- a/compiler/luci/import/src/Importer.cpp
+++ b/compiler/luci/import/src/Importer.cpp
@@ -15,6 +15,7 @@
  */
 
 #include "luci/Importer.h"
+#include "CircleImportMetadata.h"
 #include "PostImport.h"
 
 #include "luci/Import/GraphBuilder.h"
@@ -25,6 +26,8 @@
 
 #include <luci/IR/Module.h>
 #include <luci/IR/CircleNodes.h>
+#include <luci/Profile/CircleNodeID.h>
+#include <luci/Profile/CircleNodeOrigin.h>
 #include <luci/Log.h>
 #include <luci/LogHelper.h>
 
@@ -50,6 +53,7 @@ void convert_graph(const luci::GraphBuilderSource &source, luci::CircleReader &r
   const auto &tensors = reader.tensors();
   auto tensors_ptr = reader.tensors_ptr();
   assert(tensors_ptr != nullptr);
+  auto circle_metadata = std::make_unique<luci::CircleImportMetadata>(reader);
 
   // build a cache to identify if a tensor is output of an operator
   // if this is set, we should not create a CircleConst for this tensor
@@ -96,12 +100,20 @@ void convert_graph(const luci::GraphBuilderSource &source, luci::CircleReader &r
     // Data type
     graph_input->dtype(input_node->dtype());
 
+    assert(tensor.shape_signature.size() == 0 ||
+           tensor.shape_signature.size() == tensor.shape.size());
+
     // Shape of GraphInput
     auto input_shape = std::make_unique<loco::TensorShape>();
     const std::vector<int32_t> &input_dims = tensor.shape; // in NHWC
     input_shape->rank(input_dims.size());
     for (uint32_t r = 0; r < input_dims.size(); ++r)
-      input_shape->dim(r) = loco::Dimension(input_dims[r]);
+    {
+      if (tensor.shape_signature.size() > 0 && tensor.shape_signature.at(r) == -1)
+        input_shape->dim(r).unset();
+      else
+        input_shape->dim(r).set(input_dims[r]);
+    }
     graph_input->shape(std::move(input_shape));
   }
 
@@ -117,6 +129,7 @@ void convert_graph(const luci::GraphBuilderSource &source, luci::CircleReader &r
   // Note that operators in model are stored in execution order. This means that when importing
   // an operator, its input operators have already been imported. We exploit this fact to set up
   // node's inputs right after creating the node.
+  auto origin_table = circle_metadata->origin_table();
   for (uint32_t i = 0; i < operators.size(); ++i)
   {
     const circle::OperatorT &op = *operators[i];
@@ -130,7 +143,12 @@ void convert_graph(const luci::GraphBuilderSource &source, luci::CircleReader &r
         throw oops::UserExn("Invalid operator", reader.opcode_name(op));
       }
 
-      builder->build(op, &gb_context);
+      auto built_op = builder->build(op, &gb_context);
+      set_node_id(built_op, i);
+      if (origin_table.find(i) != origin_table.end())
+        add_origin(built_op, origin_table.at(i));
+      else
+        add_origin(built_op, luci::single_origin(i, built_op->name()));
     }
     else
     {
@@ -169,19 +187,28 @@ void convert_graph(const luci::GraphBuilderSource &source, luci::CircleReader &r
     // set the graph output name and node object
     auto graph_output = graph->outputs()->create();
     std::string tname = luci::tensor_name(tensor);
-    graph_output->name("output_" + tname);
+    assert(tname.length() > 0);
+    graph_output->name(tname);
 
     luci::copy_tensor_attributes(tensor, output_node);
 
     // Set GraphInputOutputIndex for graph
     output_node->index(graph_output->index());
 
+    assert(tensor.shape_signature.size() == 0 ||
+           tensor.shape_signature.size() == tensor.shape.size());
+
     // Shape of Output
     auto output_shape = std::make_unique<loco::TensorShape>();
     const std::vector<int32_t> &output_dims = tensor.shape; // in NHWC
     output_shape->rank(output_dims.size());
     for (uint32_t r = 0; r < output_dims.size(); ++r)
-      output_shape->dim(r) = loco::Dimension(output_dims[r]);
+    {
+      if (tensor.shape_signature.size() > 0 && tensor.shape_signature.at(r) == -1)
+        output_shape->dim(r).unset();
+      else
+        output_shape->dim(r).set(output_dims[r]);
+    }
     graph_output->shape(std::move(output_shape));
 
     // Data type
diff --git a/compiler/luci/import/src/Nodes/CircleAbs.cpp b/compiler/luci/import/src/Nodes/CircleAbs.cpp
index 3556dc7fa..2a1601a21 100644
--- a/compiler/luci/import/src/Nodes/CircleAbs.cpp
+++ b/compiler/luci/import/src/Nodes/CircleAbs.cpp
@@ -24,11 +24,8 @@ namespace luci
 {
 bool CircleAbsGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 1)
-    return false;
-
   // TODO Support type check
-  return true;
+  return GraphBuilder::validate(args, 1);
 }
 
 CircleNode *CircleAbsGraphBuilder::build_node(const circle::OperatorT &,
diff --git a/compiler/luci/import/src/Nodes/CircleAdd.cpp b/compiler/luci/import/src/Nodes/CircleAdd.cpp
index b767d4af2..94cbdf081 100644
--- a/compiler/luci/import/src/Nodes/CircleAdd.cpp
+++ b/compiler/luci/import/src/Nodes/CircleAdd.cpp
@@ -25,10 +25,7 @@ namespace luci
 
 bool CircleAddGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 2)
-    return false;
-
-  return true;
+  return GraphBuilder::validate(args, 2);
 }
 
 CircleNode *CircleAddGraphBuilder::build_node(const circle::OperatorT &op,
diff --git a/compiler/luci/import/src/Nodes/CircleArgMax.cpp b/compiler/luci/import/src/Nodes/CircleArgMax.cpp
index 10e8516f4..fd8a84289 100644
--- a/compiler/luci/import/src/Nodes/CircleArgMax.cpp
+++ b/compiler/luci/import/src/Nodes/CircleArgMax.cpp
@@ -25,10 +25,7 @@ namespace luci
 
 bool CircleArgMaxGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 2)
-    return false;
-
-  return true;
+  return GraphBuilder::validate(args, 2);
 }
 
 CircleNode *CircleArgMaxGraphBuilder::build_node(const circle::OperatorT &op,
diff --git a/compiler/luci/import/src/Nodes/CircleArgMin.cpp b/compiler/luci/import/src/Nodes/CircleArgMin.cpp
index 5ff534dbb..63ca8db03 100644
--- a/compiler/luci/import/src/Nodes/CircleArgMin.cpp
+++ b/compiler/luci/import/src/Nodes/CircleArgMin.cpp
@@ -25,10 +25,7 @@ namespace luci
 
 bool CircleArgMinGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 2)
-    return false;
-
-  return true;
+  return GraphBuilder::validate(args, 2);
 }
 
 CircleNode *CircleArgMinGraphBuilder::build_node(const circle::OperatorT &op,
diff --git a/compiler/luci/import/src/Nodes/CircleAveragePool2D.cpp b/compiler/luci/import/src/Nodes/CircleAveragePool2D.cpp
index ad011f71f..a351cf5e7 100644
--- a/compiler/luci/import/src/Nodes/CircleAveragePool2D.cpp
+++ b/compiler/luci/import/src/Nodes/CircleAveragePool2D.cpp
@@ -23,10 +23,7 @@ namespace luci
 
 bool CircleAveragePool2DGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 1)
-    return false;
-
-  return true;
+  return GraphBuilder::validate(args, 1);
 }
 
 CircleNode *CircleAveragePool2DGraphBuilder::build_node(const circle::OperatorT &op,
diff --git a/compiler/luci/import/src/Nodes/CircleBCQFullyConnected.cpp b/compiler/luci/import/src/Nodes/CircleBCQFullyConnected.cpp
index 16ecebd5c..4c86399ce 100644
--- a/compiler/luci/import/src/Nodes/CircleBCQFullyConnected.cpp
+++ b/compiler/luci/import/src/Nodes/CircleBCQFullyConnected.cpp
@@ -25,10 +25,7 @@ namespace luci
 
 bool CircleBCQFullyConnectedGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 5)
-    return false;
-
-  return true;
+  return GraphBuilder::validate(args, 5);
 }
 
 CircleNode *CircleBCQFullyConnectedGraphBuilder::build_node(const circle::OperatorT &op,
@@ -43,15 +40,6 @@ CircleNode *CircleBCQFullyConnectedGraphBuilder::build_node(const circle::Operat
   node->bias(inputs.at(3));
   node->weights_clusters(inputs.at(4));
 
-  // TODO Find and move to appropriate place for setting optional input
-  if (auto bias = dynamic_cast<luci::CircleOutputExclude *>(node->bias()))
-  {
-    // bias is not used for type inference, but node itself should have a type
-    bias->dtype(loco::DataType::FLOAT32);
-
-    // bias is not used for shape inference
-  }
-
   const auto *options = op.builtin_options.AsBCQFullyConnectedOptions();
   node->weights_hidden_size(options->weights_hidden_size);
   node->fusedActivationFunction(luci_actfunc(options->fused_activation_function));
diff --git a/compiler/luci/import/src/Nodes/CircleBCQGather.cpp b/compiler/luci/import/src/Nodes/CircleBCQGather.cpp
index 464f1ac18..ee1358197 100644
--- a/compiler/luci/import/src/Nodes/CircleBCQGather.cpp
+++ b/compiler/luci/import/src/Nodes/CircleBCQGather.cpp
@@ -25,10 +25,7 @@ namespace luci
 
 bool CircleBCQGatherGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 4)
-    return false;
-
-  return true;
+  return GraphBuilder::validate(args, 4);
 }
 
 CircleNode *CircleBCQGatherGraphBuilder::build_node(const circle::OperatorT &op,
diff --git a/compiler/luci/import/src/Nodes/CircleBatchMatMul.cpp b/compiler/luci/import/src/Nodes/CircleBatchMatMul.cpp
index 330775691..390719061 100644
--- a/compiler/luci/import/src/Nodes/CircleBatchMatMul.cpp
+++ b/compiler/luci/import/src/Nodes/CircleBatchMatMul.cpp
@@ -23,10 +23,7 @@ namespace luci
 
 bool CircleBatchMatMulGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 2)
-    return false;
-
-  return true;
+  return GraphBuilder::validate(args, 2);
 }
 
 CircleNode *CircleBatchMatMulGraphBuilder::build_node(const circle::OperatorT &op,
diff --git a/compiler/luci/import/src/Nodes/CircleBidirectionalSequenceLSTM.cpp b/compiler/luci/import/src/Nodes/CircleBidirectionalSequenceLSTM.cpp
new file mode 100644
index 000000000..f8bdcff72
--- /dev/null
+++ b/compiler/luci/import/src/Nodes/CircleBidirectionalSequenceLSTM.cpp
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Import/Nodes/CircleBidirectionalSequenceLSTM.h"
+
+#include <luci/IR/Nodes/CircleBidirectionalSequenceLSTM.h>
+#include <luci/IR/Nodes/CircleBidirectionalSequenceLSTMOut.h>
+
+#include <loco.h>
+
+namespace luci
+{
+
+bool CircleBidirectionalSequenceLSTMGraphBuilder::validate(const ValidateArgs &args) const
+{
+  if (args.op.inputs.size() != 48)
+    return false;
+  if (args.op.outputs.size() != 2)
+    return false;
+
+  return true;
+}
+
+CircleNode *CircleBidirectionalSequenceLSTMGraphBuilder::build_node(const BuildNodeArgs &bna) const
+{
+  auto *node = bna.context->graph()->nodes()->create<CircleBidirectionalSequenceLSTM>();
+  auto &inputs = bna.input_nodes;
+  node->input(inputs.at(0));
+  node->fw_input_to_input_weights(inputs.at(1)); // Optional
+  node->fw_input_to_cell_weights(inputs.at(2));
+  node->fw_input_to_forget_weights(inputs.at(3));
+  node->fw_input_to_output_weights(inputs.at(4));
+  node->fw_recurrent_to_input_weights(inputs.at(5)); // Optional
+  node->fw_recurrent_to_cell_weights(inputs.at(6));
+  node->fw_recurrent_to_forget_weights(inputs.at(7));
+  node->fw_recurrent_to_output_weights(inputs.at(8));
+  node->fw_cell_to_input_weights(inputs.at(9));   // Optional
+  node->fw_cell_to_forget_weights(inputs.at(10)); // Optional
+  node->fw_cell_to_output_weights(inputs.at(11)); // Optional
+  node->fw_input_gate_bias(inputs.at(12));        // Optional
+  node->fw_forget_gate_bias(inputs.at(13));
+  node->fw_cell_gate_bias(inputs.at(14));
+  node->fw_output_gate_bias(inputs.at(15));
+  node->fw_projection_weights(inputs.at(16));     // Optional
+  node->fw_projection_bias(inputs.at(17));        // Optional
+  node->bw_input_to_input_weights(inputs.at(18)); // Optional
+  node->bw_input_to_cell_weights(inputs.at(19));
+  node->bw_input_to_forget_weights(inputs.at(20));
+  node->bw_input_to_output_weights(inputs.at(21));
+  node->bw_recurrent_to_input_weights(inputs.at(22)); // Optional
+  node->bw_recurrent_to_cell_weights(inputs.at(23));
+  node->bw_recurrent_to_forget_weights(inputs.at(24));
+  node->bw_recurrent_to_output_weights(inputs.at(25));
+  node->bw_cell_to_input_weights(inputs.at(26));  // Optional
+  node->bw_cell_to_forget_weights(inputs.at(27)); // Optional
+  node->bw_cell_to_output_weights(inputs.at(28)); // Optional
+  node->bw_input_gate_bias(inputs.at(29));        // Optional
+  node->bw_forget_gate_bias(inputs.at(30));
+  node->bw_cell_gate_bias(inputs.at(31));
+  node->bw_output_gate_bias(inputs.at(32));
+  node->bw_projection_weights(inputs.at(33)); // Optional
+  node->bw_projection_bias(inputs.at(34));    // Optional
+  node->fw_activation_state(inputs.at(35));
+  node->fw_cell_state(inputs.at(36));
+  node->bw_activation_state(inputs.at(37));
+  node->bw_cell_state(inputs.at(38));
+
+  node->auxillary_input(inputs.at(39));                      // Optional
+  node->fw_auxillary_input_to_input_weights(inputs.at(40));  // Optional
+  node->fw_auxillary_input_to_forget_weights(inputs.at(41)); // Optional
+  node->fw_auxillary_input_to_cell_weights(inputs.at(42));   // Optional
+  node->fw_auxillary_input_to_output_weights(inputs.at(43)); // Optional
+  node->bw_auxillary_input_to_input_weights(inputs.at(44));  // Optional
+  node->bw_auxillary_input_to_forget_weights(inputs.at(45)); // Optional
+  node->bw_auxillary_input_to_cell_weights(inputs.at(46));   // Optional
+  node->bw_auxillary_input_to_output_weights(inputs.at(47)); // Optional
+
+  const auto *options = bna.op.builtin_options.AsBidirectionalSequenceLSTMOptions();
+  node->fusedActivationFunction(luci_actfunc(options->fused_activation_function));
+  node->cell_clip(options->cell_clip);
+  node->proj_clip(options->proj_clip);
+  node->merge_outputs(options->merge_outputs);
+  node->time_major(options->time_major);
+  node->asymmetric_quantize_inputs(options->asymmetric_quantize_inputs);
+
+  return node;
+}
+
+CircleNode *CircleBidirectionalSequenceLSTMGraphBuilder::build_out(const BuildOutArgs &boa) const
+{
+  auto *nodeout = boa.node->graph()->nodes()->create<CircleBidirectionalSequenceLSTMOut>();
+
+  nodeout->input(boa.node);
+  nodeout->index(boa.index);
+
+  return nodeout;
+}
+
+} // namespace luci
diff --git a/compiler/luci/import/src/Nodes/CircleCast.cpp b/compiler/luci/import/src/Nodes/CircleCast.cpp
index 7bdb63044..3e8c08bfa 100644
--- a/compiler/luci/import/src/Nodes/CircleCast.cpp
+++ b/compiler/luci/import/src/Nodes/CircleCast.cpp
@@ -30,14 +30,13 @@ bool CircleCastGraphBuilder::validate(const ValidateArgs &args) const
 {
   LOGGER(l);
 
+  if (!GraphBuilder::validate(args, 1))
+    return false;
+
   auto settings = luci::UserSettings::settings();
 
   const auto &inputs = args.op.inputs;
   const auto &outputs = args.op.outputs;
-  if (inputs.size() != 1)
-    return false;
-  if (outputs.size() != 1)
-    return false;
 
   // NOTE real models do have type mismatch
   const auto *options = args.op.builtin_options.AsCastOptions();
diff --git a/compiler/luci/import/src/Nodes/CircleCeil.cpp b/compiler/luci/import/src/Nodes/CircleCeil.cpp
index 2e1aaa295..d439f41cd 100644
--- a/compiler/luci/import/src/Nodes/CircleCeil.cpp
+++ b/compiler/luci/import/src/Nodes/CircleCeil.cpp
@@ -25,16 +25,8 @@ namespace luci
 
 bool CircleCeilGraphBuilder::validate(const ValidateArgs &args) const
 {
-  const auto &inputs = args.op.inputs;
-  const auto &outputs = args.op.outputs;
-  if (inputs.size() != 1)
-    return false;
-  if (outputs.size() != 1)
-    return false;
-
   // TODO dtype check
-
-  return true;
+  return GraphBuilder::validate(args, 1);
 }
 
 CircleNode *CircleCeilGraphBuilder::build_node(const circle::OperatorT &,
diff --git a/compiler/luci/import/src/Nodes/CircleConv2D.cpp b/compiler/luci/import/src/Nodes/CircleConv2D.cpp
index 9516ef16a..8cbecdc00 100644
--- a/compiler/luci/import/src/Nodes/CircleConv2D.cpp
+++ b/compiler/luci/import/src/Nodes/CircleConv2D.cpp
@@ -28,10 +28,7 @@ namespace luci
 bool CircleConv2DGraphBuilder::validate(const ValidateArgs &args) const
 {
   // Circle Conv2D may not have a bias but we won't support this
-  if (args.op.inputs.size() != 3)
-    return false;
-
-  return true;
+  return GraphBuilder::validate(args, 3);
 }
 
 CircleNode *CircleConv2DGraphBuilder::build_node(const circle::OperatorT &op,
diff --git a/compiler/luci/import/src/Nodes/CircleCos.cpp b/compiler/luci/import/src/Nodes/CircleCos.cpp
index 27d60c62c..9705202ee 100644
--- a/compiler/luci/import/src/Nodes/CircleCos.cpp
+++ b/compiler/luci/import/src/Nodes/CircleCos.cpp
@@ -25,10 +25,7 @@ namespace luci
 
 bool CircleCosGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 1)
-    return false;
-
-  return true;
+  return GraphBuilder::validate(args, 1);
 }
 
 CircleNode *CircleCosGraphBuilder::build_node(const circle::OperatorT &,
diff --git a/compiler/luci/import/src/Nodes/CircleCustom.cpp b/compiler/luci/import/src/Nodes/CircleCustom.cpp
index d541ee87b..01ac3e2a0 100644
--- a/compiler/luci/import/src/Nodes/CircleCustom.cpp
+++ b/compiler/luci/import/src/Nodes/CircleCustom.cpp
@@ -27,62 +27,39 @@ bool CircleCustomGraphBuilder::validate(const ValidateArgs &) const
   return true;
 }
 
-void CircleCustomGraphBuilder::build(const circle::OperatorT &op,
-                                     GraphBuilderContext *context) const
+CircleNode *CircleCustomGraphBuilder::build_node(const BuildNodeArgs &bna) const
 {
-  assert(context != nullptr);
+  uint32_t input_count = bna.op.inputs.size();
+  uint32_t output_count = bna.op.outputs.size();
 
-  auto graph = context->graph();
+  auto *node = bna.context->graph()->nodes()->create<CircleCustom>(input_count, output_count);
 
-  const std::vector<int32_t> &inputs = op.inputs;
-  const std::vector<int32_t> &outputs = op.outputs;
-  const auto &tensors = context->reader()->tensors();
-  auto tensors_ptr = context->reader()->tensors_ptr();
-  assert(tensors_ptr != nullptr);
+  for (uint32_t idx = 0; idx < input_count; ++idx)
+  {
+    node->inputs(idx, bna.input_nodes[idx]);
+  }
 
-  // Create CircleCustom
-  const auto &opcodes = context->reader()->opcodes();
-  const uint32_t opcode_index = op.opcode_index;
+  const auto &opcodes = bna.context->reader()->opcodes();
+  const uint32_t opcode_index = bna.op.opcode_index;
   const circle::OperatorCodeT &opcode = *opcodes[opcode_index];
 
-  auto *node = graph->nodes()->create<CircleCustom>(inputs.size());
-  uint32_t input_idx = 0;
-  for (const int32_t input_tensor_index : inputs)
-  {
-    node->inputs(input_idx++, context->nodefinder()->node(input_tensor_index));
-  }
-  node->custom_options(std::vector<uint8_t>{op.custom_options.begin(), op.custom_options.end()});
+  node->custom_options(
+    std::vector<uint8_t>{bna.op.custom_options.begin(), bna.op.custom_options.end()});
   node->custom_code(opcode.custom_code);
-  // Operator version of custom is always 1, so do nothing
 
-  uint32_t output_count = outputs.size();
+  // NOTE Operator version of custom is always 1
 
-  assert(output_count > 0);
-  {
-    // Let's use attributes from output 0 for this node
-    const circle::TensorT &output_tensor = *tensors[outputs[0]];
-    node->name(tensor_name(output_tensor));
-    node->dtype(luci_datatype(output_tensor.type));
-  }
-
-  // Create virtual outputs of Custom
-  for (uint32_t n = 0; n < output_count; ++n)
-  {
-    const circle::TensorT &output_tensor = *tensors[outputs[n]];
+  return node;
+}
 
-    auto *nodeout = graph->nodes()->create<CircleCustomOut>();
-    copy_tensor_attributes(output_tensor, nodeout);
-    // mark shape_status
-    if (tensors_ptr->Get(outputs[n])->shape() == nullptr)
-      nodeout->shape_status(ShapeStatus::NOSHAPE);
-    else
-      nodeout->shape_status(ShapeStatus::VALID);
+CircleNode *CircleCustomGraphBuilder::build_out(const BuildOutArgs &boa) const
+{
+  auto *nodeout = boa.node->graph()->nodes()->create<CircleCustomOut>();
 
-    nodeout->input(node);
-    nodeout->index(n);
+  nodeout->input(boa.node);
+  nodeout->index(boa.index);
 
-    context->nodefinder()->enroll(outputs[n], nodeout);
-  }
+  return nodeout;
 }
 
 } // namespace luci
diff --git a/compiler/luci/import/src/Nodes/CircleDepthToSpace.cpp b/compiler/luci/import/src/Nodes/CircleDepthToSpace.cpp
index 49d31bb99..49eb30a83 100644
--- a/compiler/luci/import/src/Nodes/CircleDepthToSpace.cpp
+++ b/compiler/luci/import/src/Nodes/CircleDepthToSpace.cpp
@@ -27,17 +27,13 @@ namespace luci
 
 bool CircleDepthToSpaceGraphBuilder::validate(const ValidateArgs &args) const
 {
+  if (!GraphBuilder::validate(args, 1))
+    return false;
+
   const auto &inputs = args.op.inputs;
   const auto &outputs = args.op.outputs;
 
   const auto *options = args.op.builtin_options.AsDepthToSpaceOptions();
-
-  if (inputs.size() != 1)
-    return false;
-
-  if (outputs.size() != 1)
-    return false;
-
   const auto &tensors = args.reader.tensors();
 
   if (tensors[outputs[0]]->type != tensors[inputs.at(0)]->type)
diff --git a/compiler/luci/import/src/Nodes/CircleDepthwiseConv2D.cpp b/compiler/luci/import/src/Nodes/CircleDepthwiseConv2D.cpp
index 53f85f2f5..727487c6a 100644
--- a/compiler/luci/import/src/Nodes/CircleDepthwiseConv2D.cpp
+++ b/compiler/luci/import/src/Nodes/CircleDepthwiseConv2D.cpp
@@ -32,6 +32,32 @@ bool CircleDepthwiseConv2DGraphBuilder::validate(const ValidateArgs &args) const
   if (args.op.outputs.size() != 1)
     return false;
 
+  const auto &tensors = args.reader.tensors();
+
+  // input shape
+  const auto &input = tensors.at(args.op.inputs.at(0));
+  const auto &input_shape = input->shape;
+
+  // input shape must be rank 4
+  if (input_shape.size() != 4)
+    return false;
+
+  // filter shape
+  const auto &filter = tensors.at(args.op.inputs.at(1));
+  const auto &filter_shape = filter->shape;
+
+  // filter shape must be rank 4
+  if (filter_shape.size() != 4)
+    return false;
+
+  // multiplier
+  const auto *options = args.op.builtin_options.AsDepthwiseConv2DOptions();
+  const auto &multiplier = options->depth_multiplier;
+
+  // filter represents as [1, H, W, C*M] where M is multiplier.
+  if (filter_shape.at(3) != input_shape.at(3) * multiplier)
+    return false;
+
   return true;
 }
 
diff --git a/compiler/luci/import/src/Nodes/CircleDequantize.cpp b/compiler/luci/import/src/Nodes/CircleDequantize.cpp
index 1936da97c..3db546bd0 100644
--- a/compiler/luci/import/src/Nodes/CircleDequantize.cpp
+++ b/compiler/luci/import/src/Nodes/CircleDequantize.cpp
@@ -25,10 +25,7 @@ namespace luci
 
 bool CircleDequantizeGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 1)
-    return false;
-
-  return true;
+  return GraphBuilder::validate(args, 1);
 }
 
 CircleNode *CircleDequantizeGraphBuilder::build_node(const circle::OperatorT &,
diff --git a/compiler/luci/import/src/Nodes/CircleDiv.cpp b/compiler/luci/import/src/Nodes/CircleDiv.cpp
index 615c224d7..7ea1afd95 100644
--- a/compiler/luci/import/src/Nodes/CircleDiv.cpp
+++ b/compiler/luci/import/src/Nodes/CircleDiv.cpp
@@ -23,13 +23,7 @@ namespace luci
 
 bool CircleDivGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 2)
-    return false;
-
-  if (args.op.outputs.size() != 1)
-    return false;
-
-  return true;
+  return GraphBuilder::validate(args, 2);
 }
 
 CircleNode *CircleDivGraphBuilder::build_node(const circle::OperatorT &op,
diff --git a/compiler/luci/import/src/Nodes/CircleElu.cpp b/compiler/luci/import/src/Nodes/CircleElu.cpp
index 919e95ee4..461da9517 100644
--- a/compiler/luci/import/src/Nodes/CircleElu.cpp
+++ b/compiler/luci/import/src/Nodes/CircleElu.cpp
@@ -25,14 +25,11 @@ namespace luci
 
 bool CircleEluGraphBuilder::validate(const ValidateArgs &args) const
 {
-  const auto &inputs = args.op.inputs;
-  const auto &outputs = args.op.outputs;
-
-  if (inputs.size() != 1)
+  if (!GraphBuilder::validate(args, 1))
     return false;
 
-  if (outputs.size() != 1)
-    return false;
+  const auto &inputs = args.op.inputs;
+  const auto &outputs = args.op.outputs;
 
   const auto &tensors = args.reader.tensors();
   const auto &tensor = tensors.at(inputs.at(0));
diff --git a/compiler/luci/import/src/Nodes/CircleEqual.cpp b/compiler/luci/import/src/Nodes/CircleEqual.cpp
index 1db33b8ac..4909692b4 100644
--- a/compiler/luci/import/src/Nodes/CircleEqual.cpp
+++ b/compiler/luci/import/src/Nodes/CircleEqual.cpp
@@ -25,13 +25,10 @@ namespace luci
 
 bool CircleEqualGraphBuilder::validate(const ValidateArgs &args) const
 {
-  const auto &inputs = args.op.inputs;
-
-  if (inputs.size() != 2)
-  {
+  if (!GraphBuilder::validate(args, 2))
     return false;
-  }
 
+  const auto &inputs = args.op.inputs;
   const auto &tensors = args.reader.tensors();
 
   return tensors[inputs.at(0)]->type == tensors[inputs.at(1)]->type;
diff --git a/compiler/luci/import/src/Nodes/CircleExp.cpp b/compiler/luci/import/src/Nodes/CircleExp.cpp
index 2c031d6b3..64f18fbd4 100644
--- a/compiler/luci/import/src/Nodes/CircleExp.cpp
+++ b/compiler/luci/import/src/Nodes/CircleExp.cpp
@@ -25,10 +25,10 @@ namespace luci
 
 bool CircleExpGraphBuilder::validate(const ValidateArgs &args) const
 {
-  const auto &inputs = args.op.inputs;
-  if (inputs.size() != 1)
+  if (!GraphBuilder::validate(args, 1))
     return false;
 
+  const auto &inputs = args.op.inputs;
   // input type check
   const auto &tensors = args.reader.tensors();
   const auto &tensor = tensors.at(inputs.at(0));
diff --git a/compiler/luci/import/src/Nodes/CircleExpandDims.cpp b/compiler/luci/import/src/Nodes/CircleExpandDims.cpp
index ab537c710..ee0fbdc7e 100644
--- a/compiler/luci/import/src/Nodes/CircleExpandDims.cpp
+++ b/compiler/luci/import/src/Nodes/CircleExpandDims.cpp
@@ -25,13 +25,10 @@ namespace luci
 
 bool CircleExpandDimsGraphBuilder::validate(const ValidateArgs &args) const
 {
-  const auto &inputs = args.op.inputs;
-
-  if (inputs.size() != 2)
-  {
+  if (!GraphBuilder::validate(args, 2))
     return false;
-  }
 
+  const auto &inputs = args.op.inputs;
   const auto &tensors = args.reader.tensors();
 
   return tensors[inputs.at(1)]->type == circle::TensorType_INT32;
diff --git a/compiler/luci/import/src/Nodes/CircleFakeQuant.cpp b/compiler/luci/import/src/Nodes/CircleFakeQuant.cpp
new file mode 100644
index 000000000..7cf40b225
--- /dev/null
+++ b/compiler/luci/import/src/Nodes/CircleFakeQuant.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Import/Nodes/CircleFakeQuant.h"
+
+#include <luci/IR/Nodes/CircleFullyConnected.h>
+#include <luci/IR/Nodes/CircleOutput.h>
+
+#include <loco.h>
+#include <oops/UserExn.h>
+
+namespace luci
+{
+
+bool CircleFakeQuantGraphBuilder::validate(const ValidateArgs &args) const
+{
+  return GraphBuilder::validate(args, 1);
+}
+
+CircleNode *CircleFakeQuantGraphBuilder::build_node(const circle::OperatorT &op,
+                                                    const std::vector<CircleNode *> &inputs,
+                                                    loco::Graph *graph) const
+{
+  auto *node = graph->nodes()->create<CircleFakeQuant>();
+  node->inputs(inputs.at(0));
+
+  const auto *options = op.builtin_options.AsFakeQuantOptions();
+  node->min(options->min);
+  node->max(options->max);
+  node->num_bits(options->num_bits);
+  node->narrow_range(options->narrow_range);
+
+  return node;
+}
+
+} // namespace luci
diff --git a/compiler/luci/import/src/Nodes/CircleFill.cpp b/compiler/luci/import/src/Nodes/CircleFill.cpp
index 95d5b876b..9aacddcbe 100644
--- a/compiler/luci/import/src/Nodes/CircleFill.cpp
+++ b/compiler/luci/import/src/Nodes/CircleFill.cpp
@@ -23,13 +23,7 @@ namespace luci
 
 bool CircleFillGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 2)
-    return false;
-
-  if (args.op.outputs.size() != 1)
-    return false;
-
-  return true;
+  return GraphBuilder::validate(args, 2);
 }
 
 CircleNode *CircleFillGraphBuilder::build_node(const circle::OperatorT &op,
diff --git a/compiler/luci/import/src/Nodes/CircleFloor.cpp b/compiler/luci/import/src/Nodes/CircleFloor.cpp
index ce756b3b1..9651259c7 100644
--- a/compiler/luci/import/src/Nodes/CircleFloor.cpp
+++ b/compiler/luci/import/src/Nodes/CircleFloor.cpp
@@ -25,16 +25,8 @@ namespace luci
 
 bool CircleFloorGraphBuilder::validate(const ValidateArgs &args) const
 {
-  const auto &inputs = args.op.inputs;
-  const auto &outputs = args.op.outputs;
-  if (inputs.size() != 1)
-    return false;
-  if (outputs.size() != 1)
-    return false;
-
   // TODO dtype check
-
-  return true;
+  return GraphBuilder::validate(args, 1);
 }
 
 CircleNode *CircleFloorGraphBuilder::build_node(const circle::OperatorT &,
diff --git a/compiler/luci/import/src/Nodes/CircleFloorDiv.cpp b/compiler/luci/import/src/Nodes/CircleFloorDiv.cpp
index 55f385d60..ce329326a 100644
--- a/compiler/luci/import/src/Nodes/CircleFloorDiv.cpp
+++ b/compiler/luci/import/src/Nodes/CircleFloorDiv.cpp
@@ -25,19 +25,11 @@ namespace luci
 
 bool CircleFloorDivGraphBuilder::validate(const ValidateArgs &args) const
 {
-  const auto &inputs = args.op.inputs;
-  const auto &outputs = args.op.outputs;
-
-  if (inputs.size() != 2)
-  {
+  if (!GraphBuilder::validate(args, 2))
     return false;
-  }
-
-  if (outputs.size() != 1)
-  {
-    return false;
-  }
 
+  const auto &inputs = args.op.inputs;
+  const auto &outputs = args.op.outputs;
   const auto &tensors = args.reader.tensors();
   const auto &tensor_in_0 = tensors.at(inputs.at(0));
   const auto &tensor_in_1 = tensors.at(inputs.at(1));
diff --git a/compiler/luci/import/src/Nodes/CircleFloorMod.cpp b/compiler/luci/import/src/Nodes/CircleFloorMod.cpp
index 2101e417e..d8420a43c 100644
--- a/compiler/luci/import/src/Nodes/CircleFloorMod.cpp
+++ b/compiler/luci/import/src/Nodes/CircleFloorMod.cpp
@@ -25,13 +25,10 @@ namespace luci
 
 bool CircleFloorModGraphBuilder::validate(const ValidateArgs &args) const
 {
-  const auto &inputs = args.op.inputs;
-  const auto &outputs = args.op.outputs;
-  if (inputs.size() != 2)
-    return false;
-  if (outputs.size() != 1)
+  if (!GraphBuilder::validate(args, 2))
     return false;
 
+  const auto &inputs = args.op.inputs;
   const auto &tensors = args.reader.tensors();
   const auto &tensor_in_0 = tensors.at(inputs.at(0));
   const auto &tensor_in_1 = tensors.at(inputs.at(1));
diff --git a/compiler/luci/import/src/Nodes/CircleFullyConnected.cpp b/compiler/luci/import/src/Nodes/CircleFullyConnected.cpp
index 17293ad7a..58750d79a 100644
--- a/compiler/luci/import/src/Nodes/CircleFullyConnected.cpp
+++ b/compiler/luci/import/src/Nodes/CircleFullyConnected.cpp
@@ -27,10 +27,7 @@ namespace luci
 
 bool CircleFullyConnectedGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 3)
-    return false;
-
-  return true;
+  return GraphBuilder::validate(args, 3);
 }
 
 CircleNode *CircleFullyConnectedGraphBuilder::build_node(const circle::OperatorT &op,
@@ -42,15 +39,6 @@ CircleNode *CircleFullyConnectedGraphBuilder::build_node(const circle::OperatorT
   node->weights(inputs.at(1));
   node->bias(inputs.at(2)); // bias is optional
 
-  // TODO Find and move to appropriate place for setting optional input
-  if (auto bias = dynamic_cast<luci::CircleOutputExclude *>(node->bias()))
-  {
-    // bias is not used for type inference, but node itself should have a type
-    bias->dtype(loco::DataType::FLOAT32);
-
-    // bias is not used for shape inference
-  }
-
   const auto *options = op.builtin_options.AsFullyConnectedOptions();
   node->fusedActivationFunction(luci_actfunc(options->fused_activation_function));
   node->weights_format(luci_weights_format(options->weights_format));
diff --git a/compiler/luci/import/src/Nodes/CircleGather.cpp b/compiler/luci/import/src/Nodes/CircleGather.cpp
index 75447a38a..8317a3340 100644
--- a/compiler/luci/import/src/Nodes/CircleGather.cpp
+++ b/compiler/luci/import/src/Nodes/CircleGather.cpp
@@ -26,18 +26,14 @@ namespace luci
 
 bool CircleGatherGraphBuilder::validate(const ValidateArgs &args) const
 {
+  if (!GraphBuilder::validate(args, 2))
+    return false;
+
   const auto &inputs = args.op.inputs;
-  const auto &outputs = args.op.outputs;
   const auto *options = args.op.builtin_options.AsGatherOptions();
 
   int32_t axis = options->axis;
 
-  if (inputs.size() != 2)
-    return false;
-
-  if (outputs.size() != 1)
-    return false;
-
   if (axis < 0)
     axis += inputs.size();
 
diff --git a/compiler/luci/import/src/Nodes/CircleGatherNd.cpp b/compiler/luci/import/src/Nodes/CircleGatherNd.cpp
index 981adbf63..a4bb26a10 100644
--- a/compiler/luci/import/src/Nodes/CircleGatherNd.cpp
+++ b/compiler/luci/import/src/Nodes/CircleGatherNd.cpp
@@ -27,15 +27,10 @@ namespace luci
 
 bool CircleGatherNdGraphBuilder::validate(const ValidateArgs &args) const
 {
-  const auto &inputs = args.op.inputs;
-  const auto &outputs = args.op.outputs;
-
-  if (inputs.size() != 2)
-    return false;
-
-  if (outputs.size() != 1)
+  if (!GraphBuilder::validate(args, 2))
     return false;
 
+  const auto &inputs = args.op.inputs;
   auto &indices_tensor = args.reader.tensors()[inputs.at(1)];
 
   if (!(indices_tensor->type == circle::TensorType::TensorType_INT32 ||
diff --git a/compiler/luci/import/src/Nodes/CircleGreater.cpp b/compiler/luci/import/src/Nodes/CircleGreater.cpp
index 1ad0467e4..f9c00346c 100644
--- a/compiler/luci/import/src/Nodes/CircleGreater.cpp
+++ b/compiler/luci/import/src/Nodes/CircleGreater.cpp
@@ -30,17 +30,13 @@ bool CircleGreaterGraphBuilder::validate(const ValidateArgs &args) const
 {
   LOGGER(l);
 
+  if (!GraphBuilder::validate(args, 2))
+    return false;
+
   auto settings = luci::UserSettings::settings();
 
   const auto &inputs = args.op.inputs;
   const auto &outputs = args.op.outputs;
-
-  if (inputs.size() != 2)
-    return false;
-
-  if (outputs.size() != 1)
-    return false;
-
   const auto &tensors = args.reader.tensors();
 
   if (tensors[inputs.at(0)]->type != tensors[inputs.at(1)]->type)
diff --git a/compiler/luci/import/src/Nodes/CircleGreaterEqual.cpp b/compiler/luci/import/src/Nodes/CircleGreaterEqual.cpp
index 0ac63b017..e20038fd9 100644
--- a/compiler/luci/import/src/Nodes/CircleGreaterEqual.cpp
+++ b/compiler/luci/import/src/Nodes/CircleGreaterEqual.cpp
@@ -25,19 +25,11 @@ namespace luci
 
 bool CircleGreaterEqualGraphBuilder::validate(const ValidateArgs &args) const
 {
-  const auto &inputs = args.op.inputs;
-  const auto &outputs = args.op.outputs;
-
-  if (inputs.size() != 2)
-  {
+  if (!GraphBuilder::validate(args, 2))
     return false;
-  }
-
-  if (outputs.size() != 1)
-  {
-    return false;
-  }
 
+  const auto &inputs = args.op.inputs;
+  const auto &outputs = args.op.outputs;
   const auto &tensors = args.reader.tensors();
 
   if (tensors[inputs.at(0)]->type != tensors[inputs.at(1)]->type)
diff --git a/compiler/luci/import/src/Nodes/CircleIf.cpp b/compiler/luci/import/src/Nodes/CircleIf.cpp
index db9ffe1cd..ffdbf0b79 100644
--- a/compiler/luci/import/src/Nodes/CircleIf.cpp
+++ b/compiler/luci/import/src/Nodes/CircleIf.cpp
@@ -70,69 +70,34 @@ bool CircleIfGraphBuilder::validate(const ValidateArgs &args) const
  *                       \- CircleIfOut --- Node ---
  */
 
-void CircleIfGraphBuilder::build(const circle::OperatorT &op, GraphBuilderContext *context) const
+CircleNode *CircleIfGraphBuilder::build_node(const BuildNodeArgs &bna) const
 {
-  assert(context != nullptr);
+  uint32_t input_count = bna.op.inputs.size() - 1;
+  uint32_t output_count = bna.op.outputs.size();
 
-  auto graph = context->graph();
+  auto *node = bna.context->graph()->nodes()->create<CircleIf>(input_count, output_count);
 
-  const std::vector<int32_t> &inputs = op.inputs;
-  const std::vector<int32_t> &outputs = op.outputs;
-  const auto &tensors = context->reader()->tensors();
-  const auto &opcodes = context->reader()->opcodes();
-  auto tensors_ptr = context->reader()->tensors_ptr();
-  assert(tensors_ptr != nullptr);
-
-  std::vector<CircleNode *> input_nodes;
-  for (const int32_t input_tensor_index : inputs)
-  {
-    input_nodes.push_back(context->nodefinder()->node(input_tensor_index));
-  }
-
-  uint32_t input_count = inputs.size() - 1;
-  uint32_t output_count = outputs.size();
-
-  // Create CircleIf
-  CircleIf *node = graph->nodes()->create<CircleIf>(input_count, output_count);
-
-  node->cond(input_nodes[0]);
+  node->cond(bna.input_nodes[0]);
   for (uint32_t idx = 0; idx < input_count; ++idx)
   {
-    node->input(idx, input_nodes[idx + 1]);
+    node->input(idx, bna.input_nodes[idx + 1]);
   }
 
-  const auto *options = op.builtin_options.AsIfOptions();
+  const auto *options = bna.op.builtin_options.AsIfOptions();
   node->then_branch(options->then_subgraph_index);
   node->else_branch(options->else_subgraph_index);
 
-  assert(outputs.size() > 0);
-  {
-    // Lets use name of output 0 as If name
-    const circle::TensorT &output_tensor = *tensors[outputs[0]];
-    node->name(tensor_name(output_tensor));
-    node->op_version(opcodes[op.opcode_index].get()->version);
-
-    // NOTE We don't set quantization for If itself but to virtual outputs
-  }
-
-  // Create virtual outputs of If
-  for (uint32_t n = 0; n < output_count; ++n)
-  {
-    const circle::TensorT &output_tensor = *tensors[outputs[n]];
+  return node;
+}
 
-    auto *nodeout = graph->nodes()->create<CircleIfOut>();
-    copy_tensor_attributes(output_tensor, nodeout);
-    // mark shape_status
-    if (tensors_ptr->Get(outputs[n])->shape() == nullptr)
-      nodeout->shape_status(ShapeStatus::NOSHAPE);
-    else
-      nodeout->shape_status(ShapeStatus::VALID);
+CircleNode *CircleIfGraphBuilder::build_out(const BuildOutArgs &boa) const
+{
+  auto *nodeout = boa.node->graph()->nodes()->create<CircleIfOut>();
 
-    nodeout->input(node);
-    nodeout->index(n);
+  nodeout->input(boa.node);
+  nodeout->index(boa.index);
 
-    context->nodefinder()->enroll(outputs[n], nodeout);
-  }
+  return nodeout;
 }
 
 } // namespace luci
diff --git a/compiler/luci/import/src/Nodes/CircleInstanceNorm.cpp b/compiler/luci/import/src/Nodes/CircleInstanceNorm.cpp
index 6349fd3b7..977b53406 100644
--- a/compiler/luci/import/src/Nodes/CircleInstanceNorm.cpp
+++ b/compiler/luci/import/src/Nodes/CircleInstanceNorm.cpp
@@ -25,12 +25,8 @@ namespace luci
 
 bool CircleInstanceNormGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 3)
-    return false;
-
   // TODO check dtypes
-
-  return true;
+  return GraphBuilder::validate(args, 3);
 }
 
 CircleNode *CircleInstanceNormGraphBuilder::build_node(const circle::OperatorT &op,
diff --git a/compiler/luci/import/src/Nodes/CircleL2Normalize.cpp b/compiler/luci/import/src/Nodes/CircleL2Normalize.cpp
index e4fdc200c..7e1faedfb 100644
--- a/compiler/luci/import/src/Nodes/CircleL2Normalize.cpp
+++ b/compiler/luci/import/src/Nodes/CircleL2Normalize.cpp
@@ -25,20 +25,7 @@ namespace luci
 
 bool CircleL2NormalizeGraphBuilder::validate(const ValidateArgs &args) const
 {
-  const auto &inputs = args.op.inputs;
-  const auto &outputs = args.op.outputs;
-
-  if (inputs.size() != 1)
-  {
-    return false;
-  }
-
-  if (outputs.size() != 1)
-  {
-    return false;
-  }
-
-  return true;
+  return GraphBuilder::validate(args, 1);
 }
 
 CircleNode *CircleL2NormalizeGraphBuilder::build_node(const circle::OperatorT &op,
diff --git a/compiler/luci/import/src/Nodes/CircleL2Pool2D.cpp b/compiler/luci/import/src/Nodes/CircleL2Pool2D.cpp
index 202d9d6fb..849c7c5ed 100644
--- a/compiler/luci/import/src/Nodes/CircleL2Pool2D.cpp
+++ b/compiler/luci/import/src/Nodes/CircleL2Pool2D.cpp
@@ -25,12 +25,8 @@ namespace luci
 
 bool CircleL2Pool2DGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 1)
-    return false;
-
   // TODO check dtypes
-
-  return true;
+  return GraphBuilder::validate(args, 1);
 }
 
 CircleNode *CircleL2Pool2DGraphBuilder::build_node(const circle::OperatorT &op,
diff --git a/compiler/luci/import/src/Nodes/CircleLeakyRelu.cpp b/compiler/luci/import/src/Nodes/CircleLeakyRelu.cpp
index ad4979f39..880fa6428 100644
--- a/compiler/luci/import/src/Nodes/CircleLeakyRelu.cpp
+++ b/compiler/luci/import/src/Nodes/CircleLeakyRelu.cpp
@@ -25,13 +25,7 @@ namespace luci
 
 bool CircleLeakyReluGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 1)
-    return false;
-
-  if (args.op.outputs.size() != 1)
-    return false;
-
-  return true;
+  return GraphBuilder::validate(args, 1);
 }
 
 CircleNode *CircleLeakyReluGraphBuilder::build_node(const circle::OperatorT &op,
diff --git a/compiler/luci/import/src/Nodes/CircleLess.cpp b/compiler/luci/import/src/Nodes/CircleLess.cpp
index 506036908..f9b99bebe 100644
--- a/compiler/luci/import/src/Nodes/CircleLess.cpp
+++ b/compiler/luci/import/src/Nodes/CircleLess.cpp
@@ -25,19 +25,11 @@ namespace luci
 
 bool CircleLessGraphBuilder::validate(const ValidateArgs &args) const
 {
-  const auto &inputs = args.op.inputs;
-  const auto &outputs = args.op.outputs;
-
-  if (inputs.size() != 2)
-  {
+  if (!GraphBuilder::validate(args, 2))
     return false;
-  }
-
-  if (outputs.size() != 1)
-  {
-    return false;
-  }
 
+  const auto &inputs = args.op.inputs;
+  const auto &outputs = args.op.outputs;
   const auto &tensors = args.reader.tensors();
   const auto &tensor = tensors.at(inputs.at(0));
 
diff --git a/compiler/luci/import/src/Nodes/CircleLessEqual.cpp b/compiler/luci/import/src/Nodes/CircleLessEqual.cpp
index 9b4f934a5..bb1712137 100644
--- a/compiler/luci/import/src/Nodes/CircleLessEqual.cpp
+++ b/compiler/luci/import/src/Nodes/CircleLessEqual.cpp
@@ -25,19 +25,11 @@ namespace luci
 
 bool CircleLessEqualGraphBuilder::validate(const ValidateArgs &args) const
 {
-  const auto &inputs = args.op.inputs;
-  const auto &outputs = args.op.outputs;
-
-  if (inputs.size() != 2)
-  {
+  if (!GraphBuilder::validate(args, 2))
     return false;
-  }
-
-  if (outputs.size() != 1)
-  {
-    return false;
-  }
 
+  const auto &inputs = args.op.inputs;
+  const auto &outputs = args.op.outputs;
   const auto &tensors = args.reader.tensors();
 
   if (tensors[inputs.at(0)]->type != tensors[inputs.at(1)]->type)
diff --git a/compiler/luci/import/src/Nodes/CircleLocalResponseNormalization.cpp b/compiler/luci/import/src/Nodes/CircleLocalResponseNormalization.cpp
index 0e32f62de..d03c47d12 100644
--- a/compiler/luci/import/src/Nodes/CircleLocalResponseNormalization.cpp
+++ b/compiler/luci/import/src/Nodes/CircleLocalResponseNormalization.cpp
@@ -25,16 +25,12 @@ namespace luci
 
 bool CircleLocalResponseNormalizationGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 1)
-    return false;
-
   // TODO do attribute checks
-
-  return true;
+  return GraphBuilder::validate(args, 1);
 }
 
 CircleNode *CircleLocalResponseNormalizationGraphBuilder::build_node(
-    const circle::OperatorT &op, const std::vector<CircleNode *> &inputs, loco::Graph *graph) const
+  const circle::OperatorT &op, const std::vector<CircleNode *> &inputs, loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleLocalResponseNormalization>();
   node->input(inputs.at(0));
diff --git a/compiler/luci/import/src/Nodes/CircleLog.cpp b/compiler/luci/import/src/Nodes/CircleLog.cpp
index 346fc43bb..26b575070 100644
--- a/compiler/luci/import/src/Nodes/CircleLog.cpp
+++ b/compiler/luci/import/src/Nodes/CircleLog.cpp
@@ -25,12 +25,10 @@ namespace luci
 
 bool CircleLogGraphBuilder::validate(const ValidateArgs &args) const
 {
-  const auto &inputs = args.op.inputs;
-  if (inputs.size() != 1)
-    return false;
-  if (args.op.outputs.size() != 1)
+  if (!GraphBuilder::validate(args, 1))
     return false;
 
+  const auto &inputs = args.op.inputs;
   // input type check
   // Must be one of bfloat16, half, float32, float64, complex64, complex128.
   // Currently circle supports half(float16), float32, float64, complex64.
diff --git a/compiler/luci/import/src/Nodes/CircleLogSoftmax.cpp b/compiler/luci/import/src/Nodes/CircleLogSoftmax.cpp
index ef69e868a..4361db691 100644
--- a/compiler/luci/import/src/Nodes/CircleLogSoftmax.cpp
+++ b/compiler/luci/import/src/Nodes/CircleLogSoftmax.cpp
@@ -25,12 +25,8 @@ namespace luci
 
 bool CircleLogSoftmaxGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 1)
-    return false;
-
   // TODO do attribute checks
-
-  return true;
+  return GraphBuilder::validate(args, 1);
 }
 
 CircleNode *CircleLogSoftmaxGraphBuilder::build_node(const circle::OperatorT &,
diff --git a/compiler/luci/import/src/Nodes/CircleLogicalAnd.cpp b/compiler/luci/import/src/Nodes/CircleLogicalAnd.cpp
index 7844da0f6..b13fc2735 100644
--- a/compiler/luci/import/src/Nodes/CircleLogicalAnd.cpp
+++ b/compiler/luci/import/src/Nodes/CircleLogicalAnd.cpp
@@ -25,11 +25,11 @@ namespace luci
 
 bool CircleLogicalAndGraphBuilder::validate(const ValidateArgs &args) const
 {
-  // Only BOOL type is allowed for inputs
-  const auto &inputs = args.op.inputs;
-  if (inputs.size() != 2)
+  if (!GraphBuilder::validate(args, 2))
     return false;
 
+  // Only BOOL type is allowed for inputs
+  const auto &inputs = args.op.inputs;
   const auto &tensors = args.reader.tensors();
   for (auto input : inputs)
   {
diff --git a/compiler/luci/import/src/Nodes/CircleLogicalNot.cpp b/compiler/luci/import/src/Nodes/CircleLogicalNot.cpp
index 3758642e4..f68218349 100644
--- a/compiler/luci/import/src/Nodes/CircleLogicalNot.cpp
+++ b/compiler/luci/import/src/Nodes/CircleLogicalNot.cpp
@@ -25,7 +25,7 @@ namespace luci
 
 bool CircleLogicalNotGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 1)
+  if (!GraphBuilder::validate(args, 1))
     return false;
 
   // Only BOOL type is allowed for the input
diff --git a/compiler/luci/import/src/Nodes/CircleLogicalOr.cpp b/compiler/luci/import/src/Nodes/CircleLogicalOr.cpp
index 1b87e6f9c..8c9023dd3 100644
--- a/compiler/luci/import/src/Nodes/CircleLogicalOr.cpp
+++ b/compiler/luci/import/src/Nodes/CircleLogicalOr.cpp
@@ -25,7 +25,7 @@ namespace luci
 
 bool CircleLogicalOrGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 2)
+  if (!GraphBuilder::validate(args, 2))
     return false;
 
   // Only BOOL type is allowed for inputs
diff --git a/compiler/luci/import/src/Nodes/CircleLogistic.cpp b/compiler/luci/import/src/Nodes/CircleLogistic.cpp
index 9606e19cd..0f92a9bb4 100644
--- a/compiler/luci/import/src/Nodes/CircleLogistic.cpp
+++ b/compiler/luci/import/src/Nodes/CircleLogistic.cpp
@@ -25,13 +25,11 @@ namespace luci
 
 bool CircleLogisticGraphBuilder::validate(const ValidateArgs &args) const
 {
-  const auto &inputs = args.op.inputs;
-  if (inputs.size() != 1)
-    return false;
-  const auto &outputs = args.op.outputs;
-  if (outputs.size() != 1)
+  if (!GraphBuilder::validate(args, 1))
     return false;
 
+  const auto &inputs = args.op.inputs;
+  const auto &outputs = args.op.outputs;
   const auto &tensors = args.reader.tensors();
   if (tensors.at(inputs.at(0))->type != tensors.at(outputs[0])->type)
     return false;
diff --git a/compiler/luci/import/src/Nodes/CircleMatrixDiag.cpp b/compiler/luci/import/src/Nodes/CircleMatrixDiag.cpp
index a4a21a8b7..590a07f2d 100644
--- a/compiler/luci/import/src/Nodes/CircleMatrixDiag.cpp
+++ b/compiler/luci/import/src/Nodes/CircleMatrixDiag.cpp
@@ -25,15 +25,11 @@ namespace luci
 
 bool CircleMatrixDiagGraphBuilder::validate(const ValidateArgs &args) const
 {
-  const auto &inputs = args.op.inputs;
-  const auto &outputs = args.op.outputs;
-
-  if (inputs.size() != 1)
-    return false;
-
-  if (outputs.size() != 1)
+  if (!GraphBuilder::validate(args, 1))
     return false;
 
+  const auto &inputs = args.op.inputs;
+  const auto &outputs = args.op.outputs;
   const auto &tensors = args.reader.tensors();
   const auto &tensor = tensors.at(inputs.at(0));
 
diff --git a/compiler/luci/import/src/Nodes/CircleMatrixSetDiag.cpp b/compiler/luci/import/src/Nodes/CircleMatrixSetDiag.cpp
index cf0313149..edd7d2ae2 100644
--- a/compiler/luci/import/src/Nodes/CircleMatrixSetDiag.cpp
+++ b/compiler/luci/import/src/Nodes/CircleMatrixSetDiag.cpp
@@ -25,15 +25,11 @@ namespace luci
 
 bool CircleMatrixSetDiagGraphBuilder::validate(const ValidateArgs &args) const
 {
-  const auto &inputs = args.op.inputs;
-  const auto &outputs = args.op.outputs;
-
-  if (inputs.size() != 2)
-    return false;
-
-  if (outputs.size() != 1)
+  if (!GraphBuilder::validate(args, 2))
     return false;
 
+  const auto &inputs = args.op.inputs;
+  const auto &outputs = args.op.outputs;
   const auto &tensors = args.reader.tensors();
   const auto &tensor = tensors.at(inputs.at(0));
 
diff --git a/compiler/luci/import/src/Nodes/CircleMaxPool2D.cpp b/compiler/luci/import/src/Nodes/CircleMaxPool2D.cpp
index 4bca0f40b..5c03fff18 100644
--- a/compiler/luci/import/src/Nodes/CircleMaxPool2D.cpp
+++ b/compiler/luci/import/src/Nodes/CircleMaxPool2D.cpp
@@ -25,10 +25,7 @@ namespace luci
 
 bool CircleMaxPool2DGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 1)
-    return false;
-
-  return true;
+  return GraphBuilder::validate(args, 1);
 }
 
 CircleNode *CircleMaxPool2DGraphBuilder::build_node(const circle::OperatorT &op,
diff --git a/compiler/luci/import/src/Nodes/CircleMean.cpp b/compiler/luci/import/src/Nodes/CircleMean.cpp
index d8fa9a53d..7882f17fc 100644
--- a/compiler/luci/import/src/Nodes/CircleMean.cpp
+++ b/compiler/luci/import/src/Nodes/CircleMean.cpp
@@ -23,10 +23,7 @@ namespace luci
 
 bool CircleMeanGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 2)
-    return false;
-
-  return true;
+  return GraphBuilder::validate(args, 2);
 }
 
 CircleNode *CircleMeanGraphBuilder::build_node(const circle::OperatorT &op,
diff --git a/compiler/luci/import/src/Nodes/CircleMirrorPad.cpp b/compiler/luci/import/src/Nodes/CircleMirrorPad.cpp
index e0ddd4c11..e40ce2249 100644
--- a/compiler/luci/import/src/Nodes/CircleMirrorPad.cpp
+++ b/compiler/luci/import/src/Nodes/CircleMirrorPad.cpp
@@ -25,12 +25,8 @@ namespace luci
 
 bool CircleMirrorPadGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 2)
-    return false;
-
   // TODO check others
-
-  return true;
+  return GraphBuilder::validate(args, 2);
 }
 
 CircleNode *CircleMirrorPadGraphBuilder::build_node(const circle::OperatorT &op,
diff --git a/compiler/luci/import/src/Nodes/CircleMul.cpp b/compiler/luci/import/src/Nodes/CircleMul.cpp
index e3c4a7ee5..28421f8c4 100644
--- a/compiler/luci/import/src/Nodes/CircleMul.cpp
+++ b/compiler/luci/import/src/Nodes/CircleMul.cpp
@@ -23,13 +23,7 @@ namespace luci
 
 bool CircleMulGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 2)
-    return false;
-
-  if (args.op.outputs.size() != 1)
-    return false;
-
-  return true;
+  return GraphBuilder::validate(args, 2);
 }
 
 CircleNode *CircleMulGraphBuilder::build_node(const circle::OperatorT &op,
diff --git a/compiler/luci/import/src/Nodes/CircleNeg.cpp b/compiler/luci/import/src/Nodes/CircleNeg.cpp
index a64a69560..9dd1458f4 100644
--- a/compiler/luci/import/src/Nodes/CircleNeg.cpp
+++ b/compiler/luci/import/src/Nodes/CircleNeg.cpp
@@ -24,11 +24,8 @@ namespace luci
 {
 bool CircleNegGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 1)
-    return false;
-
   // TODO Support type check
-  return true;
+  return GraphBuilder::validate(args, 1);
 }
 
 CircleNode *CircleNegGraphBuilder::build_node(const circle::OperatorT &,
diff --git a/compiler/luci/import/src/Nodes/CircleNonMaxSuppressionV4.cpp b/compiler/luci/import/src/Nodes/CircleNonMaxSuppressionV4.cpp
index a4ad4a53d..d3d69506b 100644
--- a/compiler/luci/import/src/Nodes/CircleNonMaxSuppressionV4.cpp
+++ b/compiler/luci/import/src/Nodes/CircleNonMaxSuppressionV4.cpp
@@ -61,63 +61,27 @@ bool CircleNonMaxSuppressionV4GraphBuilder::validate(const ValidateArgs &args) c
  *         We will create multiple NonMasSuppressionV4Oout nodes to emulate this
  */
 
-void CircleNonMaxSuppressionV4GraphBuilder::build(const circle::OperatorT &op,
-                                                  GraphBuilderContext *context) const
+CircleNode *CircleNonMaxSuppressionV4GraphBuilder::build_node(const BuildNodeArgs &bna) const
 {
-  assert(context != nullptr);
-
-  auto graph = context->graph();
-
-  const std::vector<int32_t> &inputs = op.inputs;
-  const std::vector<int32_t> &outputs = op.outputs;
-  const auto &tensors = context->reader()->tensors();
-  const auto &opcodes = context->reader()->opcodes();
-  auto tensors_ptr = context->reader()->tensors_ptr();
-  assert(tensors_ptr != nullptr);
-
-  std::vector<CircleNode *> input_nodes;
-  for (const int32_t input_tensor_index : inputs)
-  {
-    input_nodes.push_back(context->nodefinder()->node(input_tensor_index));
-  }
-
-  // Create CircleNonMaxSuppressionV4
-  auto node = graph->nodes()->create<CircleNonMaxSuppressionV4>();
-  node->boxes(input_nodes[0]);
-  node->scores(input_nodes[1]);
-  node->max_output_size(input_nodes[2]);
-  node->iou_threshold(input_nodes[3]);
-  node->score_threshold(input_nodes[4]);
-
-  assert(outputs.size() == 2);
-  {
-    // Let's use name of output 0 as NonMaxSuppressionV4 name
-    const circle::TensorT &output_tensor = *tensors[outputs[0]];
-    node->name(tensor_name(output_tensor));
-    node->op_version(opcodes[op.opcode_index].get()->version);
-
-    // NOTE We don't set quantization for NonMaxSuppressionV4 itself but to virtual outputs
-  }
-
-  // Create virtual outputs of NonMaxSuppressionV4
-  for (size_t n = 0; n < outputs.size(); ++n)
-  {
-    const circle::TensorT &output_tensor = *tensors[outputs[n]];
-
-    auto *nodeout = graph->nodes()->create<CircleNonMaxSuppressionV4Out>();
-    copy_tensor_attributes(output_tensor, nodeout);
-
-    // mark shape_status
-    if (tensors_ptr->Get(outputs[n])->shape() == nullptr)
-      nodeout->shape_status(ShapeStatus::NOSHAPE);
-    else
-      nodeout->shape_status(ShapeStatus::VALID);
-
-    nodeout->input(node);
-    nodeout->index(n);
-
-    context->nodefinder()->enroll(outputs[n], nodeout);
-  }
+  auto node = bna.context->graph()->nodes()->create<CircleNonMaxSuppressionV4>();
+
+  node->boxes(bna.input_nodes[0]);
+  node->scores(bna.input_nodes[1]);
+  node->max_output_size(bna.input_nodes[2]);
+  node->iou_threshold(bna.input_nodes[3]);
+  node->score_threshold(bna.input_nodes[4]);
+
+  return node;
+}
+
+CircleNode *CircleNonMaxSuppressionV4GraphBuilder::build_out(const BuildOutArgs &boa) const
+{
+  auto *nodeout = boa.node->graph()->nodes()->create<CircleNonMaxSuppressionV4Out>();
+
+  nodeout->input(boa.node);
+  nodeout->index(boa.index);
+
+  return nodeout;
 }
 
 } // namespace luci
diff --git a/compiler/luci/import/src/Nodes/CircleNonMaxSuppressionV5.cpp b/compiler/luci/import/src/Nodes/CircleNonMaxSuppressionV5.cpp
index 241dbf5ff..d797d4cb7 100644
--- a/compiler/luci/import/src/Nodes/CircleNonMaxSuppressionV5.cpp
+++ b/compiler/luci/import/src/Nodes/CircleNonMaxSuppressionV5.cpp
@@ -63,64 +63,28 @@ bool CircleNonMaxSuppressionV5GraphBuilder::validate(const ValidateArgs &args) c
  *         We will create multiple NonMasSuppressionV5Oout nodes to emulate this
  */
 
-void CircleNonMaxSuppressionV5GraphBuilder::build(const circle::OperatorT &op,
-                                                  GraphBuilderContext *context) const
+CircleNode *CircleNonMaxSuppressionV5GraphBuilder::build_node(const BuildNodeArgs &bna) const
 {
-  assert(context != nullptr);
-
-  auto graph = context->graph();
-
-  const std::vector<int32_t> &inputs = op.inputs;
-  const std::vector<int32_t> &outputs = op.outputs;
-  const auto &tensors = context->reader()->tensors();
-  const auto &opcodes = context->reader()->opcodes();
-  auto tensors_ptr = context->reader()->tensors_ptr();
-  assert(tensors_ptr != nullptr);
-
-  std::vector<CircleNode *> input_nodes;
-  for (const int32_t input_tensor_index : inputs)
-  {
-    input_nodes.push_back(context->nodefinder()->node(input_tensor_index));
-  }
-
-  // Create CircleNonMaxSuppressionV5
-  auto node = graph->nodes()->create<CircleNonMaxSuppressionV5>();
-  node->boxes(input_nodes[0]);
-  node->scores(input_nodes[1]);
-  node->max_output_size(input_nodes[2]);
-  node->iou_threshold(input_nodes[3]);
-  node->score_threshold(input_nodes[4]);
-  node->soft_nms_sigma(input_nodes[5]);
-
-  assert(outputs.size() == 3);
-  {
-    // Let's use name of output 0 as NonMaxSuppressionV5 name
-    const circle::TensorT &output_tensor = *tensors[outputs[0]];
-    node->name(tensor_name(output_tensor));
-    node->op_version(opcodes[op.opcode_index].get()->version);
-
-    // NOTE We don't set quantization for NonMaxSuppressionV5 itself but to virtual outputs
-  }
-
-  // Create virtual outputs of NonMaxSuppressionV5
-  for (size_t n = 0; n < outputs.size(); ++n)
-  {
-    const circle::TensorT &output_tensor = *tensors[outputs[n]];
-
-    auto *nodeout = graph->nodes()->create<CircleNonMaxSuppressionV5Out>();
-    copy_tensor_attributes(output_tensor, nodeout);
-
-    // mark shape_status
-    if (tensors_ptr->Get(outputs[n])->shape() == nullptr)
-      nodeout->shape_status(ShapeStatus::NOSHAPE);
-    else
-      nodeout->shape_status(ShapeStatus::VALID);
-
-    nodeout->input(node);
-    nodeout->index(n);
-
-    context->nodefinder()->enroll(outputs[n], nodeout);
-  }
+  auto node = bna.context->graph()->nodes()->create<CircleNonMaxSuppressionV5>();
+
+  node->boxes(bna.input_nodes[0]);
+  node->scores(bna.input_nodes[1]);
+  node->max_output_size(bna.input_nodes[2]);
+  node->iou_threshold(bna.input_nodes[3]);
+  node->score_threshold(bna.input_nodes[4]);
+  node->soft_nms_sigma(bna.input_nodes[5]);
+
+  return node;
+}
+
+CircleNode *CircleNonMaxSuppressionV5GraphBuilder::build_out(const BuildOutArgs &boa) const
+{
+  auto *nodeout = boa.node->graph()->nodes()->create<CircleNonMaxSuppressionV5Out>();
+
+  nodeout->input(boa.node);
+  nodeout->index(boa.index);
+
+  return nodeout;
 }
 
 } // namespace luci
diff --git a/compiler/luci/import/src/Nodes/CircleNotEqual.cpp b/compiler/luci/import/src/Nodes/CircleNotEqual.cpp
index 77e986de1..a0b8f9e4f 100644
--- a/compiler/luci/import/src/Nodes/CircleNotEqual.cpp
+++ b/compiler/luci/import/src/Nodes/CircleNotEqual.cpp
@@ -25,19 +25,11 @@ namespace luci
 
 bool CircleNotEqualGraphBuilder::validate(const ValidateArgs &args) const
 {
-  const auto &inputs = args.op.inputs;
-  const auto &outputs = args.op.outputs;
-
-  if (inputs.size() != 2)
-  {
+  if (!GraphBuilder::validate(args, 2))
     return false;
-  }
-
-  if (outputs.size() != 1)
-  {
-    return false;
-  }
 
+  const auto &inputs = args.op.inputs;
+  const auto &outputs = args.op.outputs;
   const auto &tensors = args.reader.tensors();
 
   if (tensors[inputs.at(0)]->type != tensors[inputs.at(1)]->type)
diff --git a/compiler/luci/import/src/Nodes/CircleOneHot.cpp b/compiler/luci/import/src/Nodes/CircleOneHot.cpp
index 69294e1ed..3952cc21a 100644
--- a/compiler/luci/import/src/Nodes/CircleOneHot.cpp
+++ b/compiler/luci/import/src/Nodes/CircleOneHot.cpp
@@ -26,17 +26,12 @@ namespace luci
 
 bool CircleOneHotGraphBuilder::validate(const ValidateArgs &args) const
 {
-  const auto &inputs = args.op.inputs;
-  const auto &outputs = args.op.outputs;
-  const auto *options = args.op.builtin_options.AsOneHotOptions();
-
   // Only 4 Input come refered from
-  if (inputs.size() != 4)
-    return false;
-
-  if (outputs.size() != 1)
+  if (!GraphBuilder::validate(args, 4))
     return false;
 
+  const auto &inputs = args.op.inputs;
+  const auto *options = args.op.builtin_options.AsOneHotOptions();
   const auto &tensors = args.reader.tensors();
   const auto &indices = tensors.at(inputs.at(0));
   const auto &depth = tensors.at(inputs.at(1));
diff --git a/compiler/luci/import/src/Nodes/CirclePRelu.cpp b/compiler/luci/import/src/Nodes/CirclePRelu.cpp
index c07920f7c..7c81f04bb 100644
--- a/compiler/luci/import/src/Nodes/CirclePRelu.cpp
+++ b/compiler/luci/import/src/Nodes/CirclePRelu.cpp
@@ -25,13 +25,7 @@ namespace luci
 
 bool CirclePReluGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 2)
-    return false;
-
-  if (args.op.outputs.size() != 1)
-    return false;
-
-  return true;
+  return GraphBuilder::validate(args, 2);
 }
 
 CircleNode *CirclePReluGraphBuilder::build_node(const circle::OperatorT &,
diff --git a/compiler/luci/import/src/Nodes/CirclePad.cpp b/compiler/luci/import/src/Nodes/CirclePad.cpp
index 999173b90..67dce6dee 100644
--- a/compiler/luci/import/src/Nodes/CirclePad.cpp
+++ b/compiler/luci/import/src/Nodes/CirclePad.cpp
@@ -25,12 +25,8 @@ namespace luci
 
 bool CirclePadGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 2)
-    return false;
-
   // TODO do attribute checks
-
-  return true;
+  return GraphBuilder::validate(args, 2);
 }
 
 CircleNode *CirclePadGraphBuilder::build_node(const circle::OperatorT &op,
diff --git a/compiler/luci/import/src/Nodes/CirclePadV2.cpp b/compiler/luci/import/src/Nodes/CirclePadV2.cpp
index 493876e68..84a45722a 100644
--- a/compiler/luci/import/src/Nodes/CirclePadV2.cpp
+++ b/compiler/luci/import/src/Nodes/CirclePadV2.cpp
@@ -25,13 +25,7 @@ namespace luci
 
 bool CirclePadV2GraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 3)
-    return false;
-
-  if (args.op.outputs.size() != 1)
-    return false;
-
-  return true;
+  return GraphBuilder::validate(args, 3);
 }
 
 CircleNode *CirclePadV2GraphBuilder::build_node(const circle::OperatorT &op,
diff --git a/compiler/luci/import/src/Nodes/CirclePow.cpp b/compiler/luci/import/src/Nodes/CirclePow.cpp
index def012614..1d2d41607 100644
--- a/compiler/luci/import/src/Nodes/CirclePow.cpp
+++ b/compiler/luci/import/src/Nodes/CirclePow.cpp
@@ -25,13 +25,7 @@ namespace luci
 
 bool CirclePowGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 2)
-    return false;
-
-  if (args.op.outputs.size() != 1)
-    return false;
-
-  return true;
+  return GraphBuilder::validate(args, 2);
 }
 
 CircleNode *CirclePowGraphBuilder::build_node(const circle::OperatorT &,
diff --git a/compiler/luci/import/src/Nodes/CircleRange.cpp b/compiler/luci/import/src/Nodes/CircleRange.cpp
index 38dc44ed6..d3b5afc95 100644
--- a/compiler/luci/import/src/Nodes/CircleRange.cpp
+++ b/compiler/luci/import/src/Nodes/CircleRange.cpp
@@ -24,11 +24,8 @@ namespace luci
 {
 bool CircleRangeGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 3)
-    return false;
-
   // TODO Support type check
-  return true;
+  return GraphBuilder::validate(args, 3);
 }
 
 CircleNode *CircleRangeGraphBuilder::build_node(const circle::OperatorT &,
diff --git a/compiler/luci/import/src/Nodes/CircleRank.cpp b/compiler/luci/import/src/Nodes/CircleRank.cpp
index 12658b192..afebb9509 100644
--- a/compiler/luci/import/src/Nodes/CircleRank.cpp
+++ b/compiler/luci/import/src/Nodes/CircleRank.cpp
@@ -24,13 +24,7 @@ namespace luci
 {
 bool CircleRankGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 1)
-    return false;
-
-  if (args.op.outputs.size() != 1)
-    return false;
-
-  return true;
+  return GraphBuilder::validate(args, 1);
 }
 
 CircleNode *CircleRankGraphBuilder::build_node(const circle::OperatorT &,
diff --git a/compiler/luci/import/src/Nodes/CircleReduceAny.cpp b/compiler/luci/import/src/Nodes/CircleReduceAny.cpp
index 21a821951..13205dd7a 100644
--- a/compiler/luci/import/src/Nodes/CircleReduceAny.cpp
+++ b/compiler/luci/import/src/Nodes/CircleReduceAny.cpp
@@ -23,13 +23,11 @@ namespace luci
 
 bool CircleReduceAnyGraphBuilder::validate(const ValidateArgs &args) const
 {
-  const auto &inputs = args.op.inputs;
-  const auto &outputs = args.op.outputs;
-  if (inputs.size() != 2)
-    return false;
-  if (outputs.size() != 1)
+  if (!GraphBuilder::validate(args, 2))
     return false;
 
+  const auto &inputs = args.op.inputs;
+  const auto &outputs = args.op.outputs;
   const auto &tensors = args.reader.tensors();
   const auto &tensor_0 = tensors.at(inputs.at(0));
   const auto &tensor_1 = tensors.at(inputs.at(1));
diff --git a/compiler/luci/import/src/Nodes/CircleReduceProd.cpp b/compiler/luci/import/src/Nodes/CircleReduceProd.cpp
index 5f054586e..3549c1a18 100644
--- a/compiler/luci/import/src/Nodes/CircleReduceProd.cpp
+++ b/compiler/luci/import/src/Nodes/CircleReduceProd.cpp
@@ -23,12 +23,10 @@ namespace luci
 
 bool CircleReduceProdGraphBuilder::validate(const ValidateArgs &args) const
 {
-  const auto &inputs = args.op.inputs;
-  if (inputs.size() != 2)
-    return false;
-  if (args.op.outputs.size() != 1)
+  if (!GraphBuilder::validate(args, 2))
     return false;
 
+  const auto &inputs = args.op.inputs;
   const auto &tensors = args.reader.tensors();
   const auto &tensor_1 = tensors.at(inputs.at(1));
 
diff --git a/compiler/luci/import/src/Nodes/CircleRelu.cpp b/compiler/luci/import/src/Nodes/CircleRelu.cpp
index 8e1c32a3a..73b8ffee8 100644
--- a/compiler/luci/import/src/Nodes/CircleRelu.cpp
+++ b/compiler/luci/import/src/Nodes/CircleRelu.cpp
@@ -25,13 +25,7 @@ namespace luci
 
 bool CircleReluGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 1)
-    return false;
-
-  if (args.op.outputs.size() != 1)
-    return false;
-
-  return true;
+  return GraphBuilder::validate(args, 1);
 }
 
 CircleNode *CircleReluGraphBuilder::build_node(const circle::OperatorT &,
diff --git a/compiler/luci/import/src/Nodes/CircleRelu6.cpp b/compiler/luci/import/src/Nodes/CircleRelu6.cpp
index 0283d7350..ab957eda8 100644
--- a/compiler/luci/import/src/Nodes/CircleRelu6.cpp
+++ b/compiler/luci/import/src/Nodes/CircleRelu6.cpp
@@ -25,13 +25,7 @@ namespace luci
 
 bool CircleRelu6GraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 1)
-    return false;
-
-  if (args.op.outputs.size() != 1)
-    return false;
-
-  return true;
+  return GraphBuilder::validate(args, 1);
 }
 
 CircleNode *CircleRelu6GraphBuilder::build_node(const circle::OperatorT &,
diff --git a/compiler/luci/import/src/Nodes/CircleReluN1To1.cpp b/compiler/luci/import/src/Nodes/CircleReluN1To1.cpp
index 7f517bc0d..4987f3be2 100644
--- a/compiler/luci/import/src/Nodes/CircleReluN1To1.cpp
+++ b/compiler/luci/import/src/Nodes/CircleReluN1To1.cpp
@@ -25,15 +25,8 @@ namespace luci
 
 bool CircleReluN1To1GraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 1)
-    return false;
-
-  if (args.op.outputs.size() != 1)
-    return false;
-
   // TODO check dtypes
-
-  return true;
+  return GraphBuilder::validate(args, 1);
 }
 
 CircleNode *CircleReluN1To1GraphBuilder::build_node(const circle::OperatorT &,
diff --git a/compiler/luci/import/src/Nodes/CircleReshape.cpp b/compiler/luci/import/src/Nodes/CircleReshape.cpp
index 996ae9d20..401dff0fc 100644
--- a/compiler/luci/import/src/Nodes/CircleReshape.cpp
+++ b/compiler/luci/import/src/Nodes/CircleReshape.cpp
@@ -30,6 +30,19 @@ bool CircleReshapeGraphBuilder::validate(const ValidateArgs &args) const
   if (args.op.outputs.size() != 1)
     return false;
 
+  // for two inputs, check if type is S32
+  if (args.op.inputs.size() == 2)
+  {
+    const auto &inputs = args.op.inputs;
+    const auto &tensors = args.reader.tensors();
+    const auto &tensor_in = tensors.at(inputs.at(1));
+
+    // NOTE fix this if there is any other case
+    // TensorFlow lite and circle only supports S32
+    if (tensor_in->type != circle::TensorType::TensorType_INT32)
+      return false;
+  }
+
   return true;
 }
 
@@ -53,6 +66,7 @@ static CircleNode *create_shape_node(const std::vector<int32_t> &shape, loco::Gr
   {
     shape_node->at<loco::DataType::S32>(i) = shape[i];
   }
+  shape_node->name("Reshape/shape");
   return shape_node;
 }
 
@@ -73,6 +87,7 @@ CircleNode *CircleReshapeGraphBuilder::build_node(const circle::OperatorT &op,
       shape_node = graph->nodes()->create<CircleOutputDummy>();
       shape_node->dtype(loco::DataType::S32);
       shape_node->rank(0);
+      shape_node->name("Reshape/dummy");
     }
   }
 
diff --git a/compiler/luci/import/src/Nodes/CircleResizeBilinear.cpp b/compiler/luci/import/src/Nodes/CircleResizeBilinear.cpp
index 0fccb7b44..c751b245c 100644
--- a/compiler/luci/import/src/Nodes/CircleResizeBilinear.cpp
+++ b/compiler/luci/import/src/Nodes/CircleResizeBilinear.cpp
@@ -16,7 +16,6 @@
 
 #include "luci/Import/Nodes/CircleResizeBilinear.h"
 
-#include <luci/IR/Nodes/CircleConst.h>
 #include <luci/IR/Nodes/CircleResizeBilinear.h>
 
 namespace luci
@@ -24,13 +23,7 @@ namespace luci
 
 bool CircleResizeBilinearGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 2)
-    return false;
-
-  if (args.op.outputs.size() != 1)
-    return false;
-
-  return true;
+  return GraphBuilder::validate(args, 2);
 }
 
 CircleNode *CircleResizeBilinearGraphBuilder::build_node(const circle::OperatorT &op,
diff --git a/compiler/luci/import/src/Nodes/CircleResizeNearestNeighbor.cpp b/compiler/luci/import/src/Nodes/CircleResizeNearestNeighbor.cpp
index 324323f59..df7517fe9 100644
--- a/compiler/luci/import/src/Nodes/CircleResizeNearestNeighbor.cpp
+++ b/compiler/luci/import/src/Nodes/CircleResizeNearestNeighbor.cpp
@@ -16,7 +16,6 @@
 
 #include "luci/Import/Nodes/CircleResizeNearestNeighbor.h"
 
-#include <luci/IR/Nodes/CircleConst.h>
 #include <luci/IR/Nodes/CircleResizeNearestNeighbor.h>
 
 namespace luci
@@ -24,17 +23,11 @@ namespace luci
 
 bool CircleResizeNearestNeighborGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 2)
-    return false;
-
-  if (args.op.outputs.size() != 1)
-    return false;
-
-  return true;
+  return GraphBuilder::validate(args, 2);
 }
 
 CircleNode *CircleResizeNearestNeighborGraphBuilder::build_node(
-    const circle::OperatorT &op, const std::vector<CircleNode *> &inputs, loco::Graph *graph) const
+  const circle::OperatorT &op, const std::vector<CircleNode *> &inputs, loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleResizeNearestNeighbor>();
   node->input(inputs.at(0));
diff --git a/compiler/luci/import/src/Nodes/CircleReverseSequence.cpp b/compiler/luci/import/src/Nodes/CircleReverseSequence.cpp
index ad11d4c63..2fbb7a87c 100644
--- a/compiler/luci/import/src/Nodes/CircleReverseSequence.cpp
+++ b/compiler/luci/import/src/Nodes/CircleReverseSequence.cpp
@@ -25,14 +25,11 @@ namespace luci
 
 bool CircleReverseSequenceGraphBuilder::validate(const ValidateArgs &args) const
 {
-  const auto &inputs = args.op.inputs;
-  const auto &outputs = args.op.outputs;
-
-  if (inputs.size() != 2)
-    return false;
-  if (outputs.size() != 1)
+  if (!GraphBuilder::validate(args, 2))
     return false;
 
+  const auto &inputs = args.op.inputs;
+  const auto &outputs = args.op.outputs;
   const auto &tensors = args.reader.tensors();
   const auto &tensor_in = tensors.at(inputs.at(0));
   const auto &tensor_lengths = tensors.at(inputs.at(1));
diff --git a/compiler/luci/import/src/Nodes/CircleReverseV2.cpp b/compiler/luci/import/src/Nodes/CircleReverseV2.cpp
index e2e53bb4b..ca7653201 100644
--- a/compiler/luci/import/src/Nodes/CircleReverseV2.cpp
+++ b/compiler/luci/import/src/Nodes/CircleReverseV2.cpp
@@ -25,14 +25,11 @@ namespace luci
 
 bool CircleReverseV2GraphBuilder::validate(const ValidateArgs &args) const
 {
-  const auto &inputs = args.op.inputs;
-  const auto &outputs = args.op.outputs;
-
-  if (inputs.size() != 2)
-    return false;
-  if (outputs.size() != 1)
+  if (!GraphBuilder::validate(args, 2))
     return false;
 
+  const auto &inputs = args.op.inputs;
+  const auto &outputs = args.op.outputs;
   const auto &tensors = args.reader.tensors();
   const auto &tensor_in = tensors.at(inputs.at(0));
   const auto &tensor_axis = tensors.at(inputs.at(1));
diff --git a/compiler/luci/import/src/Nodes/CircleRound.cpp b/compiler/luci/import/src/Nodes/CircleRound.cpp
index ad77f9f03..d13e0fafe 100644
--- a/compiler/luci/import/src/Nodes/CircleRound.cpp
+++ b/compiler/luci/import/src/Nodes/CircleRound.cpp
@@ -25,14 +25,11 @@ namespace luci
 
 bool CircleRoundGraphBuilder::validate(const ValidateArgs &args) const
 {
-  const auto &inputs = args.op.inputs;
-  const auto &outputs = args.op.outputs;
-
-  if (inputs.size() != 1)
-    return false;
-  if (outputs.size() != 1)
+  if (!GraphBuilder::validate(args, 1))
     return false;
 
+  const auto &inputs = args.op.inputs;
+  const auto &outputs = args.op.outputs;
   // Must be one of the following types
   // bfloat16, half (float16), float32, float64, complex64, complex128
   // Currently, circle supports float16, float32, complex64
diff --git a/compiler/luci/import/src/Nodes/CircleRsqrt.cpp b/compiler/luci/import/src/Nodes/CircleRsqrt.cpp
index ae05fbbf9..a9ca90832 100644
--- a/compiler/luci/import/src/Nodes/CircleRsqrt.cpp
+++ b/compiler/luci/import/src/Nodes/CircleRsqrt.cpp
@@ -25,10 +25,10 @@ namespace luci
 
 bool CircleRsqrtGraphBuilder::validate(const ValidateArgs &args) const
 {
-  const auto &inputs = args.op.inputs;
-  if (inputs.size() != 1)
+  if (!GraphBuilder::validate(args, 1))
     return false;
 
+  const auto &inputs = args.op.inputs;
   // Must be one of the following types
   // bfloat16, half (float16), float32, float64, complex64, complex128
   // Currently, circle supports float16, float32, complex64
@@ -36,6 +36,8 @@ bool CircleRsqrtGraphBuilder::validate(const ValidateArgs &args) const
   const auto &tensor = tensors.at(inputs.at(0));
   switch (tensor->type)
   {
+    case circle::TensorType_UINT8:
+    case circle::TensorType_INT16:
     case circle::TensorType_FLOAT16:
     case circle::TensorType_FLOAT32:
     case circle::TensorType_COMPLEX64:
diff --git a/compiler/luci/import/src/Nodes/CircleScatterNd.cpp b/compiler/luci/import/src/Nodes/CircleScatterNd.cpp
index 7f86aeb74..f8c175110 100644
--- a/compiler/luci/import/src/Nodes/CircleScatterNd.cpp
+++ b/compiler/luci/import/src/Nodes/CircleScatterNd.cpp
@@ -25,10 +25,10 @@ namespace luci
 
 bool CircleScatterNdGraphBuilder::validate(const ValidateArgs &args) const
 {
-  const auto &inputs = args.op.inputs;
-  if (inputs.size() != 3)
+  if (!GraphBuilder::validate(args, 3))
     return false;
 
+  const auto &inputs = args.op.inputs;
   // indices must have the same type as shape
   const auto &tensors = args.reader.tensors();
 
diff --git a/compiler/luci/import/src/Nodes/CircleSegmentSum.cpp b/compiler/luci/import/src/Nodes/CircleSegmentSum.cpp
index fb84e5d52..bfa333e8d 100644
--- a/compiler/luci/import/src/Nodes/CircleSegmentSum.cpp
+++ b/compiler/luci/import/src/Nodes/CircleSegmentSum.cpp
@@ -25,13 +25,11 @@ namespace luci
 
 bool CircleSegmentSumGraphBuilder::validate(const ValidateArgs &args) const
 {
-  const auto &inputs = args.op.inputs;
-  const auto &outputs = args.op.outputs;
-  if (inputs.size() != 2)
-    return false;
-  if (outputs.size() != 1)
+  if (!GraphBuilder::validate(args, 2))
     return false;
 
+  const auto &inputs = args.op.inputs;
+  const auto &outputs = args.op.outputs;
   const auto &tensors = args.reader.tensors();
   const auto &tensor_in = tensors.at(inputs.at(0));
   const auto &tensor_out = tensors.at(outputs[0]);
diff --git a/compiler/luci/import/src/Nodes/CircleSelect.cpp b/compiler/luci/import/src/Nodes/CircleSelect.cpp
index 1e649f1e0..36a5fa8a8 100644
--- a/compiler/luci/import/src/Nodes/CircleSelect.cpp
+++ b/compiler/luci/import/src/Nodes/CircleSelect.cpp
@@ -25,13 +25,10 @@ namespace luci
 
 bool CircleSelectGraphBuilder::validate(const ValidateArgs &args) const
 {
-  const auto &inputs = args.op.inputs;
-  const auto &outputs = args.op.outputs;
-  if (inputs.size() != 3)
-    return false;
-  if (outputs.size() != 1)
+  if (!GraphBuilder::validate(args, 3))
     return false;
 
+  const auto &inputs = args.op.inputs;
   const auto &tensors = args.reader.tensors();
   const auto &tensor = tensors.at(inputs.at(0));
   if (tensor->type != circle::TensorType_BOOL)
diff --git a/compiler/luci/import/src/Nodes/CircleSelectV2.cpp b/compiler/luci/import/src/Nodes/CircleSelectV2.cpp
index e6dd04de0..556c8fa33 100644
--- a/compiler/luci/import/src/Nodes/CircleSelectV2.cpp
+++ b/compiler/luci/import/src/Nodes/CircleSelectV2.cpp
@@ -25,13 +25,10 @@ namespace luci
 
 bool CircleSelectV2GraphBuilder::validate(const ValidateArgs &args) const
 {
-  const auto &inputs = args.op.inputs;
-  const auto &outputs = args.op.outputs;
-  if (inputs.size() != 3)
-    return false;
-  if (outputs.size() != 1)
+  if (!GraphBuilder::validate(args, 3))
     return false;
 
+  const auto &inputs = args.op.inputs;
   const auto &tensors = args.reader.tensors();
   const auto &condition = tensors.at(inputs.at(0));
   if (condition->type != circle::TensorType_BOOL)
diff --git a/compiler/luci/import/src/Nodes/CircleShape.cpp b/compiler/luci/import/src/Nodes/CircleShape.cpp
index bd7dfc9d9..86c0bf59b 100644
--- a/compiler/luci/import/src/Nodes/CircleShape.cpp
+++ b/compiler/luci/import/src/Nodes/CircleShape.cpp
@@ -25,16 +25,8 @@ namespace luci
 
 bool CircleShapeGraphBuilder::validate(const ValidateArgs &args) const
 {
-  const auto &inputs = args.op.inputs;
-  const auto &outputs = args.op.outputs;
-  if (inputs.size() != 1)
-    return false;
-  if (outputs.size() != 1)
-    return false;
-
   // TODO check shape, dtype
-
-  return true;
+  return GraphBuilder::validate(args, 1);
 }
 
 CircleNode *CircleShapeGraphBuilder::build_node(const circle::OperatorT &op,
diff --git a/compiler/luci/import/src/Nodes/CircleSin.cpp b/compiler/luci/import/src/Nodes/CircleSin.cpp
index 4b245ef6b..22f461123 100644
--- a/compiler/luci/import/src/Nodes/CircleSin.cpp
+++ b/compiler/luci/import/src/Nodes/CircleSin.cpp
@@ -25,12 +25,10 @@ namespace luci
 
 bool CircleSinGraphBuilder::validate(const ValidateArgs &args) const
 {
-  const auto &inputs = args.op.inputs;
-  if (inputs.size() != 1)
-    return false;
-  if (args.op.outputs.size() != 1)
+  if (!GraphBuilder::validate(args, 1))
     return false;
 
+  const auto &inputs = args.op.inputs;
   // input type check
   const auto &tensors = args.reader.tensors();
   const auto &tensor = tensors.at(inputs.at(0));
diff --git a/compiler/luci/import/src/Nodes/CircleSlice.cpp b/compiler/luci/import/src/Nodes/CircleSlice.cpp
index 8601fbf21..4166040b3 100644
--- a/compiler/luci/import/src/Nodes/CircleSlice.cpp
+++ b/compiler/luci/import/src/Nodes/CircleSlice.cpp
@@ -27,14 +27,8 @@ namespace luci
 
 bool CircleSliceGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 3)
-    return false;
-  if (args.op.outputs.size() != 1)
-    return false;
-
   // TODO check shapes and types
-
-  return true;
+  return GraphBuilder::validate(args, 3);
 }
 
 CircleNode *CircleSliceGraphBuilder::build_node(const circle::OperatorT &,
diff --git a/compiler/luci/import/src/Nodes/CircleSoftmax.cpp b/compiler/luci/import/src/Nodes/CircleSoftmax.cpp
index 0ef0b5418..e79914455 100644
--- a/compiler/luci/import/src/Nodes/CircleSoftmax.cpp
+++ b/compiler/luci/import/src/Nodes/CircleSoftmax.cpp
@@ -25,12 +25,8 @@ namespace luci
 
 bool CircleSoftmaxGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 1)
-    return false;
-
   // TODO do attribute checks
-
-  return true;
+  return GraphBuilder::validate(args, 1);
 }
 
 CircleNode *CircleSoftmaxGraphBuilder::build_node(const circle::OperatorT &op,
diff --git a/compiler/luci/import/src/Nodes/CircleSpaceToDepth.cpp b/compiler/luci/import/src/Nodes/CircleSpaceToDepth.cpp
index 8ccd55dc6..2152b65c9 100644
--- a/compiler/luci/import/src/Nodes/CircleSpaceToDepth.cpp
+++ b/compiler/luci/import/src/Nodes/CircleSpaceToDepth.cpp
@@ -27,13 +27,8 @@ namespace luci
 
 bool CircleSpaceToDepthGraphBuilder::validate(const ValidateArgs &args) const
 {
-  const auto &inputs = args.op.inputs;
-  if (inputs.size() != 1)
-    return false;
-
   // TODO do attribute checks
-
-  return true;
+  return GraphBuilder::validate(args, 1);
 }
 
 CircleNode *CircleSpaceToDepthGraphBuilder::build_node(const circle::OperatorT &op,
diff --git a/compiler/luci/import/src/Nodes/CircleSparseToDense.cpp b/compiler/luci/import/src/Nodes/CircleSparseToDense.cpp
index ac756b1f3..ce0688bb9 100644
--- a/compiler/luci/import/src/Nodes/CircleSparseToDense.cpp
+++ b/compiler/luci/import/src/Nodes/CircleSparseToDense.cpp
@@ -25,10 +25,7 @@ namespace luci
 
 bool CircleSparseToDenseGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 4)
-    return false;
-
-  return true;
+  return GraphBuilder::validate(args, 4);
 }
 
 CircleNode *CircleSparseToDenseGraphBuilder::build_node(const circle::OperatorT &op,
diff --git a/compiler/luci/import/src/Nodes/CircleSplit.cpp b/compiler/luci/import/src/Nodes/CircleSplit.cpp
index 07b6cc939..d0a24aae3 100644
--- a/compiler/luci/import/src/Nodes/CircleSplit.cpp
+++ b/compiler/luci/import/src/Nodes/CircleSplit.cpp
@@ -58,62 +58,27 @@ bool CircleSplitGraphBuilder::validate(const ValidateArgs &args) const
  *                          \- CircleSplitOut --- FullyConnected ---
  */
 
-void CircleSplitGraphBuilder::build(const circle::OperatorT &op, GraphBuilderContext *context) const
+CircleNode *CircleSplitGraphBuilder::build_node(const BuildNodeArgs &bna) const
 {
-  assert(context != nullptr);
+  auto node = bna.context->graph()->nodes()->create<CircleSplit>();
 
-  auto graph = context->graph();
+  node->split_dim(bna.input_nodes[0]);
+  node->input(bna.input_nodes[1]);
 
-  const std::vector<int32_t> &inputs = op.inputs;
-  const std::vector<int32_t> &outputs = op.outputs;
-  const auto &tensors = context->reader()->tensors();
-  const auto &opcodes = context->reader()->opcodes();
-  auto tensors_ptr = context->reader()->tensors_ptr();
-  assert(tensors_ptr != nullptr);
+  const auto *options = bna.op.builtin_options.AsSplitOptions();
+  node->num_split(options->num_splits);
 
-  std::vector<CircleNode *> input_nodes;
-  for (const int32_t input_tensor_index : inputs)
-  {
-    input_nodes.push_back(context->nodefinder()->node(input_tensor_index));
-  }
+  return node;
+}
 
-  // Create CircleSplit
-  auto node = graph->nodes()->create<CircleSplit>();
-  node->split_dim(input_nodes[0]);
-  node->input(input_nodes[1]);
+CircleNode *CircleSplitGraphBuilder::build_out(const BuildOutArgs &boa) const
+{
+  auto *nodeout = boa.node->graph()->nodes()->create<CircleSplitOut>();
 
-  const auto *options = op.builtin_options.AsSplitOptions();
-  node->num_split(options->num_splits);
+  nodeout->input(boa.node);
+  nodeout->index(boa.index);
 
-  assert(outputs.size() > 0);
-  assert(int32_t(outputs.size()) == options->num_splits);
-  {
-    // Let's use name of output 0 as Split name
-    const circle::TensorT &output_tensor = *tensors[outputs[0]];
-    node->name(tensor_name(output_tensor));
-    node->op_version(opcodes[op.opcode_index].get()->version);
-
-    // NOTE We don't set quantization for Split itself but to virtual outputs
-  }
-
-  // Create virtual outputs of Split
-  for (int32_t n = 0; n < options->num_splits; ++n)
-  {
-    const circle::TensorT &output_tensor = *tensors[outputs[n]];
-
-    auto *nodeout = graph->nodes()->create<CircleSplitOut>();
-    copy_tensor_attributes(output_tensor, nodeout);
-    // mark shape_status
-    if (tensors_ptr->Get(outputs[n])->shape() == nullptr)
-      nodeout->shape_status(ShapeStatus::NOSHAPE);
-    else
-      nodeout->shape_status(ShapeStatus::VALID);
-
-    nodeout->input(node);
-    nodeout->index(n);
-
-    context->nodefinder()->enroll(outputs[n], nodeout);
-  }
+  return nodeout;
 }
 
 } // namespace luci
diff --git a/compiler/luci/import/src/Nodes/CircleSplitV.cpp b/compiler/luci/import/src/Nodes/CircleSplitV.cpp
index 7c6e83e17..76cbf7046 100644
--- a/compiler/luci/import/src/Nodes/CircleSplitV.cpp
+++ b/compiler/luci/import/src/Nodes/CircleSplitV.cpp
@@ -58,64 +58,30 @@ bool CircleSplitVGraphBuilder::validate(const ValidateArgs &args) const
  *                           \- CircleSplitVOut --- FullyConnected ---
  */
 
-void CircleSplitVGraphBuilder::build(const circle::OperatorT &op,
-                                     GraphBuilderContext *context) const
+CircleNode *CircleSplitVGraphBuilder::build_node(const BuildNodeArgs &bna) const
 {
-  assert(context != nullptr);
-
-  auto graph = context->graph();
-
-  const std::vector<int32_t> &inputs = op.inputs;
-  const std::vector<int32_t> &outputs = op.outputs;
-  const auto &tensors = context->reader()->tensors();
-  const auto &opcodes = context->reader()->opcodes();
-  auto tensors_ptr = context->reader()->tensors_ptr();
-  assert(tensors_ptr != nullptr);
-
-  std::vector<CircleNode *> input_nodes;
-  for (const int32_t input_tensor_index : inputs)
-  {
-    input_nodes.push_back(context->nodefinder()->node(input_tensor_index));
-  }
-
-  // Create CircleSplitV
-  auto node = graph->nodes()->create<CircleSplitV>();
-  node->input(input_nodes[0]);
-  node->size_splits(input_nodes[1]);
-  node->split_dim(input_nodes[2]);
-
-  const auto *options = op.builtin_options.AsSplitVOptions();
+  auto node = bna.context->graph()->nodes()->create<CircleSplitV>();
+
+  node->input(bna.input_nodes[0]);
+  node->size_splits(bna.input_nodes[1]);
+  node->split_dim(bna.input_nodes[2]);
+
+  const auto *options = bna.op.builtin_options.AsSplitVOptions();
   node->num_split(options->num_splits);
 
-  assert(outputs.size() > 0);
-  assert(int32_t(outputs.size()) == options->num_splits);
-  {
-    // Let's use name of output 0 as Split name
-    const circle::TensorT &output_tensor = *tensors[outputs[0]];
-    node->name(tensor_name(output_tensor));
-    node->op_version(opcodes[op.opcode_index].get()->version);
-
-    // NOTE We don't set quantization for Split itself but to virtual outputs
-  }
-
-  // Create virtual outputs of Split
-  for (int32_t n = 0; n < options->num_splits; ++n)
-  {
-    const circle::TensorT &output_tensor = *tensors[outputs[n]];
-
-    auto *nodeout = graph->nodes()->create<CircleSplitVOut>();
-    copy_tensor_attributes(output_tensor, nodeout);
-    // mark shape_status
-    if (tensors_ptr->Get(outputs[n])->shape() == nullptr)
-      nodeout->shape_status(ShapeStatus::NOSHAPE);
-    else
-      nodeout->shape_status(ShapeStatus::VALID);
-
-    nodeout->input(node);
-    nodeout->index(n);
-
-    context->nodefinder()->enroll(outputs[n], nodeout);
-  }
+  assert(int32_t(bna.op.outputs.size()) == options->num_splits);
+
+  return node;
+}
+
+CircleNode *CircleSplitVGraphBuilder::build_out(const BuildOutArgs &boa) const
+{
+  auto *nodeout = boa.node->graph()->nodes()->create<CircleSplitVOut>();
+
+  nodeout->input(boa.node);
+  nodeout->index(boa.index);
+
+  return nodeout;
 }
 
 } // namespace luci
diff --git a/compiler/luci/import/src/Nodes/CircleSqrt.cpp b/compiler/luci/import/src/Nodes/CircleSqrt.cpp
index c8beaee0d..b1fdf7996 100644
--- a/compiler/luci/import/src/Nodes/CircleSqrt.cpp
+++ b/compiler/luci/import/src/Nodes/CircleSqrt.cpp
@@ -25,10 +25,7 @@ namespace luci
 
 bool CircleSqrtGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 1)
-    return false;
-
-  return true;
+  return GraphBuilder::validate(args, 1);
 }
 
 CircleNode *CircleSqrtGraphBuilder::build_node(const circle::OperatorT &,
diff --git a/compiler/luci/import/src/Nodes/CircleSquare.cpp b/compiler/luci/import/src/Nodes/CircleSquare.cpp
index b5ba048d7..7ff2b84e6 100644
--- a/compiler/luci/import/src/Nodes/CircleSquare.cpp
+++ b/compiler/luci/import/src/Nodes/CircleSquare.cpp
@@ -25,10 +25,10 @@ namespace luci
 
 bool CircleSquareGraphBuilder::validate(const ValidateArgs &args) const
 {
-  const auto &inputs = args.op.inputs;
-  if (inputs.size() != 1)
+  if (!GraphBuilder::validate(args, 1))
     return false;
 
+  const auto &inputs = args.op.inputs;
   // Must be one of the following types
   // bfloat16, half (float16), float32, float64, complex64, complex128
   // Currently, circle supports float16, float32, complex64
diff --git a/compiler/luci/import/src/Nodes/CircleSquaredDifference.cpp b/compiler/luci/import/src/Nodes/CircleSquaredDifference.cpp
index 6deae94c5..f4e193713 100644
--- a/compiler/luci/import/src/Nodes/CircleSquaredDifference.cpp
+++ b/compiler/luci/import/src/Nodes/CircleSquaredDifference.cpp
@@ -25,15 +25,11 @@ namespace luci
 
 bool CircleSquaredDifferenceGraphBuilder::validate(const ValidateArgs &args) const
 {
-  const auto &inputs = args.op.inputs;
-  const auto &outputs = args.op.outputs;
-
-  if (inputs.size() != 2)
-    return false;
-
-  if (outputs.size() != 1)
+  if (!GraphBuilder::validate(args, 2))
     return false;
 
+  const auto &inputs = args.op.inputs;
+  const auto &outputs = args.op.outputs;
   // Inputs must be one of the following types
   // bfloat16, half(float16), float32, float64, int32, int64, complex64, complex128
   const auto &tensors = args.reader.tensors();
diff --git a/compiler/luci/import/src/Nodes/CircleSqueeze.cpp b/compiler/luci/import/src/Nodes/CircleSqueeze.cpp
index 32792c266..d24d8166c 100644
--- a/compiler/luci/import/src/Nodes/CircleSqueeze.cpp
+++ b/compiler/luci/import/src/Nodes/CircleSqueeze.cpp
@@ -16,7 +16,6 @@
 
 #include "luci/Import/Nodes/CircleSqueeze.h"
 
-#include <luci/IR/Nodes/CircleConst.h>
 #include <luci/IR/Nodes/CircleSqueeze.h>
 
 namespace luci
@@ -24,13 +23,7 @@ namespace luci
 
 bool CircleSqueezeGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 1)
-    return false;
-
-  if (args.op.outputs.size() != 1)
-    return false;
-
-  return true;
+  return GraphBuilder::validate(args, 1);
 }
 
 CircleNode *CircleSqueezeGraphBuilder::build_node(const circle::OperatorT &op,
diff --git a/compiler/luci/import/src/Nodes/CircleStridedSlice.cpp b/compiler/luci/import/src/Nodes/CircleStridedSlice.cpp
index 8f943a682..ca8259cac 100644
--- a/compiler/luci/import/src/Nodes/CircleStridedSlice.cpp
+++ b/compiler/luci/import/src/Nodes/CircleStridedSlice.cpp
@@ -27,14 +27,8 @@ namespace luci
 
 bool CircleStridedSliceGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 4)
-    return false;
-  if (args.op.outputs.size() != 1)
-    return false;
-
   // TODO check shapes and types
-
-  return true;
+  return GraphBuilder::validate(args, 4);
 }
 
 CircleNode *CircleStridedSliceGraphBuilder::build_node(const circle::OperatorT &op,
diff --git a/compiler/luci/import/src/Nodes/CircleSub.cpp b/compiler/luci/import/src/Nodes/CircleSub.cpp
index 9acf83d40..c3978f218 100644
--- a/compiler/luci/import/src/Nodes/CircleSub.cpp
+++ b/compiler/luci/import/src/Nodes/CircleSub.cpp
@@ -25,13 +25,7 @@ namespace luci
 
 bool CircleSubGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 2)
-    return false;
-
-  if (args.op.outputs.size() != 1)
-    return false;
-
-  return true;
+  return GraphBuilder::validate(args, 2);
 }
 
 CircleNode *CircleSubGraphBuilder::build_node(const circle::OperatorT &op,
diff --git a/compiler/luci/import/src/Nodes/CircleSum.cpp b/compiler/luci/import/src/Nodes/CircleSum.cpp
index bd3cb6239..e348a62d9 100644
--- a/compiler/luci/import/src/Nodes/CircleSum.cpp
+++ b/compiler/luci/import/src/Nodes/CircleSum.cpp
@@ -23,10 +23,7 @@ namespace luci
 
 bool CircleSumGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 2)
-    return false;
-
-  return true;
+  return GraphBuilder::validate(args, 2);
 }
 
 CircleNode *CircleSumGraphBuilder::build_node(const circle::OperatorT &op,
diff --git a/compiler/luci/import/src/Nodes/CircleTanh.cpp b/compiler/luci/import/src/Nodes/CircleTanh.cpp
index 018f5701b..95625a0e4 100644
--- a/compiler/luci/import/src/Nodes/CircleTanh.cpp
+++ b/compiler/luci/import/src/Nodes/CircleTanh.cpp
@@ -25,13 +25,11 @@ namespace luci
 
 bool CircleTanhGraphBuilder::validate(const ValidateArgs &args) const
 {
-  const auto &inputs = args.op.inputs;
-  if (inputs.size() != 1)
-    return false;
-  const auto &outputs = args.op.outputs;
-  if (outputs.size() != 1)
+  if (!GraphBuilder::validate(args, 1))
     return false;
 
+  const auto &inputs = args.op.inputs;
+  const auto &outputs = args.op.outputs;
   const auto &tensors = args.reader.tensors();
   if (tensors.at(inputs.at(0))->type != tensors.at(outputs[0])->type)
     return false;
diff --git a/compiler/luci/import/src/Nodes/CircleTile.cpp b/compiler/luci/import/src/Nodes/CircleTile.cpp
index bc6f320ba..6da44130c 100644
--- a/compiler/luci/import/src/Nodes/CircleTile.cpp
+++ b/compiler/luci/import/src/Nodes/CircleTile.cpp
@@ -25,15 +25,11 @@ namespace luci
 
 bool CircleTileGraphBuilder::validate(const ValidateArgs &args) const
 {
-  auto inputs = args.op.inputs;
-  auto outputs = args.op.outputs;
-
-  if (inputs.size() != 2)
-    return false;
-
-  if (outputs.size() != 1)
+  if (!GraphBuilder::validate(args, 2))
     return false;
 
+  auto inputs = args.op.inputs;
+  auto outputs = args.op.outputs;
   // Multiples (inputs.at(1)) must be one of the following types
   // int32, int64
   const auto &tensors = args.reader.tensors();
diff --git a/compiler/luci/import/src/Nodes/CircleTopKV2.cpp b/compiler/luci/import/src/Nodes/CircleTopKV2.cpp
index f0677de86..49f858798 100644
--- a/compiler/luci/import/src/Nodes/CircleTopKV2.cpp
+++ b/compiler/luci/import/src/Nodes/CircleTopKV2.cpp
@@ -59,59 +59,24 @@ bool CircleTopKV2GraphBuilder::validate(const ValidateArgs &args) const
  *                           \- CircleTopKV2Out --- FullyConnected ---
  */
 
-void CircleTopKV2GraphBuilder::build(const circle::OperatorT &op,
-                                     GraphBuilderContext *context) const
+CircleNode *CircleTopKV2GraphBuilder::build_node(const BuildNodeArgs &bna) const
 {
-  assert(context != nullptr);
-
-  auto graph = context->graph();
-
-  const std::vector<int32_t> &inputs = op.inputs;
-  const std::vector<int32_t> &outputs = op.outputs;
-  const auto &tensors = context->reader()->tensors();
-  const auto &opcodes = context->reader()->opcodes();
-  auto tensors_ptr = context->reader()->tensors_ptr();
-  assert(tensors_ptr != nullptr);
-
-  std::vector<CircleNode *> input_nodes;
-  for (const int32_t input_tensor_index : inputs)
-  {
-    input_nodes.push_back(context->nodefinder()->node(input_tensor_index));
-  }
-
-  // Create CircleTopKV2
-  auto node = graph->nodes()->create<CircleTopKV2>();
-  node->input(input_nodes[0]);
-  node->k(input_nodes[1]);
-
-  assert(outputs.size() == 2);
-  {
-    // Let's use name of output 0 as TopKV2 name
-    const circle::TensorT &output_tensor = *tensors[outputs[0]];
-    node->name(tensor_name(output_tensor));
-    node->op_version(opcodes[op.opcode_index].get()->version);
-
-    // NOTE We don't set quantization for TopKV2 itself but to virtual outputs
-  }
-
-  // Create virtual outputs of TopKV2
-  for (size_t n = 0; n < outputs.size(); ++n)
-  {
-    const circle::TensorT &output_tensor = *tensors[outputs[n]];
-
-    auto *nodeout = graph->nodes()->create<CircleTopKV2Out>();
-    copy_tensor_attributes(output_tensor, nodeout);
-    // mark shape_status
-    if (tensors_ptr->Get(outputs[n])->shape() == nullptr)
-      nodeout->shape_status(ShapeStatus::NOSHAPE);
-    else
-      nodeout->shape_status(ShapeStatus::VALID);
-
-    nodeout->input(node);
-    nodeout->index(n);
-
-    context->nodefinder()->enroll(outputs[n], nodeout);
-  }
+  auto node = bna.context->graph()->nodes()->create<CircleTopKV2>();
+
+  node->input(bna.input_nodes[0]);
+  node->k(bna.input_nodes[1]);
+
+  return node;
+}
+
+CircleNode *CircleTopKV2GraphBuilder::build_out(const BuildOutArgs &boa) const
+{
+  auto *nodeout = boa.node->graph()->nodes()->create<CircleTopKV2Out>();
+
+  nodeout->input(boa.node);
+  nodeout->index(boa.index);
+
+  return nodeout;
 }
 
 } // namespace luci
diff --git a/compiler/luci/import/src/Nodes/CircleTranspose.cpp b/compiler/luci/import/src/Nodes/CircleTranspose.cpp
index cc3153085..01095239e 100644
--- a/compiler/luci/import/src/Nodes/CircleTranspose.cpp
+++ b/compiler/luci/import/src/Nodes/CircleTranspose.cpp
@@ -25,13 +25,7 @@ namespace luci
 
 bool CircleTransposeGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 2)
-    return false;
-
-  if (args.op.outputs.size() != 1)
-    return false;
-
-  return true;
+  return GraphBuilder::validate(args, 2);
 }
 
 CircleNode *CircleTransposeGraphBuilder::build_node(const circle::OperatorT &op,
diff --git a/compiler/luci/import/src/Nodes/CircleTransposeConv.cpp b/compiler/luci/import/src/Nodes/CircleTransposeConv.cpp
index c280faaf5..5a60e2f54 100644
--- a/compiler/luci/import/src/Nodes/CircleTransposeConv.cpp
+++ b/compiler/luci/import/src/Nodes/CircleTransposeConv.cpp
@@ -61,16 +61,15 @@ CircleNode *CircleTransposeConvGraphBuilder::build_node(const circle::OperatorT
   node->filter(inputs.at(1));
   node->outBackprop(inputs.at(2));
   if (inputs.size() == 3)
-    node->bias(graph->nodes()->create<CircleOutputExclude>());
-  else
-    node->bias(inputs.at(3));
-
-  if (auto bias = dynamic_cast<luci::CircleOutputExclude *>(node->bias()))
   {
-    // CircleOutputExclude doesn't need a type, but since all nodes must have a type, a dummy type
-    // is inserted.
+    auto *bias = graph->nodes()->create<CircleOutputExclude>();
+    // CircleOutputExclude doesn't need a type, but since all nodes must have a type,
+    // a dummy type is inserted.
     bias->dtype(loco::DataType::FLOAT32);
+    node->bias(bias);
   }
+  else
+    node->bias(inputs.at(3));
 
   const auto *options = op.builtin_options.AsTransposeConvOptions();
   node->padding(luci_padding(options->padding));
diff --git a/compiler/luci/import/src/Nodes/CircleUnidirectionalSequenceLSTM.cpp b/compiler/luci/import/src/Nodes/CircleUnidirectionalSequenceLSTM.cpp
index c41cf4def..d9cc3f8d0 100644
--- a/compiler/luci/import/src/Nodes/CircleUnidirectionalSequenceLSTM.cpp
+++ b/compiler/luci/import/src/Nodes/CircleUnidirectionalSequenceLSTM.cpp
@@ -25,14 +25,11 @@ namespace luci
 
 bool CircleUnidirectionalSequenceLSTMGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 24)
-    return false;
-
-  return true;
+  return GraphBuilder::validate(args, 24);
 }
 
 CircleNode *CircleUnidirectionalSequenceLSTMGraphBuilder::build_node(
-    const circle::OperatorT &op, const std::vector<CircleNode *> &inputs, loco::Graph *graph) const
+  const circle::OperatorT &op, const std::vector<CircleNode *> &inputs, loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleUnidirectionalSequenceLSTM>();
   node->input(inputs.at(0));
@@ -59,16 +56,6 @@ CircleNode *CircleUnidirectionalSequenceLSTMGraphBuilder::build_node(
   node->forget_layer_norm_coefficients(inputs.at(21)); // Optional
   node->cell_layer_norm_coefficients(inputs.at(22));   // Optional
   node->output_layer_norm_coefficients(inputs.at(23)); // Optional
-  const std::vector<int32_t> optionals = {1, 5, 9, 10, 11, 12, 16, 17, 20, 21, 22, 23};
-  for (auto optional : optionals)
-  {
-    if (auto inp = dynamic_cast<luci::CircleOutputExclude *>(node->arg(optional)))
-    {
-      // CircleOutputExclude doesn't need a type, but since all nodes must have a type, a dummy type
-      // is inserted.
-      inp->dtype(loco::DataType::FLOAT32);
-    }
-  }
 
   const auto *options = op.builtin_options.AsUnidirectionalSequenceLSTMOptions();
   node->fusedActivationFunction(luci_actfunc(options->fused_activation_function));
diff --git a/compiler/luci/import/src/Nodes/CircleUnique.cpp b/compiler/luci/import/src/Nodes/CircleUnique.cpp
index 5e79a2920..f6914c24a 100644
--- a/compiler/luci/import/src/Nodes/CircleUnique.cpp
+++ b/compiler/luci/import/src/Nodes/CircleUnique.cpp
@@ -35,55 +35,26 @@ bool CircleUniqueGraphBuilder::validate(const ValidateArgs &args) const
   return true;
 }
 
-void CircleUniqueGraphBuilder::build(const circle::OperatorT &op,
-                                     GraphBuilderContext *context) const
+CircleNode *CircleUniqueGraphBuilder::build_node(const BuildNodeArgs &bna) const
 {
-  assert(context != nullptr);
+  auto node = bna.context->graph()->nodes()->create<CircleUnique>();
 
-  auto graph = context->graph();
+  node->input(bna.input_nodes[0]);
 
-  const std::vector<int32_t> &inputs = op.inputs;
-  const std::vector<int32_t> &outputs = op.outputs;
-  const auto &tensors = context->reader()->tensors();
-  auto tensors_ptr = context->reader()->tensors_ptr();
-  assert(tensors_ptr != nullptr);
+  const auto *options = bna.op.builtin_options.AsUniqueOptions();
+  node->idx_out_type(luci_datatype(options->idx_out_type));
 
-  std::vector<CircleNode *> input_nodes;
-  for (const int32_t input_tensor_index : inputs)
-  {
-    input_nodes.push_back(context->nodefinder()->node(input_tensor_index));
-  }
-
-  // Create CircleUnique
-  auto node = graph->nodes()->create<CircleUnique>();
-  node->input(input_nodes[0]);
-
-  const auto *options = op.builtin_options.AsUniqueOptions();
-  node->output_type(luci_datatype(options->idx_out_type));
-
-  assert(int32_t(outputs.size()) == 2);
-  // Let's use name of output 0 as Unique name
-  const circle::TensorT &output_tensor = *tensors[outputs[0]];
-  node->name(tensor_name(output_tensor));
-
-  // Create virtual outputs of Unique
-  for (int32_t n = 0; n < 2; ++n)
-  {
-    const circle::TensorT &output_tensor = *tensors[outputs[n]];
+  return node;
+}
 
-    auto *nodeout = graph->nodes()->create<CircleUniqueOut>();
-    copy_tensor_attributes(output_tensor, nodeout);
-    // mark shape_status
-    if (tensors_ptr->Get(outputs[n])->shape() == nullptr)
-      nodeout->shape_status(ShapeStatus::NOSHAPE);
-    else
-      nodeout->shape_status(ShapeStatus::VALID);
+CircleNode *CircleUniqueGraphBuilder::build_out(const BuildOutArgs &boa) const
+{
+  auto *nodeout = boa.node->graph()->nodes()->create<CircleUniqueOut>();
 
-    nodeout->input(node);
-    nodeout->index(n);
+  nodeout->input(boa.node);
+  nodeout->index(boa.index);
 
-    context->nodefinder()->enroll(outputs[n], nodeout);
-  }
+  return nodeout;
 }
 
 } // namespace luci
diff --git a/compiler/luci/import/src/Nodes/CircleUnpack.cpp b/compiler/luci/import/src/Nodes/CircleUnpack.cpp
index 9e7f3d3e1..9bfc76b57 100644
--- a/compiler/luci/import/src/Nodes/CircleUnpack.cpp
+++ b/compiler/luci/import/src/Nodes/CircleUnpack.cpp
@@ -88,64 +88,27 @@ bool CircleUnpackGraphBuilder::validate(const ValidateArgs &args) const
  *                           \- CircleUnpackOut --- FullyConnected ---
  */
 
-void CircleUnpackGraphBuilder::build(const circle::OperatorT &op,
-                                     GraphBuilderContext *context) const
+CircleNode *CircleUnpackGraphBuilder::build_node(const BuildNodeArgs &bna) const
 {
-  assert(context != nullptr);
+  auto node = bna.context->graph()->nodes()->create<CircleUnpack>();
 
-  auto graph = context->graph();
+  node->value(bna.input_nodes[0]);
 
-  const std::vector<int32_t> &inputs = op.inputs;
-  const std::vector<int32_t> &outputs = op.outputs;
-  const auto &tensors = context->reader()->tensors();
-  const auto &opcodes = context->reader()->opcodes();
-  auto tensors_ptr = context->reader()->tensors_ptr();
-  assert(tensors_ptr != nullptr);
-
-  // NOTE Unpack has only one input so running a loop is not necessary
-  //      This is provided as a reference for other Ops as a reference
-  std::vector<CircleNode *> input_nodes;
-  for (const int32_t input_tensor_index : inputs)
-  {
-    input_nodes.push_back(context->nodefinder()->node(input_tensor_index));
-  }
-
-  // Create CircleUnpack
-  CircleUnpack *node = graph->nodes()->create<CircleUnpack>();
-  node->value(input_nodes[0]);
-
-  const auto *options = op.builtin_options.AsUnpackOptions();
+  const auto *options = bna.op.builtin_options.AsUnpackOptions();
   node->num(options->num);
   node->axis(options->axis);
 
-  assert(outputs.size() > 0);
-  {
-    // Let's use name of output 0 as Unpack name
-    const circle::TensorT &output_tensor = *tensors[outputs[0]];
-    node->name(tensor_name(output_tensor));
-    node->op_version(opcodes[op.opcode_index].get()->version);
-
-    // NOTE We don't set quantization for Unpack itself but to virtual outputs
-  }
-
-  // Create virtual outputs of Unpack
-  for (int32_t n = 0; n < options->num; ++n)
-  {
-    const circle::TensorT &output_tensor = *tensors[outputs[n]];
+  return node;
+}
 
-    auto *nodeout = graph->nodes()->create<CircleUnpackOut>();
-    copy_tensor_attributes(output_tensor, nodeout);
-    // mark shape_status
-    if (tensors_ptr->Get(outputs[n])->shape() == nullptr)
-      nodeout->shape_status(ShapeStatus::NOSHAPE);
-    else
-      nodeout->shape_status(ShapeStatus::VALID);
+CircleNode *CircleUnpackGraphBuilder::build_out(const BuildOutArgs &boa) const
+{
+  auto *nodeout = boa.node->graph()->nodes()->create<CircleUnpackOut>();
 
-    nodeout->input(node);
-    nodeout->index(n);
+  nodeout->input(boa.node);
+  nodeout->index(boa.index);
 
-    context->nodefinder()->enroll(outputs[n], nodeout);
-  }
+  return nodeout;
 }
 
 } // namespace luci
diff --git a/compiler/luci/import/src/Nodes/CircleWhere.cpp b/compiler/luci/import/src/Nodes/CircleWhere.cpp
index f4c5f0c66..8e4f1a0c4 100644
--- a/compiler/luci/import/src/Nodes/CircleWhere.cpp
+++ b/compiler/luci/import/src/Nodes/CircleWhere.cpp
@@ -25,15 +25,11 @@ namespace luci
 
 bool CircleWhereGraphBuilder::validate(const ValidateArgs &args) const
 {
-  const auto &inputs = args.op.inputs;
-  const auto &outputs = args.op.outputs;
-
-  if (inputs.size() != 1)
-    return false;
-
-  if (outputs.size() != 1)
+  if (!GraphBuilder::validate(args, 1))
     return false;
 
+  const auto &inputs = args.op.inputs;
+  const auto &outputs = args.op.outputs;
   const auto &tensors = args.reader.tensors();
   const auto &tensor_condition = tensors.at(inputs.at(0));
   const auto &tensor_out = tensors.at(outputs[0]);
diff --git a/compiler/luci/import/src/Nodes/CircleWhile.cpp b/compiler/luci/import/src/Nodes/CircleWhile.cpp
index aead25071..26147562f 100644
--- a/compiler/luci/import/src/Nodes/CircleWhile.cpp
+++ b/compiler/luci/import/src/Nodes/CircleWhile.cpp
@@ -58,7 +58,8 @@ bool CircleWhileGraphBuilder::validate(const ValidateArgs &args) const
  *                       \- CircleWhileOut --- Node ---
  */
 
-void CircleWhileGraphBuilder::build(const circle::OperatorT &op, GraphBuilderContext *context) const
+CircleNode *CircleWhileGraphBuilder::build(const circle::OperatorT &op,
+                                           GraphBuilderContext *context) const
 {
   assert(context != nullptr);
 
@@ -118,6 +119,8 @@ void CircleWhileGraphBuilder::build(const circle::OperatorT &op, GraphBuilderCon
 
     context->nodefinder()->enroll(outputs[n], nodeout);
   }
+
+  return node;
 }
 
 } // namespace luci
diff --git a/compiler/luci/import/src/Nodes/CircleZerosLike.cpp b/compiler/luci/import/src/Nodes/CircleZerosLike.cpp
index e60424def..ddb05e8a4 100644
--- a/compiler/luci/import/src/Nodes/CircleZerosLike.cpp
+++ b/compiler/luci/import/src/Nodes/CircleZerosLike.cpp
@@ -25,13 +25,7 @@ namespace luci
 
 bool CircleZerosLikeGraphBuilder::validate(const ValidateArgs &args) const
 {
-  if (args.op.inputs.size() != 1)
-    return false;
-
-  if (args.op.outputs.size() != 1)
-    return false;
-
-  return true;
+  return GraphBuilder::validate(args, 1);
 }
 
 CircleNode *CircleZerosLikeGraphBuilder::build_node(const circle::OperatorT &,
diff --git a/compiler/luci/import/src/PostImport.cpp b/compiler/luci/import/src/PostImport.cpp
index f436b48e8..63b16bb95 100644
--- a/compiler/luci/import/src/PostImport.cpp
+++ b/compiler/luci/import/src/PostImport.cpp
@@ -130,7 +130,10 @@ private:
 namespace
 {
 /**
- * @brief  ValidateNodeProp will validate inter graph connections for each Nodes
+ * @brief  ValidateNodeProp will validate inter graph connections for each Nodes.
+ * @note   In here, only loco::GraphInput and loco::GraphOutput are validated,
+ *         since this class is for checking inter graph connections.
+ *         CircleNodes such as CircleInput and CircleOutput will be validated at later steps.
  */
 class ValidateNodeProp final : public luci::CircleNodeMutableVisitor<void>
 {
@@ -172,9 +175,19 @@ public:
 
       auto then_graph_output = then_graph_outputs->at(then_out->index());
       auto else_graph_output = else_graph_outputs->at(else_out->index());
-      if (!(*then_graph_output->shape() == *else_graph_output->shape()))
+      if (then_graph_output->shape()->rank() != else_graph_output->shape()->rank())
       {
-        INTERNAL_EXN_V("CircleIf THEN and ELSE Graph Output shape mismatch ", idx);
+        INTERNAL_EXN_V("CircleIf THEN and ELSE Graph Output rank mismatch ", idx);
+      }
+      for (uint32_t i = 0; i < then_graph_output->shape()->rank(); ++i)
+      {
+        if (then_graph_output->shape()->dim(i).known() &&
+            else_graph_output->shape()->dim(i).known() &&
+            then_graph_output->shape()->dim(i).value() !=
+              else_graph_output->shape()->dim(i).value())
+        {
+          INTERNAL_EXN_V("CircleIf THEN and ELSE Graph Output dimension mismatch ", idx);
+        }
       }
       if (then_graph_output->dtype() != else_graph_output->dtype())
       {
@@ -231,18 +244,20 @@ public:
 
       auto cond_graph_input = cond_graph_inputs->at(cond_in->index());
       auto body_graph_input = body_graph_inputs->at(body_in->index());
-      if ((cond_in->rank() != body_in->rank()))
+      if (cond_graph_input->shape()->rank() != body_graph_input->shape()->rank())
       {
-        INTERNAL_EXN_V("CircleWhile COND input and BODY input shape mismatch ", idx);
+        INTERNAL_EXN_V("CircleWhile COND input and BODY input rank mismatch ", idx);
       }
-      if (cond_in->rank() > 0 && body_in->rank() > 0)
+      for (uint32_t i = 0; i < cond_graph_input->shape()->rank(); ++i)
       {
-        if (!(*cond_graph_input->shape() == *body_graph_input->shape()))
+        if (cond_graph_input->shape()->dim(i).known() &&
+            body_graph_input->shape()->dim(i).known() &&
+            cond_graph_input->shape()->dim(i).value() != body_graph_input->shape()->dim(i).value())
         {
-          INTERNAL_EXN_V("CircleWhile COND input and BODY input shape mismatch ", idx);
+          INTERNAL_EXN_V("CircleWhile COND input and BODY input dimension mismatch ", idx);
         }
       }
-      if (cond_in->dtype() != body_in->dtype())
+      if (cond_graph_input->dtype() != body_graph_input->dtype())
       {
         INTERNAL_EXN_V("CircleWhile COND input and BODY input type mismatch ", idx);
       }
@@ -257,18 +272,20 @@ public:
 
       auto cond_graph_input = cond_graph_inputs->at(cond_in->index());
       auto body_graph_output = body_graph_outputs->at(body_out->index());
-      if ((cond_in->rank() != body_out->rank()))
+      if (cond_graph_input->shape()->rank() != body_graph_output->shape()->rank())
       {
-        INTERNAL_EXN_V("CircleWhile COND input and BODY output shape mismatch ", idx);
+        INTERNAL_EXN_V("CircleWhile COND input and BODY output rank mismatch ", idx);
       }
-      if (cond_in->rank() > 0 && body_out->rank() > 0)
+      for (uint32_t i = 0; i < cond_graph_input->shape()->rank(); ++i)
       {
-        if (!(*cond_graph_input->shape() == *body_graph_output->shape()))
+        if (cond_graph_input->shape()->dim(i).known() &&
+            body_graph_output->shape()->dim(i).known() &&
+            cond_graph_input->shape()->dim(i).value() != body_graph_output->shape()->dim(i).value())
         {
-          INTERNAL_EXN_V("CircleWhile COND input and BODY output shape mismatch ", idx);
+          INTERNAL_EXN_V("CircleWhile COND input and BODY output dimension mismatch ", idx);
         }
       }
-      if (cond_in->dtype() != body_out->dtype())
+      if (cond_graph_input->dtype() != body_graph_output->dtype())
       {
         INTERNAL_EXN_V("CircleWhile COND input and BODY output type mismatch ", idx);
       }
diff --git a/compiler/luci/lang/CMakeLists.txt b/compiler/luci/lang/CMakeLists.txt
index 32d0a890d..c618fdd6f 100644
--- a/compiler/luci/lang/CMakeLists.txt
+++ b/compiler/luci/lang/CMakeLists.txt
@@ -7,6 +7,7 @@ target_include_directories(luci_lang PRIVATE src)
 target_include_directories(luci_lang PUBLIC include)
 target_link_libraries(luci_lang PUBLIC loco)
 target_link_libraries(luci_lang PUBLIC oops)
+target_link_libraries(luci_lang PUBLIC nncc_coverage)
 target_link_libraries(luci_lang PRIVATE logo)
 target_link_libraries(luci_lang PRIVATE nncc_common)
 
diff --git a/compiler/luci/lang/include/luci/IR/CircleNodeDecl.h b/compiler/luci/lang/include/luci/IR/CircleNodeDecl.h
index e6410d154..edec9d18b 100644
--- a/compiler/luci/lang/include/luci/IR/CircleNodeDecl.h
+++ b/compiler/luci/lang/include/luci/IR/CircleNodeDecl.h
@@ -20,7 +20,6 @@
 #include <loco/IR/Dialect.h>
 #include <loco/IR/Node.h>
 #include <loco/IR/NodeMixins.h>
-#include <luci/IR/CircleShapeSignature.h>
 #include <luci/IR/PropertyShapeStatus.h>
 
 #include "CircleOpcode.h"
@@ -62,9 +61,6 @@ struct CircleNode : public loco::Node,
     _sparsityparam = std::move(sparsityparam);
   }
 
-  const ShapeSignature &shape_signature(void) const { return _shape_signature; }
-  void shape_signature(const ShapeSignature &ss) { _shape_signature = ss; }
-
   ShapeStatus shape_status(void) const { return _shape_status; }
   void shape_status(ShapeStatus ss) { _shape_status = ss; }
 
@@ -75,7 +71,6 @@ private:
   NodeName _name;
   std::unique_ptr<CircleQuantParam> _quantparam;
   std::unique_ptr<SparsityParam> _sparsityparam;
-  ShapeSignature _shape_signature;
   ShapeStatus _shape_status{ShapeStatus::UNDEFINED};
   int32_t _op_version = 1;
 };
diff --git a/compiler/luci/lang/include/luci/IR/CircleNodeImpl.h b/compiler/luci/lang/include/luci/IR/CircleNodeImpl.h
index a6b9488db..4b3178b9b 100644
--- a/compiler/luci/lang/include/luci/IR/CircleNodeImpl.h
+++ b/compiler/luci/lang/include/luci/IR/CircleNodeImpl.h
@@ -34,8 +34,10 @@ template <typename T> T CircleNode::accept(CircleNodeVisitorBase<T> *v) const
                                    \
   case CircleOpcode::OPCODE:       \
     return v->visit(dynamic_cast<const CLASS *>(this));
+#define CIRCLE_VNODE CIRCLE_NODE
 
 #include "CircleNodes.lst"
+#undef CIRCLE_VNODE
 #undef CIRCLE_NODE
 
     default:
@@ -53,8 +55,10 @@ template <typename T> T CircleNode::accept(CircleNodeMutableVisitorBase<T> *v)
                                    \
   case CircleOpcode::OPCODE:       \
     return v->visit(dynamic_cast<CLASS *>(this));
+#define CIRCLE_VNODE CIRCLE_NODE
 
 #include "CircleNodes.lst"
+#undef CIRCLE_VNODE
 #undef CIRCLE_NODE
 
     default:
diff --git a/compiler/luci/lang/include/luci/IR/CircleNodeMixins.h b/compiler/luci/lang/include/luci/IR/CircleNodeMixins.h
new file mode 100644
index 000000000..3f8ab7d61
--- /dev/null
+++ b/compiler/luci/lang/include/luci/IR/CircleNodeMixins.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_IR_CIRCLE_NODE_MIXINS_H__
+#define __LUCI_IR_CIRCLE_NODE_MIXINS_H__
+
+#include "luci/IR/AttrFusedActFunc.h"
+
+#include <loco/IR/Node.h>
+#include <loco/IR/NodeMixins.h>
+
+#include <vector>
+
+namespace luci
+{
+
+/// @brief enumeration of mixin class
+enum class CircleNodeTrait
+{
+  FusedActFunc,
+  Bias
+};
+
+template <CircleNodeTrait T> class CircleNodeMixin;
+
+template <> class CircleNodeMixin<CircleNodeTrait::FusedActFunc>
+{
+public:
+  CircleNodeMixin() = default;
+
+public:
+  FusedActFunc fusedActivationFunction() const { return _fused_act_fun; }
+  void fusedActivationFunction(FusedActFunc fused_act_fun) { _fused_act_fun = fused_act_fun; }
+
+private:
+  FusedActFunc _fused_act_fun = FusedActFunc::UNDEFINED;
+};
+
+/**
+ * @brief Mixin class for nodes that has a bias input
+ */
+template <> class CircleNodeMixin<CircleNodeTrait::Bias>
+{
+public:
+  CircleNodeMixin() = default;
+
+public:
+  virtual loco::Node *bias(void) const = 0; /// @brief get the input for bias.
+  virtual void bias(loco::Node *node) = 0;  /// @brief set the input for bias.
+};
+
+/**
+ * @brief Nodes with the fixed number of inputs
+ *
+ * TODO Deprecated this class, and use loco::FixedArity instead
+ */
+template <unsigned N, typename Base> class FixedArityNode : public Base
+{
+public:
+  FixedArityNode()
+  {
+    _args.resize(N);
+    for (uint32_t n = 0; n < N; ++n)
+    {
+      _args[n] = std::make_unique<loco::Use>(this);
+    }
+  }
+
+  virtual ~FixedArityNode() = default;
+
+public:
+  unsigned arity(void) const final { return N; }
+
+  loco::Node *arg(uint32_t n) const final { return _args.at(n)->node(); }
+
+  void drop(void) final
+  {
+    for (uint32_t n = 0; n < N; ++n)
+    {
+      _args.at(n)->node(nullptr);
+    }
+  }
+
+protected:
+  // This API allows inherited classes to access "_args" field.
+  loco::Use *at(unsigned n) const { return _args.at(n).get(); }
+
+private:
+  std::vector<std::unique_ptr<loco::Use>> _args{};
+};
+
+} // namespace luci
+
+#endif // __LUCI_IR_CIRCLE_NODE_MIXINS_H__
diff --git a/compiler/luci/lang/include/luci/IR/CircleNodeVisitor.h b/compiler/luci/lang/include/luci/IR/CircleNodeVisitor.h
index 43339fe84..599e4bcd9 100644
--- a/compiler/luci/lang/include/luci/IR/CircleNodeVisitor.h
+++ b/compiler/luci/lang/include/luci/IR/CircleNodeVisitor.h
@@ -33,8 +33,10 @@ template <typename T> struct CircleNodeVisitorBase
   virtual ~CircleNodeVisitorBase() = default;
 
 #define CIRCLE_NODE(OPCODE, CIRCLE_CLASS) virtual T visit(const CIRCLE_CLASS *) = 0;
+#define CIRCLE_VNODE CIRCLE_NODE
 
 #include "CircleNodes.lst"
+#undef CIRCLE_VNODE
 #undef CIRCLE_NODE
 };
 
@@ -44,9 +46,11 @@ template <typename T> struct CircleNodeVisitor : public CircleNodeVisitorBase<T>
 
 #define CIRCLE_NODE(OPCODE, CIRCLE_CLASS) \
   virtual T visit(const CIRCLE_CLASS *node) { return visit(static_cast<const CircleNode *>(node)); }
+#define CIRCLE_VNODE CIRCLE_NODE
 
 #include "CircleNodes.lst"
 
+#undef CIRCLE_VNODE
 #undef CIRCLE_NODE
 
   /// @brief Default fallback
@@ -61,9 +65,11 @@ template <typename T> struct CircleNodeMutableVisitorBase
   virtual ~CircleNodeMutableVisitorBase() = default;
 
 #define CIRCLE_NODE(OPCODE, CIRCLE_CLASS) virtual T visit(CIRCLE_CLASS *) = 0;
+#define CIRCLE_VNODE CIRCLE_NODE
 
 #include "CircleNodes.lst"
 
+#undef CIRCLE_VNODE
 #undef CIRCLE_NODE
 };
 
@@ -73,9 +79,11 @@ template <typename T> struct CircleNodeMutableVisitor : public CircleNodeMutable
 
 #define CIRCLE_NODE(OPCODE, CIRCLE_CLASS) \
   virtual T visit(CIRCLE_CLASS *node) { return visit(static_cast<CircleNode *>(node)); }
+#define CIRCLE_VNODE CIRCLE_NODE
 
 #include "CircleNodes.lst"
 
+#undef CIRCLE_VNODE
 #undef CIRCLE_NODE
 
   /// @brief Default fallback
diff --git a/compiler/luci/lang/include/luci/IR/CircleNodes.h b/compiler/luci/lang/include/luci/IR/CircleNodes.h
index fde0b612b..69a82a7b9 100644
--- a/compiler/luci/lang/include/luci/IR/CircleNodes.h
+++ b/compiler/luci/lang/include/luci/IR/CircleNodes.h
@@ -25,6 +25,7 @@
 #include "Nodes/CircleAveragePool2D.h"
 #include "Nodes/CircleBatchMatMul.h"
 #include "Nodes/CircleBatchToSpaceND.h"
+#include "Nodes/CircleBidirectionalSequenceLSTM.h"
 #include "Nodes/CircleCast.h"
 #include "Nodes/CircleCeil.h"
 #include "Nodes/CircleConcatenation.h"
@@ -40,6 +41,7 @@
 #include "Nodes/CircleEqual.h"
 #include "Nodes/CircleExp.h"
 #include "Nodes/CircleExpandDims.h"
+#include "Nodes/CircleFakeQuant.h"
 #include "Nodes/CircleFill.h"
 #include "Nodes/CircleFloor.h"
 #include "Nodes/CircleFloorDiv.h"
@@ -134,6 +136,7 @@
 // Virtual nodes
 #include "Nodes/CircleInput.h"
 #include "Nodes/CircleOutput.h"
+#include "Nodes/CircleBidirectionalSequenceLSTMOut.h"
 #include "Nodes/CircleCustomOut.h"
 #include "Nodes/CircleIfOut.h"
 #include "Nodes/CircleNonMaxSuppressionV4Out.h"
@@ -150,15 +153,6 @@
 namespace luci
 {
 
-/**
- * @brief  Set both CircleReshape's 2nd input as CircleConst, and newShape attribute
- *         with same value
- * @note   Shape inference for TFLReshape forces them to be same
- *
- * TODO find better place for this helper
- */
-void set_new_shape(CircleReshape *node, int32_t *base, uint32_t size);
-
 /// @brief Link GraphOutput with CircleOutput node
 void link(loco::GraphOutput *, CircleOutput *);
 
diff --git a/compiler/luci/lang/include/luci/IR/CircleNodes.lst b/compiler/luci/lang/include/luci/IR/CircleNodes.lst
index b9d545893..b93fdc89d 100644
--- a/compiler/luci/lang/include/luci/IR/CircleNodes.lst
+++ b/compiler/luci/lang/include/luci/IR/CircleNodes.lst
@@ -2,6 +2,10 @@
 #error "Define CIRCLE_NODE"
 #endif // CIRCLE_NODE
 
+#ifndef CIRCLE_VNODE
+#error "Define CIRCLE_VNODE"
+#endif // CIRCLE_VNODE
+
 //
 // PLEASE SORT NODE DECLS IN ALPHABETICAL ORDER
 //
@@ -18,7 +22,8 @@ CIRCLE_NODE(ARG_MAX, luci::CircleArgMax)
 CIRCLE_NODE(ARG_MIN, luci::CircleArgMin)
 CIRCLE_NODE(AVERAGE_POOL_2D, luci::CircleAveragePool2D)
 CIRCLE_NODE(BATCH_TO_SPACE_ND, luci::CircleBatchToSpaceND)
-CIRCLE_NODE(BATCHMATMUL, luci::CircleBatchMatMul)
+CIRCLE_NODE(BATCH_MATMUL, luci::CircleBatchMatMul)
+CIRCLE_NODE(BIDIRECTIONAL_SEQUENCE_LSTM, luci::CircleBidirectionalSequenceLSTM)
 CIRCLE_NODE(CAST, luci::CircleCast)
 CIRCLE_NODE(CEIL, luci::CircleCeil)
 CIRCLE_NODE(CONCATENATION, luci::CircleConcatenation)
@@ -33,6 +38,7 @@ CIRCLE_NODE(ELU, luci::CircleElu)
 CIRCLE_NODE(EQUAL, luci::CircleEqual)
 CIRCLE_NODE(EXP, luci::CircleExp)
 CIRCLE_NODE(EXPAND_DIMS, luci::CircleExpandDims)
+CIRCLE_NODE(FAKE_QUANT, luci::CircleFakeQuant)
 CIRCLE_NODE(FILL, luci::CircleFill)
 CIRCLE_NODE(FLOOR, luci::CircleFloor)
 CIRCLE_NODE(FLOOR_DIV, luci::CircleFloorDiv)
@@ -125,18 +131,19 @@ CIRCLE_NODE(BCQ_FULLY_CONNECTED, luci::CircleBCQFullyConnected)
 CIRCLE_NODE(BCQ_GATHER, luci::CircleBCQGather)
 CIRCLE_NODE(INSTANCE_NORM, luci::CircleInstanceNorm)
 // Virtual node(s)
-CIRCLE_NODE(CIRCLECONST, luci::CircleConst)
-CIRCLE_NODE(CIRCLEINPUT, luci::CircleInput)
-CIRCLE_NODE(CIRCLEOUTPUT, luci::CircleOutput)
-CIRCLE_NODE(CIRCLEOUTPUTDUMMY, luci::CircleOutputDummy)
-CIRCLE_NODE(CIRCLEOUTPUTEXCLUDE, luci::CircleOutputExclude)
-CIRCLE_NODE(CIRCLECUSTOMOUT, luci::CircleCustomOut)
-CIRCLE_NODE(CIRCLEIFOUT, luci::CircleIfOut)
-CIRCLE_NODE(CIRCLENONMAXSUPPRESSIONV4OUT, luci::CircleNonMaxSuppressionV4Out)
-CIRCLE_NODE(CIRCLENONMAXSUPPRESSIONV5OUT, luci::CircleNonMaxSuppressionV5Out)
-CIRCLE_NODE(CIRCLESPLITOUT, luci::CircleSplitOut)
-CIRCLE_NODE(CIRCLESPLITVOUT, luci::CircleSplitVOut)
-CIRCLE_NODE(CIRCLETOPKV2OUT, luci::CircleTopKV2Out)
-CIRCLE_NODE(CIRCLEUNIQUEOUT, luci::CircleUniqueOut)
-CIRCLE_NODE(CIRCLEUNPACKOUT, luci::CircleUnpackOut)
-CIRCLE_NODE(CIRCLEWHILEOUT, luci::CircleWhileOut)
+CIRCLE_VNODE(CIRCLEBIDIRECTIONAL_SEQUENCE_LSTM_OUT, luci::CircleBidirectionalSequenceLSTMOut)
+CIRCLE_VNODE(CIRCLECONST, luci::CircleConst)
+CIRCLE_VNODE(CIRCLEINPUT, luci::CircleInput)
+CIRCLE_VNODE(CIRCLEOUTPUT, luci::CircleOutput)
+CIRCLE_VNODE(CIRCLEOUTPUTDUMMY, luci::CircleOutputDummy)
+CIRCLE_VNODE(CIRCLEOUTPUTEXCLUDE, luci::CircleOutputExclude)
+CIRCLE_VNODE(CIRCLECUSTOMOUT, luci::CircleCustomOut)
+CIRCLE_VNODE(CIRCLEIFOUT, luci::CircleIfOut)
+CIRCLE_VNODE(CIRCLENONMAXSUPPRESSIONV4OUT, luci::CircleNonMaxSuppressionV4Out)
+CIRCLE_VNODE(CIRCLENONMAXSUPPRESSIONV5OUT, luci::CircleNonMaxSuppressionV5Out)
+CIRCLE_VNODE(CIRCLESPLITOUT, luci::CircleSplitOut)
+CIRCLE_VNODE(CIRCLESPLITVOUT, luci::CircleSplitVOut)
+CIRCLE_VNODE(CIRCLETOPKV2OUT, luci::CircleTopKV2Out)
+CIRCLE_VNODE(CIRCLEUNIQUEOUT, luci::CircleUniqueOut)
+CIRCLE_VNODE(CIRCLEUNPACKOUT, luci::CircleUnpackOut)
+CIRCLE_VNODE(CIRCLEWHILEOUT, luci::CircleWhileOut)
diff --git a/compiler/luci/lang/include/luci/IR/CircleOpcode.h b/compiler/luci/lang/include/luci/IR/CircleOpcode.h
index 703b70da2..be3069f94 100644
--- a/compiler/luci/lang/include/luci/IR/CircleOpcode.h
+++ b/compiler/luci/lang/include/luci/IR/CircleOpcode.h
@@ -23,7 +23,9 @@ namespace luci
 enum class CircleOpcode
 {
 #define CIRCLE_NODE(OPCODE, CLASS) OPCODE,
+#define CIRCLE_VNODE CIRCLE_NODE
 #include "CircleNodes.lst"
+#undef CIRCLE_VNODE
 #undef CIRCLE_NODE
 };
 
diff --git a/compiler/luci/lang/include/luci/IR/CircleShapeSignature.h b/compiler/luci/lang/include/luci/IR/CircleShapeSignature.h
deleted file mode 100644
index 18a260486..000000000
--- a/compiler/luci/lang/include/luci/IR/CircleShapeSignature.h
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __LUCI_IR_SHAPE_SIGNATURE_H__
-#define __LUCI_IR_SHAPE_SIGNATURE_H__
-
-#include <stdint.h>
-#include <vector>
-
-namespace luci
-{
-
-class ShapeSignature
-{
-public:
-  ShapeSignature() = default;
-
-  ShapeSignature(const std::vector<int32_t> &shape_signature)
-  {
-    _shape_signature = shape_signature;
-  }
-
-public:
-  const std::vector<int32_t> &as_vector() const { return _shape_signature; }
-
-  int32_t dim(uint32_t d) const { return _shape_signature.at(d); }
-  int32_t &dim(uint32_t d) { return _shape_signature.at(d); }
-
-  uint32_t rank(void) const { return _shape_signature.size(); }
-  void rank(uint32_t rank) { _shape_signature.resize(rank); }
-
-private:
-  std::vector<int32_t> _shape_signature{};
-};
-
-bool operator==(const ShapeSignature &lhs, const ShapeSignature &rhs);
-
-} // namespace luci
-
-#endif // __LUCI_IR_SHAPE_SIGNATURE_H__
diff --git a/compiler/luci/lang/src/DeadNodeQueryService.h b/compiler/luci/lang/include/luci/IR/DeadNodeQueryService.h
index d10696667..d10696667 100644
--- a/compiler/luci/lang/src/DeadNodeQueryService.h
+++ b/compiler/luci/lang/include/luci/IR/DeadNodeQueryService.h
diff --git a/compiler/luci/lang/include/luci/IR/LuciNodeMixins.h b/compiler/luci/lang/include/luci/IR/LuciNodeMixins.h
index c1bb0db11..2078495c6 100644
--- a/compiler/luci/lang/include/luci/IR/LuciNodeMixins.h
+++ b/compiler/luci/lang/include/luci/IR/LuciNodeMixins.h
@@ -17,90 +17,16 @@
 #ifndef __LUCI_IR_LUCINODEMIXINS_H__
 #define __LUCI_IR_LUCINODEMIXINS_H__
 
-#include "luci/IR/AttrFusedActFunc.h"
+// TODO remove this file after LuciNodeTrait and LuciNodeMixin are not used in backend
 
-#include <loco/IR/Node.h>
-#include <loco/IR/NodeMixins.h>
-
-#include <vector>
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
 
-/// @brief enumeration of mixin class
-enum class LuciNodeTrait
-{
-  FusedActFunc,
-  Bias
-};
-
-template <LuciNodeTrait T> class LuciNodeMixin;
-
-template <> class LuciNodeMixin<LuciNodeTrait::FusedActFunc>
-{
-public:
-  LuciNodeMixin() = default;
-
-public:
-  FusedActFunc fusedActivationFunction() const { return _fused_act_fun; }
-  void fusedActivationFunction(FusedActFunc fused_act_fun) { _fused_act_fun = fused_act_fun; }
-
-private:
-  FusedActFunc _fused_act_fun = FusedActFunc::UNDEFINED;
-};
-
-/**
- * @brief Mixin class for nodes that has a bias input
- */
-template <> class LuciNodeMixin<LuciNodeTrait::Bias>
-{
-public:
-  LuciNodeMixin() = default;
-
-public:
-  virtual loco::Node *bias(void) const = 0; /// @brief get the input for bias.
-  virtual void bias(loco::Node *node) = 0;  /// @brief set the input for bias.
-};
-
-/**
- * @brief Nodes with the fixed number of inputs
- *
- * TODO Deprecated this class, and use loco::FixedArity instead
- */
-template <unsigned N, typename Base> class FixedArityNode : public Base
-{
-public:
-  FixedArityNode()
-  {
-    _args.resize(N);
-    for (uint32_t n = 0; n < N; ++n)
-    {
-      _args[n] = std::make_unique<loco::Use>(this);
-    }
-  }
-
-  virtual ~FixedArityNode() = default;
-
-public:
-  unsigned arity(void) const final { return N; }
-
-  loco::Node *arg(uint32_t n) const final { return _args.at(n)->node(); }
-
-  void drop(void) final
-  {
-    for (uint32_t n = 0; n < N; ++n)
-    {
-      _args.at(n)->node(nullptr);
-    }
-  }
-
-protected:
-  // This API allows inherited classes to access "_args" field.
-  loco::Use *at(unsigned n) const { return _args.at(n).get(); }
+using LuciNodeTrait = CircleNodeTrait;
 
-private:
-  std::vector<std::unique_ptr<loco::Use>> _args{};
-};
+template <LuciNodeTrait T> using LuciNodeMixin = CircleNodeMixin<T>;
 
 } // namespace luci
 
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleAbs.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleAbs.h
index 45dba15bf..7a73f37cd 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleAbs.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleAbs.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleAdd.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleAdd.h
index f26eccd1a..92563de4c 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleAdd.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleAdd.h
@@ -21,7 +21,7 @@
 #include "luci/IR/CircleOpcode.h"
 
 #include "luci/IR/AttrFusedActFunc.h"
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -30,7 +30,7 @@ namespace luci
  * @brief ADD in Circle
  */
 class CircleAdd final : public FixedArityNode<2, CircleNodeImpl<CircleOpcode::ADD>>,
-                        public LuciNodeMixin<LuciNodeTrait::FusedActFunc>
+                        public CircleNodeMixin<CircleNodeTrait::FusedActFunc>
 {
 public:
   loco::Node *x(void) const { return at(0)->node(); }
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleArgMax.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleArgMax.h
index dbc4b2b3a..c1e4631e4 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleArgMax.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleArgMax.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleArgMin.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleArgMin.h
index 8cb561983..b4d026201 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleArgMin.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleArgMin.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleAveragePool2D.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleAveragePool2D.h
index 0b43b40c8..4aa45c2d8 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleAveragePool2D.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleAveragePool2D.h
@@ -24,7 +24,7 @@
 #include "luci/IR/AttrPadding.h"
 #include "luci/IR/AttrStride.h"
 #include "luci/IR/AttrFusedActFunc.h"
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -33,16 +33,14 @@ namespace luci
  * @brief AVERAGE_POOL_2D in Circle
  */
 class CircleAveragePool2D final
-    : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::AVERAGE_POOL_2D>>,
-      public LuciNodeMixin<LuciNodeTrait::FusedActFunc>
+  : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::AVERAGE_POOL_2D>>,
+    public CircleNodeMixin<CircleNodeTrait::FusedActFunc>
 {
 public:
-  CircleAveragePool2D() : _padding(Padding::UNDEFINED) { /* empty */}
-
-public:
   loco::Node *value(void) const { return at(0)->node(); }
   void value(loco::Node *node) { at(0)->node(node); }
 
+public:
   Padding padding() const { return _padding; }
   void padding(Padding padding) { _padding = padding; }
 
@@ -53,7 +51,7 @@ public:
   Stride *stride(void) { return &_stride; }
 
 private:
-  Padding _padding;
+  Padding _padding{Padding::UNDEFINED};
   Stride _stride;
   Filter _filter;
 };
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleBCQFullyConnected.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleBCQFullyConnected.h
index 7d12d593a..4c164ebca 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleBCQFullyConnected.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleBCQFullyConnected.h
@@ -21,7 +21,7 @@
 #include "luci/IR/CircleOpcode.h"
 
 #include "luci/IR/AttrFusedActFunc.h"
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -30,9 +30,9 @@ namespace luci
  * @brief BCQ_FULLY_CONNECTED in Circle
  */
 class CircleBCQFullyConnected final
-    : public FixedArityNode<5, CircleNodeImpl<CircleOpcode::BCQ_FULLY_CONNECTED>>,
-      public LuciNodeMixin<LuciNodeTrait::FusedActFunc>,
-      public LuciNodeMixin<LuciNodeTrait::Bias>
+  : public FixedArityNode<5, CircleNodeImpl<CircleOpcode::BCQ_FULLY_CONNECTED>>,
+    public CircleNodeMixin<CircleNodeTrait::FusedActFunc>,
+    public CircleNodeMixin<CircleNodeTrait::Bias>
 {
 public:
   loco::Node *input(void) const { return at(0)->node(); }
@@ -58,7 +58,7 @@ public:
   }
 
 private:
-  int32_t _weights_hidden_size = 0;
+  int32_t _weights_hidden_size{0};
 };
 
 } // namespace luci
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleBCQGather.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleBCQGather.h
index f7638261d..1a0bf4f19 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleBCQGather.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleBCQGather.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -51,8 +51,8 @@ public:
   void input_hidden_size(int32_t input_hidden_size) { _input_hidden_size = input_hidden_size; }
 
 private:
-  int32_t _axis = 0;
-  int32_t _input_hidden_size = 0;
+  int32_t _axis{0};
+  int32_t _input_hidden_size{0};
 };
 
 } // namespace luci
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleBatchMatMul.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleBatchMatMul.h
index 19999924e..864b033ed 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleBatchMatMul.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleBatchMatMul.h
@@ -20,15 +20,15 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
 
 /**
- * @brief BATCHMATMUL in Circle
+ * @brief BATCH_MATMUL in Circle
  */
-class CircleBatchMatMul final : public FixedArityNode<2, CircleNodeImpl<CircleOpcode::BATCHMATMUL>>
+class CircleBatchMatMul final : public FixedArityNode<2, CircleNodeImpl<CircleOpcode::BATCH_MATMUL>>
 {
 public:
   loco::Node *x(void) const { return at(0)->node(); }
@@ -45,8 +45,8 @@ public:
   void adj_y(bool arg) { _adj_y = arg; }
 
 private:
-  bool _adj_x = false;
-  bool _adj_y = false;
+  bool _adj_x{false};
+  bool _adj_y{false};
 };
 
 } // namespace luci
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleBatchToSpaceND.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleBatchToSpaceND.h
index 67c0a2102..80fa53b8e 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleBatchToSpaceND.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleBatchToSpaceND.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -29,7 +29,7 @@ namespace luci
  * @brief BATCH_TO_SPACE_ND in Circle
  */
 class CircleBatchToSpaceND final
-    : public FixedArityNode<3, CircleNodeImpl<CircleOpcode::BATCH_TO_SPACE_ND>>
+  : public FixedArityNode<3, CircleNodeImpl<CircleOpcode::BATCH_TO_SPACE_ND>>
 {
 public:
   loco::Node *input(void) const { return at(0)->node(); }
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleBidirectionalSequenceLSTM.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleBidirectionalSequenceLSTM.h
new file mode 100644
index 000000000..d16281b69
--- /dev/null
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleBidirectionalSequenceLSTM.h
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_IR_CIRCLEBIDIRECTIONALSEQUENCE_LSTM_H__
+#define __LUCI_IR_CIRCLEBIDIRECTIONALSEQUENCE_LSTM_H__
+
+#include "luci/IR/CircleNodeDecl.h"
+#include "luci/IR/CircleOpcode.h"
+
+#include "luci/IR/AttrFusedActFunc.h"
+#include "luci/IR/CircleNodeMixins.h"
+
+namespace luci
+{
+
+/**
+ * @brief BIDIRECTIONAL_SEQUENCE_LSTM in Circle
+ */
+class CircleBidirectionalSequenceLSTM final
+  : public FixedArityNode<48, CircleNodeImpl<CircleOpcode::BIDIRECTIONAL_SEQUENCE_LSTM>>,
+    public CircleNodeMixin<CircleNodeTrait::FusedActFunc>
+{
+public:
+  loco::Node *input(void) const { return at(0)->node(); }
+  void input(loco::Node *node) { at(0)->node(node); }
+
+  loco::Node *fw_input_to_input_weights(void) const { return at(1)->node(); }
+  void fw_input_to_input_weights(loco::Node *node) { at(1)->node(node); }
+  loco::Node *fw_input_to_forget_weights(void) const { return at(2)->node(); }
+  void fw_input_to_forget_weights(loco::Node *node) { at(2)->node(node); }
+  loco::Node *fw_input_to_cell_weights(void) const { return at(3)->node(); }
+  void fw_input_to_cell_weights(loco::Node *node) { at(3)->node(node); }
+  loco::Node *fw_input_to_output_weights(void) const { return at(4)->node(); }
+  void fw_input_to_output_weights(loco::Node *node) { at(4)->node(node); }
+
+  loco::Node *fw_recurrent_to_input_weights(void) const { return at(5)->node(); }
+  void fw_recurrent_to_input_weights(loco::Node *node) { at(5)->node(node); }
+  loco::Node *fw_recurrent_to_forget_weights(void) const { return at(6)->node(); }
+  void fw_recurrent_to_forget_weights(loco::Node *node) { at(6)->node(node); }
+  loco::Node *fw_recurrent_to_cell_weights(void) const { return at(7)->node(); }
+  void fw_recurrent_to_cell_weights(loco::Node *node) { at(7)->node(node); }
+  loco::Node *fw_recurrent_to_output_weights(void) const { return at(8)->node(); }
+  void fw_recurrent_to_output_weights(loco::Node *node) { at(8)->node(node); }
+
+  loco::Node *fw_cell_to_input_weights(void) const { return at(9)->node(); }
+  void fw_cell_to_input_weights(loco::Node *node) { at(9)->node(node); }
+  loco::Node *fw_cell_to_forget_weights(void) const { return at(10)->node(); }
+  void fw_cell_to_forget_weights(loco::Node *node) { at(10)->node(node); }
+  loco::Node *fw_cell_to_output_weights(void) const { return at(11)->node(); }
+  void fw_cell_to_output_weights(loco::Node *node) { at(11)->node(node); }
+
+  loco::Node *fw_input_gate_bias(void) const { return at(12)->node(); }
+  void fw_input_gate_bias(loco::Node *node) { at(12)->node(node); }
+  loco::Node *fw_forget_gate_bias(void) const { return at(13)->node(); }
+  void fw_forget_gate_bias(loco::Node *node) { at(13)->node(node); }
+  loco::Node *fw_cell_gate_bias(void) const { return at(14)->node(); }
+  void fw_cell_gate_bias(loco::Node *node) { at(14)->node(node); }
+  loco::Node *fw_output_gate_bias(void) const { return at(15)->node(); }
+  void fw_output_gate_bias(loco::Node *node) { at(15)->node(node); }
+
+  loco::Node *fw_projection_weights(void) const { return at(16)->node(); }
+  void fw_projection_weights(loco::Node *node) { at(16)->node(node); }
+  loco::Node *fw_projection_bias(void) const { return at(17)->node(); }
+  void fw_projection_bias(loco::Node *node) { at(17)->node(node); }
+
+  loco::Node *bw_input_to_input_weights(void) const { return at(18)->node(); }
+  void bw_input_to_input_weights(loco::Node *node) { at(18)->node(node); }
+  loco::Node *bw_input_to_forget_weights(void) const { return at(19)->node(); }
+  void bw_input_to_forget_weights(loco::Node *node) { at(19)->node(node); }
+  loco::Node *bw_input_to_cell_weights(void) const { return at(20)->node(); }
+  void bw_input_to_cell_weights(loco::Node *node) { at(20)->node(node); }
+  loco::Node *bw_input_to_output_weights(void) const { return at(21)->node(); }
+  void bw_input_to_output_weights(loco::Node *node) { at(21)->node(node); }
+
+  loco::Node *bw_recurrent_to_input_weights(void) const { return at(22)->node(); }
+  void bw_recurrent_to_input_weights(loco::Node *node) { at(22)->node(node); }
+  loco::Node *bw_recurrent_to_forget_weights(void) const { return at(23)->node(); }
+  void bw_recurrent_to_forget_weights(loco::Node *node) { at(23)->node(node); }
+  loco::Node *bw_recurrent_to_cell_weights(void) const { return at(24)->node(); }
+  void bw_recurrent_to_cell_weights(loco::Node *node) { at(24)->node(node); }
+  loco::Node *bw_recurrent_to_output_weights(void) const { return at(25)->node(); }
+  void bw_recurrent_to_output_weights(loco::Node *node) { at(25)->node(node); }
+
+  loco::Node *bw_cell_to_input_weights(void) const { return at(26)->node(); }
+  void bw_cell_to_input_weights(loco::Node *node) { at(26)->node(node); }
+  loco::Node *bw_cell_to_forget_weights(void) const { return at(27)->node(); }
+  void bw_cell_to_forget_weights(loco::Node *node) { at(27)->node(node); }
+  loco::Node *bw_cell_to_output_weights(void) const { return at(28)->node(); }
+  void bw_cell_to_output_weights(loco::Node *node) { at(28)->node(node); }
+
+  loco::Node *bw_input_gate_bias(void) const { return at(29)->node(); }
+  void bw_input_gate_bias(loco::Node *node) { at(29)->node(node); }
+  loco::Node *bw_forget_gate_bias(void) const { return at(30)->node(); }
+  void bw_forget_gate_bias(loco::Node *node) { at(30)->node(node); }
+  loco::Node *bw_cell_gate_bias(void) const { return at(31)->node(); }
+  void bw_cell_gate_bias(loco::Node *node) { at(31)->node(node); }
+  loco::Node *bw_output_gate_bias(void) const { return at(32)->node(); }
+  void bw_output_gate_bias(loco::Node *node) { at(32)->node(node); }
+
+  loco::Node *bw_projection_weights(void) const { return at(33)->node(); }
+  void bw_projection_weights(loco::Node *node) { at(33)->node(node); }
+  loco::Node *bw_projection_bias(void) const { return at(34)->node(); }
+  void bw_projection_bias(loco::Node *node) { at(34)->node(node); }
+
+  loco::Node *fw_activation_state(void) const { return at(35)->node(); }
+  void fw_activation_state(loco::Node *node) { at(35)->node(node); }
+  loco::Node *fw_cell_state(void) const { return at(36)->node(); }
+  void fw_cell_state(loco::Node *node) { at(36)->node(node); }
+
+  loco::Node *bw_activation_state(void) const { return at(37)->node(); }
+  void bw_activation_state(loco::Node *node) { at(37)->node(node); }
+  loco::Node *bw_cell_state(void) const { return at(38)->node(); }
+  void bw_cell_state(loco::Node *node) { at(38)->node(node); }
+
+  loco::Node *auxillary_input(void) const { return at(39)->node(); }
+  void auxillary_input(loco::Node *node) { at(39)->node(node); }
+  loco::Node *fw_auxillary_input_to_input_weights(void) const { return at(40)->node(); }
+  void fw_auxillary_input_to_input_weights(loco::Node *node) { at(40)->node(node); }
+  loco::Node *fw_auxillary_input_to_forget_weights(void) const { return at(41)->node(); }
+  void fw_auxillary_input_to_forget_weights(loco::Node *node) { at(41)->node(node); }
+  loco::Node *fw_auxillary_input_to_cell_weights(void) const { return at(42)->node(); }
+  void fw_auxillary_input_to_cell_weights(loco::Node *node) { at(42)->node(node); }
+  loco::Node *fw_auxillary_input_to_output_weights(void) const { return at(43)->node(); }
+  void fw_auxillary_input_to_output_weights(loco::Node *node) { at(43)->node(node); }
+  loco::Node *bw_auxillary_input_to_input_weights(void) const { return at(44)->node(); }
+  void bw_auxillary_input_to_input_weights(loco::Node *node) { at(44)->node(node); }
+  loco::Node *bw_auxillary_input_to_forget_weights(void) const { return at(45)->node(); }
+  void bw_auxillary_input_to_forget_weights(loco::Node *node) { at(45)->node(node); }
+  loco::Node *bw_auxillary_input_to_cell_weights(void) const { return at(46)->node(); }
+  void bw_auxillary_input_to_cell_weights(loco::Node *node) { at(46)->node(node); }
+  loco::Node *bw_auxillary_input_to_output_weights(void) const { return at(47)->node(); }
+  void bw_auxillary_input_to_output_weights(loco::Node *node) { at(47)->node(node); }
+
+public:
+  float cell_clip(void) const { return _cell_clip; }
+  void cell_clip(float cell_clip) { _cell_clip = cell_clip; }
+  float proj_clip(void) const { return _proj_clip; }
+  void proj_clip(float proj_clip) { _proj_clip = proj_clip; }
+  bool merge_outputs(void) const { return _merge_outputs; }
+  void merge_outputs(bool merge_outputs) { _merge_outputs = merge_outputs; }
+  bool time_major(void) const { return _time_major; }
+  void time_major(bool time_major) { _time_major = time_major; }
+  bool asymmetric_quantize_inputs(void) const { return _asymmetric_quantize_inputs; }
+  void asymmetric_quantize_inputs(bool asymmetric_quantize_inputs)
+  {
+    _asymmetric_quantize_inputs = asymmetric_quantize_inputs;
+  }
+
+private:
+  float _cell_clip{0.0f};
+  float _proj_clip{0.0f};
+  bool _merge_outputs{false};
+  bool _time_major{false};
+  bool _asymmetric_quantize_inputs{false};
+};
+
+} // namespace luci
+
+#endif // __LUCI_IR_CIRCLEBIDIRECTIONALSEQUENCE_LSTM_H__
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleBidirectionalSequenceLSTMOut.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleBidirectionalSequenceLSTMOut.h
new file mode 100644
index 000000000..fb2eb0831
--- /dev/null
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleBidirectionalSequenceLSTMOut.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_IR_CIRCLE_BIDIRECTIONAL_SEQUENCE_LSTM_OUT_H__
+#define __LUCI_IR_CIRCLE_BIDIRECTIONAL_SEQUENCE_LSTM_OUT_H__
+
+#include "luci/IR/CircleNodeDecl.h"
+#include "luci/IR/CircleOpcode.h"
+
+#include "luci/IR/CircleNodeMixins.h"
+
+namespace luci
+{
+
+/**
+ * @brief Virtual CIRCLEBIDIRECTIONAL_SEQUENCE_LSTM_OUT in Circle
+ */
+class CircleBidirectionalSequenceLSTMOut final
+  : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::CIRCLEBIDIRECTIONAL_SEQUENCE_LSTM_OUT>>
+{
+public:
+  loco::Node *input(void) const { return at(0)->node(); }
+  void input(loco::Node *node) { at(0)->node(node); }
+
+public:
+  int32_t index(void) const { return _index; }
+  void index(int32_t index) { _index = index; }
+
+private:
+  int32_t _index{-1};
+};
+
+} // namespace luci
+
+#endif // __LUCI_IR_CIRCLE_BIDIRECTIONAL_SEQUENCE_LSTM_OUT_H__
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleCast.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleCast.h
index 9a89d0b2b..0b793607f 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleCast.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleCast.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleCeil.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleCeil.h
index 8a8715dcf..3d7a7ebc7 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleCeil.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleCeil.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleConcatenation.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleConcatenation.h
index dea1a4613..2746a0a2e 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleConcatenation.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleConcatenation.h
@@ -21,7 +21,7 @@
 #include "luci/IR/CircleOpcode.h"
 
 #include "luci/IR/AttrFusedActFunc.h"
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 #include "luci/IR/VariadicArityNode.h"
 
 #include <cassert>
@@ -33,12 +33,12 @@ namespace luci
  * @brief CONCATENATION in Circle
  */
 class CircleConcatenation final
-    : public VariadicArityNode<CircleNodeImpl<CircleOpcode::CONCATENATION>>,
-      public LuciNodeMixin<LuciNodeTrait::FusedActFunc>
+  : public VariadicArityNode<CircleNodeImpl<CircleOpcode::CONCATENATION>>,
+    public CircleNodeMixin<CircleNodeTrait::FusedActFunc>
 {
 public:
   CircleConcatenation(uint32_t arity)
-      : VariadicArityNode<CircleNodeImpl<CircleOpcode::CONCATENATION>>(arity)
+    : VariadicArityNode<CircleNodeImpl<CircleOpcode::CONCATENATION>>(arity)
   {
     // TODO Support when arity is 0
     assert(arity >= 1);
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleConst.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleConst.h
index 250282049..e44363d14 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleConst.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleConst.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 #include <loco/IR/DataTypeTraits.h>
 
@@ -34,9 +34,6 @@ namespace luci
 class CircleConst final : public FixedArityNode<0, CircleNodeImpl<CircleOpcode::CIRCLECONST>>
 {
 public:
-  CircleConst() = default;
-
-public:
   template <loco::DataType DT> uint32_t size(void) const;
   template <loco::DataType DT> void size(uint32_t size);
   template <loco::DataType DT> const typename loco::DataTypeImpl<DT>::Type &at(uint32_t n) const;
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleConv2D.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleConv2D.h
index 13657cee4..7c390940e 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleConv2D.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleConv2D.h
@@ -24,7 +24,7 @@
 #include "luci/IR/AttrStride.h"
 #include "luci/IR/AttrDilation.h"
 #include "luci/IR/AttrFusedActFunc.h"
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -33,8 +33,8 @@ namespace luci
  * @brief CONV_2D in Circle
  */
 class CircleConv2D final : public FixedArityNode<3, CircleNodeImpl<CircleOpcode::CONV_2D>>,
-                           public LuciNodeMixin<LuciNodeTrait::FusedActFunc>,
-                           public LuciNodeMixin<LuciNodeTrait::Bias>
+                           public CircleNodeMixin<CircleNodeTrait::FusedActFunc>,
+                           public CircleNodeMixin<CircleNodeTrait::Bias>
 {
 public:
   loco::Node *input(void) const { return at(0)->node(); }
@@ -57,7 +57,7 @@ public:
   Dilation *dilation(void) { return &_dilation; }
 
 private:
-  Padding _padding = Padding::UNDEFINED;
+  Padding _padding{Padding::UNDEFINED};
   Stride _stride;
   Dilation _dilation;
 };
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleCos.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleCos.h
index 07ced620a..cff04906d 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleCos.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleCos.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleCustom.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleCustom.h
index 6c722b766..b21cc679f 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleCustom.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleCustom.h
@@ -29,19 +29,23 @@ namespace luci
 class CircleCustom final : public VariadicArityNode<CircleNodeImpl<CircleOpcode::CUSTOM>>
 {
 public:
-  CircleCustom(uint32_t arity) : VariadicArityNode<CircleNodeImpl<CircleOpcode::CUSTOM>>(arity)
+  CircleCustom(uint32_t arity, uint32_t out)
+    : VariadicArityNode<CircleNodeImpl<CircleOpcode::CUSTOM>>(arity), _output_count(out)
   {
     // TODO Support when arity is 0
     assert(arity >= 1);
+    assert(out > 0);
   }
 
 public:
   uint32_t numInputs(void) const { return arity(); }
+  uint32_t numOutputs(void) const { return _output_count; }
 
 public:
   Node *inputs(uint32_t index) const { return at(index)->node(); }
   void inputs(uint32_t index, Node *node) { at(index)->node(node); }
 
+public:
   const std::vector<uint8_t> &custom_options(void) const { return _custom_options; }
   void custom_options(const std::vector<uint8_t> &custom_options)
   {
@@ -54,6 +58,7 @@ public:
 private:
   std::vector<uint8_t> _custom_options;
   std::string _custom_code;
+  uint32_t _output_count{0};
 };
 
 } // namespace luci
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleCustomOut.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleCustomOut.h
index 36b8e4aed..91a89c151 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleCustomOut.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleCustomOut.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -29,12 +29,9 @@ namespace luci
  * @brief Virtual CIRCLECUSTOMOUT in Circle
  */
 class CircleCustomOut final
-    : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::CIRCLECUSTOMOUT>>
+  : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::CIRCLECUSTOMOUT>>
 {
 public:
-  CircleCustomOut() = default;
-
-public:
   loco::Node *input(void) const { return at(0)->node(); }
   void input(loco::Node *node) { at(0)->node(node); }
 
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleDepthToSpace.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleDepthToSpace.h
index e19282b97..85b567fb7 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleDepthToSpace.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleDepthToSpace.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -29,18 +29,18 @@ namespace luci
  * @brief DEPTH_TO_SPACE in Circle
  */
 class CircleDepthToSpace final
-    : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::DEPTH_TO_SPACE>>
+  : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::DEPTH_TO_SPACE>>
 {
 public:
   loco::Node *input(void) const { return at(0)->node(); }
   void input(loco::Node *node) { at(0)->node(node); }
 
 public:
-  int block_size(void) const { return _block_size; }
-  void block_size(int block_size) { _block_size = block_size; }
+  int32_t block_size(void) const { return _block_size; }
+  void block_size(int32_t block_size) { _block_size = block_size; }
 
 private:
-  int _block_size{0};
+  int32_t _block_size{0};
 };
 
 } // namespace luci
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleDepthwiseConv2D.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleDepthwiseConv2D.h
index eb058cec1..046aa5908 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleDepthwiseConv2D.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleDepthwiseConv2D.h
@@ -25,7 +25,7 @@
 #include "luci/IR/AttrPadding.h"
 #include "luci/IR/AttrStride.h"
 #include "luci/IR/AttrFusedActFunc.h"
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -34,9 +34,9 @@ namespace luci
  * @brief DEPTHWISE_CONV_2D in Circle
  */
 class CircleDepthwiseConv2D final
-    : public FixedArityNode<3, CircleNodeImpl<CircleOpcode::DEPTHWISE_CONV_2D>>,
-      public LuciNodeMixin<LuciNodeTrait::FusedActFunc>,
-      public LuciNodeMixin<LuciNodeTrait::Bias>
+  : public FixedArityNode<3, CircleNodeImpl<CircleOpcode::DEPTHWISE_CONV_2D>>,
+    public CircleNodeMixin<CircleNodeTrait::FusedActFunc>,
+    public CircleNodeMixin<CircleNodeTrait::Bias>
 {
 public:
   loco::Node *input(void) const { return at(0)->node(); }
@@ -62,9 +62,9 @@ public:
   Dilation *dilation(void) { return &_dilation; }
 
 private:
-  Padding _padding = Padding::UNDEFINED;
+  Padding _padding{Padding::UNDEFINED};
   Stride _stride;
-  int32_t _depth_multiplier = 0;
+  int32_t _depth_multiplier{0};
   Dilation _dilation;
 };
 
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleDequantize.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleDequantize.h
index 847c5dfc5..c3ee44253 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleDequantize.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleDequantize.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleDiv.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleDiv.h
index 1d4d3a239..fcc3f427c 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleDiv.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleDiv.h
@@ -24,7 +24,7 @@
 #include "luci/IR/AttrPadding.h"
 #include "luci/IR/AttrStride.h"
 #include "luci/IR/AttrFusedActFunc.h"
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -33,12 +33,9 @@ namespace luci
  * @brief DIV in Circle
  */
 class CircleDiv final : public FixedArityNode<2, CircleNodeImpl<CircleOpcode::DIV>>,
-                        public LuciNodeMixin<LuciNodeTrait::FusedActFunc>
+                        public CircleNodeMixin<CircleNodeTrait::FusedActFunc>
 {
 public:
-  CircleDiv() = default;
-
-public:
   loco::Node *x(void) const { return at(0)->node(); }
   void x(loco::Node *node) { at(0)->node(node); }
 
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleElu.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleElu.h
index fbb2f3533..721edd9ae 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleElu.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleElu.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -31,9 +31,6 @@ namespace luci
 class CircleElu final : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::ELU>>
 {
 public:
-  CircleElu() = default;
-
-public:
   loco::Node *features(void) const { return at(0)->node(); }
   void features(loco::Node *node) { at(0)->node(node); }
 };
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleEqual.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleEqual.h
index 2087d097a..69697ac7e 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleEqual.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleEqual.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleExp.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleExp.h
index 97aecb30a..b8a5d4561 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleExp.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleExp.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleExpandDims.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleExpandDims.h
index f70219614..15bfe6a29 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleExpandDims.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleExpandDims.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -31,9 +31,6 @@ namespace luci
 class CircleExpandDims final : public FixedArityNode<2, CircleNodeImpl<CircleOpcode::EXPAND_DIMS>>
 {
 public:
-  CircleExpandDims() = default;
-
-public:
   loco::Node *input(void) const { return at(0)->node(); }
   void input(loco::Node *node) { at(0)->node(node); }
 
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleFakeQuant.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleFakeQuant.h
new file mode 100644
index 000000000..9e3159685
--- /dev/null
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleFakeQuant.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_IR_CIRCLE_FAKE_QUANT_H__
+#define __LUCI_IR_CIRCLE_FAKE_QUANT_H__
+
+#include "luci/IR/CircleNodeDecl.h"
+#include "luci/IR/CircleOpcode.h"
+
+#include "luci/IR/CircleNodeMixins.h"
+
+namespace luci
+{
+
+/**
+ * @brief FAKE_QUANT in Circle
+ * @note  'inputs' came from TF.quantize.fake_quant_from_min_max_vars
+ */
+class CircleFakeQuant final : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::FAKE_QUANT>>
+{
+public:
+  loco::Node *inputs(void) const { return at(0)->node(); }
+  void inputs(loco::Node *node) { at(0)->node(node); }
+
+public:
+  float min(void) const { return _min; }
+  void min(float min) { _min = min; }
+
+  float max(void) const { return _max; }
+  void max(float max) { _max = max; }
+
+  int32_t num_bits(void) const { return _num_bits; }
+  void num_bits(int32_t num_bits) { _num_bits = num_bits; }
+
+  bool narrow_range(void) const { return _narrow_range; }
+  void narrow_range(bool narrow_range) { _narrow_range = narrow_range; }
+
+private:
+  float _min{0.0f};
+  float _max{0.0f};
+  int32_t _num_bits{0};
+  bool _narrow_range{false};
+};
+
+} // namespace luci
+
+#endif // __LUCI_IR_CIRCLEGATHER_H__
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleFill.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleFill.h
index bfc65274a..183794d41 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleFill.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleFill.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleFloor.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleFloor.h
index 7e10547b6..ce6807e98 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleFloor.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleFloor.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleFloorDiv.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleFloorDiv.h
index ba9db010c..bf76e37b6 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleFloorDiv.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleFloorDiv.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleFloorMod.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleFloorMod.h
index 4d13717a0..1af0af758 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleFloorMod.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleFloorMod.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleFullyConnected.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleFullyConnected.h
index 952befc87..2862cadb2 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleFullyConnected.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleFullyConnected.h
@@ -21,7 +21,7 @@
 #include "luci/IR/CircleOpcode.h"
 
 #include "luci/IR/AttrFusedActFunc.h"
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -30,9 +30,9 @@ namespace luci
  * @brief FULLY_CONNECTED in Circle
  */
 class CircleFullyConnected final
-    : public FixedArityNode<3, CircleNodeImpl<CircleOpcode::FULLY_CONNECTED>>,
-      public LuciNodeMixin<LuciNodeTrait::FusedActFunc>,
-      public LuciNodeMixin<LuciNodeTrait::Bias>
+  : public FixedArityNode<3, CircleNodeImpl<CircleOpcode::FULLY_CONNECTED>>,
+    public CircleNodeMixin<CircleNodeTrait::FusedActFunc>,
+    public CircleNodeMixin<CircleNodeTrait::Bias>
 {
 public:
   enum class WeightsFormat
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleGather.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleGather.h
index 1e8c4982a..78fa2fc28 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleGather.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleGather.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -42,7 +42,7 @@ public:
   void axis(int32_t axis) { _axis = axis; }
 
 private:
-  int32_t _axis = 0;
+  int32_t _axis{0};
 };
 
 } // namespace luci
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleGatherNd.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleGatherNd.h
index 3423a8216..d6f34f1ea 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleGatherNd.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleGatherNd.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleGreater.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleGreater.h
index 040a4e338..a03b6c749 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleGreater.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleGreater.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleGreaterEqual.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleGreaterEqual.h
index 82bdab212..e435320b2 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleGreaterEqual.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleGreaterEqual.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -29,7 +29,7 @@ namespace luci
  * @brief GREATER EQUAL in Circle
  */
 class CircleGreaterEqual final
-    : public FixedArityNode<2, CircleNodeImpl<CircleOpcode::GREATER_EQUAL>>
+  : public FixedArityNode<2, CircleNodeImpl<CircleOpcode::GREATER_EQUAL>>
 {
 public:
   loco::Node *x(void) const { return at(0)->node(); }
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleIf.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleIf.h
index 2f9eac211..1c037a406 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleIf.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleIf.h
@@ -34,7 +34,7 @@ class CircleIf final : public VariadicArityNode<CircleNodeImpl<CircleOpcode::IF>
 {
 public:
   CircleIf(uint32_t arity, uint32_t out)
-      : VariadicArityNode<CircleNodeImpl<CircleOpcode::IF>>(arity + 1), _output_count(out)
+    : VariadicArityNode<CircleNodeImpl<CircleOpcode::IF>>(arity + 1), _output_count(out)
   {
     assert(arity > 0);
     assert(out > 0);
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleIfOut.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleIfOut.h
index 3654e943b..5adaaa447 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleIfOut.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleIfOut.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -31,9 +31,6 @@ namespace luci
 class CircleIfOut final : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::CIRCLEIFOUT>>
 {
 public:
-  CircleIfOut() = default;
-
-public:
   loco::Node *input(void) const { return at(0)->node(); }
   void input(loco::Node *node) { at(0)->node(node); }
 
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleInput.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleInput.h
index 4a7d36a4e..e0be9aa6e 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleInput.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleInput.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 #include <loco/IR/DataTypeTraits.h>
 #include <loco/IR/GraphInputIndex.h>
@@ -35,16 +35,13 @@ namespace luci
 class CircleInput final : public FixedArityNode<0, CircleNodeImpl<CircleOpcode::CIRCLEINPUT>>
 {
 public:
-  CircleInput() = default;
-
-public:
   void index(const loco::GraphInputIndex &index);
   loco::GraphInputIndex index(void) const;
 
   bool indexed(void) const { return _index != -1; }
 
 private:
-  int64_t _index = -1; // Uninitialized
+  int64_t _index{-1}; // Uninitialized
 };
 
 } // namespace luci
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleInstanceNorm.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleInstanceNorm.h
index db0faa05e..65c34194d 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleInstanceNorm.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleInstanceNorm.h
@@ -21,7 +21,7 @@
 #include "luci/IR/CircleOpcode.h"
 
 #include "luci/IR/AttrFusedActFunc.h"
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -30,8 +30,8 @@ namespace luci
  * @brief INSTANCE_NORM in Circle
  */
 class CircleInstanceNorm final
-    : public FixedArityNode<3, CircleNodeImpl<CircleOpcode::INSTANCE_NORM>>,
-      public LuciNodeMixin<LuciNodeTrait::FusedActFunc>
+  : public FixedArityNode<3, CircleNodeImpl<CircleOpcode::INSTANCE_NORM>>,
+    public CircleNodeMixin<CircleNodeTrait::FusedActFunc>
 {
 public:
   /// @note  Currently only support FLOAT32 as input node
@@ -44,11 +44,12 @@ public:
   loco::Node *beta(void) const { return at(2)->node(); }
   void beta(loco::Node *node) { at(2)->node(node); }
 
+public:
   float epsilon() const { return _epsilon; }
   void epsilon(float epsilon) { _epsilon = epsilon; }
 
 private:
-  float _epsilon = 1e-05;
+  float _epsilon{1e-05};
 };
 
 } // namespace luci
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleL2Normalize.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleL2Normalize.h
index efa932d95..eb2b372ce 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleL2Normalize.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleL2Normalize.h
@@ -21,7 +21,7 @@
 #include "luci/IR/CircleOpcode.h"
 
 #include "luci/IR/AttrFusedActFunc.h"
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -30,8 +30,8 @@ namespace luci
  * @brief L2_NORMALIZATION in Circle
  */
 class CircleL2Normalize final
-    : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::L2_NORMALIZATION>>,
-      public LuciNodeMixin<LuciNodeTrait::FusedActFunc>
+  : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::L2_NORMALIZATION>>,
+    public CircleNodeMixin<CircleNodeTrait::FusedActFunc>
 {
 public:
   loco::Node *x(void) const { return at(0)->node(); }
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleL2Pool2D.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleL2Pool2D.h
index 7c76ee5d0..624d29e9e 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleL2Pool2D.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleL2Pool2D.h
@@ -24,7 +24,7 @@
 #include "luci/IR/AttrPadding.h"
 #include "luci/IR/AttrStride.h"
 #include "luci/IR/AttrFusedActFunc.h"
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -33,15 +33,13 @@ namespace luci
  * @brief L2_POOL_2D in Circle
  */
 class CircleL2Pool2D final : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::L2_POOL_2D>>,
-                             public LuciNodeMixin<LuciNodeTrait::FusedActFunc>
+                             public CircleNodeMixin<CircleNodeTrait::FusedActFunc>
 {
 public:
-  CircleL2Pool2D() : _padding(Padding::UNDEFINED) { /* empty */}
-
-public:
   loco::Node *value(void) const { return at(0)->node(); }
   void value(loco::Node *node) { at(0)->node(node); }
 
+public:
   Padding padding() const { return _padding; }
   void padding(Padding padding) { _padding = padding; }
 
@@ -52,7 +50,7 @@ public:
   Stride *stride(void) { return &_stride; }
 
 private:
-  Padding _padding;
+  Padding _padding{Padding::UNDEFINED};
   Stride _stride;
   Filter _filter;
 };
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleLeakyRelu.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleLeakyRelu.h
index d6ac97fc0..c8e93af91 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleLeakyRelu.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleLeakyRelu.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -31,17 +31,15 @@ namespace luci
 class CircleLeakyRelu final : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::LEAKY_RELU>>
 {
 public:
-  CircleLeakyRelu() = default;
-
-public:
   loco::Node *features(void) const { return at(0)->node(); }
   void features(loco::Node *node) { at(0)->node(node); }
 
+public:
   float alpha() const { return _alpha; }
   void alpha(float alpha) { _alpha = alpha; }
 
 private:
-  float _alpha = 0.2f;
+  float _alpha{0.2f};
 };
 
 } // namespace luci
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleLess.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleLess.h
index cd6cf1872..7adf67842 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleLess.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleLess.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleLessEqual.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleLessEqual.h
index 4c7c6a49b..eb8962494 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleLessEqual.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleLessEqual.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleLocalResponseNormalization.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleLocalResponseNormalization.h
index 8ad2b40fd..4d324700e 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleLocalResponseNormalization.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleLocalResponseNormalization.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -29,7 +29,7 @@ namespace luci
  * @brief LOCAL_RESPONSE_NORMALIZATION in Circle
  */
 class CircleLocalResponseNormalization final
-    : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::LOCAL_RESPONSE_NORMALIZATION>>
+  : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::LOCAL_RESPONSE_NORMALIZATION>>
 {
 public:
   loco::Node *input(void) const { return at(0)->node(); }
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleLog.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleLog.h
index aeb13fed9..2cc57ce2d 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleLog.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleLog.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleLogSoftmax.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleLogSoftmax.h
index 5dfd2c1f9..b73ff7c2a 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleLogSoftmax.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleLogSoftmax.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleLogicalAnd.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleLogicalAnd.h
index 975f6dbc7..9943c71cd 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleLogicalAnd.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleLogicalAnd.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleLogicalNot.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleLogicalNot.h
index 749dbe518..369a3e7bf 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleLogicalNot.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleLogicalNot.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleLogicalOr.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleLogicalOr.h
index 570be57af..c54ec3ebf 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleLogicalOr.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleLogicalOr.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleLogistic.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleLogistic.h
index 8328cb328..1f95e0f77 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleLogistic.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleLogistic.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -31,9 +31,6 @@ namespace luci
 class CircleLogistic final : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::LOGISTIC>>
 {
 public:
-  CircleLogistic() = default;
-
-public:
   loco::Node *x(void) const { return at(0)->node(); }
   void x(loco::Node *node) { at(0)->node(node); }
 };
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleMatrixDiag.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleMatrixDiag.h
index dca6538c3..f8bf259f9 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleMatrixDiag.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleMatrixDiag.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleMatrixSetDiag.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleMatrixSetDiag.h
index c1f5f3023..76aeaff40 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleMatrixSetDiag.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleMatrixSetDiag.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -29,7 +29,7 @@ namespace luci
  * @brief MATRIX_SET_DIAG in Circle
  */
 class CircleMatrixSetDiag final
-    : public FixedArityNode<2, CircleNodeImpl<CircleOpcode::MATRIX_SET_DIAG>>
+  : public FixedArityNode<2, CircleNodeImpl<CircleOpcode::MATRIX_SET_DIAG>>
 {
 public:
   loco::Node *input(void) const { return at(0)->node(); }
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleMaxPool2D.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleMaxPool2D.h
index 1eb6532ff..557240d54 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleMaxPool2D.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleMaxPool2D.h
@@ -24,7 +24,7 @@
 #include "luci/IR/AttrPadding.h"
 #include "luci/IR/AttrStride.h"
 #include "luci/IR/AttrFusedActFunc.h"
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -33,15 +33,13 @@ namespace luci
  * @brief MAX_POOL_2D in Circle
  */
 class CircleMaxPool2D final : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::MAX_POOL_2D>>,
-                              public LuciNodeMixin<LuciNodeTrait::FusedActFunc>
+                              public CircleNodeMixin<CircleNodeTrait::FusedActFunc>
 {
 public:
-  CircleMaxPool2D() : _padding(Padding::UNDEFINED) { /* empty */}
-
-public:
   loco::Node *value(void) const { return at(0)->node(); }
   void value(loco::Node *node) { at(0)->node(node); }
 
+public:
   Padding padding() const { return _padding; }
   void padding(Padding padding) { _padding = padding; }
 
@@ -52,7 +50,7 @@ public:
   Stride *stride(void) { return &_stride; }
 
 private:
-  Padding _padding;
+  Padding _padding{Padding::UNDEFINED};
   Stride _stride;
   Filter _filter;
 };
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleMaximum.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleMaximum.h
index 6f789bc14..317cea308 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleMaximum.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleMaximum.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleMean.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleMean.h
index 7f8aeb5aa..f56e4f4c0 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleMean.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleMean.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -42,7 +42,7 @@ public:
   void keep_dims(bool keep_dims) { _keep_dims = keep_dims; }
 
 private:
-  bool _keep_dims = false;
+  bool _keep_dims{false};
 };
 
 } // namespace luci
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleMinimum.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleMinimum.h
index 79d5a6f17..959d9c93b 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleMinimum.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleMinimum.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleMirrorPad.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleMirrorPad.h
index 68db8f6f3..c69e8f7c1 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleMirrorPad.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleMirrorPad.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 #include "luci/IR/AttrMirrorPadMode.h"
 
 namespace luci
@@ -32,9 +32,6 @@ namespace luci
 class CircleMirrorPad final : public FixedArityNode<2, CircleNodeImpl<CircleOpcode::MIRROR_PAD>>
 {
 public:
-  CircleMirrorPad() = default;
-
-public:
   loco::Node *input(void) const { return at(0)->node(); }
   void input(loco::Node *node) { at(0)->node(node); }
 
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleMul.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleMul.h
index 67e897170..85ed694b3 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleMul.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleMul.h
@@ -21,7 +21,7 @@
 #include "luci/IR/CircleOpcode.h"
 
 #include "luci/IR/AttrFusedActFunc.h"
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -30,7 +30,7 @@ namespace luci
  * @brief MUL in Circle
  */
 class CircleMul final : public FixedArityNode<2, CircleNodeImpl<CircleOpcode::MUL>>,
-                        public LuciNodeMixin<LuciNodeTrait::FusedActFunc>
+                        public CircleNodeMixin<CircleNodeTrait::FusedActFunc>
 {
 public:
   loco::Node *x(void) const { return at(0)->node(); }
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleNeg.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleNeg.h
index 4149ac4a7..adea3fb83 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleNeg.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleNeg.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleNonMaxSuppressionV4.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleNonMaxSuppressionV4.h
index 69f3368c0..b47404bb0 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleNonMaxSuppressionV4.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleNonMaxSuppressionV4.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -29,7 +29,7 @@ namespace luci
  * @brief NON_MAX_SUPPRESSION_V4 in Circle
  */
 class CircleNonMaxSuppressionV4 final
-    : public FixedArityNode<5, CircleNodeImpl<CircleOpcode::NON_MAX_SUPPRESSION_V4>>
+  : public FixedArityNode<5, CircleNodeImpl<CircleOpcode::NON_MAX_SUPPRESSION_V4>>
 {
 public:
   loco::Node *boxes(void) const { return at(0)->node(); }
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleNonMaxSuppressionV4Out.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleNonMaxSuppressionV4Out.h
index a24dc3e9c..7e6923b5e 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleNonMaxSuppressionV4Out.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleNonMaxSuppressionV4Out.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -29,12 +29,9 @@ namespace luci
  * @brief Virtual NONMAXSUPPRESSIONV4OUT in Circle
  */
 class CircleNonMaxSuppressionV4Out final
-    : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::CIRCLENONMAXSUPPRESSIONV4OUT>>
+  : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::CIRCLENONMAXSUPPRESSIONV4OUT>>
 {
 public:
-  CircleNonMaxSuppressionV4Out() = default;
-
-public:
   loco::Node *input(void) const { return at(0)->node(); }
   void input(loco::Node *node) { at(0)->node(node); }
 
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleNonMaxSuppressionV5.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleNonMaxSuppressionV5.h
index 52d682147..77086ede7 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleNonMaxSuppressionV5.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleNonMaxSuppressionV5.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -29,7 +29,7 @@ namespace luci
  * @brief NON_MAX_SUPPRESSION_V5 in Circle
  */
 class CircleNonMaxSuppressionV5 final
-    : public FixedArityNode<6, CircleNodeImpl<CircleOpcode::NON_MAX_SUPPRESSION_V5>>
+  : public FixedArityNode<6, CircleNodeImpl<CircleOpcode::NON_MAX_SUPPRESSION_V5>>
 {
 public:
   loco::Node *boxes(void) const { return at(0)->node(); }
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleNonMaxSuppressionV5Out.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleNonMaxSuppressionV5Out.h
index 0c6989cc7..63d061f11 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleNonMaxSuppressionV5Out.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleNonMaxSuppressionV5Out.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -29,12 +29,9 @@ namespace luci
  * @brief Virtual NONMAXSUPPRESSIONV5OUT in Circle
  */
 class CircleNonMaxSuppressionV5Out final
-    : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::CIRCLENONMAXSUPPRESSIONV5OUT>>
+  : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::CIRCLENONMAXSUPPRESSIONV5OUT>>
 {
 public:
-  CircleNonMaxSuppressionV5Out() = default;
-
-public:
   loco::Node *input(void) const { return at(0)->node(); }
   void input(loco::Node *node) { at(0)->node(node); }
 
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleNotEqual.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleNotEqual.h
index cca7a5e22..add6a0747 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleNotEqual.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleNotEqual.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleOneHot.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleOneHot.h
index 665e01d48..b3eb0f436 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleOneHot.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleOneHot.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -48,7 +48,7 @@ public:
   void axis(int32_t axis) { _axis = axis; }
 
 private:
-  int32_t _axis = -1;
+  int32_t _axis{-1};
 };
 
 } // namespace luci
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleOutput.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleOutput.h
index 67e55f1a1..eb02f824e 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleOutput.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleOutput.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 #include <loco/IR/GraphOutputIndex.h>
 
@@ -34,8 +34,6 @@ namespace luci
 class CircleOutput final : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::CIRCLEOUTPUT>>
 {
 public:
-  CircleOutput() = default;
-
   void index(const loco::GraphOutputIndex &index);
   loco::GraphOutputIndex index(void) const;
 
@@ -46,7 +44,7 @@ public:
   void from(loco::Node *node) { at(0)->node(node); }
 
 private:
-  int64_t _index = -1; // Uninitialized
+  int64_t _index{-1}; // Uninitialized
 };
 
 /**
@@ -54,7 +52,7 @@ private:
  */
 // TODO remove CircleOutputDummy
 class CircleOutputDummy final
-    : public FixedArityNode<0, CircleNodeImpl<CircleOpcode::CIRCLEOUTPUTDUMMY>>
+  : public FixedArityNode<0, CircleNodeImpl<CircleOpcode::CIRCLEOUTPUTDUMMY>>
 {
 public:
   CircleOutputDummy() = default;
@@ -64,7 +62,7 @@ public:
  * @brief CircleOutputExclude is used to specifying not exported nodes
  */
 class CircleOutputExclude final
-    : public FixedArityNode<0, CircleNodeImpl<CircleOpcode::CIRCLEOUTPUTEXCLUDE>>
+  : public FixedArityNode<0, CircleNodeImpl<CircleOpcode::CIRCLEOUTPUTEXCLUDE>>
 {
 public:
   CircleOutputExclude() = default;
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CirclePRelu.h b/compiler/luci/lang/include/luci/IR/Nodes/CirclePRelu.h
index 693777512..3c5559db2 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CirclePRelu.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CirclePRelu.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -31,9 +31,6 @@ namespace luci
 class CirclePRelu final : public FixedArityNode<2, CircleNodeImpl<CircleOpcode::PRELU>>
 {
 public:
-  CirclePRelu() = default;
-
-public:
   loco::Node *input(void) const { return at(0)->node(); }
   void input(loco::Node *node) { at(0)->node(node); }
 
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CirclePad.h b/compiler/luci/lang/include/luci/IR/Nodes/CirclePad.h
index 31599bda0..ede217789 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CirclePad.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CirclePad.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -31,9 +31,6 @@ namespace luci
 class CirclePad final : public FixedArityNode<2, CircleNodeImpl<CircleOpcode::PAD>>
 {
 public:
-  CirclePad() = default;
-
-public:
   loco::Node *input(void) const { return at(0)->node(); }
   void input(loco::Node *node) { at(0)->node(node); }
 
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CirclePadV2.h b/compiler/luci/lang/include/luci/IR/Nodes/CirclePadV2.h
index 563cfd9a4..644e2bb27 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CirclePadV2.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CirclePadV2.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -31,9 +31,6 @@ namespace luci
 class CirclePadV2 final : public FixedArityNode<3, CircleNodeImpl<CircleOpcode::PADV2>>
 {
 public:
-  CirclePadV2() = default;
-
-public:
   loco::Node *input(void) const { return at(0)->node(); }
   void input(loco::Node *node) { at(0)->node(node); }
 
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CirclePow.h b/compiler/luci/lang/include/luci/IR/Nodes/CirclePow.h
index 006e3dd86..40c5a829d 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CirclePow.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CirclePow.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -31,9 +31,6 @@ namespace luci
 class CirclePow final : public FixedArityNode<2, CircleNodeImpl<CircleOpcode::POW>>
 {
 public:
-  CirclePow() = default;
-
-public:
   loco::Node *x(void) const { return at(0)->node(); }
   void x(loco::Node *node) { at(0)->node(node); }
 
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleRange.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleRange.h
index 977a37a52..56f8a2eba 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleRange.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleRange.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleRank.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleRank.h
index ba6d67f69..034f251bc 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleRank.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleRank.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleReduceAny.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleReduceAny.h
index 0456be863..c64dbbdf8 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleReduceAny.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleReduceAny.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -42,7 +42,7 @@ public:
   void keep_dims(bool keep_dims) { _keep_dims = keep_dims; }
 
 private:
-  bool _keep_dims = false;
+  bool _keep_dims{false};
 };
 
 } // namespace luci
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleReduceMax.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleReduceMax.h
index 925c977e5..97cbecd08 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleReduceMax.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleReduceMax.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -42,7 +42,7 @@ public:
   void keep_dims(bool keep_dims) { _keep_dims = keep_dims; }
 
 private:
-  bool _keep_dims = false;
+  bool _keep_dims{false};
 };
 
 } // namespace luci
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleReduceMin.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleReduceMin.h
index fd789ae5e..33708928f 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleReduceMin.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleReduceMin.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -42,7 +42,7 @@ public:
   void keep_dims(bool keep_dims) { _keep_dims = keep_dims; }
 
 private:
-  bool _keep_dims = false;
+  bool _keep_dims{false};
 };
 
 } // namespace luci
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleReduceProd.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleReduceProd.h
index b7d226255..3689ee532 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleReduceProd.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleReduceProd.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -42,7 +42,7 @@ public:
   void keep_dims(bool keep_dims) { _keep_dims = keep_dims; }
 
 private:
-  bool _keep_dims = false;
+  bool _keep_dims{false};
 };
 
 } // namespace luci
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleRelu.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleRelu.h
index 91272d2bf..6148caa03 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleRelu.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleRelu.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -31,9 +31,6 @@ namespace luci
 class CircleRelu final : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::RELU>>
 {
 public:
-  CircleRelu() = default;
-
-public:
   loco::Node *features(void) const { return at(0)->node(); }
   void features(loco::Node *node) { at(0)->node(node); }
 };
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleRelu6.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleRelu6.h
index b4274ded9..0fa25e873 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleRelu6.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleRelu6.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -31,9 +31,6 @@ namespace luci
 class CircleRelu6 final : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::RELU6>>
 {
 public:
-  CircleRelu6() = default;
-
-public:
   loco::Node *features(void) const { return at(0)->node(); }
   void features(loco::Node *node) { at(0)->node(node); }
 };
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleReluN1To1.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleReluN1To1.h
index a5c5710c2..13c0d166f 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleReluN1To1.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleReluN1To1.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -31,9 +31,6 @@ namespace luci
 class CircleReluN1To1 final : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::RELU_N1_TO_1>>
 {
 public:
-  CircleReluN1To1() = default;
-
-public:
   loco::Node *features(void) const { return at(0)->node(); }
   void features(loco::Node *node) { at(0)->node(node); }
 };
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleReshape.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleReshape.h
index b13144f7e..090df4044 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleReshape.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleReshape.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -31,14 +31,11 @@ namespace luci
 class CircleReshape final : public FixedArityNode<2, CircleNodeImpl<CircleOpcode::RESHAPE>>
 {
 public:
-  CircleReshape() = default;
-
-public:
   loco::Node *tensor(void) const { return at(0)->node(); }
   void tensor(loco::Node *node) { at(0)->node(node); }
 
   // NOTE shape is optional and can be CircleConst or any other type
-  //      and also can be CircleOutputDummy when reshape option does not exist
+  //      and also should be CircleOutputDummy when reshape option does not exist
   loco::Node *shape(void) const { return at(1)->node(); }
   void shape(loco::Node *node) { at(1)->node(node); }
 
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleResizeBilinear.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleResizeBilinear.h
index 3c8223338..091916a2b 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleResizeBilinear.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleResizeBilinear.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -29,18 +29,16 @@ namespace luci
  * @brief RESIZE_BILINEAR in Circle
  */
 class CircleResizeBilinear final
-    : public FixedArityNode<2, CircleNodeImpl<CircleOpcode::RESIZE_BILINEAR>>
+  : public FixedArityNode<2, CircleNodeImpl<CircleOpcode::RESIZE_BILINEAR>>
 {
 public:
-  CircleResizeBilinear() = default;
-
-public:
   loco::Node *input(void) const { return at(0)->node(); }
   void input(loco::Node *node) { at(0)->node(node); }
 
   loco::Node *size(void) const { return at(1)->node(); }
   void size(loco::Node *node) { at(1)->node(node); }
 
+public:
   bool align_corners() const { return _align_corners; }
   void align_corners(bool value) { _align_corners = value; }
 
@@ -48,8 +46,8 @@ public:
   void half_pixel_centers(bool value) { _half_pixel_centers = value; }
 
 private:
-  bool _align_corners = false;
-  bool _half_pixel_centers = false;
+  bool _align_corners{false};
+  bool _half_pixel_centers{false};
 };
 
 } // namespace luci
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleResizeNearestNeighbor.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleResizeNearestNeighbor.h
index dc32ebee7..ab880d767 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleResizeNearestNeighbor.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleResizeNearestNeighbor.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -29,23 +29,21 @@ namespace luci
  * @brief RESIZE_NEAREST_NEIGHBOR in Circle
  */
 class CircleResizeNearestNeighbor final
-    : public FixedArityNode<2, CircleNodeImpl<CircleOpcode::RESIZE_NEAREST_NEIGHBOR>>
+  : public FixedArityNode<2, CircleNodeImpl<CircleOpcode::RESIZE_NEAREST_NEIGHBOR>>
 {
 public:
-  CircleResizeNearestNeighbor() = default;
-
-public:
   loco::Node *input(void) const { return at(0)->node(); }
   void input(loco::Node *node) { at(0)->node(node); }
 
   loco::Node *size(void) const { return at(1)->node(); }
   void size(loco::Node *node) { at(1)->node(node); }
 
+public:
   bool align_corners() const { return _align_corners; }
   void align_corners(bool value) { _align_corners = value; }
 
 private:
-  bool _align_corners = false;
+  bool _align_corners{false};
 };
 
 } // namespace luci
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleReverseSequence.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleReverseSequence.h
index b0766dd3e..5f089a768 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleReverseSequence.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleReverseSequence.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -29,12 +29,9 @@ namespace luci
  * @brief REVERSE_SEQUENCE in Circle
  */
 class CircleReverseSequence final
-    : public FixedArityNode<2, CircleNodeImpl<CircleOpcode::REVERSE_SEQUENCE>>
+  : public FixedArityNode<2, CircleNodeImpl<CircleOpcode::REVERSE_SEQUENCE>>
 {
 public:
-  CircleReverseSequence() = default;
-
-public:
   loco::Node *input(void) const { return at(0)->node(); }
   void input(loco::Node *node) { at(0)->node(node); }
 
@@ -42,15 +39,15 @@ public:
   void seq_lengths(loco::Node *node) { at(1)->node(node); }
 
 public:
-  int seq_axis(void) const { return _seq_axis; }
-  void seq_axis(int seq_axis) { _seq_axis = seq_axis; }
+  int32_t seq_axis(void) const { return _seq_axis; }
+  void seq_axis(int32_t seq_axis) { _seq_axis = seq_axis; }
 
-  int batch_axis(void) const { return _batch_axis; }
-  void batch_axis(int batch_axis) { _batch_axis = batch_axis; }
+  int32_t batch_axis(void) const { return _batch_axis; }
+  void batch_axis(int32_t batch_axis) { _batch_axis = batch_axis; }
 
 private:
-  int _seq_axis{0};
-  int _batch_axis{0};
+  int32_t _seq_axis{0};
+  int32_t _batch_axis{0};
 };
 
 } // namespace luci
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleReverseV2.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleReverseV2.h
index 71d9f65aa..96b6a793d 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleReverseV2.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleReverseV2.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleRound.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleRound.h
index 30296ce9e..e340266ed 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleRound.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleRound.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -31,9 +31,6 @@ namespace luci
 class CircleRound final : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::ROUND>>
 {
 public:
-  CircleRound() = default;
-
-public:
   loco::Node *x(void) const { return at(0)->node(); }
   void x(loco::Node *node) { at(0)->node(node); }
 };
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleRsqrt.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleRsqrt.h
index 873397bce..7907f326b 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleRsqrt.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleRsqrt.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -31,9 +31,6 @@ namespace luci
 class CircleRsqrt final : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::RSQRT>>
 {
 public:
-  CircleRsqrt() = default;
-
-public:
   loco::Node *x(void) const { return at(0)->node(); }
   void x(loco::Node *node) { at(0)->node(node); }
 };
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleScatterNd.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleScatterNd.h
index 9f93a0a80..fda3abafc 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleScatterNd.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleScatterNd.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleSegmentSum.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleSegmentSum.h
index 416d617b2..e7227e9ee 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleSegmentSum.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleSegmentSum.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -31,9 +31,6 @@ namespace luci
 class CircleSegmentSum final : public FixedArityNode<2, CircleNodeImpl<CircleOpcode::SEGMENT_SUM>>
 {
 public:
-  CircleSegmentSum() = default;
-
-public:
   loco::Node *input(void) const { return at(0)->node(); }
   void input(loco::Node *node) { at(0)->node(node); }
 
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleSelect.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleSelect.h
index 727647168..6f778d72d 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleSelect.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleSelect.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -31,9 +31,6 @@ namespace luci
 class CircleSelect final : public FixedArityNode<3, CircleNodeImpl<CircleOpcode::SELECT>>
 {
 public:
-  CircleSelect() = default;
-
-public:
   loco::Node *condition(void) const { return at(0)->node(); }
   void condition(loco::Node *node) { at(0)->node(node); }
 
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleSelectV2.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleSelectV2.h
index 7ac3c0524..7969cc2aa 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleSelectV2.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleSelectV2.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -31,9 +31,6 @@ namespace luci
 class CircleSelectV2 final : public FixedArityNode<3, CircleNodeImpl<CircleOpcode::SELECT_V2>>
 {
 public:
-  CircleSelectV2() = default;
-
-public:
   loco::Node *condition(void) const { return at(0)->node(); }
   void condition(loco::Node *node) { at(0)->node(node); }
 
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleShape.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleShape.h
index ff20ce684..903894dbd 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleShape.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleShape.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -31,9 +31,6 @@ namespace luci
 class CircleShape final : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::SHAPE>>
 {
 public:
-  CircleShape() = default;
-
-public:
   loco::Node *input(void) const { return at(0)->node(); }
   void input(loco::Node *node) { at(0)->node(node); }
 
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleSin.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleSin.h
index 5624db253..25dc18b0d 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleSin.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleSin.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleSlice.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleSlice.h
index a2113643d..98556d7a6 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleSlice.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleSlice.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleSoftmax.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleSoftmax.h
index 7166a329b..d10cb1682 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleSoftmax.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleSoftmax.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleSpaceToBatchND.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleSpaceToBatchND.h
index 042ebffcd..ef715c6d0 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleSpaceToBatchND.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleSpaceToBatchND.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -29,7 +29,7 @@ namespace luci
  * @brief SPACE_TO_BATCH_ND in Circle
  */
 class CircleSpaceToBatchND final
-    : public FixedArityNode<3, CircleNodeImpl<CircleOpcode::SPACE_TO_BATCH_ND>>
+  : public FixedArityNode<3, CircleNodeImpl<CircleOpcode::SPACE_TO_BATCH_ND>>
 {
 public:
   loco::Node *input(void) const { return at(0)->node(); }
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleSpaceToDepth.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleSpaceToDepth.h
index 420a4cb96..387e0d80f 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleSpaceToDepth.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleSpaceToDepth.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -29,18 +29,18 @@ namespace luci
  * @brief SPACE_TO_DEPTH in Circle
  */
 class CircleSpaceToDepth final
-    : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::SPACE_TO_DEPTH>>
+  : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::SPACE_TO_DEPTH>>
 {
 public:
   loco::Node *input(void) const { return at(0)->node(); }
   void input(loco::Node *node) { at(0)->node(node); }
 
 public:
-  int block_size(void) const { return _block_size; }
-  void block_size(int block_size) { _block_size = block_size; }
+  int32_t block_size(void) const { return _block_size; }
+  void block_size(int32_t block_size) { _block_size = block_size; }
 
 private:
-  int _block_size{0};
+  int32_t _block_size{0};
 };
 
 } // namespace luci
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleSparseToDense.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleSparseToDense.h
index 7e80304b0..94a20c064 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleSparseToDense.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleSparseToDense.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -29,7 +29,7 @@ namespace luci
  * @brief SPARSE_TO_DENSE in Circle
  */
 class CircleSparseToDense final
-    : public FixedArityNode<4, CircleNodeImpl<CircleOpcode::SPARSE_TO_DENSE>>
+  : public FixedArityNode<4, CircleNodeImpl<CircleOpcode::SPARSE_TO_DENSE>>
 {
 public:
   loco::Node *indices(void) const { return at(0)->node(); }
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleSplit.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleSplit.h
index 0eda19501..0cb953131 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleSplit.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleSplit.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleSplitOut.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleSplitOut.h
index 6bf4a9fef..a507740e4 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleSplitOut.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleSplitOut.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -31,9 +31,6 @@ namespace luci
 class CircleSplitOut final : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::CIRCLESPLITOUT>>
 {
 public:
-  CircleSplitOut() = default;
-
-public:
   loco::Node *input(void) const { return at(0)->node(); }
   void input(loco::Node *node) { at(0)->node(node); }
 
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleSplitV.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleSplitV.h
index 1b7d55534..cb02cbbcf 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleSplitV.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleSplitV.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleSplitVOut.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleSplitVOut.h
index d3b2f1e5a..adf79f30c 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleSplitVOut.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleSplitVOut.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -29,12 +29,9 @@ namespace luci
  * @brief Virtual CIRCLESPLITVOUT in Circle
  */
 class CircleSplitVOut final
-    : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::CIRCLESPLITVOUT>>
+  : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::CIRCLESPLITVOUT>>
 {
 public:
-  CircleSplitVOut() = default;
-
-public:
   loco::Node *input(void) const { return at(0)->node(); }
   void input(loco::Node *node) { at(0)->node(node); }
 
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleSqrt.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleSqrt.h
index c96ca8498..b76bd1ad5 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleSqrt.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleSqrt.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -31,9 +31,6 @@ namespace luci
 class CircleSqrt final : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::SQRT>>
 {
 public:
-  CircleSqrt() = default;
-
-public:
   loco::Node *x(void) const { return at(0)->node(); }
   void x(loco::Node *node) { at(0)->node(node); }
 };
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleSquare.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleSquare.h
index a29edfe82..3f9228b3b 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleSquare.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleSquare.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -31,9 +31,6 @@ namespace luci
 class CircleSquare final : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::SQUARE>>
 {
 public:
-  CircleSquare() = default;
-
-public:
   loco::Node *x(void) const { return at(0)->node(); }
   void x(loco::Node *node) { at(0)->node(node); }
 };
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleSquaredDifference.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleSquaredDifference.h
index b5b39f920..355c9f3d3 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleSquaredDifference.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleSquaredDifference.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -29,12 +29,9 @@ namespace luci
  * @brief SQUARED_DIFFERENCE in Circle
  */
 class CircleSquaredDifference final
-    : public FixedArityNode<2, CircleNodeImpl<CircleOpcode::SQUARED_DIFFERENCE>>
+  : public FixedArityNode<2, CircleNodeImpl<CircleOpcode::SQUARED_DIFFERENCE>>
 {
 public:
-  CircleSquaredDifference() = default;
-
-public:
   loco::Node *x(void) const { return at(0)->node(); }
   void x(loco::Node *node) { at(0)->node(node); }
 
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleSqueeze.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleSqueeze.h
index f175f1411..ba71ff217 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleSqueeze.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleSqueeze.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -31,9 +31,6 @@ namespace luci
 class CircleSqueeze final : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::SQUEEZE>>
 {
 public:
-  CircleSqueeze() = default;
-
-public:
   loco::Node *input(void) const { return at(0)->node(); }
   void input(loco::Node *node) { at(0)->node(node); }
 
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleStridedSlice.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleStridedSlice.h
index 98799fec1..6a4155ef1 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleStridedSlice.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleStridedSlice.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -29,7 +29,7 @@ namespace luci
  * @brief STRIDED_SLICE in Circle
  */
 class CircleStridedSlice final
-    : public FixedArityNode<4, CircleNodeImpl<CircleOpcode::STRIDED_SLICE>>
+  : public FixedArityNode<4, CircleNodeImpl<CircleOpcode::STRIDED_SLICE>>
 {
 public:
   loco::Node *input(void) const { return at(0)->node(); }
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleSub.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleSub.h
index 08208f942..d9aaa44e5 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleSub.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleSub.h
@@ -21,7 +21,7 @@
 #include "luci/IR/CircleOpcode.h"
 
 #include "luci/IR/AttrFusedActFunc.h"
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -30,12 +30,9 @@ namespace luci
  * @brief SUB in Circle
  */
 class CircleSub final : public FixedArityNode<2, CircleNodeImpl<CircleOpcode::SUB>>,
-                        public LuciNodeMixin<LuciNodeTrait::FusedActFunc>
+                        public CircleNodeMixin<CircleNodeTrait::FusedActFunc>
 {
 public:
-  CircleSub() = default;
-
-public:
   loco::Node *x(void) const { return at(0)->node(); }
   void x(loco::Node *node) { at(0)->node(node); }
 
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleSum.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleSum.h
index 21faa76fe..a72e18f54 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleSum.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleSum.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleTanh.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleTanh.h
index f7444921f..2036a7301 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleTanh.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleTanh.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -31,9 +31,6 @@ namespace luci
 class CircleTanh final : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::TANH>>
 {
 public:
-  CircleTanh() = default;
-
-public:
   loco::Node *x(void) const { return at(0)->node(); }
   void x(loco::Node *node) { at(0)->node(node); }
 };
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleTile.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleTile.h
index 96e1f69c6..1ec2f5e82 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleTile.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleTile.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -31,9 +31,6 @@ namespace luci
 class CircleTile final : public FixedArityNode<2, CircleNodeImpl<CircleOpcode::TILE>>
 {
 public:
-  CircleTile() = default;
-
-public:
   loco::Node *input(void) const { return at(0)->node(); }
   void input(loco::Node *node) { at(0)->node(node); }
 
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleTopKV2.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleTopKV2.h
index 3b2b5abb7..0bf78c3ee 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleTopKV2.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleTopKV2.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -31,9 +31,6 @@ namespace luci
 class CircleTopKV2 final : public FixedArityNode<2, CircleNodeImpl<CircleOpcode::TOPK_V2>>
 {
 public:
-  CircleTopKV2() = default;
-
-public:
   loco::Node *input(void) const { return at(0)->node(); }
   void input(loco::Node *node) { at(0)->node(node); }
 
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleTopKV2Out.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleTopKV2Out.h
index 5a6dd0c02..f1a6b4a41 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleTopKV2Out.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleTopKV2Out.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -29,12 +29,9 @@ namespace luci
  * @brief Virtual CIRCLETOPKV2OUT in Circle
  */
 class CircleTopKV2Out final
-    : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::CIRCLETOPKV2OUT>>
+  : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::CIRCLETOPKV2OUT>>
 {
 public:
-  CircleTopKV2Out() = default;
-
-public:
   loco::Node *input(void) const { return at(0)->node(); }
   void input(loco::Node *node) { at(0)->node(node); }
 
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleTranspose.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleTranspose.h
index 095cd6746..72ce0738c 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleTranspose.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleTranspose.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -31,13 +31,7 @@ namespace luci
 class CircleTranspose final : public FixedArityNode<2, CircleNodeImpl<CircleOpcode::TRANSPOSE>>
 {
 public:
-  CircleTranspose() = default;
-
-public:
-  /// @brief Get the input node to transpose
   loco::Node *a(void) const { return at(0)->node(); }
-
-  /// @brief Set the input node to transpose
   void a(loco::Node *node) { at(0)->node(node); }
 
   loco::Node *perm(void) const { return at(1)->node(); }
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleTransposeConv.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleTransposeConv.h
index e355102d6..5ae41c0c4 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleTransposeConv.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleTransposeConv.h
@@ -22,7 +22,7 @@
 
 #include "luci/IR/AttrPadding.h"
 #include "luci/IR/AttrStride.h"
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -34,8 +34,8 @@ namespace luci
  *        'out' acutally means 'out' and 'in' of the this node.
  */
 class CircleTransposeConv final
-    : public FixedArityNode<4, CircleNodeImpl<CircleOpcode::TRANSPOSE_CONV>>,
-      public LuciNodeMixin<LuciNodeTrait::Bias>
+  : public FixedArityNode<4, CircleNodeImpl<CircleOpcode::TRANSPOSE_CONV>>,
+    public CircleNodeMixin<CircleNodeTrait::Bias>
 {
 public:
   loco::Node *inputSizes(void) const { return at(0)->node(); }
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleUnidirectionalSequenceLSTM.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleUnidirectionalSequenceLSTM.h
index 4352b045b..faf0ec94d 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleUnidirectionalSequenceLSTM.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleUnidirectionalSequenceLSTM.h
@@ -21,7 +21,7 @@
 #include "luci/IR/CircleOpcode.h"
 
 #include "luci/IR/AttrFusedActFunc.h"
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -30,8 +30,8 @@ namespace luci
  * @brief UNIDIRECTIONAL_SEQUENCE_LSTM in Circle
  */
 class CircleUnidirectionalSequenceLSTM final
-    : public FixedArityNode<24, CircleNodeImpl<CircleOpcode::UNIDIRECTIONAL_SEQUENCE_LSTM>>,
-      public LuciNodeMixin<LuciNodeTrait::FusedActFunc>
+  : public FixedArityNode<24, CircleNodeImpl<CircleOpcode::UNIDIRECTIONAL_SEQUENCE_LSTM>>,
+    public CircleNodeMixin<CircleNodeTrait::FusedActFunc>
 {
 public:
   loco::Node *input(void) const { return at(0)->node(); }
@@ -104,10 +104,10 @@ public:
   }
 
 private:
-  float _cell_clip = 0.0f;
-  float _proj_clip = 0.0f;
-  bool _time_major = false;
-  bool _asymmetric_quantize_inputs = false;
+  float _cell_clip{0.0f};
+  float _proj_clip{0.0f};
+  bool _time_major{false};
+  bool _asymmetric_quantize_inputs{false};
 };
 
 } // namespace luci
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleUnique.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleUnique.h
index 719a72362..2dd48b2f9 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleUnique.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleUnique.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -36,7 +36,7 @@ public:
 
 public:
   loco::DataType idx_out_type(void) const { return _idx_out_type; }
-  void output_type(loco::DataType ot) { _idx_out_type = ot; }
+  void idx_out_type(loco::DataType ot) { _idx_out_type = ot; }
 
 private:
   loco::DataType _idx_out_type{loco::DataType::S32};
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleUniqueOut.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleUniqueOut.h
index f846403e0..233351860 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleUniqueOut.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleUniqueOut.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -29,12 +29,9 @@ namespace luci
  * @brief Virtual CIRCLEUNIQUEOUT in Circle
  */
 class CircleUniqueOut final
-    : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::CIRCLEUNIQUEOUT>>
+  : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::CIRCLEUNIQUEOUT>>
 {
 public:
-  CircleUniqueOut() = default;
-
-public:
   loco::Node *input(void) const { return at(0)->node(); }
   void input(loco::Node *node) { at(0)->node(node); }
 
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleUnpack.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleUnpack.h
index cb91d7e6a..fd0c66ce0 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleUnpack.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleUnpack.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -31,9 +31,6 @@ namespace luci
 class CircleUnpack final : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::UNPACK>>
 {
 public:
-  CircleUnpack() = default;
-
-public:
   loco::Node *value(void) const { return at(0)->node(); }
   void value(loco::Node *node) { at(0)->node(node); }
 
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleUnpackOut.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleUnpackOut.h
index 6f24578a1..640d2f1bb 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleUnpackOut.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleUnpackOut.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -29,12 +29,9 @@ namespace luci
  * @brief Virtual CIRCLEUNPACKOUT in Circle
  */
 class CircleUnpackOut final
-    : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::CIRCLEUNPACKOUT>>
+  : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::CIRCLEUNPACKOUT>>
 {
 public:
-  CircleUnpackOut() = default;
-
-public:
   loco::Node *input(void) const { return at(0)->node(); }
   void input(loco::Node *node) { at(0)->node(node); }
 
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleWhere.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleWhere.h
index 51eda3d6e..8895bcbbd 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleWhere.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleWhere.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 #include <cassert>
 
@@ -33,9 +33,6 @@ namespace luci
 class CircleWhere final : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::WHERE>>
 {
 public:
-  CircleWhere() = default;
-
-public:
   loco::Node *condition() const { return at(0)->node(); }
   void condition(loco::Node *node) { at(0)->node(node); }
 };
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleWhile.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleWhile.h
index 40ec96414..f4154d3ab 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleWhile.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleWhile.h
@@ -34,7 +34,7 @@ class CircleWhile final : public VariadicArityNode<CircleNodeImpl<CircleOpcode::
 {
 public:
   CircleWhile(uint32_t arity, uint32_t out)
-      : VariadicArityNode<CircleNodeImpl<CircleOpcode::WHILE>>(arity), _output_count(out)
+    : VariadicArityNode<CircleNodeImpl<CircleOpcode::WHILE>>(arity), _output_count(out)
   {
     assert(arity > 0);
     assert(out > 0);
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleWhileOut.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleWhileOut.h
index cdf617848..98efc21e5 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleWhileOut.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleWhileOut.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -31,9 +31,6 @@ namespace luci
 class CircleWhileOut final : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::CIRCLEWHILEOUT>>
 {
 public:
-  CircleWhileOut() = default;
-
-public:
   loco::Node *input(void) const { return at(0)->node(); }
   void input(loco::Node *node) { at(0)->node(node); }
 
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleZerosLike.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleZerosLike.h
index d3b6d272a..9302facd0 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleZerosLike.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleZerosLike.h
@@ -20,7 +20,7 @@
 #include "luci/IR/CircleNodeDecl.h"
 #include "luci/IR/CircleOpcode.h"
 
-#include "luci/IR/LuciNodeMixins.h"
+#include "luci/IR/CircleNodeMixins.h"
 
 namespace luci
 {
@@ -31,13 +31,7 @@ namespace luci
 class CircleZerosLike final : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::ZEROS_LIKE>>
 {
 public:
-  CircleZerosLike() = default;
-
-public:
-  /// @brief Get the input node
   loco::Node *input(void) const { return at(0)->node(); }
-
-  /// @brief Set the input node
   void input(loco::Node *node) { at(0)->node(node); }
 };
 
diff --git a/compiler/luci/lang/include/luci/IR/SparsityParam.h b/compiler/luci/lang/include/luci/IR/SparsityParam.h
index f471e5ef9..6cfff67e1 100644
--- a/compiler/luci/lang/include/luci/IR/SparsityParam.h
+++ b/compiler/luci/lang/include/luci/IR/SparsityParam.h
@@ -44,7 +44,7 @@ class SparseIndexVector
 public:
   SparseIndexVector() = default;
   SparseIndexVector(const SparseIndexVectorType &type, const std::vector<int32_t> &sparse_index_vec)
-      : _type{type}
+    : _type{type}
   {
     switch (type)
     {
@@ -53,7 +53,7 @@ public:
       case SparseIndexVectorType::I32:
       {
         _vec_ptr = static_cast<void *>(
-            new std::vector<int32_t>(sparse_index_vec.begin(), sparse_index_vec.end()));
+          new std::vector<int32_t>(sparse_index_vec.begin(), sparse_index_vec.end()));
         break;
       }
       case SparseIndexVectorType::U16:
@@ -90,21 +90,21 @@ public:
       case SparseIndexVectorType::I32:
       {
         const std::vector<int32_t> *vec =
-            static_cast<const std::vector<int32_t> *>(sparse_index_vec);
+          static_cast<const std::vector<int32_t> *>(sparse_index_vec);
         _vec_ptr = static_cast<void *>(new std::vector<int32_t>(vec->begin(), vec->end()));
         break;
       }
       case SparseIndexVectorType::U16:
       {
         const std::vector<uint16_t> *vec =
-            static_cast<const std::vector<uint16_t> *>(sparse_index_vec);
+          static_cast<const std::vector<uint16_t> *>(sparse_index_vec);
         _vec_ptr = static_cast<void *>(new std::vector<uint16_t>(vec->begin(), vec->end()));
         break;
       }
       case SparseIndexVectorType::U8:
       {
         const std::vector<uint8_t> *vec =
-            static_cast<const std::vector<uint8_t> *>(sparse_index_vec);
+          static_cast<const std::vector<uint8_t> *>(sparse_index_vec);
         _vec_ptr = static_cast<void *>(new std::vector<uint8_t>(vec->begin(), vec->end()));
         break;
       }
@@ -114,12 +114,12 @@ public:
   }
 
   SparseIndexVector(const SparseIndexVector &sparse_index_vec)
-      : SparseIndexVector(sparse_index_vec._type, sparse_index_vec._vec_ptr)
+    : SparseIndexVector(sparse_index_vec._type, sparse_index_vec._vec_ptr)
   {
   }
 
   SparseIndexVector(SparseIndexVector &&sparse_index_vec)
-      : _type{sparse_index_vec._type}, _vec_ptr{std::exchange(sparse_index_vec._vec_ptr, nullptr)}
+    : _type{sparse_index_vec._type}, _vec_ptr{std::exchange(sparse_index_vec._vec_ptr, nullptr)}
   {
   }
 
@@ -178,8 +178,8 @@ public:
   const std::vector<uint16_t> *as_uint16_vector(void) const
   {
     return _type == SparseIndexVectorType::U16
-               ? static_cast<const std::vector<uint16_t> *>(_vec_ptr)
-               : nullptr;
+             ? static_cast<const std::vector<uint16_t> *>(_vec_ptr)
+             : nullptr;
   }
   const std::vector<uint8_t> *as_uint8_vector(void) const
   {
@@ -202,8 +202,8 @@ public:
   }
   DimMetaData(DimensionType format, int32_t dense_size, const SparseIndexVector &array_segments,
               const SparseIndexVector &array_indices)
-      : _format{format}, _dense_size{dense_size}, _array_segments{array_segments},
-        _array_indices{array_indices}
+    : _format{format}, _dense_size{dense_size}, _array_segments{array_segments}, _array_indices{
+                                                                                   array_indices}
   {
     // DO NOTHING
   }
diff --git a/compiler/luci/lang/src/CircleDialect.cpp b/compiler/luci/lang/src/CircleDialect.cpp
index 42ca3c917..0d315fc55 100644
--- a/compiler/luci/lang/src/CircleDialect.cpp
+++ b/compiler/luci/lang/src/CircleDialect.cpp
@@ -15,6 +15,7 @@
  */
 
 #include "luci/IR/CircleDialect.h"
+#include "luci/IR/DeadNodeQueryService.h"
 #include "luci/IR/Nodes/CircleInput.h"
 #include "luci/IR/Nodes/CircleOutput.h"
 
@@ -22,8 +23,6 @@
 #include <loco/IR/GraphInputIndex.h>
 #include <loco/IR/GraphOutputIndex.h>
 
-#include "DeadNodeQueryService.h"
-
 #include <cassert>
 #include <memory>
 
diff --git a/compiler/luci/lang/src/LuciNodeMixins.cpp b/compiler/luci/lang/src/CircleNodeMixins.cpp
index 660cbe1a5..f72178df5 100644
--- a/compiler/luci/lang/src/LuciNodeMixins.cpp
+++ b/compiler/luci/lang/src/CircleNodeMixins.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,5 +14,5 @@
  * limitations under the License.
  */
 
-// This is to validate LuciNodeMixins.h
-#include "luci/IR/LuciNodeMixins.h"
+// This is to validate CircleNodeMixins.h
+#include "luci/IR/CircleNodeMixins.h"
diff --git a/compiler/luci/lang/src/CircleNodes.cpp b/compiler/luci/lang/src/CircleNodes.cpp
index c77c06861..2c2688c9e 100644
--- a/compiler/luci/lang/src/CircleNodes.cpp
+++ b/compiler/luci/lang/src/CircleNodes.cpp
@@ -23,31 +23,6 @@
 namespace luci
 {
 
-void set_new_shape(CircleReshape *node, int32_t *base, uint32_t size)
-{
-  // Check node does not have both of new shape infos
-  LUCI_ASSERT(node->shape() == nullptr, "node already has shape input");
-  LUCI_ASSERT(node->newShape()->rank() == 0, "node already has newShape attribute");
-
-  const loco::DataType S32 = loco::DataType::S32;
-
-  // Set 2nd input as CircleConst
-  auto const_shape_node = node->graph()->nodes()->create<CircleConst>();
-  const_shape_node->rank(1);
-  const_shape_node->dim(0) = size;
-  const_shape_node->dtype(S32);
-  const_shape_node->size<S32>(size);
-  const_shape_node->shape_status(luci::ShapeStatus::VALID);
-  for (uint32_t axis = 0; axis < size; ++axis)
-    const_shape_node->at<S32>(axis) = base[axis];
-  node->shape(const_shape_node);
-
-  // Set newShape attribute
-  node->newShape()->rank(size);
-  for (uint32_t axis = 0; axis < size; ++axis)
-    node->newShape()->dim(axis) = base[axis];
-}
-
 void link(loco::GraphOutput *output, CircleOutput *node) { node->index(output->index()); }
 
 CircleOutput *output_node(loco::Graph *g, const loco::GraphOutputIndex &index)
diff --git a/compiler/luci/lang/src/DeadNodeQueryService.cpp b/compiler/luci/lang/src/DeadNodeQueryService.cpp
index a22574c94..7dac08b5f 100644
--- a/compiler/luci/lang/src/DeadNodeQueryService.cpp
+++ b/compiler/luci/lang/src/DeadNodeQueryService.cpp
@@ -14,9 +14,8 @@
  * limitations under the License.
  */
 
-#include "DeadNodeQueryService.h"
-
 #include "luci/IR/CircleNodeVisitor.h"
+#include "luci/IR/DeadNodeQueryService.h"
 
 #include <loco/IR/Graph.h>
 
diff --git a/compiler/luci/lang/src/Nodes/CircleBatchMatMul.test.cpp b/compiler/luci/lang/src/Nodes/CircleBatchMatMul.test.cpp
index d7712c8dd..3859d7fca 100644
--- a/compiler/luci/lang/src/Nodes/CircleBatchMatMul.test.cpp
+++ b/compiler/luci/lang/src/Nodes/CircleBatchMatMul.test.cpp
@@ -26,7 +26,7 @@ TEST(CircleBatchMatMulTest, constructor)
   luci::CircleBatchMatMul batchmatmul_node;
 
   ASSERT_EQ(luci::CircleDialect::get(), batchmatmul_node.dialect());
-  ASSERT_EQ(luci::CircleOpcode::BATCHMATMUL, batchmatmul_node.opcode());
+  ASSERT_EQ(luci::CircleOpcode::BATCH_MATMUL, batchmatmul_node.opcode());
 
   ASSERT_EQ(nullptr, batchmatmul_node.x());
   ASSERT_EQ(nullptr, batchmatmul_node.y());
diff --git a/compiler/luci/lang/src/Nodes/CircleBidrectionalSequenceLSTM.test.cpp b/compiler/luci/lang/src/Nodes/CircleBidrectionalSequenceLSTM.test.cpp
new file mode 100644
index 000000000..3f13422e5
--- /dev/null
+++ b/compiler/luci/lang/src/Nodes/CircleBidrectionalSequenceLSTM.test.cpp
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/IR/Nodes/CircleBidirectionalSequenceLSTM.h"
+
+#include "luci/IR/CircleDialect.h"
+#include "luci/IR/CircleNodeVisitor.h"
+
+#include <gtest/gtest.h>
+
+TEST(CircleBidirectionalSequenceLSTMTest, constructor_P)
+{
+  luci::CircleBidirectionalSequenceLSTM trc_node;
+
+  ASSERT_EQ(luci::CircleDialect::get(), trc_node.dialect());
+  ASSERT_EQ(luci::CircleOpcode::BIDIRECTIONAL_SEQUENCE_LSTM, trc_node.opcode());
+
+  ASSERT_EQ(nullptr, trc_node.input());
+
+  ASSERT_EQ(nullptr, trc_node.fw_input_to_input_weights());
+  ASSERT_EQ(nullptr, trc_node.fw_input_to_forget_weights());
+  ASSERT_EQ(nullptr, trc_node.fw_input_to_cell_weights());
+  ASSERT_EQ(nullptr, trc_node.fw_input_to_output_weights());
+
+  ASSERT_EQ(nullptr, trc_node.fw_recurrent_to_input_weights());
+  ASSERT_EQ(nullptr, trc_node.fw_recurrent_to_forget_weights());
+  ASSERT_EQ(nullptr, trc_node.fw_recurrent_to_cell_weights());
+  ASSERT_EQ(nullptr, trc_node.fw_recurrent_to_output_weights());
+
+  ASSERT_EQ(nullptr, trc_node.fw_cell_to_input_weights());
+  ASSERT_EQ(nullptr, trc_node.fw_cell_to_forget_weights());
+  ASSERT_EQ(nullptr, trc_node.fw_cell_to_output_weights());
+
+  ASSERT_EQ(nullptr, trc_node.fw_input_gate_bias());
+  ASSERT_EQ(nullptr, trc_node.fw_forget_gate_bias());
+  ASSERT_EQ(nullptr, trc_node.fw_cell_gate_bias());
+  ASSERT_EQ(nullptr, trc_node.fw_output_gate_bias());
+
+  ASSERT_EQ(nullptr, trc_node.fw_projection_weights());
+  ASSERT_EQ(nullptr, trc_node.fw_projection_bias());
+
+  ASSERT_EQ(nullptr, trc_node.bw_input_to_input_weights());
+  ASSERT_EQ(nullptr, trc_node.bw_input_to_forget_weights());
+  ASSERT_EQ(nullptr, trc_node.bw_input_to_cell_weights());
+  ASSERT_EQ(nullptr, trc_node.bw_input_to_output_weights());
+
+  ASSERT_EQ(nullptr, trc_node.bw_recurrent_to_input_weights());
+  ASSERT_EQ(nullptr, trc_node.bw_recurrent_to_forget_weights());
+  ASSERT_EQ(nullptr, trc_node.bw_recurrent_to_cell_weights());
+  ASSERT_EQ(nullptr, trc_node.bw_recurrent_to_output_weights());
+
+  ASSERT_EQ(nullptr, trc_node.bw_cell_to_input_weights());
+  ASSERT_EQ(nullptr, trc_node.bw_cell_to_forget_weights());
+  ASSERT_EQ(nullptr, trc_node.bw_cell_to_output_weights());
+
+  ASSERT_EQ(nullptr, trc_node.bw_input_gate_bias());
+  ASSERT_EQ(nullptr, trc_node.bw_forget_gate_bias());
+  ASSERT_EQ(nullptr, trc_node.bw_cell_gate_bias());
+  ASSERT_EQ(nullptr, trc_node.bw_output_gate_bias());
+
+  ASSERT_EQ(nullptr, trc_node.bw_projection_weights());
+  ASSERT_EQ(nullptr, trc_node.bw_projection_bias());
+
+  ASSERT_EQ(nullptr, trc_node.fw_activation_state());
+  ASSERT_EQ(nullptr, trc_node.fw_cell_state());
+  ASSERT_EQ(nullptr, trc_node.bw_activation_state());
+  ASSERT_EQ(nullptr, trc_node.bw_cell_state());
+
+  ASSERT_EQ(nullptr, trc_node.auxillary_input());
+  ASSERT_EQ(nullptr, trc_node.fw_auxillary_input_to_input_weights());
+  ASSERT_EQ(nullptr, trc_node.fw_auxillary_input_to_forget_weights());
+  ASSERT_EQ(nullptr, trc_node.fw_auxillary_input_to_cell_weights());
+  ASSERT_EQ(nullptr, trc_node.fw_auxillary_input_to_output_weights());
+  ASSERT_EQ(nullptr, trc_node.bw_auxillary_input_to_input_weights());
+  ASSERT_EQ(nullptr, trc_node.bw_auxillary_input_to_forget_weights());
+  ASSERT_EQ(nullptr, trc_node.bw_auxillary_input_to_cell_weights());
+  ASSERT_EQ(nullptr, trc_node.bw_auxillary_input_to_output_weights());
+
+  ASSERT_EQ(luci::FusedActFunc::UNDEFINED, trc_node.fusedActivationFunction());
+  ASSERT_EQ(0.f, trc_node.cell_clip());
+  ASSERT_EQ(0.f, trc_node.proj_clip());
+  ASSERT_EQ(false, trc_node.merge_outputs());
+  ASSERT_EQ(false, trc_node.time_major());
+  ASSERT_EQ(false, trc_node.asymmetric_quantize_inputs());
+}
+
+TEST(CircleBidirectionalSequenceLSTMTest, arity_NEG)
+{
+  luci::CircleBidirectionalSequenceLSTM trc_node;
+
+  ASSERT_NO_THROW(trc_node.arg(36));
+  ASSERT_THROW(trc_node.arg(48), std::out_of_range);
+}
+
+TEST(CircleBidirectionalSequenceLSTMTest, visit_mutable_NEG)
+{
+  struct TestVisitor final : public luci::CircleNodeMutableVisitor<void>
+  {
+  };
+
+  luci::CircleBidirectionalSequenceLSTM trc_node;
+
+  TestVisitor tv;
+  ASSERT_THROW(trc_node.accept(&tv), std::exception);
+}
+
+TEST(CircleBidirectionalSequenceLSTMTest, visit_NEG)
+{
+  struct TestVisitor final : public luci::CircleNodeVisitor<void>
+  {
+  };
+
+  luci::CircleBidirectionalSequenceLSTM trc_node;
+
+  TestVisitor tv;
+  ASSERT_THROW(trc_node.accept(&tv), std::exception);
+}
diff --git a/compiler/luci/lang/src/Nodes/CircleConst.test.cpp b/compiler/luci/lang/src/Nodes/CircleConst.test.cpp
new file mode 100644
index 000000000..a81f4b00d
--- /dev/null
+++ b/compiler/luci/lang/src/Nodes/CircleConst.test.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/IR/Nodes/CircleConst.h"
+
+#include "luci/IR/CircleDialect.h"
+#include "luci/IR/CircleNodeVisitor.h"
+
+#include <gtest/gtest.h>
+
+TEST(CircleConstTest, constructor)
+{
+  luci::CircleConst const_node;
+
+  ASSERT_EQ(luci::CircleDialect::get(), const_node.dialect());
+  ASSERT_EQ(luci::CircleOpcode::CIRCLECONST, const_node.opcode());
+}
+
+TEST(CircleConstTest, dype_size)
+{
+  luci::CircleConst const_node;
+
+  const_node.dtype(loco::DataType::S32);
+  const_node.size<loco::DataType::S32>(1);
+
+  ASSERT_EQ(loco::DataType::S32, const_node.dtype());
+  ASSERT_EQ(1, const_node.size<loco::DataType::S32>());
+}
+
+TEST(CircleConstTest, scalar)
+{
+  luci::CircleConst const_node;
+
+  const_node.dtype(loco::DataType::S32);
+  const_node.size<loco::DataType::S32>(1);
+  const_node.scalar<loco::DataType::S32>() = 1;
+
+  auto const &cs = const_node.scalar<loco::DataType::S32>();
+  ASSERT_EQ(1, cs);
+}
diff --git a/compiler/luci/lang/src/Nodes/CircleCustom.test.cpp b/compiler/luci/lang/src/Nodes/CircleCustom.test.cpp
index c07268cbf..76b70f38b 100644
--- a/compiler/luci/lang/src/Nodes/CircleCustom.test.cpp
+++ b/compiler/luci/lang/src/Nodes/CircleCustom.test.cpp
@@ -22,7 +22,7 @@
 
 TEST(CircleCustomTest, constructor)
 {
-  luci::CircleCustom custom_node(2);
+  luci::CircleCustom custom_node(2, 1);
 
   ASSERT_EQ(luci::CircleDialect::get(), custom_node.dialect());
   ASSERT_EQ(luci::CircleOpcode::CUSTOM, custom_node.opcode());
@@ -33,18 +33,19 @@ TEST(CircleCustomTest, constructor)
 
   ASSERT_EQ(2, custom_node.numInputs());
   ASSERT_EQ(0, custom_node.custom_code().size());
+  ASSERT_EQ(1, custom_node.numOutputs());
 }
 
 TEST(CircleCustomTest, constructor_NEG)
 {
-  ASSERT_DEBUG_DEATH(luci::CircleCustom{0}, "");
+  ASSERT_DEBUG_DEATH(luci::CircleCustom(0, 0), "");
 
   SUCCEED();
 }
 
 TEST(CircleCustomTest, invalidIndex_NEG)
 {
-  luci::CircleCustom custom_node(2);
+  luci::CircleCustom custom_node(2, 1);
 
   EXPECT_ANY_THROW(custom_node.arg(5));
 }
diff --git a/compiler/luci/lang/src/Nodes/CircleFakeQuant.test.cpp b/compiler/luci/lang/src/Nodes/CircleFakeQuant.test.cpp
new file mode 100644
index 000000000..912e40570
--- /dev/null
+++ b/compiler/luci/lang/src/Nodes/CircleFakeQuant.test.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/IR/Nodes/CircleFakeQuant.h"
+
+#include "luci/IR/CircleDialect.h"
+#include "luci/IR/CircleNodeVisitor.h"
+
+#include <gtest/gtest.h>
+
+TEST(CircleFakeQuantTest, constructor_P)
+{
+  luci::CircleFakeQuant fakequant;
+
+  ASSERT_EQ(fakequant.dialect(), luci::CircleDialect::get());
+  ASSERT_EQ(fakequant.opcode(), luci::CircleOpcode::FAKE_QUANT);
+
+  ASSERT_EQ(nullptr, fakequant.inputs());
+  ASSERT_EQ(0.0f, fakequant.min());
+  ASSERT_EQ(0.0f, fakequant.max());
+  ASSERT_EQ(0, fakequant.num_bits());
+  ASSERT_FALSE(fakequant.narrow_range());
+}
diff --git a/compiler/luci/logex/src/FormattedGraph.cpp b/compiler/luci/logex/src/FormattedGraph.cpp
index b2b9cb72b..f1337e3e6 100644
--- a/compiler/luci/logex/src/FormattedGraph.cpp
+++ b/compiler/luci/logex/src/FormattedGraph.cpp
@@ -146,7 +146,9 @@ std::string circle_opname(uint32_t opnum)
 #define CIRCLE_NODE(OPCODE, CLASS) \
   case luci::CircleOpcode::OPCODE: \
     return prefix + #OPCODE;
+#define CIRCLE_VNODE CIRCLE_NODE
 #include <luci/IR/CircleNodes.lst>
+#undef CIRCLE_VNODE
 #undef CIRCLE_NODE
     default:
       break;
@@ -175,7 +177,9 @@ protected:
     s.state(locop::NodeSummary::State::PartiallyKnown);                 \
     return true;                                                        \
   }
+#define CIRCLE_VNODE CIRCLE_NODE
 #include <luci/IR/CircleNodes.lst>
+#undef CIRCLE_VNODE
 #undef CIRCLE_NODE
 
 protected:
@@ -205,6 +209,7 @@ private:
   IMPLEMENT(luci::CircleAveragePool2D)
   IMPLEMENT(luci::CircleBatchMatMul)
   IMPLEMENT(luci::CircleBatchToSpaceND)
+  IMPLEMENT(luci::CircleBidirectionalSequenceLSTM)
   IMPLEMENT(luci::CircleCast)
   IMPLEMENT(luci::CircleCeil)
   IMPLEMENT(luci::CircleConcatenation)
@@ -219,6 +224,7 @@ private:
   IMPLEMENT(luci::CircleElu)
   IMPLEMENT(luci::CircleExp)
   IMPLEMENT(luci::CircleExpandDims)
+  IMPLEMENT(luci::CircleFakeQuant)
   IMPLEMENT(luci::CircleFill)
   IMPLEMENT(luci::CircleFloor)
   IMPLEMENT(luci::CircleFloorDiv)
@@ -433,6 +439,96 @@ bool summary_node(const locop::SymbolTable *tbl, const luci::CircleBatchToSpaceN
   return true;
 }
 
+bool summary_node(const locop::SymbolTable *tbl, const luci::CircleBidirectionalSequenceLSTM *node,
+                  locop::NodeSummary &s)
+{
+  s.args().append("input", tbl->lookup(node->input()));
+
+  s.args().append("fw_input_to_input_weights", tbl->lookup(node->fw_input_to_input_weights()));
+  s.args().append("fw_input_to_forget_weights", tbl->lookup(node->fw_input_to_forget_weights()));
+  s.args().append("fw_input_to_cell_weights", tbl->lookup(node->fw_input_to_cell_weights()));
+  s.args().append("fw_input_to_output_weights", tbl->lookup(node->fw_input_to_output_weights()));
+
+  s.args().append("fw_recurrent_to_input_weights",
+                  tbl->lookup(node->fw_recurrent_to_input_weights()));
+  s.args().append("fw_recurrent_to_forget_weights",
+                  tbl->lookup(node->fw_recurrent_to_forget_weights()));
+  s.args().append("fw_recurrent_to_cell_weights",
+                  tbl->lookup(node->fw_recurrent_to_cell_weights()));
+  s.args().append("fw_recurrent_to_output_weights",
+                  tbl->lookup(node->fw_recurrent_to_output_weights()));
+
+  s.args().append("fw_cell_to_input_weights", tbl->lookup(node->fw_cell_to_input_weights()));
+  s.args().append("fw_cell_to_forget_weights", tbl->lookup(node->fw_cell_to_forget_weights()));
+  s.args().append("fw_cell_to_output_weights", tbl->lookup(node->fw_cell_to_output_weights()));
+
+  s.args().append("fw_input_gate_bias", tbl->lookup(node->fw_input_gate_bias()));
+  s.args().append("fw_forget_gate_bias", tbl->lookup(node->fw_forget_gate_bias()));
+  s.args().append("fw_cell_gate_bias", tbl->lookup(node->fw_cell_gate_bias()));
+  s.args().append("fw_output_gate_bias", tbl->lookup(node->fw_output_gate_bias()));
+
+  s.args().append("fw_projection_weights", tbl->lookup(node->fw_projection_weights()));
+  s.args().append("fw_projection_bias", tbl->lookup(node->fw_projection_bias()));
+
+  s.args().append("bw_input_to_input_weights", tbl->lookup(node->bw_input_to_input_weights()));
+  s.args().append("bw_input_to_forget_weights", tbl->lookup(node->bw_input_to_forget_weights()));
+  s.args().append("bw_input_to_cell_weights", tbl->lookup(node->bw_input_to_cell_weights()));
+  s.args().append("bw_input_to_output_weights", tbl->lookup(node->bw_input_to_output_weights()));
+
+  s.args().append("bw_recurrent_to_input_weights",
+                  tbl->lookup(node->bw_recurrent_to_input_weights()));
+  s.args().append("bw_recurrent_to_forget_weights",
+                  tbl->lookup(node->bw_recurrent_to_forget_weights()));
+  s.args().append("bw_recurrent_to_cell_weights",
+                  tbl->lookup(node->bw_recurrent_to_cell_weights()));
+  s.args().append("bw_recurrent_to_output_weights",
+                  tbl->lookup(node->bw_recurrent_to_output_weights()));
+
+  s.args().append("bw_cell_to_input_weights", tbl->lookup(node->bw_cell_to_input_weights()));
+  s.args().append("bw_cell_to_forget_weights", tbl->lookup(node->bw_cell_to_forget_weights()));
+  s.args().append("bw_cell_to_output_weights", tbl->lookup(node->bw_cell_to_output_weights()));
+
+  s.args().append("bw_input_gate_bias", tbl->lookup(node->bw_input_gate_bias()));
+  s.args().append("bw_forget_gate_bias", tbl->lookup(node->bw_forget_gate_bias()));
+  s.args().append("bw_cell_gate_bias", tbl->lookup(node->bw_cell_gate_bias()));
+  s.args().append("bw_output_gate_bias", tbl->lookup(node->bw_output_gate_bias()));
+
+  s.args().append("bw_projection_weights", tbl->lookup(node->bw_projection_weights()));
+  s.args().append("bw_projection_bias", tbl->lookup(node->bw_projection_bias()));
+
+  s.args().append("fw_activation_state", tbl->lookup(node->fw_activation_state()));
+  s.args().append("fw_cell_state", tbl->lookup(node->fw_cell_state()));
+  s.args().append("bw_activation_state", tbl->lookup(node->bw_activation_state()));
+  s.args().append("bw_cell_state", tbl->lookup(node->bw_cell_state()));
+
+  s.args().append("auxillary_input", tbl->lookup(node->auxillary_input()));
+  s.args().append("fw_auxillary_input_to_input_weights",
+                  tbl->lookup(node->fw_auxillary_input_to_input_weights()));
+  s.args().append("fw_auxillary_input_to_forget_weights",
+                  tbl->lookup(node->fw_auxillary_input_to_forget_weights()));
+  s.args().append("fw_auxillary_input_to_cell_weights",
+                  tbl->lookup(node->fw_auxillary_input_to_cell_weights()));
+  s.args().append("fw_auxillary_input_to_output_weights",
+                  tbl->lookup(node->fw_auxillary_input_to_output_weights()));
+  s.args().append("bw_auxillary_input_to_input_weights",
+                  tbl->lookup(node->bw_auxillary_input_to_input_weights()));
+  s.args().append("bw_auxillary_input_to_forget_weights",
+                  tbl->lookup(node->bw_auxillary_input_to_forget_weights()));
+  s.args().append("bw_auxillary_input_to_cell_weights",
+                  tbl->lookup(node->bw_auxillary_input_to_cell_weights()));
+  s.args().append("bw_auxillary_input_to_output_weights",
+                  tbl->lookup(node->bw_auxillary_input_to_output_weights()));
+
+  s.args().append("cell_clip", to_str(node->cell_clip()));
+  s.args().append("proj_clip", to_str(node->proj_clip()));
+  s.args().append("merge_outputs", to_str(node->merge_outputs()));
+  s.args().append("time_major", to_str(node->time_major()));
+  s.args().append("asymmetric_quantize_inputs", to_str(node->asymmetric_quantize_inputs()));
+
+  s.state(locop::NodeSummary::State::Complete);
+  return true;
+}
+
 bool summary_node(const locop::SymbolTable *tbl, const luci::CircleCast *node,
                   locop::NodeSummary &s)
 {
@@ -521,6 +617,18 @@ bool summary_node(const locop::SymbolTable *tbl, const luci::CircleExpandDims *n
   return true;
 }
 
+bool summary_node(const locop::SymbolTable *tbl, const luci::CircleFakeQuant *node,
+                  locop::NodeSummary &s)
+{
+  s.args().append("inputs", tbl->lookup(node->inputs()));
+  s.args().append("min", pepper::str(node->min()));
+  s.args().append("max", pepper::str(node->max()));
+  s.args().append("num_bits", pepper::str(node->num_bits()));
+  s.args().append("narrow_range", node->narrow_range() ? "true" : "false");
+  s.state(locop::NodeSummary::State::Complete);
+  return true;
+}
+
 bool summary_node(const locop::SymbolTable *tbl, const luci::CircleFill *node,
                   locop::NodeSummary &s)
 {
@@ -1189,7 +1297,9 @@ bool CircleNodeSummaryBuilderBase::build(const loco::Node *node, locop::NodeSumm
     s.comments().append("Mem = " + ptr_to_str(node));     \
     return summary(dynamic_cast<const CLASS *>(node), s); \
   }
+#define CIRCLE_VNODE CIRCLE_NODE
 #include <luci/IR/CircleNodes.lst>
+#undef CIRCLE_VNODE
 #undef CIRCLE_NODE
 
   return false;
@@ -1238,6 +1348,12 @@ bool CircleNodeSummaryBuilder::summary(const luci::CircleBatchToSpaceND *node,
   return summary_node(tbl(), node, s);
 }
 
+bool CircleNodeSummaryBuilder::summary(const luci::CircleBidirectionalSequenceLSTM *node,
+                                       locop::NodeSummary &s) const
+{
+  return summary_node(tbl(), node, s);
+}
+
 bool CircleNodeSummaryBuilder::summary(const luci::CircleCast *node, locop::NodeSummary &s) const
 {
   return summary_node(tbl(), node, s);
@@ -1314,6 +1430,17 @@ bool CircleNodeSummaryBuilder::summary(const luci::CircleExpandDims *node,
   return summary_node(tbl(), node, s);
 }
 
+bool CircleNodeSummaryBuilder::summary(const luci::CircleFakeQuant *node,
+                                       locop::NodeSummary &s) const
+{
+  return summary_node(tbl(), node, s);
+}
+
+bool CircleNodeSummaryBuilder::summary(const luci::CircleFill *node, locop::NodeSummary &s) const
+{
+  return summary_node(tbl(), node, s);
+}
+
 bool CircleNodeSummaryBuilder::summary(const luci::CircleFloor *node, locop::NodeSummary &s) const
 {
   return use_x(tbl(), node, s);
@@ -1331,11 +1458,6 @@ bool CircleNodeSummaryBuilder::summary(const luci::CircleFloorMod *node,
   return use_xy(tbl(), node, s);
 }
 
-bool CircleNodeSummaryBuilder::summary(const luci::CircleFill *node, locop::NodeSummary &s) const
-{
-  return summary_node(tbl(), node, s);
-}
-
 bool CircleNodeSummaryBuilder::summary(const luci::CircleFullyConnected *node,
                                        locop::NodeSummary &s) const
 {
diff --git a/compiler/luci/partition/CMakeLists.txt b/compiler/luci/partition/CMakeLists.txt
new file mode 100644
index 000000000..838642b6e
--- /dev/null
+++ b/compiler/luci/partition/CMakeLists.txt
@@ -0,0 +1,29 @@
+file(GLOB_RECURSE SOURCES "src/*.cpp")
+file(GLOB_RECURSE TESTS "src/*.test.cpp")
+list(REMOVE_ITEM SOURCES ${TESTS})
+
+add_library(luci_partition SHARED ${SOURCES})
+target_include_directories(luci_partition PRIVATE src)
+target_include_directories(luci_partition PUBLIC include)
+target_link_libraries(luci_partition PUBLIC luci_lang)
+target_link_libraries(luci_partition PRIVATE luci_service)
+target_link_libraries(luci_partition PRIVATE luci_log)
+target_link_libraries(luci_partition PRIVATE luci_logex)
+target_link_libraries(luci_partition PRIVATE mio_circle)
+target_link_libraries(luci_partition PRIVATE nncc_common)
+target_link_libraries(luci_partition PRIVATE oops)
+
+install(TARGETS luci_partition DESTINATION lib)
+
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
+nnas_find_package(GTest REQUIRED)
+
+GTest_AddTest(luci_partition_test ${TESTS})
+target_include_directories(luci_partition_test PRIVATE src)
+target_link_libraries(luci_partition_test luci_lang)
+target_link_libraries(luci_partition_test luci_partition)
+target_link_libraries(luci_partition_test luci_testhelper)
+target_link_libraries(luci_partition_test luci_service)
diff --git a/compiler/luci/partition/README.md b/compiler/luci/partition/README.md
new file mode 100644
index 000000000..40a46bc56
--- /dev/null
+++ b/compiler/luci/partition/README.md
@@ -0,0 +1,4 @@
+# luci-partition
+
+`luci-partition` provides partition of a model to two or more sub models and
+its connection configuration having same computational results.
diff --git a/compiler/luci/partition/include/luci/Partition.h b/compiler/luci/partition/include/luci/Partition.h
new file mode 100644
index 000000000..cf90e448b
--- /dev/null
+++ b/compiler/luci/partition/include/luci/Partition.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_PARTITION_H__
+#define __LUCI_PARTITION_H__
+
+#include <luci/IR/Module.h>
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace luci
+{
+
+/**
+ * @brief PartitionTable holds partition information
+ */
+struct PartitionTable
+{
+  std::vector<std::string> groups;
+  std::string default_group;
+
+  // assign by opcode name: OPCODENAME=group
+  std::unordered_map<std::string /* OPCODENAME */, std::string /* group */> byopcodes;
+
+  // TODO add assign by OP name
+};
+
+/**
+ * @brief PartedModule holds partitioned module and group name
+ */
+struct PartedModule
+{
+  std::unique_ptr<Module> module;
+  // group name used to partition this module
+  std::string group;
+
+  // unique name(filename) of this module
+  std::string name;
+};
+
+struct PartedModules
+{
+  std::vector<PartedModule> pmodules;
+
+  // TODO add connections ?
+};
+
+/**
+ * @brief Method to do paritioning from module and PartitionTable to produce PartedModules
+ */
+PartedModules apply(Module *module, const PartitionTable &partition);
+
+} // namespace luci
+
+#endif // __LUCI_PARTITION_H__
diff --git a/compiler/luci/partition/src/CircleOpCode.cpp b/compiler/luci/partition/src/CircleOpCode.cpp
new file mode 100644
index 000000000..86694fa40
--- /dev/null
+++ b/compiler/luci/partition/src/CircleOpCode.cpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleOpCode.h"
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/IR/CircleNodeVisitor.h>
+
+#include <mio/circle/schema_generated.h>
+
+namespace
+{
+
+using namespace luci;
+using namespace circle;
+
+class QueryOpCode final : public CircleNodeVisitor<BuiltinOperator>
+{
+public:
+// NOTE only circle operator may have BuiltinOperator_XXX
+#define CIRCLE_NODE(OPCODE, CIRCLE_CLASS) \
+  BuiltinOperator visit(const CIRCLE_CLASS *) final { return BuiltinOperator_##OPCODE; }
+#define CIRCLE_VNODE(OPCODE, CIRCLE_CLASS)
+
+#include "luci/IR/CircleNodes.lst"
+#undef CIRCLE_VNODE
+#undef CIRCLE_NODE
+
+  // NOTE only builtin operators should be called (NOT virtual nodes)
+};
+
+class QueryCircleName final : public luci::CircleNodeVisitor<const char *>
+{
+public:
+// NOTE provide names for circle virtual nodes
+#define CIRCLE_NODE(OPCODE, CIRCLE_CLASS)
+#define CIRCLE_VNODE(OPCODE, CIRCLE_CLASS) \
+  const char *visit(const CIRCLE_CLASS *) final { return #OPCODE; }
+
+#include "luci/IR/CircleNodes.lst"
+#undef CIRCLE_VNODE
+#undef CIRCLE_NODE
+
+  // default is null
+  const char *visit(const luci::CircleNode *) final { return nullptr; }
+};
+
+} // namespace
+
+namespace luci
+{
+
+std::string opcode_name(const CircleNode *node)
+{
+  QueryCircleName qcn;
+  auto cname = node->accept(&qcn);
+  if (cname != nullptr)
+    return std::string(cname);
+
+  QueryOpCode qoc;
+  auto opcode = node->accept(&qoc);
+  auto name = circle::EnumNameBuiltinOperator(opcode);
+  return std::string(name);
+}
+
+} // namespace luci
diff --git a/compiler/luci/lang/src/CircleShapeSignature.cpp b/compiler/luci/partition/src/CircleOpCode.h
index 970000203..d17b09261 100644
--- a/compiler/luci/lang/src/CircleShapeSignature.cpp
+++ b/compiler/luci/partition/src/CircleOpCode.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,21 +14,18 @@
  * limitations under the License.
  */
 
-#include "luci/IR/CircleShapeSignature.h"
+#ifndef __LUCI_PARTITION_CIRCLE_OP_CODE_H__
+#define __LUCI_PARTITION_CIRCLE_OP_CODE_H__
 
-namespace luci
-{
+#include <luci/IR/CircleNode.h>
 
-bool operator==(const ShapeSignature &lhs, const ShapeSignature &rhs)
-{
-  if (lhs.rank() != rhs.rank())
-    return false;
+#include <string>
 
-  for (uint32_t i = 0; i < lhs.rank(); ++i)
-    if (lhs.dim(i) != rhs.dim(i))
-      return false;
+namespace luci
+{
 
-  return true;
-}
+std::string opcode_name(const CircleNode *node);
 
 } // namespace luci
+
+#endif // __LUCI_PARTITION_CIRCLE_OP_CODE_H__
diff --git a/compiler/luci/partition/src/CircleOpCode.test.cpp b/compiler/luci/partition/src/CircleOpCode.test.cpp
new file mode 100644
index 000000000..d2524a2ef
--- /dev/null
+++ b/compiler/luci/partition/src/CircleOpCode.test.cpp
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleOpCode.h"
+
+// NOTE any node will do for testing
+#include <luci/IR/Nodes/CircleSqrt.h>
+
+#include <gtest/gtest.h>
+
+TEST(CircleOpCodeTest, name)
+{
+  auto g = loco::make_graph();
+  auto node = g->nodes()->create<luci::CircleSqrt>();
+
+  auto name = luci::opcode_name(node);
+  ASSERT_EQ(name, "SQRT");
+}
diff --git a/compiler/luci/partition/src/ConnectNode.cpp b/compiler/luci/partition/src/ConnectNode.cpp
new file mode 100644
index 000000000..336be7c57
--- /dev/null
+++ b/compiler/luci/partition/src/ConnectNode.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConnectNode.h"
+
+#include <oops/UserExn.h>
+
+namespace luci
+{
+
+void clone_connect(const luci::CircleNode *node, luci::CloneContext &clonecontext)
+{
+  ConnectNode cn(clonecontext);
+  node->accept(&cn);
+}
+
+luci::CircleNode *ConnectNode::find_clone(const luci::CircleNode *node)
+{
+  auto it = _clonecontext.find(node);
+  if (it == _clonecontext.end())
+    throw oops::UserExn("Invalid node in ConnectNode");
+  return it->second;
+}
+
+} // namespace luci
diff --git a/compiler/luci/partition/src/ConnectNode.h b/compiler/luci/partition/src/ConnectNode.h
new file mode 100644
index 000000000..017c587e5
--- /dev/null
+++ b/compiler/luci/partition/src/ConnectNode.h
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_PARTITION_CONNECT_NODE_H__
+#define __LUCI_PARTITION_CONNECT_NODE_H__
+
+#include <luci/IR/CircleNode.h>
+#include <luci/IR/CircleNodeVisitor.h>
+
+namespace luci
+{
+
+/**
+ * @note MapNode2Clone is used as a map from original node to cloned node
+ *       to find input of a cloned node
+ *
+ *   (Original)              (Clone)
+ *
+ *     [A]                  [A']
+ *      |   [B]              |   [B']
+ *      |    |               |    |
+ *       \  /                 \  /
+ *        [C]                 [C']
+ *
+ *  From view of [C'] we need to find [A'] and [B']. We know [C] from [C'],
+ *  then we can get from input of [C] as [A], [B] then [A]->[A'] and [B]->[B']
+ *  from the map.
+ */
+using MapNode2Clone = std::map<const CircleNode * /* ORG */, CircleNode * /* CLONE */>;
+
+struct CloneContext
+{
+  std::pair<MapNode2Clone::iterator, bool> emplace(const CircleNode *org, CircleNode *clone)
+  {
+    return node2clone.emplace(org, clone);
+  }
+  MapNode2Clone::iterator find(const CircleNode *org) { return node2clone.find(org); }
+  MapNode2Clone::iterator end(void) { return node2clone.end(); }
+
+  MapNode2Clone node2clone;
+};
+
+class ConnectNode final : public luci::CircleNodeVisitor<void>
+{
+public:
+  ConnectNode(luci::CloneContext &clonecontext) : _clonecontext(clonecontext){};
+
+public:
+  // void visit(const luci::CircleAbs *) final;
+  void visit(const luci::CircleAdd *) final;
+  // void visit(const luci::CircleAddN *) final;
+  // void visit(const luci::CircleArgMax *) final;
+  // void visit(const luci::CircleArgMin *) final;
+  // void visit(const luci::CircleAveragePool2D *) final;
+  // void visit(const luci::CircleBatchMatMul *) final;
+  // void visit(const luci::CircleBatchToSpaceND *) final;
+  // void visit(const luci::CircleCast *) final;
+  // void visit(const luci::CircleCeil *) final;
+  // void visit(const luci::CircleConcatenation *) final;
+  void visit(const luci::CircleConst *) final;
+  // void visit(const luci::CircleConv2D *) final;
+  // void visit(const luci::CircleCos *) final;
+  // void visit(const luci::CircleCustom *) final;
+  // void visit(const luci::CircleDepthToSpace *) final;
+  // void visit(const luci::CircleDepthwiseConv2D *) final;
+  // void visit(const luci::CircleDequantize *) final;
+  void visit(const luci::CircleDiv *) final;
+  // void visit(const luci::CircleElu *) final;
+  // void visit(const luci::CircleEqual *) final;
+  // void visit(const luci::CircleExp *) final;
+  // void visit(const luci::CircleExpandDims *) final;
+  // void visit(const luci::CircleFakeQuant *) final;
+  // void visit(const luci::CircleFill *) final;
+  // void visit(const luci::CircleFloor *) final;
+  // void visit(const luci::CircleFloorDiv *) final;
+  // void visit(const luci::CircleFloorMod *) final;
+  // void visit(const luci::CircleFullyConnected *) final;
+  // void visit(const luci::CircleGather *) final;
+  // void visit(const luci::CircleGatherNd *) final;
+  // void visit(const luci::CircleGreater *) final;
+  // void visit(const luci::CircleGreaterEqual *) final;
+  // void visit(const luci::CircleIf *) final;
+  // void visit(const luci::CircleL2Normalize *) final;
+  // void visit(const luci::CircleL2Pool2D *) final;
+  // void visit(const luci::CircleLeakyRelu *) final;
+  // void visit(const luci::CircleLess *) final;
+  // void visit(const luci::CircleLessEqual *) final;
+  // void visit(const luci::CircleLocalResponseNormalization *) final;
+  // void visit(const luci::CircleLog *) final;
+  // void visit(const luci::CircleLogicalAnd *) final;
+  // void visit(const luci::CircleLogicalNot *) final;
+  // void visit(const luci::CircleLogicalOr *) final;
+  // void visit(const luci::CircleLogistic *) final;
+  // void visit(const luci::CircleLogSoftmax *) final;
+  // void visit(const luci::CircleMatrixDiag *) final;
+  // void visit(const luci::CircleMatrixSetDiag *) final;
+  // void visit(const luci::CircleMaximum *) final;
+  // void visit(const luci::CircleMaxPool2D *) final;
+  void visit(const luci::CircleMean *) final;
+  // void visit(const luci::CircleMinimum *) final;
+  // void visit(const luci::CircleMirrorPad *) final;
+  void visit(const luci::CircleMul *) final;
+  // void visit(const luci::CircleNeg *) final;
+  // void visit(const luci::CircleNonMaxSuppressionV4 *) final;
+  // void visit(const luci::CircleNonMaxSuppressionV5 *) final;
+  // void visit(const luci::CircleNotEqual *) final;
+  // void visit(const luci::CircleOneHot *) final;
+  // void visit(const luci::CirclePack *) final;
+  // void visit(const luci::CirclePad *) final;
+  // void visit(const luci::CirclePadV2 *) final;
+  void visit(const luci::CirclePow *) final;
+  // void visit(const luci::CirclePRelu *) final;
+  // void visit(const luci::CircleRange *) final;
+  // void visit(const luci::CircleRank *) final;
+  // void visit(const luci::CircleReduceAny *) final;
+  // void visit(const luci::CircleReduceMax *) final;
+  // void visit(const luci::CircleReduceMin *) final;
+  // void visit(const luci::CircleReduceProd *) final;
+  // void visit(const luci::CircleRelu *) final;
+  // void visit(const luci::CircleRelu6 *) final;
+  // void visit(const luci::CircleReluN1To1 *) final;
+  // void visit(const luci::CircleReshape *) final;
+  // void visit(const luci::CircleResizeBilinear *) final;
+  // void visit(const luci::CircleResizeNearestNeighbor *) final;
+  // void visit(const luci::CircleReverseSequence *) final;
+  // void visit(const luci::CircleReverseV2 *) final;
+  // void visit(const luci::CircleRound *) final;
+  void visit(const luci::CircleRsqrt *) final;
+  // void visit(const luci::CircleScatterNd *) final;
+  // void visit(const luci::CircleSegmentSum *) final;
+  // void visit(const luci::CircleSelect *) final;
+  // void visit(const luci::CircleSelectV2 *) final;
+  // void visit(const luci::CircleShape *) final;
+  // void visit(const luci::CircleSin *) final;
+  // void visit(const luci::CircleSlice *) final;
+  // void visit(const luci::CircleSoftmax *) final;
+  // void visit(const luci::CircleSpaceToBatchND *) final;
+  // void visit(const luci::CircleSpaceToDepth *) final;
+  // void visit(const luci::CircleSparseToDense *) final;
+  // void visit(const luci::CircleSplit *) final;
+  // void visit(const luci::CircleSplitV *) final;
+  void visit(const luci::CircleSqrt *) final;
+  // void visit(const luci::CircleSquare *) final;
+  void visit(const luci::CircleSquaredDifference *) final;
+  // void visit(const luci::CircleSqueeze *) final;
+  // void visit(const luci::CircleStridedSlice *) final;
+  void visit(const luci::CircleSub *) final;
+  // void visit(const luci::CircleSum *) final;
+  // void visit(const luci::CircleTanh *) final;
+  // void visit(const luci::CircleTile *) final;
+  // void visit(const luci::CircleTopKV2 *) final;
+  // void visit(const luci::CircleTranspose *) final;
+  // void visit(const luci::CircleTransposeConv *) final;
+  // void visit(const luci::CircleUnidirectionalSequenceLSTM *) final;
+  // void visit(const luci::CircleUnique *) final;
+  // void visit(const luci::CircleUnpack *) final;
+  // void visit(const luci::CircleWhere *) final;
+  // void visit(const luci::CircleWhile *) final;
+  // void visit(const luci::CircleZerosLike *) final;
+
+  // Circle Only
+  // void visit(const luci::CircleBCQFullyConnected *) final;
+  // void visit(const luci::CircleBCQGather *) final;
+  // void visit(const luci::CircleInstanceNorm *) final;
+
+  // Virtual
+  // void visit(const luci::CircleCustomOut *) final;
+  // void visit(const luci::CircleIfOut *) final;
+  // void visit(const luci::CircleInput *) final;
+  // void visit(const luci::CircleNonMaxSuppressionV4Out *) final;
+  // void visit(const luci::CircleNonMaxSuppressionV5Out *) final;
+  // void visit(const luci::CircleOutput *) final;
+  // void visit(const luci::CircleOutputDummy *) final;
+  // void visit(const luci::CircleOutputExclude *) final;
+  // void visit(const luci::CircleSplitOut *) final;
+  // void visit(const luci::CircleSplitVOut *) final;
+  // void visit(const luci::CircleTopKV2Out *) final;
+  // void visit(const luci::CircleUniqueOut *) final;
+  // void visit(const luci::CircleUnpackOut *) final;
+  // void visit(const luci::CircleWhileOut *) final;
+
+public:
+  luci::CircleNode *find_clone(const luci::CircleNode *node);
+
+protected:
+  luci::CloneContext &_clonecontext;
+};
+
+/**
+ * @brief Connect cloned node from input node
+ */
+void clone_connect(const luci::CircleNode *node, luci::CloneContext &clonecontext);
+
+} // namespace luci
+
+#endif // __LUCI_PARTITION_CONNECT_NODE_H__
diff --git a/compiler/stdex/include/stdex/Memory.h b/compiler/luci/partition/src/ConnectNode.test.cpp
index 86751f073..a2009c654 100644
--- a/compiler/stdex/include/stdex/Memory.h
+++ b/compiler/luci/partition/src/ConnectNode.test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,16 +14,6 @@
  * limitations under the License.
  */
 
-#ifndef __STDEX_MEMORY_H__
-#define __STDEX_MEMORY_H__
+#include "ConnectNode.test.h"
 
-#include <memory>
-
-namespace stdex
-{
-
-using std::make_unique;
-
-} // namespace stdex
-
-#endif // __STDEX_MEMORY_H__
+// This file validates "ConnectNode.test.h". Please DO NOT remove this file.
diff --git a/compiler/luci/partition/src/ConnectNode.test.h b/compiler/luci/partition/src/ConnectNode.test.h
new file mode 100644
index 000000000..f7333ff99
--- /dev/null
+++ b/compiler/luci/partition/src/ConnectNode.test.h
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CONNECT_NODE_TEST_H__
+#define __CONNECT_NODE_TEST_H__
+
+#include "ConnectNode.h"
+
+#include <luci/Service/CircleNodeClone.h>
+#include <luci/test/TestIOGraph.h>
+
+#include <loco/IR/Graph.h>
+
+#include <initializer_list>
+#include <memory>
+#include <stdexcept>
+#include <vector>
+
+namespace luci
+{
+namespace test
+{
+
+template <unsigned N> class TestIsOGraph : public TestIsGraphlet<N>, public TestOGraphlet
+{
+public:
+  TestIsOGraph() = default;
+
+public:
+  virtual void init(const std::initializer_list<ShapeU32> shape_in, const ShapeU32 shape_out)
+  {
+    if (shape_in.size() != N)
+      throw std::runtime_error("Failed to init TestIsOGraph");
+
+    TestIsGraphlet<N>::init(TestIsGraphlet<N>::g(), shape_in);
+    TestOGraphlet::init(TestIsGraphlet<N>::g(), shape_out);
+  }
+};
+
+template <class T> class NodeGraphletT
+{
+public:
+  virtual void init(loco::Graph *g)
+  {
+    _node = g->nodes()->create<T>();
+    _node->dtype(loco::DataType::S32);
+    _node->name("node");
+  }
+
+  T *node(void) const { return _node; }
+
+protected:
+  T *_node{nullptr};
+};
+
+template <class T> class NodeIsGraphletT
+{
+public:
+  virtual void init(loco::Graph *g, uint32_t n)
+  {
+    _node = g->nodes()->create<T>(n);
+    _node->dtype(loco::DataType::S32);
+    _node->name("node");
+  }
+
+  T *node(void) const { return _node; }
+
+protected:
+  T *_node{nullptr};
+};
+
+/**
+ * @brief ConnectionTestHelper provides common framework for testing
+ *        cloned CircleNode connection
+ */
+class ConnectionTestHelper
+{
+public:
+  ConnectionTestHelper() { _graph_clone = loco::make_graph(); }
+
+public:
+  template <unsigned N> void prepare_inputs(TestIsOGraph<N> *isograph)
+  {
+    assert(N == isograph->num_inputs());
+
+    for (uint32_t i = 0; i < N; ++i)
+    {
+      auto *input = _graph_clone->nodes()->create<luci::CircleInput>();
+      luci::copy_common_attributes(isograph->input(i), input);
+      _clonectx.emplace(isograph->input(i), input);
+      _inputs.push_back(input);
+    }
+  }
+
+  /**
+   * @note prepare_inputs_miss is for negative testing
+   */
+  template <unsigned N> void prepare_inputs_miss(TestIsOGraph<N> *isograph)
+  {
+    assert(N == isograph->num_inputs());
+
+    for (uint32_t i = 0; i < N; ++i)
+    {
+      auto *input = _graph_clone->nodes()->create<luci::CircleInput>();
+      luci::copy_common_attributes(isograph->input(i), input);
+      if (i != 0)
+        _clonectx.emplace(isograph->input(i), input);
+      _inputs.push_back(input);
+    }
+  }
+
+  void clone_connect(luci::CircleNode *node, luci::CircleNode *clone)
+  {
+    _clonectx.emplace(node, clone);
+
+    luci::clone_connect(node, _clonectx);
+  }
+
+public:
+  loco::Graph *graph_clone(void) { return _graph_clone.get(); }
+
+  luci::CircleNode *inputs(uint32_t idx) { return _inputs.at(idx); }
+
+protected:
+  luci::CloneContext _clonectx;
+  std::vector<luci::CircleInput *> _inputs;
+  std::unique_ptr<loco::Graph> _graph_clone; // graph for clones
+};
+
+} // namespace test
+} // namespace luci
+
+#endif // __CONNECT_NODE_TEST_H__
diff --git a/compiler/luci/partition/src/Nodes/CircleAdd.cpp b/compiler/luci/partition/src/Nodes/CircleAdd.cpp
new file mode 100644
index 000000000..d393997e9
--- /dev/null
+++ b/compiler/luci/partition/src/Nodes/CircleAdd.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConnectNode.h"
+
+namespace
+{
+
+void connect(luci::ConnectNode *cn, const luci::CircleAdd *node)
+{
+  auto *cloned = loco::must_cast<luci::CircleAdd *>(cn->find_clone(node));
+
+  luci::CircleNode *x = loco::must_cast<luci::CircleNode *>(node->x());
+  luci::CircleNode *y = loco::must_cast<luci::CircleNode *>(node->y());
+
+  cloned->x(cn->find_clone(x));
+  cloned->y(cn->find_clone(y));
+}
+
+} // namespace
+
+namespace luci
+{
+
+void ConnectNode::visit(const luci::CircleAdd *node) { connect(this, node); }
+
+} // namespace luci
diff --git a/compiler/luci/partition/src/Nodes/CircleAdd.test.cpp b/compiler/luci/partition/src/Nodes/CircleAdd.test.cpp
new file mode 100644
index 000000000..e457b83d2
--- /dev/null
+++ b/compiler/luci/partition/src/Nodes/CircleAdd.test.cpp
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConnectNode.h"
+
+#include "ConnectNode.test.h"
+
+#include <luci/Service/CircleNodeClone.h>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+using namespace luci::test;
+
+class NodeGraphlet : public NodeGraphletT<luci::CircleAdd>
+{
+public:
+  NodeGraphlet() = default;
+
+public:
+  void init(loco::Graph *g) override
+  {
+    NodeGraphletT<luci::CircleAdd>::init(g);
+
+    _node->fusedActivationFunction(luci::FusedActFunc::RELU);
+  }
+};
+
+class TestNodeGraph : public TestIsOGraph<2>, public NodeGraphlet
+{
+public:
+  TestNodeGraph() = default;
+
+public:
+  void init(const ShapeU32 shape)
+  {
+    TestIsOGraph<2>::init({shape, shape}, shape);
+    NodeGraphlet::init(g());
+
+    node()->x(input(0));
+    node()->y(input(1));
+
+    output()->from(node());
+  }
+};
+
+} // namespace
+
+TEST(ConnectNodeTest, connect_Add)
+{
+  TestNodeGraph tng;
+  tng.init({2, 3});
+
+  ConnectionTestHelper cth;
+  cth.prepare_inputs(&tng);
+
+  auto *node = tng.node();
+  ASSERT_NO_THROW(loco::must_cast<luci::CircleAdd *>(node));
+
+  auto *clone = luci::clone_node(node, cth.graph_clone());
+  ASSERT_NO_THROW(loco::must_cast<luci::CircleAdd *>(clone));
+
+  cth.clone_connect(node, clone);
+
+  ASSERT_EQ(2, clone->arity());
+  ASSERT_EQ(cth.inputs(0), clone->arg(0));
+  ASSERT_EQ(cth.inputs(1), clone->arg(1));
+}
+
+TEST(ConnectNodeTest, connect_Add_NEG)
+{
+  TestNodeGraph tng;
+  tng.init({2, 3});
+
+  ConnectionTestHelper cth;
+  cth.prepare_inputs_miss(&tng);
+
+  auto *node = tng.node();
+  ASSERT_NO_THROW(loco::must_cast<luci::CircleAdd *>(node));
+
+  auto *clone = luci::clone_node(node, cth.graph_clone());
+  ASSERT_NO_THROW(loco::must_cast<luci::CircleAdd *>(clone));
+
+  EXPECT_ANY_THROW(cth.clone_connect(node, clone));
+}
diff --git a/compiler/luci/service/src/Nodes/CircleInput.cpp b/compiler/luci/partition/src/Nodes/CircleConst.cpp
index 24eab7bd6..118cd8de2 100644
--- a/compiler/luci/service/src/Nodes/CircleInput.cpp
+++ b/compiler/luci/partition/src/Nodes/CircleConst.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,14 +14,14 @@
  * limitations under the License.
  */
 
-#include <luci/Service/CircleShapeSignatureInference.h>
+#include "ConnectNode.h"
 
 namespace luci
 {
 
-ShapeSignature ssinf::Algorithm::visit(const luci::CircleInput *node)
+void ConnectNode::visit(const luci::CircleConst *)
 {
-  return node->shape_signature();
+  // Nothing to do
 }
 
 } // namespace luci
diff --git a/compiler/luci/partition/src/Nodes/CircleDiv.cpp b/compiler/luci/partition/src/Nodes/CircleDiv.cpp
new file mode 100644
index 000000000..480338542
--- /dev/null
+++ b/compiler/luci/partition/src/Nodes/CircleDiv.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConnectNode.h"
+
+namespace
+{
+
+void connect(luci::ConnectNode *cn, const luci::CircleDiv *node)
+{
+  auto *cloned = loco::must_cast<luci::CircleDiv *>(cn->find_clone(node));
+
+  luci::CircleNode *x = loco::must_cast<luci::CircleNode *>(node->x());
+  luci::CircleNode *y = loco::must_cast<luci::CircleNode *>(node->y());
+
+  cloned->x(cn->find_clone(x));
+  cloned->y(cn->find_clone(y));
+}
+
+} // namespace
+
+namespace luci
+{
+
+void ConnectNode::visit(const luci::CircleDiv *node) { connect(this, node); }
+
+} // namespace luci
diff --git a/compiler/luci/partition/src/Nodes/CircleDiv.test.cpp b/compiler/luci/partition/src/Nodes/CircleDiv.test.cpp
new file mode 100644
index 000000000..226932337
--- /dev/null
+++ b/compiler/luci/partition/src/Nodes/CircleDiv.test.cpp
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConnectNode.h"
+
+#include "ConnectNode.test.h"
+
+#include <luci/Service/CircleNodeClone.h>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+using namespace luci::test;
+
+class NodeGraphlet : public NodeGraphletT<luci::CircleDiv>
+{
+public:
+  NodeGraphlet() = default;
+
+public:
+  void init(loco::Graph *g) override
+  {
+    NodeGraphletT<luci::CircleDiv>::init(g);
+
+    _node->fusedActivationFunction(luci::FusedActFunc::RELU);
+  }
+};
+
+class TestNodeGraph : public TestIsOGraph<2>, public NodeGraphlet
+{
+public:
+  TestNodeGraph() = default;
+
+public:
+  void init(const ShapeU32 shape)
+  {
+    TestIsOGraph<2>::init({shape, shape}, shape);
+    NodeGraphlet::init(g());
+
+    node()->x(input(0));
+    node()->y(input(1));
+
+    output()->from(node());
+  }
+};
+
+} // namespace
+
+TEST(ConnectNodeTest, connect_Div)
+{
+  TestNodeGraph tng;
+  tng.init({2, 3});
+
+  ConnectionTestHelper cth;
+  cth.prepare_inputs(&tng);
+
+  auto *node = tng.node();
+  ASSERT_NO_THROW(loco::must_cast<luci::CircleDiv *>(node));
+
+  auto *clone = luci::clone_node(node, cth.graph_clone());
+  ASSERT_NO_THROW(loco::must_cast<luci::CircleDiv *>(clone));
+
+  cth.clone_connect(node, clone);
+
+  ASSERT_EQ(2, clone->arity());
+  ASSERT_EQ(cth.inputs(0), clone->arg(0));
+  ASSERT_EQ(cth.inputs(1), clone->arg(1));
+}
+
+TEST(ConnectNodeTest, connect_Div_NEG)
+{
+  TestNodeGraph tng;
+  tng.init({2, 3});
+
+  ConnectionTestHelper cth;
+  cth.prepare_inputs_miss(&tng);
+
+  auto *node = tng.node();
+  ASSERT_NO_THROW(loco::must_cast<luci::CircleDiv *>(node));
+
+  auto *clone = luci::clone_node(node, cth.graph_clone());
+  ASSERT_NO_THROW(loco::must_cast<luci::CircleDiv *>(clone));
+
+  EXPECT_ANY_THROW(cth.clone_connect(node, clone));
+}
diff --git a/compiler/luci/partition/src/Nodes/CircleMean.cpp b/compiler/luci/partition/src/Nodes/CircleMean.cpp
new file mode 100644
index 000000000..b634e5838
--- /dev/null
+++ b/compiler/luci/partition/src/Nodes/CircleMean.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConnectNode.h"
+
+namespace
+{
+
+void connect(luci::ConnectNode *cn, const luci::CircleMean *node)
+{
+  auto *cloned = loco::must_cast<luci::CircleMean *>(cn->find_clone(node));
+
+  luci::CircleNode *input = loco::must_cast<luci::CircleNode *>(node->input());
+  luci::CircleNode *reduction_indices =
+    loco::must_cast<luci::CircleNode *>(node->reduction_indices());
+
+  cloned->input(cn->find_clone(input));
+  cloned->reduction_indices(cn->find_clone(reduction_indices));
+}
+
+} // namespace
+
+namespace luci
+{
+
+void ConnectNode::visit(const luci::CircleMean *node) { connect(this, node); }
+
+} // namespace luci
diff --git a/compiler/luci/partition/src/Nodes/CircleMul.cpp b/compiler/luci/partition/src/Nodes/CircleMul.cpp
new file mode 100644
index 000000000..2cd2b4038
--- /dev/null
+++ b/compiler/luci/partition/src/Nodes/CircleMul.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConnectNode.h"
+
+namespace
+{
+
+void connect(luci::ConnectNode *cn, const luci::CircleMul *node)
+{
+  auto *cloned = loco::must_cast<luci::CircleMul *>(cn->find_clone(node));
+
+  luci::CircleNode *x = loco::must_cast<luci::CircleNode *>(node->x());
+  luci::CircleNode *y = loco::must_cast<luci::CircleNode *>(node->y());
+
+  cloned->x(cn->find_clone(x));
+  cloned->y(cn->find_clone(y));
+}
+
+} // namespace
+
+namespace luci
+{
+
+void ConnectNode::visit(const luci::CircleMul *node) { connect(this, node); }
+
+} // namespace luci
diff --git a/compiler/luci/partition/src/Nodes/CircleMul.test.cpp b/compiler/luci/partition/src/Nodes/CircleMul.test.cpp
new file mode 100644
index 000000000..99cf0824d
--- /dev/null
+++ b/compiler/luci/partition/src/Nodes/CircleMul.test.cpp
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConnectNode.h"
+
+#include "ConnectNode.test.h"
+
+#include <luci/Service/CircleNodeClone.h>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+using namespace luci::test;
+
+class NodeGraphlet : public NodeGraphletT<luci::CircleMul>
+{
+public:
+  NodeGraphlet() = default;
+
+public:
+  void init(loco::Graph *g)
+  {
+    NodeGraphletT<luci::CircleMul>::init(g);
+
+    _node->fusedActivationFunction(luci::FusedActFunc::RELU);
+  }
+};
+
+class TestNodeGraph : public TestIsOGraph<2>, public NodeGraphlet
+{
+public:
+  TestNodeGraph() = default;
+
+public:
+  void init(const ShapeU32 shape)
+  {
+    TestIsOGraph<2>::init({shape, shape}, shape);
+    NodeGraphlet::init(g());
+
+    node()->x(input(0));
+    node()->y(input(1));
+
+    output()->from(node());
+  }
+};
+
+} // namespace
+
+TEST(ConnectNodeTest, connect_Mul)
+{
+  TestNodeGraph tng;
+  tng.init({2, 3});
+
+  ConnectionTestHelper cth;
+  cth.prepare_inputs(&tng);
+
+  auto *node = tng.node();
+  ASSERT_NO_THROW(loco::must_cast<luci::CircleMul *>(node));
+
+  auto *clone = luci::clone_node(node, cth.graph_clone());
+  ASSERT_NO_THROW(loco::must_cast<luci::CircleMul *>(clone));
+
+  cth.clone_connect(node, clone);
+
+  ASSERT_EQ(2, clone->arity());
+  ASSERT_EQ(cth.inputs(0), clone->arg(0));
+  ASSERT_EQ(cth.inputs(1), clone->arg(1));
+}
+
+TEST(ConnectNodeTest, connect_Mul_NEG)
+{
+  TestNodeGraph tng;
+  tng.init({2, 3});
+
+  ConnectionTestHelper cth;
+  cth.prepare_inputs_miss(&tng);
+
+  auto *node = tng.node();
+  ASSERT_NO_THROW(loco::must_cast<luci::CircleMul *>(node));
+
+  auto *clone = luci::clone_node(node, cth.graph_clone());
+  ASSERT_NO_THROW(loco::must_cast<luci::CircleMul *>(clone));
+
+  EXPECT_ANY_THROW(cth.clone_connect(node, clone));
+}
diff --git a/compiler/luci/partition/src/Nodes/CirclePow.cpp b/compiler/luci/partition/src/Nodes/CirclePow.cpp
new file mode 100644
index 000000000..fb180ee69
--- /dev/null
+++ b/compiler/luci/partition/src/Nodes/CirclePow.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConnectNode.h"
+
+namespace
+{
+
+void connect(luci::ConnectNode *cn, const luci::CirclePow *node)
+{
+  auto *cloned = loco::must_cast<luci::CirclePow *>(cn->find_clone(node));
+
+  luci::CircleNode *x = loco::must_cast<luci::CircleNode *>(node->x());
+  luci::CircleNode *y = loco::must_cast<luci::CircleNode *>(node->y());
+
+  cloned->x(cn->find_clone(x));
+  cloned->y(cn->find_clone(y));
+}
+
+} // namespace
+
+namespace luci
+{
+
+void ConnectNode::visit(const luci::CirclePow *node) { connect(this, node); }
+
+} // namespace luci
diff --git a/compiler/luci/partition/src/Nodes/CircleRsqrt.cpp b/compiler/luci/partition/src/Nodes/CircleRsqrt.cpp
new file mode 100644
index 000000000..03e64aad0
--- /dev/null
+++ b/compiler/luci/partition/src/Nodes/CircleRsqrt.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConnectNode.h"
+
+namespace
+{
+
+void connect(luci::ConnectNode *cn, const luci::CircleRsqrt *node)
+{
+  auto *cloned = loco::must_cast<luci::CircleRsqrt *>(cn->find_clone(node));
+
+  luci::CircleNode *x = loco::must_cast<luci::CircleNode *>(node->x());
+
+  cloned->x(cn->find_clone(x));
+}
+
+} // namespace
+
+namespace luci
+{
+
+void ConnectNode::visit(const luci::CircleRsqrt *node) { connect(this, node); }
+
+} // namespace luci
diff --git a/compiler/luci/partition/src/Nodes/CircleSqrt.cpp b/compiler/luci/partition/src/Nodes/CircleSqrt.cpp
new file mode 100644
index 000000000..f737aac8d
--- /dev/null
+++ b/compiler/luci/partition/src/Nodes/CircleSqrt.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConnectNode.h"
+
+namespace
+{
+
+void connect(luci::ConnectNode *cn, const luci::CircleSqrt *node)
+{
+  auto *cloned = loco::must_cast<luci::CircleSqrt *>(cn->find_clone(node));
+
+  luci::CircleNode *x = loco::must_cast<luci::CircleNode *>(node->x());
+
+  cloned->x(cn->find_clone(x));
+}
+
+} // namespace
+
+namespace luci
+{
+
+void ConnectNode::visit(const luci::CircleSqrt *node) { connect(this, node); }
+
+} // namespace luci
diff --git a/compiler/luci/partition/src/Nodes/CircleSquaredDifference.cpp b/compiler/luci/partition/src/Nodes/CircleSquaredDifference.cpp
new file mode 100644
index 000000000..40dd31706
--- /dev/null
+++ b/compiler/luci/partition/src/Nodes/CircleSquaredDifference.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConnectNode.h"
+
+namespace
+{
+
+void connect(luci::ConnectNode *cn, const luci::CircleSquaredDifference *node)
+{
+  auto *cloned = loco::must_cast<luci::CircleSquaredDifference *>(cn->find_clone(node));
+
+  luci::CircleNode *x = loco::must_cast<luci::CircleNode *>(node->x());
+  luci::CircleNode *y = loco::must_cast<luci::CircleNode *>(node->y());
+
+  cloned->x(cn->find_clone(x));
+  cloned->y(cn->find_clone(y));
+}
+
+} // namespace
+
+namespace luci
+{
+
+void ConnectNode::visit(const luci::CircleSquaredDifference *node) { connect(this, node); }
+
+} // namespace luci
diff --git a/compiler/luci/partition/src/Nodes/CircleSub.cpp b/compiler/luci/partition/src/Nodes/CircleSub.cpp
new file mode 100644
index 000000000..8ac294b7b
--- /dev/null
+++ b/compiler/luci/partition/src/Nodes/CircleSub.cpp
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConnectNode.h"
+
+namespace
+{
+
+void connect(luci::ConnectNode *cn, const luci::CircleSub *node)
+{
+  auto *cloned = loco::must_cast<luci::CircleSub *>(cn->find_clone(node));
+
+  luci::CircleNode *x = loco::must_cast<luci::CircleNode *>(node->x());
+  luci::CircleNode *y = loco::must_cast<luci::CircleNode *>(node->y());
+
+  cloned->x(cn->find_clone(x));
+  cloned->y(cn->find_clone(y));
+}
+
+} // namespace
+
+namespace luci
+{
+
+void ConnectNode::visit(const luci::CircleSub *node) { connect(this, node); }
+
+} // namespace luci
diff --git a/compiler/luci/partition/src/Nodes/CircleSub.test.cpp b/compiler/luci/partition/src/Nodes/CircleSub.test.cpp
new file mode 100644
index 000000000..7c0d83745
--- /dev/null
+++ b/compiler/luci/partition/src/Nodes/CircleSub.test.cpp
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ConnectNode.h"
+
+#include "ConnectNode.test.h"
+
+#include <luci/Service/CircleNodeClone.h>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+using namespace luci::test;
+
+class NodeGraphlet : public NodeGraphletT<luci::CircleSub>
+{
+public:
+  NodeGraphlet() = default;
+
+public:
+  void init(loco::Graph *g)
+  {
+    NodeGraphletT<luci::CircleSub>::init(g);
+
+    _node->fusedActivationFunction(luci::FusedActFunc::RELU);
+  }
+};
+
+class TestNodeGraph : public TestIsOGraph<2>, public NodeGraphlet
+{
+public:
+  TestNodeGraph() = default;
+
+public:
+  void init(const ShapeU32 shape)
+  {
+    TestIsOGraph<2>::init({shape, shape}, shape);
+    NodeGraphlet::init(g());
+
+    node()->x(input(0));
+    node()->y(input(1));
+
+    output()->from(node());
+  }
+};
+
+} // namespace
+
+TEST(ConnectNodeTest, connect_Sub)
+{
+  TestNodeGraph tng;
+  tng.init({2, 3});
+
+  ConnectionTestHelper cth;
+  cth.prepare_inputs(&tng);
+
+  auto *node = tng.node();
+  ASSERT_NO_THROW(loco::must_cast<luci::CircleSub *>(node));
+
+  auto *clone = luci::clone_node(node, cth.graph_clone());
+  ASSERT_NO_THROW(loco::must_cast<luci::CircleSub *>(clone));
+
+  cth.clone_connect(node, clone);
+
+  ASSERT_EQ(2, clone->arity());
+  ASSERT_EQ(cth.inputs(0), clone->arg(0));
+  ASSERT_EQ(cth.inputs(1), clone->arg(1));
+}
+
+TEST(ConnectNodeTest, connect_Sub_NEG)
+{
+  TestNodeGraph tng;
+  tng.init({2, 3});
+
+  ConnectionTestHelper cth;
+  cth.prepare_inputs_miss(&tng);
+
+  auto *node = tng.node();
+  ASSERT_NO_THROW(loco::must_cast<luci::CircleSub *>(node));
+
+  auto *clone = luci::clone_node(node, cth.graph_clone());
+  ASSERT_NO_THROW(loco::must_cast<luci::CircleSub *>(clone));
+
+  EXPECT_ANY_THROW(cth.clone_connect(node, clone));
+}
diff --git a/compiler/luci/partition/src/Partition.cpp b/compiler/luci/partition/src/Partition.cpp
new file mode 100644
index 000000000..cc7106ca9
--- /dev/null
+++ b/compiler/luci/partition/src/Partition.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "PartitionIR.h"
+#include "PartitionIRDump.h"
+#include "PartitionPGroups.h"
+#include "PartitionMerge.h"
+#include "PartitionCleanup.h"
+#include "PartitionPModules.h"
+#include "PartitionPModulesDump.h"
+
+#include "luci/Partition.h"
+#include "luci/Log.h"
+
+#include <cassert>
+
+namespace luci
+{
+
+/**
+ * @brief This will return Partitioned Modules object
+ */
+PartedModules apply(Module *source, const PartitionTable &partition)
+{
+  assert(source != nullptr);
+
+  LOGGER(l);
+
+  auto pgroups = produce_pgroups(source, partition);
+  INFO(l) << "--- Partition Graph (1)------------------------";
+  INFO(l) << pgroups.get();
+
+  auto mpgroups = merge_pgroups(pgroups.get());
+  INFO(l) << "--- Partition Graph (2)------------------------";
+  INFO(l) << mpgroups.get();
+
+  remove_unused_inputoutputs(mpgroups.get(), source);
+  INFO(l) << "--- Partition Graph (3)------------------------";
+  INFO(l) << mpgroups.get();
+
+  auto pmodules = produce_pmodules(mpgroups.get());
+  INFO(l) << "--- Modules -----------------------------------";
+  INFO(l) << &pmodules;
+
+  return pmodules;
+}
+
+} // namespace luci
diff --git a/compiler/luci/partition/src/Partition.test.cpp b/compiler/luci/partition/src/Partition.test.cpp
new file mode 100644
index 000000000..9e24c441c
--- /dev/null
+++ b/compiler/luci/partition/src/Partition.test.cpp
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Partition.h"
+
+#include <luci/test/TestIOGraph.h>
+
+#include <luci/IR/Nodes/CircleSqrt.h>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+using namespace luci::test;
+
+class SqrtGraphlet
+{
+public:
+  SqrtGraphlet() = default;
+
+public:
+  void init(loco::Graph *g, const ShapeU32 input_shape)
+  {
+    _sqrt = g->nodes()->create<luci::CircleSqrt>();
+    _sqrt->dtype(loco::DataType::S32);
+    _sqrt->name("sqrt");
+  }
+
+protected:
+  luci::CircleSqrt *_sqrt = nullptr;
+};
+
+class SqrtGraph : public TestIOGraph, public SqrtGraphlet
+{
+public:
+  SqrtGraph() = default;
+
+public:
+  void init(const ShapeU32 shape)
+  {
+    TestIOGraph::init(shape, shape);
+    SqrtGraphlet::init(g(), shape);
+
+    _sqrt->x(input());
+
+    output()->from(_sqrt);
+  }
+};
+
+} // namespace
+
+TEST(PartitionTest, simple_apply)
+{
+  luci::Module module;
+
+  SqrtGraph g;
+  g.init({3, 3});
+  g.transfer_to(&module);
+
+  luci::PartitionTable pt;
+  pt.default_group = "A";
+
+  auto pms = apply(&module, pt);
+
+  ASSERT_EQ(1, pms.pmodules.size());
+
+  auto &pm = *pms.pmodules.begin();
+  ASSERT_NE(nullptr, pm.module->graph());
+}
diff --git a/compiler/luci/partition/src/PartitionCleanup.cpp b/compiler/luci/partition/src/PartitionCleanup.cpp
new file mode 100644
index 000000000..6545295df
--- /dev/null
+++ b/compiler/luci/partition/src/PartitionCleanup.cpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "PartitionCleanup.h"
+
+#include "luci/Log.h"
+
+namespace
+{
+
+using CircleNodes = std::vector<luci::CircleNode *>;
+
+/**
+ * @note Original source outputs should be outputs
+ */
+void gather_graph_outputs(CircleNodes &nodes, const luci::Module *source)
+{
+  // graph outputs are treated as used
+  auto graph = source->graph();
+  for (uint32_t n = 0; n < graph->outputs()->size(); ++n)
+  {
+    auto output = luci::output_node(graph, n); // output is CircleOutput
+    assert(output != nullptr);
+
+    auto node = loco::must_cast<luci::CircleNode *>(output->from());
+
+    nodes.push_back(node);
+  }
+
+  // TODO add unused virtual outputs
+}
+
+/**
+ * @note If one PGroup requires an input, that input should be an output
+ *        from another PGroup
+ */
+void gather_pgroups_outputs(CircleNodes &nodes, const luci::PGroups *pgroups)
+{
+  // input of a pgroup is used output
+  for (auto &pgroup : pgroups->pgroups)
+  {
+    for (auto input : pgroup->inputs)
+    {
+      nodes.push_back(input);
+    }
+  }
+}
+
+} // namespace
+
+namespace luci
+{
+
+void remove_unused_inputoutputs(luci::PGroups *pgroups, const luci::Module *source)
+{
+  assert(source != nullptr);
+  assert(pgroups != nullptr);
+
+  LOGGER(l);
+
+  // TODO support multiple subgraph
+  assert(source->size() == 1);
+
+  INFO(l) << "--- Cleanup unused inputs/outputs";
+
+  // remove input within same pgroup
+  for (auto &pgroup : pgroups->pgroups)
+  {
+    bool changed;
+    do
+    {
+      changed = false;
+      for (auto it = pgroup->inputs.begin(); it != pgroup->inputs.end(); ++it)
+      {
+        auto input = *it;
+        if (pgroups->pgroup_of(input) == pgroup.get())
+        {
+          INFO(l) << "  Cleanup input " << input->name() << " from group " << pgroup->group;
+          pgroup->inputs.erase(it);
+          changed = true;
+          break;
+        }
+        // NOTE CircleConst is one of input type, as they are registered as
+        //      input to some node and then (should be) merged.
+        //      Remove if this input is CircleConst
+        if (dynamic_cast<CircleConst *>(input) != nullptr)
+        {
+          INFO(l) << "  Cleanup CircleConst " << input->name() << " from group " << pgroup->group;
+          pgroup->inputs.erase(it);
+          changed = true;
+          break;
+        }
+      }
+    } while (changed);
+  }
+
+  // remove unused output(s)
+  // 'used_outputs' will hold actual used outputs for all PGroups
+  CircleNodes used_outputs;
+
+  gather_graph_outputs(used_outputs, source);
+  gather_pgroups_outputs(used_outputs, pgroups);
+
+  for (auto &pgroup : pgroups->pgroups)
+  {
+    bool changed;
+    do
+    {
+      changed = false;
+      for (auto it = pgroup->outputs.begin(); it != pgroup->outputs.end(); ++it)
+      {
+        auto output = *it;
+        auto oit = std::find(used_outputs.begin(), used_outputs.end(), output);
+        if (oit == used_outputs.end())
+        {
+          INFO(l) << "  Cleanup output " << output->name() << " from group " << pgroup->group;
+          pgroup->outputs.erase(it);
+          changed = true;
+          break;
+        }
+      }
+    } while (changed);
+  }
+}
+
+} // namespace luci
diff --git a/compiler/luci/partition/src/PartitionCleanup.h b/compiler/luci/partition/src/PartitionCleanup.h
new file mode 100644
index 000000000..f81b4a7cb
--- /dev/null
+++ b/compiler/luci/partition/src/PartitionCleanup.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_PARTITON_CLEANUP_H__
+#define __LUCI_PARTITON_CLEANUP_H__
+
+#include "PartitionIR.h"
+
+#include <luci/IR/Module.h>
+
+namespace luci
+{
+
+/**
+ * @brief This will remove unused inputs/outputs in each pgroup of pgroups
+ */
+void remove_unused_inputoutputs(luci::PGroups *, const luci::Module *);
+
+} // namespace luci
+
+#endif // __LUCI_PARTITON_CLEANUP_H__
diff --git a/compiler/luci/partition/src/PartitionIR.cpp b/compiler/luci/partition/src/PartitionIR.cpp
new file mode 100644
index 000000000..ebd6b25fa
--- /dev/null
+++ b/compiler/luci/partition/src/PartitionIR.cpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "PartitionIR.h"
+#include "CircleOpCode.h"
+
+#include "luci/Log.h"
+
+#include <cassert>
+#include <ostream>
+#include <iostream>
+
+namespace luci
+{
+
+std::unique_ptr<PGroups> PGroups::make_copy(void) const
+{
+  auto d_pgroups = std::make_unique<luci::PGroups>();
+
+  for (auto &s_pgroup : pgroups)
+  {
+    // make a copy of s_pgroup to d_pgroup
+    std::unique_ptr<luci::PGroup> d_pgroup = std::make_unique<luci::PGroup>();
+
+    d_pgroup->group = s_pgroup->group;
+    d_pgroup->id = s_pgroup->id;
+
+    for (auto &pnode : s_pgroup->pnodes)
+    {
+      auto pnodec = std::make_unique<luci::PNode>();
+      pnodec->node = pnode->node;
+      pnodec->group = pnode->group;
+      pnodec->pgroup = d_pgroup.get();
+      d_pgroup->pnodes.push_back(std::move(pnodec));
+    }
+
+    for (auto &input : s_pgroup->inputs)
+      d_pgroup->inputs.push_back(input);
+
+    for (auto &output : s_pgroup->outputs)
+      d_pgroup->outputs.push_back(output);
+
+    // copy node2group
+    for (auto it = node2group.begin(); it != node2group.end(); ++it)
+      d_pgroups->node2group[it->first] = it->second;
+
+    // build id2pgroup
+    d_pgroups->id2pgroup[d_pgroup->id] = d_pgroup.get();
+
+    d_pgroups->pgroups.push_back(std::move(d_pgroup));
+    // note: d_pgroup is now nullptr as it's moved
+  }
+
+  return std::move(d_pgroups);
+}
+
+std::string PGroups::group_of(luci::CircleNode *node) const
+{
+  assert(node != nullptr);
+
+  LOGGER(l);
+
+  auto it = node2group.find(node);
+  if (it == node2group.end())
+  {
+    INFO(l) << "PGroups::group_of " << node << "(" << node->name() << ") not found" << std::endl;
+    return "";
+  }
+  return it->second;
+}
+
+const PGroup *PGroups::pgroup_of(luci::CircleNode *node) const
+{
+  assert(node != nullptr);
+
+  for (auto &pgroup : pgroups)
+  {
+    for (auto &pnode : pgroup->pnodes)
+    {
+      if (node == pnode->node)
+        return pgroup.get();
+    }
+  }
+  // node maybe graph input (CircleInput)
+  return nullptr;
+}
+
+} // namespace luci
diff --git a/compiler/luci/partition/src/PartitionIR.h b/compiler/luci/partition/src/PartitionIR.h
new file mode 100644
index 000000000..852e38cc0
--- /dev/null
+++ b/compiler/luci/partition/src/PartitionIR.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_PARTITION_IR_H__
+#define __LUCI_PARTITION_IR_H__
+
+#include <luci/IR/CircleNodes.h>
+
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace luci
+{
+
+struct PGroup;
+
+/**
+ * @brief Partition Node with CircleNode with group name
+ * @note  node just points to source luci::CircleNode, NOT the cloned node
+ *        CloneContext is used to find cloned node from source node
+ */
+struct PNode
+{
+  const luci::CircleNode *node = nullptr;
+  std::string group;
+
+  const PGroup *pgroup = nullptr;
+};
+
+/**
+ * @brief Partition Group with Partition Nodes of same group and I/Os nodes
+ */
+struct PGroup
+{
+  std::vector<std::unique_ptr<PNode>> pnodes;
+  std::string group;
+  uint32_t id = 0;
+
+  // I/O while partitioning
+  std::vector<luci::CircleNode *> inputs;
+  std::vector<luci::CircleNode *> outputs;
+};
+
+struct PGroups
+{
+  std::vector<std::unique_ptr<PGroup>> pgroups;
+
+  // node2group is to find group key from source node
+  std::map<const luci::CircleNode *, std::string> node2group;
+
+  // id2pngroup is to find *pngroup from pngroup id
+  std::map<uint32_t, PGroup *> id2pgroup;
+
+  // default group key for reference
+  std::string default_group;
+
+public:
+  /**
+   * @brief return a copy of PGroups
+   */
+  std::unique_ptr<PGroups> make_copy(void) const;
+
+  /**
+   * @brief return group key of node, empty string if not found
+   */
+  std::string group_of(luci::CircleNode *node) const;
+
+  /**
+   * @brief return holding pgroup of node, nullptr if not found
+   */
+  const PGroup *pgroup_of(luci::CircleNode *node) const;
+};
+
+} // namespace luci
+
+#endif // __LUCI_PARTITION_IR_H__
diff --git a/compiler/luci/partition/src/PartitionIR.test.cpp b/compiler/luci/partition/src/PartitionIR.test.cpp
new file mode 100644
index 000000000..4c051a96d
--- /dev/null
+++ b/compiler/luci/partition/src/PartitionIR.test.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "PartitionIR.h"
+
+// NOTE any node will do for testing
+#include <luci/IR/Nodes/CircleAdd.h>
+
+#include <gtest/gtest.h>
+
+#include <memory>
+
+TEST(PartitionIRTest, PNode_ctor)
+{
+  auto g = loco::make_graph();
+  auto node = g->nodes()->create<luci::CircleAdd>();
+
+  luci::PNode pnode;
+  pnode.node = node;
+
+  ASSERT_NE(nullptr, pnode.node);
+  ASSERT_EQ(nullptr, pnode.pgroup);
+}
+
+// TODO add more tests with luci::PNode
+
+TEST(PartitionIRTest, PGroup_ctor)
+{
+  auto g = loco::make_graph();
+  auto node = g->nodes()->create<luci::CircleAdd>();
+
+  luci::PGroup pgroup;
+  auto pnode = std::make_unique<luci::PNode>();
+  pnode->node = node;
+
+  pgroup.pnodes.push_back(std::move(pnode));
+
+  ASSERT_NE(pgroup.pnodes.end(), pgroup.pnodes.begin());
+  ASSERT_EQ(0, pgroup.inputs.size());
+  ASSERT_EQ(0, pgroup.outputs.size());
+}
+
+// TODO add more tests with luci::PGroup
+
+TEST(PartitionIRTest, PGroups_ctor)
+{
+  auto g = loco::make_graph();
+  auto node = g->nodes()->create<luci::CircleAdd>();
+
+  auto pnode = std::make_unique<luci::PNode>();
+  pnode->node = node;
+
+  auto pgroup = std::make_unique<luci::PGroup>();
+  pgroup->pnodes.push_back(std::move(pnode));
+
+  luci::PGroups pgroups;
+  pgroups.pgroups.push_back(std::move(pgroup));
+
+  ASSERT_NE(pgroups.pgroups.end(), pgroups.pgroups.begin());
+}
+
+// TODO add more tests with luci::PGroups
diff --git a/compiler/luci/partition/src/PartitionIRDump.cpp b/compiler/luci/partition/src/PartitionIRDump.cpp
new file mode 100644
index 000000000..4f2c26800
--- /dev/null
+++ b/compiler/luci/partition/src/PartitionIRDump.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "PartitionIRDump.h"
+
+#include "CircleOpCode.h"
+
+#include <iostream>
+
+namespace luci
+{
+
+void dump(std::ostream &os, const PNode *pnode)
+{
+  os << "PNode: " << pnode->group << ", " << pnode->node << ":" << luci::opcode_name(pnode->node)
+     << ":" << pnode->node->name() << std::endl;
+}
+
+void dump(std::ostream &os, const PGroup *pgroup)
+{
+  os << "--- PGroup: " << pgroup->group << std::endl;
+  os << "Input(s): ";
+  for (auto &node_in : pgroup->inputs)
+    os << node_in->name() << " ";
+  os << std::endl;
+  for (auto &pnode : pgroup->pnodes)
+  {
+    dump(os, pnode.get());
+  }
+  os << "Output(s): ";
+  for (auto &node_out : pgroup->outputs)
+    os << node_out->name() << " ";
+  os << std::endl;
+}
+
+void dump(std::ostream &os, const PGroups *pgroups)
+{
+  for (auto &pgroup : pgroups->pgroups)
+  {
+    dump(os, pgroup.get());
+  }
+  os << "--- Node2Group items: " << std::endl;
+  for (auto it = pgroups->node2group.begin(); it != pgroups->node2group.end(); ++it)
+  {
+    auto node = it->first;
+    auto group = it->second;
+    os << "  Node: " << node << "(" << node->name() << "): " << group << std::endl;
+  }
+}
+
+} // namespace luci
+
+std::ostream &operator<<(std::ostream &os, const luci::PGroups *pgroups)
+{
+  luci::dump(os, pgroups);
+  return os;
+}
diff --git a/compiler/luci/partition/src/PartitionIRDump.h b/compiler/luci/partition/src/PartitionIRDump.h
new file mode 100644
index 000000000..8a4b3f579
--- /dev/null
+++ b/compiler/luci/partition/src/PartitionIRDump.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_PARTITION_IR_DUMP_H__
+#define __LUCI_PARTITION_IR_DUMP_H__
+
+#include "PartitionIR.h"
+
+#include <iostream>
+
+namespace luci
+{
+
+void dump(std::ostream &os, const PNode *pnode);
+void dump(std::ostream &os, const PGroup *pgroup);
+void dump(std::ostream &os, const PGroups *pgroups);
+
+} // namespace luci
+
+std::ostream &operator<<(std::ostream &os, const luci::PGroups *pgroups);
+
+#endif // __LUCI_PARTITION_IR_DUMP_H__
diff --git a/compiler/luci/partition/src/PartitionMerge.cpp b/compiler/luci/partition/src/PartitionMerge.cpp
new file mode 100644
index 000000000..038fc2a0c
--- /dev/null
+++ b/compiler/luci/partition/src/PartitionMerge.cpp
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "PartitionMerge.h"
+
+#include <algorithm>
+
+namespace
+{
+
+/**
+ * @brief return true if pgroup_i output is one of the inputs of pgroup
+ */
+bool is_input_of(const luci::PGroup *pgroup_i, const luci::PGroup *pgroup)
+{
+  for (auto *output : pgroup_i->outputs)
+  {
+    for (auto *input : pgroup->inputs)
+    {
+      if (input == output)
+        return true;
+    }
+  }
+  return false;
+}
+
+/**
+ * @brief return true if there is only one input or all the inputs have same group
+ * @note  pgroups is used to find group of pgroup
+ */
+bool is_input_same(const luci::PGroup *pgroup, const luci::PGroups *pgroups)
+{
+  assert(pgroups != nullptr);
+  assert(pgroup != nullptr);
+
+  const luci::PGroup *input_pgroup = nullptr;
+  std::string group;
+  for (auto &input : pgroup->inputs)
+  {
+    auto input_group = pgroups->group_of(input);
+    // NOTE: all the nodes should be registered and return should be valid group.
+    // convert_to_proups() should ensure this.
+    // assert here to find if there is any problem with this.
+    assert(not input_group.empty());
+    if (input_group.empty())
+      input_group = pgroups->default_group;
+
+    if (group.empty())
+      group = input_group;
+    else
+    {
+      if (group != input_group)
+        return false;
+    }
+    // if there are multiple inputs, all the inputs should be in same pgroup
+    // https://github.com/Samsung/ONE/issues/6230#issuecomment-801618150
+    // https://github.com/Samsung/ONE/issues/6230#issuecomment-801680531
+    auto pgroup_input = pgroups->pgroup_of(input);
+    if (pgroup_input != nullptr)
+    {
+      if (input_pgroup == nullptr)
+        input_pgroup = pgroup_input;
+      else
+      {
+        if (input_pgroup != pgroup_input)
+          return false;
+      }
+    }
+  }
+  return true;
+}
+
+/**
+ * @brief merge pgroup into pgroup_i
+ * @note  output of pgroup_i should be input of pgroup
+ */
+void merge_into(luci::PGroup *pgroup, luci::PGroup *pgroup_i)
+{
+  for (auto &pnode : pgroup->pnodes)
+  {
+    // update pgroup for this pnode
+    pnode->pgroup = pgroup_i;
+    assert(pnode->group == pgroup_i->group);
+
+    // we don't need to add this in topological order:
+    // all the nodes will be created first then connection will be held
+    pgroup_i->pnodes.push_back(std::move(pnode));
+    // note: pnode is now nullptr as it's moved into pgroup_i->pnodes
+  }
+
+  for (auto &input : pgroup->inputs)
+  {
+    // add inputs of pgroup to pgroup_i if not member of pgroup_i
+    bool found_in_pgroup_i = false;
+    for (auto &pnode : pgroup_i->pnodes)
+    {
+      if (input == pnode->node)
+      {
+        found_in_pgroup_i = true;
+        break;
+      }
+    }
+    // skip if this input is already in the inputs
+    auto fit = std::find(pgroup_i->inputs.begin(), pgroup_i->inputs.end(), input);
+    if (fit != pgroup_i->inputs.end())
+    {
+      found_in_pgroup_i = true;
+    }
+    // note: if we force found_in_pgroup_i to false, for testing there will be
+    // unnecessary inputs
+    if (not found_in_pgroup_i)
+    {
+      // node input maybe in another pgroup
+      pgroup_i->inputs.push_back(input);
+    }
+  }
+  // add outputs of pgroup to pgroup_i outputs if not exist
+  for (auto &output : pgroup->outputs)
+  {
+    auto it = std::find(pgroup_i->outputs.begin(), pgroup_i->outputs.end(), output);
+    if (it == pgroup_i->outputs.end())
+    {
+      pgroup_i->outputs.push_back(output);
+    }
+  }
+}
+
+} // namespace
+
+namespace luci
+{
+
+/**
+ * @brief This will merge pgroups with same group values in topological order
+ */
+std::unique_ptr<luci::PGroups> merge_pgroups(const luci::PGroups *s_pgroups)
+{
+  // Make a copy of pgroups to apply merge action
+  // Q) do we really need a copy?
+  auto d_pgroups = s_pgroups->make_copy();
+
+  // Merge partition graphs
+  // - This is initial implementation that works for limited networks
+  // - if A and B is same group -> if A is input of B -> ... -> merge B into A
+  auto &pgroups = d_pgroups->pgroups;
+  bool changed;
+  do
+  {
+    changed = false;
+    for (auto &pgroup_i : pgroups)
+    {
+      bool merged = false;
+      for (auto it = pgroups.begin(); it != pgroups.end(); ++it)
+      {
+        auto &pgroup = *it;
+
+        // skip if same object
+        if (pgroup->id == pgroup_i->id)
+          continue;
+        // skip if different group
+        if (pgroup->group != pgroup_i->group)
+          continue;
+        // skip if not connected
+        if (!is_input_of(pgroup_i.get(), pgroup.get()))
+          continue;
+        // skip if there are multiple inputs but inputs differ in group
+        if (!is_input_same(pgroup.get(), d_pgroups.get()))
+          continue;
+        // TODO add more condition may be needed
+
+        merge_into(pgroup.get(), pgroup_i.get());
+
+        auto eit = d_pgroups->id2pgroup.find(pgroup->id);
+        assert(eit != d_pgroups->id2pgroup.end());
+        d_pgroups->id2pgroup.erase(eit);
+
+        // remove merged pgroup from pgroups
+        pgroups.erase(it);
+
+        merged = true;
+        break;
+      }
+      if (merged)
+      {
+        changed = true;
+        break;
+      }
+    }
+  } while (changed);
+
+  return std::move(d_pgroups);
+}
+
+} // namespace luci
diff --git a/compiler/luci/partition/src/PartitionMerge.h b/compiler/luci/partition/src/PartitionMerge.h
new file mode 100644
index 000000000..5c9fec2d2
--- /dev/null
+++ b/compiler/luci/partition/src/PartitionMerge.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_PARTITON_MERGE_H__
+#define __LUCI_PARTITON_MERGE_H__
+
+#include "PartitionIR.h"
+
+#include <memory>
+
+namespace luci
+{
+
+std::unique_ptr<luci::PGroups> merge_pgroups(const luci::PGroups *s_pgroups);
+
+} // namespace luci
+
+#endif // __LUCI_PARTITON_MERGE_H__
diff --git a/compiler/luci/partition/src/PartitionPGroups.cpp b/compiler/luci/partition/src/PartitionPGroups.cpp
new file mode 100644
index 000000000..594ed6c40
--- /dev/null
+++ b/compiler/luci/partition/src/PartitionPGroups.cpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "PartitionPGroups.h"
+#include "PartitionIR.h"
+#include "CircleOpCode.h"
+
+#include "luci/Partition.h"
+#include "luci/Log.h"
+#include "luci/LogHelper.h"
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/IR/CircleNodeVisitor.h>
+
+#include <loco.h>
+
+namespace
+{
+
+class IsVirtualNode final : public luci::CircleNodeVisitor<bool>
+{
+public:
+  bool visit(const luci::CircleInput *) final { return true; }
+  bool visit(const luci::CircleOutput *) final { return true; }
+  // TODO add all virtual nodes
+
+  // default is false
+  bool visit(const luci::CircleNode *) final { return false; }
+};
+
+bool check_allocate_partition(const luci::CircleNode *node)
+{
+  IsVirtualNode query;
+  if (node->accept(&query))
+    return false;
+  /**
+   * @note About CircleConst
+   *       CirleConst acts like a part of some CircleNode and managing mulitiple
+   *       used(referenced) CircleConst is a bit difficult if it's used across
+   *       different PGroup. So we treat this different to other types.
+   *       https://github.com/Samsung/ONE/issues/6230#issuecomment-809802813
+   */
+  if (dynamic_cast<const luci::CircleConst *>(node) != nullptr)
+    return false;
+  return true;
+}
+
+} // namespace
+
+namespace luci
+{
+
+std::unique_ptr<luci::PGroups> produce_pgroups(const luci::Module *source,
+                                               const luci::PartitionTable &partition)
+{
+  assert(source != nullptr);
+  // TODO support multiple subgraphs
+  assert(source->size() == 1);
+
+  LOGGER(l);
+
+  auto pgroups = std::make_unique<luci::PGroups>();
+
+  pgroups->default_group = partition.default_group;
+
+  // Create a PGroup per CircleNode: each PGroup will have one CircleNode
+  auto graph = source->graph();
+  auto nodes = graph->nodes();
+  for (uint32_t idx = 0; idx < nodes->size(); ++idx)
+  {
+    auto node = loco::must_cast<luci::CircleNode *>(nodes->at(idx));
+
+    // check if node is normal node that we are interested
+    if (check_allocate_partition(node))
+    {
+      auto opcodename = luci::opcode_name(node);
+      assert(!opcodename.empty());
+
+      auto group = partition.default_group;
+      auto it = partition.byopcodes.find(opcodename);
+      if (it != partition.byopcodes.end())
+        group = it->second;
+
+      INFO(l) << "Op: " << node->name() << ": " << opcodename << ", " << node << ", " << group
+              << std::endl;
+
+      auto pgroup = std::make_unique<luci::PGroup>();
+      pgroup->group = group;
+      pgroup->id = idx + 1;
+
+      auto pnode = std::make_unique<luci::PNode>();
+      pnode->node = node;
+      pnode->group = group;
+      pnode->pgroup = pgroup.get();
+
+      pgroup->pnodes.push_back(std::move(pnode));
+
+      // Set input of PGroup
+      for (uint32_t in = 0; in < node->arity(); ++in)
+      {
+        auto input = loco::must_cast<luci::CircleNode *>(node->arg(in));
+        // this input maybe CircleInput in source graph
+        // --> not confident this is safe
+        pgroup->inputs.push_back(input);
+      }
+      // Set output of PGroup: node itself or multiple virtual outputs
+      // TODO support multiple virtual outputs
+      pgroup->outputs.push_back(node);
+
+      pgroups->node2group[node] = group;
+      pgroups->id2pgroup[pgroup->id] = pgroup.get();
+
+      pgroups->pgroups.push_back(std::move(pgroup));
+    }
+    else
+    {
+      INFO(l) << "Skip Op: " << node->name() << std::endl;
+      // record as default group
+      pgroups->node2group[node] = partition.default_group;
+    }
+  }
+
+  return std::move(pgroups);
+}
+
+} // namespace luci
diff --git a/compiler/luci/partition/src/PartitionPGroups.h b/compiler/luci/partition/src/PartitionPGroups.h
new file mode 100644
index 000000000..998e11cbd
--- /dev/null
+++ b/compiler/luci/partition/src/PartitionPGroups.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_PARTITON_PGROUPS_H__
+#define __LUCI_PARTITON_PGROUPS_H__
+
+#include "PartitionIR.h"
+
+#include "luci/Partition.h"
+
+#include <luci/IR/Module.h>
+
+namespace luci
+{
+
+/**
+ * @brief This will produce a PGroups from Module and PartitionTable.
+ * @note  Each PGroup will hold one CircleNode and partition key value as group.
+ *        Supports only single Graph in the Module for now.
+ */
+std::unique_ptr<luci::PGroups> produce_pgroups(const luci::Module *source,
+                                               const luci::PartitionTable &partition);
+
+} // namespace luci
+
+#endif // __LUCI_PARTITON_PGROUPS_H__
diff --git a/compiler/luci/partition/src/PartitionPGroups.test.cpp b/compiler/luci/partition/src/PartitionPGroups.test.cpp
new file mode 100644
index 000000000..960f3cde9
--- /dev/null
+++ b/compiler/luci/partition/src/PartitionPGroups.test.cpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "PartitionPGroups.h"
+
+#include <luci/test/TestIOGraph.h>
+
+#include <luci/IR/Nodes/CircleSqrt.h>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+using namespace luci::test;
+
+class SqrtGraphlet
+{
+public:
+  SqrtGraphlet() = default;
+
+public:
+  void init(loco::Graph *g, const ShapeU32 input_shape)
+  {
+    _sqrt = g->nodes()->create<luci::CircleSqrt>();
+    _sqrt->dtype(loco::DataType::S32);
+    _sqrt->name("sqrt");
+  }
+
+protected:
+  luci::CircleSqrt *_sqrt = nullptr;
+};
+
+class SqrtGraph : public TestIOGraph, public SqrtGraphlet
+{
+public:
+  SqrtGraph() = default;
+
+public:
+  void init(const ShapeU32 shape)
+  {
+    TestIOGraph::init(shape, shape);
+    SqrtGraphlet::init(g(), shape);
+
+    _sqrt->x(input());
+
+    output()->from(_sqrt);
+  }
+};
+
+} // namespace
+
+TEST(PartitionPGroupsTest, simple_produce)
+{
+  luci::Module module;
+
+  SqrtGraph g;
+  g.init({3, 3});
+  g.transfer_to(&module);
+
+  luci::PartitionTable pt;
+  pt.default_group = "A";
+
+  auto pgs = produce_pgroups(&module, pt);
+
+  ASSERT_EQ(1, pgs->pgroups.size());
+}
diff --git a/compiler/luci/partition/src/PartitionPModules.cpp b/compiler/luci/partition/src/PartitionPModules.cpp
new file mode 100644
index 000000000..36f4d47a4
--- /dev/null
+++ b/compiler/luci/partition/src/PartitionPModules.cpp
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "PartitionPModules.h"
+#include "ConnectNode.h"
+
+#include "luci/Service/CircleNodeClone.h"
+#include "luci/Log.h"
+
+#include <loco.h>
+
+namespace
+{
+
+void add_graph_input(loco::Graph *graph, luci::CircleInput *input_node)
+{
+  assert(graph != nullptr);
+  assert(input_node != nullptr);
+
+  auto graph_input = graph->inputs()->create();
+  graph_input->name(input_node->name());
+
+  // Set GraphInputOutputIndex for graph
+  input_node->index(graph_input->index());
+
+  // Data type
+  graph_input->dtype(input_node->dtype());
+
+  // Shape of GraphInput
+  auto input_shape = std::make_unique<loco::TensorShape>();
+  input_shape->rank(input_node->rank());
+  for (uint32_t r = 0; r < input_node->rank(); ++r)
+  {
+    if (input_node->dim(r).known())
+      input_shape->dim(r).set(input_node->dim(r).value());
+  }
+  graph_input->shape(std::move(input_shape));
+}
+
+void add_graph_output(loco::Graph *graph, luci::CircleOutput *output_node)
+{
+  assert(graph != nullptr);
+  assert(output_node != nullptr);
+
+  auto graph_output = graph->outputs()->create();
+  graph_output->name(output_node->name());
+
+  // Set GraphInputOutputIndex for graph
+  output_node->index(graph_output->index());
+
+  // Data type
+  graph_output->dtype(output_node->dtype());
+
+  // Shape of GraphOutput
+  auto output_shape = std::make_unique<loco::TensorShape>();
+  output_shape->rank(output_node->rank());
+  for (uint32_t r = 0; r < output_node->rank(); ++r)
+  {
+    if (output_node->dim(r).known())
+      output_shape->dim(r).set(output_node->dim(r).value());
+  }
+  graph_output->shape(std::move(output_shape));
+}
+
+/**
+ * @brief Build loco::graph from pgroup into graph
+ */
+void build_graph(loco::Graph *graph, const luci::PGroup *pgroup)
+{
+  LOGGER(l);
+
+  luci::CloneContext clonectx;
+
+  // add input node(s)
+  for (auto *input : pgroup->inputs)
+  {
+    auto *input_clone = graph->nodes()->create<luci::CircleInput>();
+    luci::copy_common_attributes(input, input_clone);
+
+    add_graph_input(graph, input_clone);
+    clonectx.emplace(input, input_clone);
+
+    INFO(l) << "MAP: "
+            << " input(" << input << ") -> " << input_clone << "(" << input_clone->name() << ")";
+  }
+
+  // add CircleConst for inputs
+  for (auto &pnode : pgroup->pnodes)
+  {
+    auto node = pnode->node;
+    uint32_t arity = node->arity();
+    for (uint32_t a = 0; a < arity; ++a)
+    {
+      auto in_a_const = dynamic_cast<luci::CircleConst *>(node->arg(a));
+      if (in_a_const != nullptr)
+      {
+        auto it = clonectx.find(in_a_const);
+        if (it == clonectx.end())
+        {
+          auto *clone = clone_node(in_a_const, graph);
+          clonectx.emplace(in_a_const, clone);
+
+          INFO(l) << "MAP: "
+                  << " const(" << in_a_const << ") -> " << clone << "(" << clone->name() << ")";
+        }
+      }
+    }
+  }
+
+  // add nodes
+  for (auto &pnode : pgroup->pnodes)
+  {
+    auto *clone = clone_node(pnode->node, graph);
+    clonectx.emplace(pnode->node, clone);
+
+    INFO(l) << "MAP: "
+            << "  node(" << pnode->node << ") -> " << clone << "(" << clone->name() << ")";
+  }
+  // connect nodes
+  for (auto &pnode : pgroup->pnodes)
+  {
+    clone_connect(pnode->node, clonectx);
+  }
+
+  // add output node(s)
+  for (auto *output : pgroup->outputs)
+  {
+    auto *output_clone = graph->nodes()->create<luci::CircleOutput>();
+    luci::copy_common_attributes(output, output_clone);
+    // note: we don't add output_clone to clonectx.
+    // logically, output is not used as an input to any other nodes.
+
+    auto it = clonectx.find(output);
+    assert(it != clonectx.end());
+    output_clone->from(it->second);
+
+    add_graph_output(graph, output_clone);
+
+    INFO(l) << "MAP: "
+            << "output(" << output << ") -> " << output_clone << "(" << output_clone->name() << ")"
+            << ": from " << it->second << "(" << it->second->name() << ")";
+  }
+}
+
+std::string make_name(const luci::PGroup *pgroup)
+{
+  auto &first_pnode = *pgroup->pnodes.begin();
+  auto *first_node = first_pnode->node;
+  std::string name = first_node->graph()->name();
+  name = name + "_" + pgroup->group;
+  return name;
+}
+
+} // namespace
+
+namespace luci
+{
+
+/**
+ * @brief This will produce list of luci::Module as PartedModules from pgroups
+ */
+luci::PartedModules produce_pmodules(const luci::PGroups *pgroups)
+{
+  LOGGER(l);
+
+  luci::PartedModules pms;
+
+  for (auto &pgroup : pgroups->pgroups)
+  {
+    luci::PartedModule pm;
+    pm.module = std::make_unique<luci::Module>();
+    pm.group = pgroup->group;
+
+    auto graph = loco::make_graph();
+
+    auto graph_name = make_name(pgroup.get());
+    graph->name(graph_name);
+
+    INFO(l) << "--- Partition Graph build----------------------";
+    INFO(l) << "--- name: " << graph_name;
+    build_graph(graph.get(), pgroup.get());
+
+    pm.module->add(std::move(graph));
+    pms.pmodules.emplace_back(std::move(pm));
+  }
+
+  return pms;
+}
+
+} // namespace luci
diff --git a/compiler/luci/partition/src/PartitionPModules.h b/compiler/luci/partition/src/PartitionPModules.h
new file mode 100644
index 000000000..628ada56c
--- /dev/null
+++ b/compiler/luci/partition/src/PartitionPModules.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_PARTITON_PMODULES_H__
+#define __LUCI_PARTITON_PMODULES_H__
+
+#include "PartitionIR.h"
+
+#include "luci/Partition.h"
+
+namespace luci
+{
+
+luci::PartedModules produce_pmodules(const luci::PGroups *pgroups);
+
+} // namespace luci
+
+#endif // __LUCI_PARTITON_PMODULES_H__
diff --git a/compiler/luci/partition/src/PartitionPModules.test.cpp b/compiler/luci/partition/src/PartitionPModules.test.cpp
new file mode 100644
index 000000000..99c39e839
--- /dev/null
+++ b/compiler/luci/partition/src/PartitionPModules.test.cpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "PartitionPModules.h"
+#include "PartitionPGroups.h"
+
+#include <luci/test/TestIOGraph.h>
+
+#include <luci/IR/Nodes/CircleSqrt.h>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+using namespace luci::test;
+
+class SqrtGraphlet
+{
+public:
+  SqrtGraphlet() = default;
+
+public:
+  void init(loco::Graph *g, const ShapeU32 input_shape)
+  {
+    _sqrt = g->nodes()->create<luci::CircleSqrt>();
+    _sqrt->dtype(loco::DataType::S32);
+    _sqrt->name("sqrt");
+  }
+
+protected:
+  luci::CircleSqrt *_sqrt = nullptr;
+};
+
+class SqrtGraph : public TestIOGraph, public SqrtGraphlet
+{
+public:
+  SqrtGraph() = default;
+
+public:
+  void init(const ShapeU32 shape)
+  {
+    TestIOGraph::init(shape, shape);
+    SqrtGraphlet::init(g(), shape);
+
+    _sqrt->x(input());
+
+    output()->from(_sqrt);
+  }
+};
+
+} // namespace
+
+TEST(PartitionPModulesTest, simple_convert)
+{
+  luci::Module module;
+
+  SqrtGraph g;
+  g.init({3, 3});
+  g.transfer_to(&module);
+
+  luci::PartitionTable pt;
+  pt.default_group = "A";
+
+  auto pgs = produce_pgroups(&module, pt);
+  auto pms = produce_pmodules(pgs.get());
+
+  ASSERT_EQ(1, pms.pmodules.size());
+}
diff --git a/compiler/luci/partition/src/PartitionPModulesDump.cpp b/compiler/luci/partition/src/PartitionPModulesDump.cpp
new file mode 100644
index 000000000..ee50bc6fb
--- /dev/null
+++ b/compiler/luci/partition/src/PartitionPModulesDump.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "PartitionPModulesDump.h"
+
+#include "luci/LogHelper.h"
+
+#include <iostream>
+
+namespace luci
+{
+
+void dump(std::ostream &os, const PartedModule *pmodule)
+{
+  os << "--- PartedModule: " << pmodule->group << std::endl;
+  os << luci::fmt(pmodule->module->graph());
+}
+
+void dump(std::ostream &os, const PartedModules *pmodules)
+{
+  for (auto &pmodule : pmodules->pmodules)
+  {
+    dump(os, &pmodule);
+  }
+  os << std::endl;
+}
+
+} // namespace luci
+
+std::ostream &operator<<(std::ostream &os, const luci::PartedModules *pmodules)
+{
+  luci::dump(os, pmodules);
+  return os;
+}
diff --git a/compiler/luci/partition/src/PartitionPModulesDump.h b/compiler/luci/partition/src/PartitionPModulesDump.h
new file mode 100644
index 000000000..e77b235f4
--- /dev/null
+++ b/compiler/luci/partition/src/PartitionPModulesDump.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_PARTITION_PMODULES_DUMP_H__
+#define __LUCI_PARTITION_PMODULES_DUMP_H__
+
+#include "luci/Partition.h"
+
+#include <iostream>
+
+namespace luci
+{
+
+void dump(std::ostream &os, const PartedModule *pmodule);
+void dump(std::ostream &os, const PartedModules *pmodules);
+
+} // namespace luci
+
+std::ostream &operator<<(std::ostream &os, const luci::PartedModules *pmodules);
+
+#endif // __LUCI_PARTITION_PMODULES_DUMP_H__
diff --git a/compiler/luci/pass/CMakeLists.txt b/compiler/luci/pass/CMakeLists.txt
index 2c5fb3407..2977fbed7 100644
--- a/compiler/luci/pass/CMakeLists.txt
+++ b/compiler/luci/pass/CMakeLists.txt
@@ -12,6 +12,7 @@ target_link_libraries(luci_pass PRIVATE luci_lang)
 target_link_libraries(luci_pass PRIVATE luci_log)
 target_link_libraries(luci_pass PRIVATE luci_service)
 target_link_libraries(luci_pass PRIVATE luci_logex)
+target_link_libraries(luci_pass PRIVATE luci_profile)
 target_link_libraries(luci_pass PRIVATE nncc_common)
 target_link_libraries(luci_pass PRIVATE oops)
 install(TARGETS luci_pass DESTINATION lib)
@@ -26,4 +27,5 @@ GTest_AddTest(luci_pass_test ${TESTS})
 target_include_directories(luci_pass_test PRIVATE src)
 target_link_libraries(luci_pass_test luci_pass)
 target_link_libraries(luci_pass_test luci_lang)
+target_link_libraries(luci_pass_test luci_testhelper)
 #target_link_libraries(luci_pass_test oops)
diff --git a/compiler/luci/pass/include/luci/CircleOptimizer.h b/compiler/luci/pass/include/luci/CircleOptimizer.h
index 906760e0a..1f5e1c8b9 100644
--- a/compiler/luci/pass/include/luci/CircleOptimizer.h
+++ b/compiler/luci/pass/include/luci/CircleOptimizer.h
@@ -35,6 +35,8 @@ public:
     enum Algorithm
     {
       FuseAddWithTConv,
+      FuseBatchNormWithConv,
+      FuseBatchNormWithDwConv,
       FuseBatchNormWithTConv,
       FuseBCQ,
       FuseInstanceNorm,
@@ -44,7 +46,11 @@ public:
       QuantizeDequantizeWeights,
       QuantizeWithMinMax,
       Requantize,
+      FoldAddV2,
+      FoldCast,
       FoldDequantize,
+      FoldSparseToDense,
+      ForwardReshapeToUnaryOp,
       SparsifyTensorPass,
       FusePreActivationBatchNorm,
       MakeBatchNormGammaPositive,
@@ -53,6 +59,15 @@ public:
       RemoveRedundantTranspose,
       ReplaceMulAddWithDepthwiseConv,
       SubstitutePackToReshape,
+      SubstituteSqueezeToReshape,
+      ConvertNCHWToNHWC,
+      RemoveUnnecessarySlice,
+      RemoveUnnecessaryStridedSlice,
+      RemoveUnnecessarySplit,
+      RemoveUnnecessaryReshape,
+      TransformMinMaxToRelu6Pass,
+      SubstituteTransposeToReshape,
+      RemoveRedundantReshape,
     };
 
     enum AlgorithmParameters
@@ -68,6 +83,10 @@ public:
       Sparsify_format,
       Sparsify_block_size,
       Sparsify_block_map,
+
+      // convert NCHW to NHWC
+      NCHW_to_NHWC_preserve_input_shape,
+      NCHW_to_NHWC_preserve_output_shape,
     };
 
     virtual ~Options() = default;
diff --git a/compiler/luci/pass/include/luci/Pass/ShapeInferencePass.h b/compiler/luci/pass/include/luci/Pass/CircleShapeInferencePass.h
index e21ab4cce..21d6d09d6 100644
--- a/compiler/luci/pass/include/luci/Pass/ShapeInferencePass.h
+++ b/compiler/luci/pass/include/luci/Pass/CircleShapeInferencePass.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef __LUCI_SHAPE_INFERENCE_PASS_H__
-#define __LUCI_SHAPE_INFERENCE_PASS_H__
+#ifndef __LUCI_CIRCLE_SHAPE_INFERENCE_PASS_H__
+#define __LUCI_CIRCLE_SHAPE_INFERENCE_PASS_H__
 
 #include <loco.h>
 
@@ -25,12 +25,12 @@ namespace luci
 {
 
 /**
- * @brief Pass to infer shape of nodes
+ * @brief Pass to infer shape of circle nodes
  */
-class ShapeInferencePass : public luci::Pass
+class CircleShapeInferencePass : public luci::Pass
 {
 public:
-  virtual const char *name(void) const { return "luci::ShapeInferencePass"; }
+  virtual const char *name(void) const { return "luci::CircleShapeInferencePass"; }
 
 public:
   bool run(luci::Module *m);
@@ -39,4 +39,4 @@ public:
 
 } // namespace luci
 
-#endif //__LUCI_SHAPE_INFERENCE_PASS_H__
+#endif //__LUCI_CIRCLE_SHAPE_INFERENCE_PASS_H__
diff --git a/compiler/luci/pass/include/luci/Pass/ConvertNCHWToNHWCPass.h b/compiler/luci/pass/include/luci/Pass/ConvertNCHWToNHWCPass.h
new file mode 100644
index 000000000..ba2392596
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/ConvertNCHWToNHWCPass.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_CONVERT_NCHW_TO_NHWC_PASS_H__
+#define __LUCI_CONVERT_NCHW_TO_NHWC_PASS_H__
+
+#include <logo/Pass.h>
+
+namespace luci
+{
+
+/**
+ * @brief   Class to convert NCHW Ops to NHWC
+ *
+ * @details Find operators that use NCHW layout and make them use NHWC.
+ *          Strictly speaking, it is impossible to distinguish whether
+ *          an operator is using NCHW or NHWC without programmers' annotations.
+ *          But we guess the data layout of each operator as much as possible
+ *          based on the assumptions described in the comments.
+ *          Note that this Pass does not change the execution result even
+ *          for the false-positive cases.
+ */
+struct ConvertNCHWToNHWCPass final : public logo::Pass
+{
+public:
+  ConvertNCHWToNHWCPass(bool preserve_input, bool preserve_output)
+    : _preserve_input(preserve_input), _preserve_output(preserve_output)
+  {
+    // Do nothing
+  }
+
+  ConvertNCHWToNHWCPass() = delete;
+
+  virtual ~ConvertNCHWToNHWCPass() = default;
+
+  const char *name(void) const final { return "luci::ConvertNCHWToNHWCPass"; }
+
+  bool run(loco::Graph *g) final;
+
+private:
+  bool _preserve_input = false;
+  bool _preserve_output = false;
+};
+
+} // namespace luci
+
+#endif // __LUCI_CONVERT_NCHW_TO_NHWC_PASS_H__
diff --git a/compiler/luci/pass/include/luci/Pass/FoldAddV2Pass.h b/compiler/luci/pass/include/luci/Pass/FoldAddV2Pass.h
new file mode 100644
index 000000000..cd260b916
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/FoldAddV2Pass.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_FOLD_ADD_V2_PASS_H__
+#define __LUCI_FOLD_ADD_V2_PASS_H__
+
+#include <logo/Pass.h>
+
+namespace luci
+{
+
+/**
+ * @brief  Class to fold AddV2 to a constant tensor
+ *
+ */
+struct FoldAddV2Pass final : public logo::Pass
+{
+  const char *name(void) const final { return "luci::FoldAddV2Pass"; }
+
+  bool run(loco::Graph *g) final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_FOLD_ADD_V2_PASS_H__
diff --git a/compiler/luci/pass/include/luci/Pass/FoldCastPass.h b/compiler/luci/pass/include/luci/Pass/FoldCastPass.h
new file mode 100644
index 000000000..5d7ce4ad3
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/FoldCastPass.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_FOLD_CAST_PASS_H__
+#define __LUCI_FOLD_CAST_PASS_H__
+
+#include <logo/Pass.h>
+
+namespace luci
+{
+
+/**
+ * @brief  Class to fold Cast to a constant tensor
+ *
+ */
+struct FoldCastPass final : public logo::Pass
+{
+  const char *name(void) const final { return "luci::FoldCastPass"; }
+
+  bool run(loco::Graph *g) final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_FOLD_CAST_PASS_H__
diff --git a/compiler/luci/pass/include/luci/Pass/FoldSparseToDensePass.h b/compiler/luci/pass/include/luci/Pass/FoldSparseToDensePass.h
new file mode 100644
index 000000000..00d2447a5
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/FoldSparseToDensePass.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_FOLD_SPARSE_TO_DENSE_PASS_H__
+#define __LUCI_FOLD_SPARSE_TO_DENSE_PASS_H__
+
+#include <logo/Pass.h>
+
+namespace luci
+{
+
+/**
+ * @brief  Class to fold SparseToDense to a constant tensor
+ *
+ */
+struct FoldSparseToDensePass final : public logo::Pass
+{
+  const char *name(void) const final { return "luci::FoldSparseToDensePass"; }
+
+  bool run(loco::Graph *g) final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_FOLD_SPARSE_TO_DENSE_PASS_H__
diff --git a/compiler/luci/pass/include/luci/Pass/ForwardReshapeToUnaryOpPass.h b/compiler/luci/pass/include/luci/Pass/ForwardReshapeToUnaryOpPass.h
new file mode 100644
index 000000000..4c308e531
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/ForwardReshapeToUnaryOpPass.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_FORWARD_RESHAPE_TO_UNARYOP_PASS_H__
+#define __LUCI_FORWARD_RESHAPE_TO_UNARYOP_PASS_H__
+
+#include <logo/Pass.h>
+
+namespace luci
+{
+
+/**
+ * @brief  Class to Forward send Reshape after UnaryOp.
+ */
+struct ForwardReshapeToUnaryOpPass final : public logo::Pass
+{
+  const char *name(void) const final { return "luci::ForwardReshapeToUnaryOpPass"; }
+
+  bool run(loco::Graph *g) final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_FORWARD_RESHAPE_TO_UNARYOP_PASS_H__
diff --git a/compiler/luci/pass/include/luci/Pass/FuseBatchNormWithConvPass.h b/compiler/luci/pass/include/luci/Pass/FuseBatchNormWithConvPass.h
new file mode 100644
index 000000000..1ed85447b
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/FuseBatchNormWithConvPass.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_FUSE_BATCH_NORM_WITH_CONV_PASS_H__
+#define __LUCI_FUSE_BATCH_NORM_WITH_CONV_PASS_H__
+
+#include <logo/Pass.h>
+
+namespace luci
+{
+
+/**
+ * @brief  Class to fuse Batch Normalization into CircleConv
+ */
+struct FuseBatchNormWithConvPass final : public logo::Pass
+{
+  const char *name(void) const final { return "luci::FuseBatchNormWithConvPass"; }
+
+  bool run(loco::Graph *g) final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_FUSE_BATCH_NORM_WITH_CONV_PASS_H__
diff --git a/compiler/luci/pass/include/luci/Pass/FuseBatchNormWithDwConvPass.h b/compiler/luci/pass/include/luci/Pass/FuseBatchNormWithDwConvPass.h
new file mode 100644
index 000000000..32885c6b2
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/FuseBatchNormWithDwConvPass.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_FUSE_BATCH_NORM_WITH_DWCONV_PASS_H__
+#define __LUCI_FUSE_BATCH_NORM_WITH_DWCONV_PASS_H__
+
+#include <logo/Pass.h>
+
+namespace luci
+{
+
+/**
+ * @brief  Class to fuse Batch Normalization into CircleDepthWiseConv2D
+ */
+struct FuseBatchNormWithDwConvPass final : public logo::Pass
+{
+  const char *name(void) const final { return "luci::FuseBatchNormWithDwConvPass"; }
+
+  bool run(loco::Graph *g) final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_FUSE_BATCH_NORM_WITH_DWCONV_PASS_H__
diff --git a/compiler/luci/pass/include/luci/Pass/FuseBatchNormWithTConv.h b/compiler/luci/pass/include/luci/Pass/FuseBatchNormWithTConvPass.h
index d3e930a36..d3e930a36 100644
--- a/compiler/luci/pass/include/luci/Pass/FuseBatchNormWithTConv.h
+++ b/compiler/luci/pass/include/luci/Pass/FuseBatchNormWithTConvPass.h
diff --git a/compiler/luci/pass/include/luci/Pass/MigrateLegacyShapeDtypePass.h b/compiler/luci/pass/include/luci/Pass/MigrateLegacyShapeDtypePass.h
deleted file mode 100644
index c0ebc4e5d..000000000
--- a/compiler/luci/pass/include/luci/Pass/MigrateLegacyShapeDtypePass.h
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __LUCI_MIGRATE_LEGACY_SHAPE_DTYPE_PASS_H__
-#define __LUCI_MIGRATE_LEGACY_SHAPE_DTYPE_PASS_H__
-
-#include <loco.h>
-
-#include <luci/ModulePass.h>
-
-namespace luci
-{
-
-/**
- * @brief Pass to copy shape/dtype of loco to circle node
- *
- * CAUTION : This pass will be removed after refactoring is finished
- */
-class MigrateLegacyShapeDtypePass : public luci::Pass
-{
-public:
-  virtual const char *name(void) const { return "luci::MigrateLegacyShapeDtypePass"; }
-
-public:
-  bool run(luci::Module *m);
-  bool run(loco::Graph *graph);
-};
-
-} // namespace luci
-
-#endif //__LUCI_MIGRATE_LEGACY_SHAPE_DTYPE_PASS_H__
diff --git a/compiler/luci/pass/include/luci/Pass/QuantizeDequantizeWeightsPass.h b/compiler/luci/pass/include/luci/Pass/QuantizeDequantizeWeightsPass.h
index 713b88f9d..78e7323f9 100644
--- a/compiler/luci/pass/include/luci/Pass/QuantizeDequantizeWeightsPass.h
+++ b/compiler/luci/pass/include/luci/Pass/QuantizeDequantizeWeightsPass.h
@@ -34,7 +34,7 @@ class QuantizeDequantizeWeightsPass : public logo::Pass
 public:
   QuantizeDequantizeWeightsPass(loco::DataType input_dtype, loco::DataType output_dtype,
                                 QuantizationGranularity granularity)
-      : _input_dtype{input_dtype}, _output_dtype{output_dtype}, _granularity{granularity}
+    : _input_dtype{input_dtype}, _output_dtype{output_dtype}, _granularity{granularity}
   {
     // DO NOTHING
   }
diff --git a/compiler/luci/pass/include/luci/Pass/QuantizeWithMinMaxPass.h b/compiler/luci/pass/include/luci/Pass/QuantizeWithMinMaxPass.h
index bb0d0ff40..9520910d5 100644
--- a/compiler/luci/pass/include/luci/Pass/QuantizeWithMinMaxPass.h
+++ b/compiler/luci/pass/include/luci/Pass/QuantizeWithMinMaxPass.h
@@ -34,7 +34,7 @@ class QuantizeWithMinMaxPass : public logo::Pass
 public:
   QuantizeWithMinMaxPass(loco::DataType input_dtype, loco::DataType output_dtype,
                          QuantizationGranularity granularity)
-      : _input_dtype{input_dtype}, _output_dtype{output_dtype}, _granularity{granularity}
+    : _input_dtype{input_dtype}, _output_dtype{output_dtype}, _granularity{granularity}
   {
     // DO NOTHING
   }
diff --git a/compiler/luci/pass/include/luci/Pass/RemoveRedundantReshapePass.h b/compiler/luci/pass/include/luci/Pass/RemoveRedundantReshapePass.h
new file mode 100644
index 000000000..458ffc094
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/RemoveRedundantReshapePass.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_REMOVE_REDUNDANT_RESHAPE_PASS_H__
+#define __LUCI_REMOVE_REDUNDANT_RESHAPE_PASS_H__
+
+#include <logo/Pass.h>
+
+namespace luci
+{
+
+/**
+ * @brief  Class to remove redundant Reshape node into 1 Reshape node.
+ * @details This class will update consecutive two Reshape node into single Reshape node.
+ *          As Reshape operation just change shape, not buffer, former reshape could be unnecessary.
+ */
+struct RemoveRedundantReshapePass final : public logo::Pass
+{
+  const char *name(void) const final { return "luci::RemoveRedundantReshapePass"; }
+
+  bool run(loco::Graph *g) final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_REMOVE_REDUNDANT_RESHAPE_PASS_H__
diff --git a/compiler/luci/pass/include/luci/Pass/RemoveUnnecessaryReshapePass.h b/compiler/luci/pass/include/luci/Pass/RemoveUnnecessaryReshapePass.h
new file mode 100644
index 000000000..8fca35e5b
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/RemoveUnnecessaryReshapePass.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_REMOVE_UNNECESSARY_RESHAPE_PASS_H__
+#define __LUCI_REMOVE_UNNECESSARY_RESHAPE_PASS_H__
+
+#include <logo/Pass.h>
+
+namespace luci
+{
+
+/**
+ * @brief  Class to Remove Unnecessary(input shape and output shape same) Reshape node.
+ */
+struct RemoveUnnecessaryReshapePass final : public logo::Pass
+{
+  const char *name(void) const final { return "luci::RemoveUnnecessaryReshapePass"; }
+
+  bool run(loco::Graph *g) final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_REMOVE_UNNECESSARY_RESHAPE_PASS_H__
diff --git a/compiler/luci/pass/include/luci/Pass/RemoveUnnecessarySlicePass.h b/compiler/luci/pass/include/luci/Pass/RemoveUnnecessarySlicePass.h
new file mode 100644
index 000000000..a3b0f2f8c
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/RemoveUnnecessarySlicePass.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_REMOVE_NO_EFFECT_SLICE_PASS_H__
+#define __LUCI_REMOVE_NO_EFFECT_SLICE_PASS_H__
+
+#include <logo/Pass.h>
+
+namespace luci
+{
+
+/**
+ * @brief  Class to Remove Unnecessary(input and output are same) Slice node.
+ */
+struct RemoveUnnecessarySlicePass final : public logo::Pass
+{
+  const char *name(void) const final { return "luci::RemoveUnnecessarySlicePass"; }
+
+  bool run(loco::Graph *g) final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_REMOVE_NO_EFFECT_SLICE_PASS_H__
diff --git a/compiler/luci/pass/include/luci/Pass/RemoveUnnecessarySplitPass.h b/compiler/luci/pass/include/luci/Pass/RemoveUnnecessarySplitPass.h
new file mode 100644
index 000000000..0d9330fe7
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/RemoveUnnecessarySplitPass.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_REMOVE_UNNECESSARY_SPLIT_PASS_H__
+#define __LUCI_REMOVE_UNNECESSARY_SPLIT_PASS_H__
+
+#include <logo/Pass.h>
+
+namespace luci
+{
+
+/**
+ * @brief Remove unnecessary Split OP
+ */
+struct RemoveUnnecessarySplitPass final : public logo::Pass
+{
+  const char *name(void) const final { return "luci::RemoveUnnecessarySplitPass"; }
+
+  bool run(loco::Graph *g) final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_REMOVE_UNNECESSARY_SPLIT_PASS_H__
diff --git a/compiler/luci/pass/include/luci/Pass/RemoveUnnecessaryStridedSlicePass.h b/compiler/luci/pass/include/luci/Pass/RemoveUnnecessaryStridedSlicePass.h
new file mode 100644
index 000000000..0f6a61d43
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/RemoveUnnecessaryStridedSlicePass.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_REMOVE_UNNECESSARY_STRIDED_SLICE_PASS_H__
+#define __LUCI_REMOVE_UNNECESSARY_STRIDED_SLICE_PASS_H__
+
+#include <logo/Pass.h>
+
+namespace luci
+{
+
+/**
+ * @brief  Class to Remove Unnecessary(input and output are same) StridedSlice node.
+ */
+struct RemoveUnnecessaryStridedSlicePass final : public logo::Pass
+{
+  const char *name(void) const final { return "luci::RemoveUnnecessaryStridedSlicePass"; }
+
+  bool run(loco::Graph *g) final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_REMOVE_UNNECESSARY_STRIDED_SLICE_PASS_H__
diff --git a/compiler/luci/pass/include/luci/Pass/RequantizePass.h b/compiler/luci/pass/include/luci/Pass/RequantizePass.h
index 2442b24ea..c6c424f1b 100644
--- a/compiler/luci/pass/include/luci/Pass/RequantizePass.h
+++ b/compiler/luci/pass/include/luci/Pass/RequantizePass.h
@@ -33,7 +33,7 @@ class RequantizePass : public logo::Pass
 {
 public:
   RequantizePass(loco::DataType input_dtype, loco::DataType output_dtype)
-      : _input_dtype{input_dtype}, _output_dtype{output_dtype}
+    : _input_dtype{input_dtype}, _output_dtype{output_dtype}
   {
     // DO NOTHING
   }
diff --git a/compiler/luci/pass/include/luci/Pass/SparsifyTensorPass.h b/compiler/luci/pass/include/luci/Pass/SparsifyTensorPass.h
index 41f43bf88..0ce142c55 100644
--- a/compiler/luci/pass/include/luci/Pass/SparsifyTensorPass.h
+++ b/compiler/luci/pass/include/luci/Pass/SparsifyTensorPass.h
@@ -35,8 +35,8 @@ public:
   SparsifyTensorPass(const std::string &tensor_name, const std::vector<int32_t> &traversal_order,
                      const std::vector<DimensionType> &format,
                      const std::vector<int32_t> &block_size, const std::vector<int32_t> &block_map)
-      : _tensor_name{tensor_name}, _traversal_order{traversal_order}, _format{format},
-        _block_size{block_size}, _block_map{block_map}
+    : _tensor_name{tensor_name}, _traversal_order{traversal_order}, _format{format},
+      _block_size{block_size}, _block_map{block_map}
   {
     // DO NOTHING
   }
diff --git a/compiler/luci/pass/include/luci/Pass/SubstituteSqueezeToReshapePass.h b/compiler/luci/pass/include/luci/Pass/SubstituteSqueezeToReshapePass.h
new file mode 100644
index 000000000..d8df6ac3f
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/SubstituteSqueezeToReshapePass.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_SUBSTITUTE_SQUEEZE_TO_RESHAPE_PASS_H__
+#define __LUCI_SUBSTITUTE_SQUEEZE_TO_RESHAPE_PASS_H__
+
+#include <logo/Pass.h>
+
+namespace luci
+{
+
+/**
+ * @brief  Class to Substitute Squeeze to Reshape node for certain conditions.
+ */
+struct SubstituteSqueezeToReshapePass final : public logo::Pass
+{
+  const char *name(void) const final { return "luci::SubstituteSqueezeToReshapePass"; }
+
+  bool run(loco::Graph *g) final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_SUBSTITUTE_SQUEEZE_TO_RESHAPE_PASS_H__
diff --git a/compiler/luci/pass/include/luci/Pass/SubstituteTransposeToReshapePass.h b/compiler/luci/pass/include/luci/Pass/SubstituteTransposeToReshapePass.h
new file mode 100644
index 000000000..ee708585a
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/SubstituteTransposeToReshapePass.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_SUBSTITUTE_TRANSPOSE_TO_RESHAPE_PASS_H__
+#define __LUCI_SUBSTITUTE_TRANSPOSE_TO_RESHAPE_PASS_H__
+
+#include <logo/Pass.h>
+
+namespace luci
+{
+
+/**
+ * @brief  Class to Substitute Transpose with certain input shape condition to single reshape node.
+ */
+struct SubstituteTransposeToReshapePass final : public logo::Pass
+{
+  const char *name(void) const final { return "luci::SubstituteTransposeToReshapePass"; }
+
+  bool run(loco::Graph *g) final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_SUBSTITUTE_TRANSPOSE_TO_RESHAPE_PASS_H__
diff --git a/compiler/luci/pass/include/luci/Pass/TransformMinMaxToRelu6Pass.h b/compiler/luci/pass/include/luci/Pass/TransformMinMaxToRelu6Pass.h
new file mode 100644
index 000000000..9ea39ee4e
--- /dev/null
+++ b/compiler/luci/pass/include/luci/Pass/TransformMinMaxToRelu6Pass.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_TRANSFORM_MIN_MAX_TO_RELU6_PASS_H__
+#define __LUCI_TRANSFORM_MIN_MAX_TO_RELU6_PASS_H__
+
+#include <logo/Pass.h>
+
+namespace luci
+{
+
+/**
+ * @brief  Class to transform Maximum(Minimum(input, 6), 0) to Relu6
+ */
+struct TransformMinMaxToRelu6Pass final : public logo::Pass
+{
+  const char *name(void) const final { return "luci::TransformMinMaxToRelu6Pass"; }
+
+  bool run(loco::Graph *g) final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_TRANSFORM_MIN_MAX_TO_RELU6_PASS_H__
diff --git a/compiler/luci/pass/src/BatchNormPatternFinder.cpp b/compiler/luci/pass/src/BatchNormPatternFinder.cpp
new file mode 100644
index 000000000..c1a06bfda
--- /dev/null
+++ b/compiler/luci/pass/src/BatchNormPatternFinder.cpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BatchNormPatternFinder.h"
+
+#include <luci/IR/CircleNodes.h>
+
+namespace luci
+{
+
+bool is_batchnorm_add(const luci::CircleAdd *add, luci::CircleMul *&mul, luci::CircleConst *&beta)
+{
+  auto x = loco::must_cast<luci::CircleNode *>(add->x());
+  auto y = loco::must_cast<luci::CircleNode *>(add->y());
+
+  luci::CircleMul *pred = nullptr;
+  luci::CircleConst *constant = nullptr;
+
+  if (x->opcode() == luci::CircleOpcode::CIRCLECONST && y->opcode() == luci::CircleOpcode::MUL)
+  {
+    pred = loco::must_cast<luci::CircleMul *>(y);
+    constant = loco::must_cast<luci::CircleConst *>(x);
+  }
+  else if (x->opcode() == luci::CircleOpcode::MUL && y->opcode() == luci::CircleOpcode::CIRCLECONST)
+  {
+    pred = loco::must_cast<luci::CircleMul *>(x);
+    constant = loco::must_cast<luci::CircleConst *>(y);
+  }
+  else
+  {
+    return false;
+  }
+
+  if (constant->rank() != 1)
+    return false;
+
+  auto channel_dim = constant->dim(0);
+  // Assumption: Layout is channel-last
+  if (!(channel_dim == add->dim(add->rank() - 1)))
+    return false;
+
+  mul = pred;
+  beta = constant;
+  return true;
+}
+
+bool is_batchnorm_add(const luci::CircleAdd *add)
+{
+  // for dummy mul and beta
+  luci::CircleMul *mul = nullptr;
+  luci::CircleConst *beta = nullptr;
+
+  return is_batchnorm_add(add, mul, beta);
+}
+
+bool is_batchnorm_mul(const luci::CircleMul *mul, luci::CircleNode *&pred_node,
+                      luci::CircleConst *&gamma)
+{
+  auto x = dynamic_cast<luci::CircleConst *>(mul->x());
+  auto y = dynamic_cast<luci::CircleConst *>(mul->y());
+
+  luci::CircleNode *pred = nullptr;
+  luci::CircleConst *constant = nullptr;
+
+  if (x != nullptr && y == nullptr)
+  {
+    pred = loco::must_cast<luci::CircleNode *>(mul->y());
+    constant = x;
+  }
+  else if (x == nullptr && y != nullptr)
+  {
+    pred = loco::must_cast<luci::CircleNode *>(mul->x());
+    constant = y;
+  }
+  else
+  {
+    return false;
+  }
+
+  if (constant->rank() != 1)
+    return false;
+
+  auto channel_dim = constant->dim(0);
+  // Assumption: Layout is channel-last
+  if (!(channel_dim == mul->dim(mul->rank() - 1)))
+    return false;
+
+  pred_node = pred;
+  gamma = constant;
+  return true;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/BatchNormPatternFinder.h b/compiler/luci/pass/src/BatchNormPatternFinder.h
new file mode 100644
index 000000000..58cdbb464
--- /dev/null
+++ b/compiler/luci/pass/src/BatchNormPatternFinder.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_PASS_BATCH_NORM_PATTERN_FINDER_H__
+#define __LUCI_PASS_BATCH_NORM_PATTERN_FINDER_H__
+
+#include <luci/IR/CircleNodes.h>
+
+namespace luci
+{
+
+/**
+ * @brief Find Mul-Add pattern and return Mul and beta as BatchNorm
+ */
+bool is_batchnorm_add(const luci::CircleAdd *add, luci::CircleMul *&mul, luci::CircleConst *&beta);
+
+/**
+ * @brief Find Mul-Add pattern
+ */
+bool is_batchnorm_add(const luci::CircleAdd *add);
+
+/**
+ * @brief Find Const-Mul pattern and return Node and gamma as BatchNorm
+ */
+bool is_batchnorm_mul(const luci::CircleMul *mul, luci::CircleNode *&pred_node,
+                      luci::CircleConst *&gamma);
+
+} // namespace luci
+
+#endif // __LUCI_PASS_BATCH_NORM_PATTERN_FINDER_H__
diff --git a/compiler/luci/pass/src/BatchNormPatternFinder.test.cpp b/compiler/luci/pass/src/BatchNormPatternFinder.test.cpp
new file mode 100644
index 000000000..08e7fac1c
--- /dev/null
+++ b/compiler/luci/pass/src/BatchNormPatternFinder.test.cpp
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BatchNormPatternFinder.h"
+
+#include <luci/test/TestIOGraph.h>
+
+#include <luci/IR/CircleNodes.h>
+
+#include <gtest/gtest.h>
+
+namespace luci
+{
+namespace test
+{
+
+/**
+ * @brief Graphlet with Add and Const as beta from BatchNorm
+ */
+class AddBetaGraphlet
+{
+public:
+  AddBetaGraphlet() = default;
+
+  void init(loco::Graph *g, const ShapeU32 shape, luci::FusedActFunc actf)
+  {
+    _add = g->nodes()->create<luci::CircleAdd>();
+    _add_beta = g->nodes()->create<luci::CircleConst>();
+
+    _add->dtype(loco::DataType::FLOAT32);
+    _add_beta->dtype(loco::DataType::FLOAT32);
+
+    _add->fusedActivationFunction(actf);
+
+    assert(shape.size() > 0);
+    auto last_it = std::prev(shape.end(), 1);
+    auto channel_size = *last_it;
+
+    _add->shape(shape);
+    _add_beta->shape({channel_size});
+    _add_beta->size<loco::DataType::FLOAT32>(channel_size);
+    for (uint32_t i = 0; i < channel_size; i++)
+      _add_beta->at<loco::DataType::FLOAT32>(i) = i;
+
+    _add->name("add");
+    _add_beta->name("add_beta");
+  }
+
+public:
+  luci::CircleAdd *add() { return _add; }
+
+protected:
+  luci::CircleAdd *_add = nullptr;
+  luci::CircleConst *_add_beta = nullptr;
+};
+
+/**
+ * @brief Graphlet with Mul and Const as gamma from BatchNorm
+ */
+class MulGammaGraphlet
+{
+public:
+  MulGammaGraphlet() = default;
+
+  void init(loco::Graph *g, const ShapeU32 shape, luci::FusedActFunc actf)
+  {
+    _mul = g->nodes()->create<luci::CircleMul>();
+    _mul_gamma = g->nodes()->create<luci::CircleConst>();
+
+    _mul->dtype(loco::DataType::FLOAT32);
+    _mul_gamma->dtype(loco::DataType::FLOAT32);
+
+    _mul->fusedActivationFunction(actf);
+
+    assert(shape.size() > 0);
+    auto last_it = std::prev(shape.end(), 1);
+    auto channel_size = *last_it;
+
+    _mul->shape(shape);
+    _mul_gamma->shape({channel_size});
+    _mul_gamma->size<loco::DataType::FLOAT32>(channel_size);
+    for (uint32_t i = 0; i < channel_size; i++)
+      _mul_gamma->at<loco::DataType::FLOAT32>(i) = i;
+
+    _mul->name("mul");
+    _mul_gamma->name("mul_gamma");
+  }
+
+public:
+  luci::CircleMul *mul(void) { return _mul; }
+
+protected:
+  luci::CircleMul *_mul = nullptr;
+  luci::CircleConst *_mul_gamma = nullptr;
+};
+
+/**
+ * @brief Graph of Mul-Add pattern from BatchNorm
+ */
+class MulAddGraph : public TestIOGraph, public AddBetaGraphlet, public MulGammaGraphlet
+{
+public:
+  MulAddGraph() = default;
+
+  void init(const ShapeU32 shape_in, const ShapeU32 shape_out)
+  {
+    TestIOGraph::init(shape_in, shape_out);
+    MulGammaGraphlet::init(g(), shape_in, luci::FusedActFunc::NONE);
+    AddBetaGraphlet::init(g(), shape_out, luci::FusedActFunc::RELU);
+
+    // connect network
+    _mul->x(input());
+    _mul->y(_mul_gamma);
+    _add->x(_mul);
+    _add->y(_add_beta);
+    output()->from(_add);
+  }
+};
+
+/**
+ * @brief Graph of Add with Const
+ */
+class AddGraph : public TestIOGraph, public AddBetaGraphlet
+{
+public:
+  AddGraph() = default;
+
+  void init(const ShapeU32 shape_in, const ShapeU32 shape_out)
+  {
+    TestIOGraph::init(shape_in, shape_out);
+    AddBetaGraphlet::init(g(), shape_in, luci::FusedActFunc::RELU);
+
+    // connect network
+    _add->x(input());
+    _add->y(_add_beta);
+    output()->from(_add);
+  }
+};
+
+} // namespace test
+} // namespace luci
+
+class BatchNormPatternFinderMulAddTest : public ::testing::Test
+{
+public:
+  BatchNormPatternFinderMulAddTest() = default;
+
+protected:
+  luci::test::MulAddGraph _mag;
+};
+
+class BatchNormPatternFinderAddTest : public ::testing::Test
+{
+public:
+  BatchNormPatternFinderAddTest() = default;
+
+protected:
+  luci::test::AddGraph _ag;
+};
+
+TEST_F(BatchNormPatternFinderMulAddTest, is_batchnorm_add)
+{
+  _mag.init({1, 16, 16, 4}, {1, 16, 16, 4});
+
+  luci::CircleMul *mul = nullptr;
+  luci::CircleConst *beta = nullptr;
+
+  auto res = luci::is_batchnorm_add(_mag.add(), mul, beta);
+  ASSERT_TRUE(res);
+  ASSERT_NE(nullptr, mul);
+  ASSERT_NE(nullptr, beta);
+}
+
+TEST_F(BatchNormPatternFinderMulAddTest, is_batchnorm_add2)
+{
+  _mag.init({1, 16, 16, 4}, {1, 16, 16, 4});
+
+  auto res = luci::is_batchnorm_add(_mag.add());
+  ASSERT_TRUE(res);
+}
+
+TEST_F(BatchNormPatternFinderAddTest, is_batchnorm_add_NEG)
+{
+  _ag.init({1, 16, 16, 4}, {1, 16, 16, 4});
+
+  luci::CircleMul *mul = nullptr;
+  luci::CircleConst *beta = nullptr;
+
+  auto res = luci::is_batchnorm_add(_ag.add(), mul, beta);
+  ASSERT_FALSE(res);
+}
+
+TEST_F(BatchNormPatternFinderMulAddTest, is_batchnorm_mul)
+{
+  _mag.init({1, 16, 16, 4}, {1, 16, 16, 4});
+
+  luci::CircleNode *pred = nullptr;
+  luci::CircleConst *gamma = nullptr;
+
+  auto res = luci::is_batchnorm_mul(_mag.mul(), pred, gamma);
+  ASSERT_TRUE(res);
+  ASSERT_NE(nullptr, pred);
+  ASSERT_NE(nullptr, gamma);
+}
diff --git a/compiler/luci/pass/src/CircleOptimizer.cpp b/compiler/luci/pass/src/CircleOptimizer.cpp
index cc9fe481c..bddad34fa 100644
--- a/compiler/luci/pass/src/CircleOptimizer.cpp
+++ b/compiler/luci/pass/src/CircleOptimizer.cpp
@@ -16,16 +16,28 @@
 
 #include "luci/CircleOptimizer.h"
 
+#include "luci/Pass/ConvertNCHWToNHWCPass.h"
+#include "luci/Pass/FoldAddV2Pass.h"
+#include "luci/Pass/FoldCastPass.h"
 #include "luci/Pass/FoldDequantizePass.h"
+#include "luci/Pass/FoldSparseToDensePass.h"
+#include "luci/Pass/ForwardReshapeToUnaryOpPass.h"
 #include "luci/Pass/FuseActivationFunctionPass.h"
 #include "luci/Pass/FuseAddWithTConvPass.h"
-#include "luci/Pass/FuseBatchNormWithTConv.h"
+#include "luci/Pass/FuseBatchNormWithConvPass.h"
+#include "luci/Pass/FuseBatchNormWithDwConvPass.h"
+#include "luci/Pass/FuseBatchNormWithTConvPass.h"
 #include "luci/Pass/FuseBCQPass.h"
 #include "luci/Pass/FuseInstanceNormPass.h"
 #include "luci/Pass/FusePreActivationBatchNormPass.h"
 #include "luci/Pass/MakeBatchNormGammaPositivePass.h"
 #include "luci/Pass/PropagateQuantParamPass.h"
+#include "luci/Pass/RemoveRedundantReshapePass.h"
 #include "luci/Pass/RemoveRedundantTransposePass.h"
+#include "luci/Pass/RemoveUnnecessaryReshapePass.h"
+#include "luci/Pass/RemoveUnnecessarySlicePass.h"
+#include "luci/Pass/RemoveUnnecessaryStridedSlicePass.h"
+#include "luci/Pass/RemoveUnnecessarySplitPass.h"
 #include "luci/Pass/ReplaceMulAddWithDepthwiseConvPass.h"
 #include "luci/Pass/ResolveCustomOpAddPass.h"
 #include "luci/Pass/ResolveCustomOpBatchMatMulPass.h"
@@ -36,21 +48,22 @@
 #include "luci/Pass/SparsifyTensorPass.h"
 #include "luci/Pass/ShuffleWeightTo16x1Float32Pass.h"
 #include "luci/Pass/SubstitutePackToReshapePass.h"
+#include "luci/Pass/SubstituteSqueezeToReshapePass.h"
+#include "luci/Pass/SubstituteTransposeToReshapePass.h"
+#include "luci/Pass/TransformMinMaxToRelu6Pass.h"
 // TODO add more passes
 
-#include "luci/Pass/ShapeInferencePass.h"
-#include "luci/Pass/ShapeSignatureInferencePass.h"
-#include "luci/Pass/TypeInferencePass.h"
-
-// Following passes will be removed after refactoring is finished
-#include "luci/Pass/MigrateLegacyShapeDtypePass.h"
+#include "luci/Pass/CircleShapeInferencePass.h"
+#include "luci/Pass/CircleTypeInferencePass.h"
 
 // logo passes
 #include <logo/RemoveDeadNodeWithQueryPass.h>
 
 #include "ModulePhase.h"
 #include "ProgressReporter.h"
-#include "CircleOptimizerUtils.h"
+#include "helpers/Strings.h"
+
+#include "QuantizedModelVerifier.h"
 
 #include <luci/IR/CircleNodes.h>
 #include <logo/Phase.h>
@@ -61,20 +74,6 @@
 namespace
 {
 
-std::vector<int> parseIntFromCommadelimitedStr(std::string str)
-{
-  std::vector<int> ret;
-  std::istringstream is(str);
-  for (uint32_t i; is >> i;)
-  {
-    assert(i != ',');
-    ret.push_back(i);
-    if (is.peek() == ',')
-      is.ignore();
-  }
-  return ret;
-}
-
 using namespace luci;
 
 class OptimizeOptionsImpl final : public luci::CircleOptimizer::Options
@@ -138,13 +137,9 @@ void CircleOptimizer::optimize(luci::Module *m) const
 {
   luci::Phase phase;
 
-  // Following passes will be deprecated after refactoring is finished.
-  phase.emplace_back(std::make_unique<luci::MigrateLegacyShapeDtypePass>());
-
   // Following passes are needed everytime when other passes create new node or modify some nodes.
-  phase.emplace_back(std::make_unique<luci::ShapeInferencePass>());
-  phase.emplace_back(std::make_unique<luci::ShapeSignatureInferencePass>());
-  phase.emplace_back(std::make_unique<luci::TypeInferencePass>());
+  phase.emplace_back(std::make_unique<luci::CircleShapeInferencePass>());
+  phase.emplace_back(std::make_unique<luci::CircleTypeInferencePass>());
 
   if (_options->query(Options::Algorithm::FuseBCQ))
   {
@@ -164,13 +159,9 @@ void CircleOptimizer::optimize(loco::Graph *g) const
   /* TRANSFORM DECLARATION BEGIN */
   phase.emplace_back(std::make_unique<logo::RemoveDeadNodeWithQueryPass>());
 
-  // Following passes will be deprecated after refactoring is finished.
-  phase.emplace_back(std::make_unique<luci::MigrateLegacyShapeDtypePass>());
-
   // Following passes are needed everytime when other passes create new node or modify some nodes.
-  phase.emplace_back(std::make_unique<luci::TypeInferencePass>());
-  phase.emplace_back(std::make_unique<luci::ShapeInferencePass>());
-  phase.emplace_back(std::make_unique<luci::ShapeSignatureInferencePass>());
+  phase.emplace_back(std::make_unique<luci::CircleShapeInferencePass>());
+  phase.emplace_back(std::make_unique<luci::CircleTypeInferencePass>());
 
   if (_options->query(Options::Algorithm::ResolveCustomOpAdd))
   {
@@ -188,6 +179,14 @@ void CircleOptimizer::optimize(loco::Graph *g) const
   {
     phase.emplace_back(std::make_unique<FuseInstanceNormPass>());
   }
+  if (_options->query(Options::Algorithm::FuseBatchNormWithConv))
+  {
+    phase.emplace_back(std::make_unique<FuseBatchNormWithConvPass>());
+  }
+  if (_options->query(Options::Algorithm::FuseBatchNormWithDwConv))
+  {
+    phase.emplace_back(std::make_unique<FuseBatchNormWithDwConvPass>());
+  }
   if (_options->query(Options::Algorithm::FuseBatchNormWithTConv))
   {
     phase.emplace_back(std::make_unique<FuseBatchNormWithTConvPass>());
@@ -200,10 +199,26 @@ void CircleOptimizer::optimize(loco::Graph *g) const
   {
     phase.emplace_back(std::make_unique<FuseActivationFunctionPass>());
   }
+  if (_options->query(Options::Algorithm::FoldAddV2))
+  {
+    phase.emplace_back(std::make_unique<luci::FoldAddV2Pass>());
+  }
+  if (_options->query(Options::Algorithm::FoldCast))
+  {
+    phase.emplace_back(std::make_unique<luci::FoldCastPass>());
+  }
   if (_options->query(Options::Algorithm::FoldDequantize))
   {
     phase.emplace_back(std::make_unique<luci::FoldDequantizePass>());
   }
+  if (_options->query(Options::Algorithm::FoldSparseToDense))
+  {
+    phase.emplace_back(std::make_unique<luci::FoldSparseToDensePass>());
+  }
+  if (_options->query(Options::Algorithm::ForwardReshapeToUnaryOp))
+  {
+    phase.emplace_back(std::make_unique<luci::ForwardReshapeToUnaryOpPass>());
+  }
   if (_options->query(Options::Algorithm::FusePreActivationBatchNorm))
   {
     phase.emplace_back(std::make_unique<luci::FusePreActivationBatchNormPass>());
@@ -216,6 +231,26 @@ void CircleOptimizer::optimize(loco::Graph *g) const
   {
     phase.emplace_back(std::make_unique<luci::ShuffleWeightTo16x1Float32Pass>());
   }
+  if (_options->query(Options::Algorithm::RemoveUnnecessaryReshape))
+  {
+    phase.emplace_back(std::make_unique<luci::RemoveUnnecessaryReshapePass>());
+  }
+  if (_options->query(Options::Algorithm::RemoveUnnecessarySlice))
+  {
+    phase.emplace_back(std::make_unique<luci::RemoveUnnecessarySlicePass>());
+  }
+  if (_options->query(Options::Algorithm::RemoveUnnecessaryStridedSlice))
+  {
+    phase.emplace_back(std::make_unique<luci::RemoveUnnecessaryStridedSlicePass>());
+  }
+  if (_options->query(Options::Algorithm::RemoveUnnecessarySplit))
+  {
+    phase.emplace_back(std::make_unique<luci::RemoveUnnecessarySplitPass>());
+  }
+  if (_options->query(Options::Algorithm::RemoveRedundantReshape))
+  {
+    phase.emplace_back(std::make_unique<luci::RemoveRedundantReshapePass>());
+  }
   if (_options->query(Options::Algorithm::RemoveRedundantTranspose))
   {
     phase.emplace_back(std::make_unique<luci::RemoveRedundantTransposePass>());
@@ -228,6 +263,28 @@ void CircleOptimizer::optimize(loco::Graph *g) const
   {
     phase.emplace_back(std::make_unique<luci::SubstitutePackToReshapePass>());
   }
+  if (_options->query(Options::Algorithm::SubstituteSqueezeToReshape))
+  {
+    phase.emplace_back(std::make_unique<luci::SubstituteSqueezeToReshapePass>());
+  }
+  if (_options->query(Options::Algorithm::SubstituteTransposeToReshape))
+  {
+    phase.emplace_back(std::make_unique<luci::SubstituteTransposeToReshapePass>());
+  }
+  if (_options->query(Options::Algorithm::TransformMinMaxToRelu6Pass))
+  {
+    phase.emplace_back(std::make_unique<luci::TransformMinMaxToRelu6Pass>());
+  }
+  if (_options->query(Options::Algorithm::ConvertNCHWToNHWC))
+  {
+    bool preserve_input =
+      _options->param(Options::AlgorithmParameters::NCHW_to_NHWC_preserve_input_shape) == "true";
+    bool preserve_output =
+      _options->param(Options::AlgorithmParameters::NCHW_to_NHWC_preserve_output_shape) == "true";
+
+    phase.emplace_back(
+      std::make_unique<luci::ConvertNCHWToNHWCPass>(preserve_input, preserve_output));
+  }
 
   /* TRANSFORM DECLARATION END */
 
@@ -275,7 +332,7 @@ void CircleOptimizer::quantize(loco::Graph *g) const
     }
 
     luci::QuantizeDequantizeWeightsPass fake_quantizer(
-        str_to_dtype(input_dtype), str_to_dtype(output_dtype), str_to_granularity(granularity));
+      str_to_dtype(input_dtype), str_to_dtype(output_dtype), str_to_granularity(granularity));
     fake_quantizer.run(g);
   }
 
@@ -315,14 +372,19 @@ void CircleOptimizer::quantize(loco::Graph *g) const
 
     phase.emplace_back(std::make_unique<luci::PropagateQuantParamPass>());
 
-    phase.emplace_back(std::make_unique<luci::ShapeInferencePass>());
-    phase.emplace_back(std::make_unique<luci::TypeInferencePass>());
+    phase.emplace_back(std::make_unique<luci::CircleShapeInferencePass>());
+    phase.emplace_back(std::make_unique<luci::CircleTypeInferencePass>());
     phase.emplace_back(std::make_unique<logo::RemoveDeadNodeWithQueryPass>());
 
     ProgressReporter prog(g, logo::PhaseStrategy::Saturate);
     logo::PhaseRunner<logo::PhaseStrategy::Saturate> phase_runner{g};
     phase_runner.attach(&prog);
     phase_runner.run(phase);
+
+    // Verify the type/granularity of the quantized model
+    luci::QuantizedModelVerifier verifier(str_to_dtype(output_dtype),
+                                          str_to_granularity(granularity));
+    verifier.verify(g);
   }
 
   // Requantize
@@ -349,8 +411,8 @@ void CircleOptimizer::quantize(loco::Graph *g) const
   logo::Phase phase;
 
   // Do Shape/Type inference
-  phase.emplace_back(std::make_unique<luci::ShapeInferencePass>());
-  phase.emplace_back(std::make_unique<luci::TypeInferencePass>());
+  phase.emplace_back(std::make_unique<luci::CircleShapeInferencePass>());
+  phase.emplace_back(std::make_unique<luci::CircleTypeInferencePass>());
 
   ProgressReporter prog(g, logo::PhaseStrategy::Saturate);
   logo::PhaseRunner<logo::PhaseStrategy::Saturate> phase_runner{g};
@@ -364,13 +426,13 @@ void CircleOptimizer::sparsify(loco::Graph *g) const
   {
     std::string tensor_name = _options->param(Options::AlgorithmParameters::Sparsify_tensor_name);
     std::string str_tarversal_order =
-        _options->param(Options::AlgorithmParameters::Sparsify_traversal_order);
+      _options->param(Options::AlgorithmParameters::Sparsify_traversal_order);
     std::string str_format = _options->param(Options::AlgorithmParameters::Sparsify_format);
     std::string str_block_size = _options->param(Options::AlgorithmParameters::Sparsify_block_size);
     std::string str_block_map = _options->param(Options::AlgorithmParameters::Sparsify_block_map);
 
     // traversal order
-    std::vector<int32_t> traversal_order = parseIntFromCommadelimitedStr(str_tarversal_order);
+    std::vector<int32_t> traversal_order = csv_to_vector<int32_t>(str_tarversal_order);
     // format
     std::vector<DimensionType> format;
     std::istringstream is(str_format);
@@ -385,9 +447,9 @@ void CircleOptimizer::sparsify(loco::Graph *g) const
         is.ignore();
     }
     // block size
-    std::vector<int32_t> block_size = parseIntFromCommadelimitedStr(str_block_size);
+    std::vector<int32_t> block_size = csv_to_vector<int32_t>(str_block_size);
     // block map
-    std::vector<int32_t> block_map = parseIntFromCommadelimitedStr(str_block_map);
+    std::vector<int32_t> block_map = csv_to_vector<int32_t>(str_block_map);
 
     luci::SparsifyTensorPass sparsifier{tensor_name, traversal_order, format, block_size,
                                         block_map};
diff --git a/compiler/luci/pass/src/CircleOptimizer.test.cpp b/compiler/luci/pass/src/CircleOptimizer.test.cpp
new file mode 100644
index 000000000..ca6dc77f3
--- /dev/null
+++ b/compiler/luci/pass/src/CircleOptimizer.test.cpp
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/CircleOptimizer.h"
+
+#include <gtest/gtest.h>
+
+using namespace luci;
+using Algorithms = luci::CircleOptimizer::Options::Algorithm;
+using AlgorithmParameters = luci::CircleOptimizer::Options::AlgorithmParameters;
+
+TEST(CircleOptimizerTest, optimize_algorithms)
+{
+  loco::Graph g;
+  luci::CircleOptimizer o;
+
+  auto options = o.options();
+
+  // NOTE these are added to cover the test
+  // TODO add more if needed
+  options->enable(Algorithms::FoldAddV2);
+  options->enable(Algorithms::FoldCast);
+  options->enable(Algorithms::FoldDequantize);
+  options->enable(Algorithms::FoldSparseToDense);
+  options->enable(Algorithms::FusePreActivationBatchNorm);
+  options->enable(Algorithms::MakeBatchNormGammaPositive);
+  options->enable(Algorithms::ShuffleWeightTo16x1Float32);
+  options->enable(Algorithms::RemoveUnnecessaryReshape);
+  options->enable(Algorithms::RemoveUnnecessarySlice);
+  options->enable(Algorithms::RemoveUnnecessarySplit);
+  options->enable(Algorithms::ReplaceMulAddWithDepthwiseConv);
+  options->enable(Algorithms::SubstituteTransposeToReshape);
+  options->enable(Algorithms::ConvertNCHWToNHWC);
+
+  o.optimize(&g);
+
+  SUCCEED();
+}
+
+TEST(CircleOptimizerTest, sparsify_simple)
+{
+  loco::Graph g;
+  luci::CircleOptimizer o;
+
+  auto options = o.options();
+
+  options->enable(Algorithms::SparsifyTensorPass);
+  options->param(AlgorithmParameters::Sparsify_tensor_name, "dummy");
+  options->param(AlgorithmParameters::Sparsify_traversal_order, "dummy");
+  options->param(AlgorithmParameters::Sparsify_format, "ds");
+  options->param(AlgorithmParameters::Sparsify_block_size, "1,1");
+  options->param(AlgorithmParameters::Sparsify_block_map, "1,1");
+
+  o.sparsify(&g);
+
+  SUCCEED();
+}
+
+TEST(CircleOptimizerTest, quantize_quantdequant_simple)
+{
+  loco::Graph g;
+  luci::CircleOptimizer o;
+
+  auto options = o.options();
+
+  options->enable(Algorithms::QuantizeDequantizeWeights);
+  options->param(AlgorithmParameters::Quantize_input_dtype, "float32");
+  options->param(AlgorithmParameters::Quantize_output_dtype, "uint8");
+  options->param(AlgorithmParameters::Quantize_granularity, "layer");
+
+  o.quantize(&g);
+
+  SUCCEED();
+}
+
+TEST(CircleOptimizerTest, quantize_quantdequant_input_NEG)
+{
+  loco::Graph g;
+  luci::CircleOptimizer o;
+
+  auto options = o.options();
+
+  options->enable(Algorithms::QuantizeDequantizeWeights);
+  options->param(AlgorithmParameters::Quantize_input_dtype, "invalid");
+  options->param(AlgorithmParameters::Quantize_output_dtype, "uint8");
+  options->param(AlgorithmParameters::Quantize_granularity, "layer");
+
+  EXPECT_THROW(o.quantize(&g), std::runtime_error);
+}
+
+TEST(CircleOptimizerTest, quantize_quantdequant_output_NEG)
+{
+  loco::Graph g;
+  luci::CircleOptimizer o;
+
+  auto options = o.options();
+
+  options->enable(Algorithms::QuantizeDequantizeWeights);
+  options->param(AlgorithmParameters::Quantize_input_dtype, "float32");
+  options->param(AlgorithmParameters::Quantize_output_dtype, "invalid");
+  options->param(AlgorithmParameters::Quantize_granularity, "layer");
+
+  EXPECT_THROW(o.quantize(&g), std::runtime_error);
+}
+
+TEST(CircleOptimizerTest, quantize_quantdequant_gran_NEG)
+{
+  loco::Graph g;
+  luci::CircleOptimizer o;
+
+  auto options = o.options();
+
+  options->enable(Algorithms::QuantizeDequantizeWeights);
+  options->param(AlgorithmParameters::Quantize_input_dtype, "float32");
+  options->param(AlgorithmParameters::Quantize_output_dtype, "uint8");
+  options->param(AlgorithmParameters::Quantize_granularity, "invalid");
+
+  EXPECT_THROW(o.quantize(&g), std::runtime_error);
+}
+
+TEST(CircleOptimizerTest, quantize_minmax_simple)
+{
+  loco::Graph g;
+  luci::CircleOptimizer o;
+
+  auto options = o.options();
+
+  options->enable(Algorithms::QuantizeWithMinMax);
+  options->param(AlgorithmParameters::Quantize_input_dtype, "float32");
+  options->param(AlgorithmParameters::Quantize_output_dtype, "uint8");
+  options->param(AlgorithmParameters::Quantize_granularity, "layer");
+
+  o.quantize(&g);
+
+  SUCCEED();
+}
+
+TEST(CircleOptimizerTest, quantize_minmax_input_NEG)
+{
+  loco::Graph g;
+  luci::CircleOptimizer o;
+
+  auto options = o.options();
+
+  options->enable(Algorithms::QuantizeWithMinMax);
+  options->param(AlgorithmParameters::Quantize_input_dtype, "invalid");
+  options->param(AlgorithmParameters::Quantize_output_dtype, "uint8");
+  options->param(AlgorithmParameters::Quantize_granularity, "layer");
+
+  EXPECT_THROW(o.quantize(&g), std::runtime_error);
+}
+
+TEST(CircleOptimizerTest, quantize_minmax_output_NEG)
+{
+  loco::Graph g;
+  luci::CircleOptimizer o;
+
+  auto options = o.options();
+
+  options->enable(Algorithms::QuantizeWithMinMax);
+  options->param(AlgorithmParameters::Quantize_input_dtype, "float32");
+  options->param(AlgorithmParameters::Quantize_output_dtype, "invalid");
+  options->param(AlgorithmParameters::Quantize_granularity, "layer");
+
+  EXPECT_THROW(o.quantize(&g), std::runtime_error);
+}
+
+TEST(CircleOptimizerTest, quantize_minmax_gran_NEG)
+{
+  loco::Graph g;
+  luci::CircleOptimizer o;
+
+  auto options = o.options();
+
+  options->enable(Algorithms::QuantizeWithMinMax);
+  options->param(AlgorithmParameters::Quantize_input_dtype, "float32");
+  options->param(AlgorithmParameters::Quantize_output_dtype, "uint8");
+  options->param(AlgorithmParameters::Quantize_granularity, "invalid");
+
+  EXPECT_THROW(o.quantize(&g), std::runtime_error);
+}
+
+TEST(CircleOptimizerTest, quantize_requant_simple)
+{
+  loco::Graph g;
+  luci::CircleOptimizer o;
+
+  auto options = o.options();
+
+  options->enable(Algorithms::Requantize);
+  options->param(AlgorithmParameters::Quantize_input_dtype, "int8");
+  options->param(AlgorithmParameters::Quantize_output_dtype, "uint8");
+
+  o.quantize(&g);
+
+  SUCCEED();
+}
+
+TEST(CircleOptimizerTest, quantize_requant_input_NEG)
+{
+  loco::Graph g;
+  luci::CircleOptimizer o;
+
+  auto options = o.options();
+
+  options->enable(Algorithms::Requantize);
+  options->param(AlgorithmParameters::Quantize_input_dtype, "invalid");
+  options->param(AlgorithmParameters::Quantize_output_dtype, "uint8");
+
+  EXPECT_THROW(o.quantize(&g), std::runtime_error);
+}
+
+TEST(CircleOptimizerTest, quantize_requant_output_NEG)
+{
+  loco::Graph g;
+  luci::CircleOptimizer o;
+
+  auto options = o.options();
+
+  options->enable(Algorithms::Requantize);
+  options->param(AlgorithmParameters::Quantize_input_dtype, "int8");
+  options->param(AlgorithmParameters::Quantize_output_dtype, "invalid");
+
+  EXPECT_THROW(o.quantize(&g), std::runtime_error);
+}
diff --git a/compiler/luci/pass/src/CircleOptimizerUtils.cpp b/compiler/luci/pass/src/CircleOptimizerUtils.cpp
index ffc372392..127573db4 100644
--- a/compiler/luci/pass/src/CircleOptimizerUtils.cpp
+++ b/compiler/luci/pass/src/CircleOptimizerUtils.cpp
@@ -16,74 +16,18 @@
 
 #include "CircleOptimizerUtils.h"
 
-namespace luci
-{
-
-bool in_array(const std::string &str, const std::vector<std::string> &array)
-{
-  return std::find(array.begin(), array.end(), str) != array.end();
-}
+#include <luci/IR/CircleNode.h>
 
-std::string to_string(const std::vector<std::string> &strings)
-{
-  assert(!strings.empty());
-
-  std::string res;
-  for (unsigned int i = 0; i < strings.size() - 1; i++)
-    res += strings[i] + ", ";
-
-  res += strings[strings.size() - 1];
-  return res;
-}
-
-std::string to_lower_case(std::string s)
-{
-  std::transform(s.begin(), s.end(), s.begin(), [](unsigned char c) { return std::tolower(c); });
-  return s;
-}
-
-loco::DataType str_to_dtype(const std::string &str)
+namespace luci
 {
-  if (to_lower_case(str).compare("uint8") == 0)
-    return loco::DataType::U8;
-  if (to_lower_case(str).compare("uint16") == 0)
-    return loco::DataType::U16;
-  if (to_lower_case(str).compare("uint32") == 0)
-    return loco::DataType::U32;
-  if (to_lower_case(str).compare("uint64") == 0)
-    return loco::DataType::U64;
-
-  if (to_lower_case(str).compare("int8") == 0)
-    return loco::DataType::S8;
-  if (to_lower_case(str).compare("int16") == 0)
-    return loco::DataType::S16;
-  if (to_lower_case(str).compare("int32") == 0)
-    return loco::DataType::S32;
-  if (to_lower_case(str).compare("int64") == 0)
-    return loco::DataType::S64;
-
-  if (to_lower_case(str).compare("float16") == 0)
-    return loco::DataType::FLOAT16;
-  if (to_lower_case(str).compare("float32") == 0)
-    return loco::DataType::FLOAT32;
-  if (to_lower_case(str).compare("float64") == 0)
-    return loco::DataType::FLOAT64;
 
-  if (to_lower_case(str).compare("bool") == 0)
-    return loco::DataType::BOOL;
-
-  return loco::DataType::Unknown;
-}
-
-QuantizationGranularity str_to_granularity(const std::string &str)
+bool has_dynamic_shape(const loco::Node *node)
 {
-  if (to_lower_case(str).compare("layer") == 0)
-    return QuantizationGranularity::LayerWise;
-
-  if (to_lower_case(str).compare("channel") == 0)
-    return QuantizationGranularity::ChannelWise;
-
-  throw std::runtime_error("Quantization granularity must be either 'layer' or 'channel'");
+  const auto circle_node = loco::must_cast<const luci::CircleNode *>(node);
+  for (uint32_t i = 0; i < circle_node->rank(); ++i)
+    if (!circle_node->dim(i).known())
+      return true;
+  return false;
 }
 
 } // namespace luci
diff --git a/compiler/luci/pass/src/CircleOptimizerUtils.h b/compiler/luci/pass/src/CircleOptimizerUtils.h
index 7e577a05f..e04942bfa 100644
--- a/compiler/luci/pass/src/CircleOptimizerUtils.h
+++ b/compiler/luci/pass/src/CircleOptimizerUtils.h
@@ -17,25 +17,12 @@
 #ifndef __LUCI_CIRCLE_OPTIMIZER_UTILS_H__
 #define __LUCI_CIRCLE_OPTIMIZER_UTILS_H__
 
-#include "luci/Pass/QuantizeDequantizeWeightsPass.h"
-#include "luci/Pass/QuantizeWithMinMaxPass.h"
-
 #include <loco.h>
 
-#include <algorithm>
-
 namespace luci
 {
 
-bool in_array(const std::string &, const std::vector<std::string> &);
-
-std::string to_string(const std::vector<std::string> &);
-
-std::string to_lower_case(std::string);
-
-loco::DataType str_to_dtype(const std::string &);
-
-QuantizationGranularity str_to_granularity(const std::string &);
+bool has_dynamic_shape(const loco::Node *node);
 
 } // namespace luci
 
diff --git a/compiler/luci/pass/src/CircleShapeInferencePass.cpp b/compiler/luci/pass/src/CircleShapeInferencePass.cpp
new file mode 100644
index 000000000..ddab22421
--- /dev/null
+++ b/compiler/luci/pass/src/CircleShapeInferencePass.cpp
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "helpers/InferenceCandidates.h"
+
+#include "luci/Pass/CircleShapeInferencePass.h"
+
+#include <luci/Service/CircleShapeInference.h>
+
+#include <loco.h>
+
+namespace
+{
+
+bool is_same_shape(luci::CircleNode *node, loco::TensorShape shape)
+{
+  if (node->shape_status() != luci::ShapeStatus::VALID)
+    return false;
+
+  if (node->rank() != shape.rank())
+    return false;
+
+  for (uint32_t i = 0; i < node->rank(); ++i)
+  {
+    if (node->dim(i).known() != shape.dim(i).known())
+      return false;
+
+    if (node->dim(i).value() != shape.dim(i).value())
+      return false;
+  }
+
+  return true;
+}
+
+} // namespace
+
+namespace luci
+{
+
+bool CircleShapeInferencePass::run(luci::Module *m)
+{
+  bool changed = false;
+
+  for (size_t g = 0; g < m->size(); ++g)
+  {
+    if (run(m->graph(g)))
+      changed = true;
+  }
+
+  return changed;
+}
+
+bool CircleShapeInferencePass::run(loco::Graph *g)
+{
+  luci::sinf::Rule shape_infer_rule;
+  bool changed = false;
+
+  for (auto node : inference_candidates(g))
+  {
+    loco::TensorShape shape;
+    auto circle_node = loco::must_cast<luci::CircleNode *>(node);
+
+    if (shape_infer_rule.infer(circle_node, shape) && !is_same_shape(circle_node, shape))
+    {
+      circle_node->rank(shape.rank());
+      for (uint32_t i = 0; i < shape.rank(); ++i)
+        circle_node->dim(i) = shape.dim(i);
+
+      circle_node->shape_status(luci::ShapeStatus::VALID);
+
+      changed = true;
+    }
+  }
+
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/CircleShapeInferencePass.test.cpp b/compiler/luci/pass/src/CircleShapeInferencePass.test.cpp
new file mode 100644
index 000000000..cb3f1fe5f
--- /dev/null
+++ b/compiler/luci/pass/src/CircleShapeInferencePass.test.cpp
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/CircleShapeInferencePass.h"
+
+#include <loco.h>
+
+#include <luci/IR/CircleNodes.h>
+
+#include <gtest/gtest.h>
+
+TEST(CircleShapeInferencePassTest, name)
+{
+  luci::CircleShapeInferencePass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
+
+/**
+ * This test is to check whether shape inference is done by topological order.
+ *
+ * When perm() of "transpose1" is changed from "old_perm" to "new_perm"
+ * by some of luci/Pass like below diagram, shape_status of "transpose1" is
+ * still VALID even the shape should be changed.
+ * If "transpose2" is visited first before shape of "transpose1" is updated,
+ * "transpose2" can reference the shape of "relu" which is not updated yet.
+ * Then shape of "transpose2" becomes 3x5x5x1 and it causes an error at "conv2d".
+ *
+ * <Initial graph>
+ *                                                4x1x1x3
+ * [old_perm] ----------+              [filter] ----------+
+ * (0,2,1,3)            |                                 |
+ *                      |               [bias]  ----------+
+ *                      |                                 |
+ *  input  ------> [transpose1] ------> [relu] ------> [conv2d] ------>  output
+ *         1x5x5x3              1x5x5x3        1x5x5x3          1x5x5x4
+ *
+ *
+ * <Right after transformation>
+ *                                                                        4x1x1x3
+ * [new_perm] ----------+-----------------------------------+    [filter] ------+
+ * (3,2,1,0)            |                                   |                   |
+ *                      |                                   |      [bias] ------+
+ *                      |                                   |                   |
+ *  input  ------> [transpose1] ------> [relu] ------> [transpose2] ------> [conv2d] ------>  output
+ *         1x5x5x3              1x5x5x3        1x5x5x3                 ?             1x5x5x4
+ *
+ *
+ * <Expected result>
+ *                                                                        4x1x1x3
+ * [new_perm] ----------+-----------------------------------+    [filter] ------+
+ * (3,2,1,0)            |                                   |                   |
+ *                      |                                   |      [bias] ------+
+ *                      |                                   |                   |
+ *  input  ------> [transpose1] ------> [relu] ------> [transpose2] ------> [conv2d] ------>  output
+ *         1x5x5x3              3x5x5x1        3x5x5x1              1x5x5x3          1x5x5x4
+ *
+ */
+TEST(CircleShapeInferencePassTest, original_node_change)
+{
+  luci::CircleShapeInferencePass pass;
+  auto g = loco::make_graph();
+
+  // Have to be packed into lambda to check throw
+  auto shape_inference_run = [&]() {
+    while (pass.run(g.get()) == true)
+      ;
+  };
+
+  // Create nodes to make relu traversed first
+  auto input = g->nodes()->create<luci::CircleInput>();
+  auto relu = g->nodes()->create<luci::CircleRelu>();
+  auto old_perm = g->nodes()->create<luci::CircleConst>();
+  auto transpose1 = g->nodes()->create<luci::CircleTranspose>();
+  auto filter = g->nodes()->create<luci::CircleConst>();
+  auto bias = g->nodes()->create<luci::CircleConst>();
+  auto conv2d = g->nodes()->create<luci::CircleConv2D>();
+  auto output = g->nodes()->create<luci::CircleOutput>();
+  auto new_perm = g->nodes()->create<luci::CircleConst>();
+  auto transpose2 = g->nodes()->create<luci::CircleTranspose>();
+
+  // Build up initial graph
+  auto graph_input = g->inputs()->create();
+  graph_input->shape({1, 5, 5, 3});
+
+  input->index(graph_input->index());
+  input->shape({1, 5, 5, 3});
+  input->shape_status(luci::ShapeStatus::VALID);
+
+  old_perm->dtype(loco::DataType::S32);
+  old_perm->size<loco::DataType::S32>(4);
+  old_perm->shape({4});
+  old_perm->at<loco::DataType::S32>(0) = 0;
+  old_perm->at<loco::DataType::S32>(1) = 2;
+  old_perm->at<loco::DataType::S32>(2) = 1;
+  old_perm->at<loco::DataType::S32>(3) = 3;
+  old_perm->shape_status(luci::ShapeStatus::VALID);
+
+  transpose1->a(input);
+  transpose1->perm(old_perm);
+
+  relu->features(transpose1);
+
+  filter->dtype(loco::DataType::FLOAT32);
+  filter->size<loco::DataType::FLOAT32>(4 * 1 * 1 * 3);
+  filter->shape({4, 1, 1, 3});
+  filter->shape_status(luci::ShapeStatus::VALID);
+
+  bias->dtype(loco::DataType::FLOAT32);
+  bias->size<loco::DataType::FLOAT32>(4);
+  bias->shape({4});
+  bias->shape_status(luci::ShapeStatus::VALID);
+
+  conv2d->input(relu);
+  conv2d->filter(filter);
+  conv2d->bias(bias);
+  conv2d->padding(luci::Padding::VALID);
+  conv2d->stride()->h(1);
+  conv2d->stride()->w(1);
+  conv2d->dilation()->h(1);
+  conv2d->dilation()->w(1);
+
+  output->from(conv2d);
+  auto graph_output = g->outputs()->create();
+  output->index(graph_output->index());
+  graph_output->shape({1, 5, 5, 4});
+
+  ASSERT_NO_THROW(shape_inference_run());
+
+  // Transform graph
+  new_perm->dtype(loco::DataType::S32);
+  new_perm->size<loco::DataType::S32>(4);
+  new_perm->shape({4});
+  new_perm->at<loco::DataType::S32>(0) = 3;
+  new_perm->at<loco::DataType::S32>(1) = 2;
+  new_perm->at<loco::DataType::S32>(2) = 1;
+  new_perm->at<loco::DataType::S32>(3) = 0;
+  new_perm->shape_status(luci::ShapeStatus::VALID);
+
+  transpose1->perm(new_perm);
+
+  transpose2->a(relu);
+  transpose2->perm(new_perm);
+
+  conv2d->input(transpose2);
+
+  ASSERT_NO_THROW(shape_inference_run());
+
+  // Check result of shape inference is correct
+  ASSERT_EQ(3, transpose1->dim(0).value());
+  ASSERT_EQ(5, transpose1->dim(1).value());
+  ASSERT_EQ(5, transpose1->dim(2).value());
+  ASSERT_EQ(1, transpose1->dim(3).value());
+
+  ASSERT_EQ(3, relu->dim(0).value());
+  ASSERT_EQ(5, relu->dim(1).value());
+  ASSERT_EQ(5, relu->dim(2).value());
+  ASSERT_EQ(1, relu->dim(3).value());
+
+  ASSERT_EQ(1, transpose2->dim(0).value());
+  ASSERT_EQ(5, transpose2->dim(1).value());
+  ASSERT_EQ(5, transpose2->dim(2).value());
+  ASSERT_EQ(3, transpose2->dim(3).value());
+
+  ASSERT_EQ(1, conv2d->dim(0).value());
+  ASSERT_EQ(5, conv2d->dim(1).value());
+  ASSERT_EQ(5, conv2d->dim(2).value());
+  ASSERT_EQ(4, conv2d->dim(3).value());
+
+  SUCCEED();
+}
+
+/**
+ * This test is for checking when imported shape is wrong.
+ *
+ * Even "concat1" has wrong shape at first, correct shape should be inferred.
+ *
+ * <Initial graph>
+ *
+ *         1x1x1x1
+ *  input1 ------+                 8x7x6x5
+ *               +-----> [concat1] ------+
+ *  input2 ------+       (axis=3)        |                  1x1x2x3
+ *         1x1x1x2                       +------> [concat2] ------> output
+ *                                       |        (axis=2)
+ *                     1x1x1x3           |
+ *  input3 ------------------------------+
+ *
+ *
+ * <Expected result>
+ *
+ *         1x1x1x1
+ *  input1 ------+                 1x1x1x3
+ *               +-----> [concat1] ------+
+ *  input2 ------+       (axis=3)        |                  1x1x2x3
+ *         1x1x1x2                       +------> [concat2] ------> output
+ *                                       |        (axis=2)
+ *                     1x1x1x3           |
+ *  input3 ------------------------------+
+ */
+TEST(CircleShapeInferencePassTest, wrong_imported_shape)
+{
+  luci::CircleShapeInferencePass pass;
+  auto g = loco::make_graph();
+
+  // Have to be packed into lambda to check throw
+  auto shape_inference_run = [&]() {
+    while (pass.run(g.get()) == true)
+      ;
+  };
+
+  // Create nodes to make concat2 traversed first
+  auto concat2 = g->nodes()->create<luci::CircleConcatenation>(2);
+  auto concat1 = g->nodes()->create<luci::CircleConcatenation>(2);
+  auto input1 = g->nodes()->create<luci::CircleInput>();
+  auto input2 = g->nodes()->create<luci::CircleInput>();
+  auto input3 = g->nodes()->create<luci::CircleInput>();
+
+  // Build up initial graph
+  auto graph_input1 = g->inputs()->create();
+  auto graph_input2 = g->inputs()->create();
+  auto graph_input3 = g->inputs()->create();
+  graph_input1->shape({1, 1, 1, 1});
+  graph_input2->shape({1, 1, 1, 2});
+  graph_input2->shape({1, 1, 1, 3});
+
+  input1->index(graph_input1->index());
+  input1->shape({1, 1, 1, 1});
+  input1->shape_status(luci::ShapeStatus::VALID);
+
+  input2->index(graph_input2->index());
+  input2->shape({1, 1, 1, 2});
+  input2->shape_status(luci::ShapeStatus::VALID);
+
+  input3->index(graph_input3->index());
+  input3->shape({1, 1, 1, 3});
+  input3->shape_status(luci::ShapeStatus::VALID);
+
+  concat1->values(0, input1);
+  concat1->values(1, input2);
+  concat1->axis(3);
+  concat1->shape({8, 7, 6, 5}); // Intentionally set wrong shape
+  concat1->shape_status(luci::ShapeStatus::VALID);
+
+  concat2->values(0, concat1);
+  concat2->values(1, input3);
+  concat2->axis(2);
+
+  auto output = g->nodes()->create<luci::CircleOutput>();
+  output->from(concat2);
+  auto graph_output = g->outputs()->create();
+  output->index(graph_output->index());
+  graph_output->shape({1, 1, 2, 3});
+
+  ASSERT_NO_THROW(shape_inference_run());
+
+  // Check result of shape inference is correct
+  ASSERT_EQ(1, concat1->dim(0).value());
+  ASSERT_EQ(1, concat1->dim(1).value());
+  ASSERT_EQ(1, concat1->dim(2).value());
+  ASSERT_EQ(3, concat1->dim(3).value());
+
+  ASSERT_EQ(1, concat2->dim(0).value());
+  ASSERT_EQ(1, concat2->dim(1).value());
+  ASSERT_EQ(2, concat2->dim(2).value());
+  ASSERT_EQ(3, concat2->dim(3).value());
+
+  SUCCEED();
+}
+
+/**
+ * This test is for checking that virtual operations which is not used for graph output
+ * but shape should be exported.
+ *
+ * Although "split_out2" is not used for graph output, shape should be inferenced.
+ *
+ * <Initial graph>
+ *
+ *
+ *          1x6                +----> [split_out1] ----> output
+ *  input ------> [split] -----+
+ *             (split_dim=1)   +----> [split_out2]
+ *             (num_split=2)
+ *
+ *
+ * <Expected result>
+ *                               1x3                1x3
+ *          1x6                +----> [split_out1] ----> output
+ *  input ------> [split] -----+
+ *             (split_dim=1)   +----> [split_out2]
+ *             (num_split=2)     1x3
+ */
+TEST(CircleShapeInferencePassTest, not_used_virtual_op)
+{
+  luci::CircleShapeInferencePass pass;
+  auto g = loco::make_graph();
+
+  // Have to be packed into lambda to check throw
+  auto shape_inference_run = [&]() {
+    while (pass.run(g.get()) == true)
+      ;
+  };
+
+  // Create nodes
+  auto input = g->nodes()->create<luci::CircleInput>();
+  auto split = g->nodes()->create<luci::CircleSplit>();
+  auto split_out1 = g->nodes()->create<luci::CircleSplitOut>();
+  auto split_out2 = g->nodes()->create<luci::CircleSplitOut>();
+  auto split_dim = g->nodes()->create<luci::CircleConst>();
+
+  // Build up initial graph
+  auto graph_input1 = g->inputs()->create();
+  graph_input1->shape({1, 6});
+
+  input->index(graph_input1->index());
+  input->shape({1, 6});
+  input->shape_status(luci::ShapeStatus::VALID);
+
+  split_dim->dtype(loco::DataType::S32);
+  split_dim->size<loco::DataType::S32>(1);
+  split_dim->shape({1});
+  split_dim->at<loco::DataType::S32>(0) = 1;
+  split_dim->shape_status(luci::ShapeStatus::VALID);
+
+  split->split_dim(split_dim);
+  split->input(input);
+  split->num_split(2);
+
+  split_out1->input(split);
+  split_out1->index(0);
+
+  split_out2->input(split);
+  split_out2->index(1);
+
+  auto output = g->nodes()->create<luci::CircleOutput>();
+  output->from(split_out1);
+  auto graph_output = g->outputs()->create();
+  output->index(graph_output->index());
+  graph_output->shape({1, 3});
+
+  ASSERT_NO_THROW(shape_inference_run());
+
+  // Check result of shape inference is correct
+  ASSERT_EQ(1, split_out1->dim(0).value());
+  ASSERT_EQ(3, split_out1->dim(1).value());
+
+  ASSERT_EQ(1, split_out2->dim(0).value());
+  ASSERT_EQ(3, split_out2->dim(1).value());
+
+  SUCCEED();
+}
diff --git a/compiler/luci/pass/src/CircleTypeInferencePass.cpp b/compiler/luci/pass/src/CircleTypeInferencePass.cpp
index 67bd253e0..fb3755ffa 100644
--- a/compiler/luci/pass/src/CircleTypeInferencePass.cpp
+++ b/compiler/luci/pass/src/CircleTypeInferencePass.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "helpers/InferenceCandidates.h"
+
 #include "luci/Pass/CircleTypeInferencePass.h"
 
 #include <luci/Service/CircleTypeInference.h>
@@ -41,7 +43,7 @@ bool CircleTypeInferencePass::run(loco::Graph *g)
   luci::tinf::Rule type_infer_rule;
   bool changed = false;
 
-  for (auto node : loco::postorder_traversal(loco::output_nodes(g)))
+  for (auto node : inference_candidates(g))
   {
     loco::DataType dtype;
     auto circle_node = loco::must_cast<luci::CircleNode *>(node);
diff --git a/compiler/stdex/src/Queue.test.cpp b/compiler/luci/pass/src/CircleTypeInferencePass.test.cpp
index d76cd3ee6..415424a6f 100644
--- a/compiler/stdex/src/Queue.test.cpp
+++ b/compiler/luci/pass/src/CircleTypeInferencePass.test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,19 +14,13 @@
  * limitations under the License.
  */
 
-#include "stdex/Queue.h"
+#include "luci/Pass/CircleTypeInferencePass.h"
 
 #include <gtest/gtest.h>
 
-TEST(QueueTest, take)
+TEST(CircleTypeInferencePassTest, name)
 {
-  std::queue<int> q;
-
-  q.emplace(3);
-  q.emplace(4);
-  q.emplace(5);
-
-  ASSERT_EQ(stdex::take(q), 3);
-  ASSERT_EQ(stdex::take(q), 4);
-  ASSERT_EQ(stdex::take(q), 5);
+  luci::CircleTypeInferencePass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
 }
diff --git a/compiler/luci/pass/src/ConvertNCHWToNHWCPass.cpp b/compiler/luci/pass/src/ConvertNCHWToNHWCPass.cpp
new file mode 100644
index 000000000..c9022f122
--- /dev/null
+++ b/compiler/luci/pass/src/ConvertNCHWToNHWCPass.cpp
@@ -0,0 +1,698 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/ConvertNCHWToNHWCPass.h"
+#include "CircleOptimizerUtils.h"
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/IR/CircleNodeVisitor.h>
+#include <luci/Profile/CircleNodeOrigin.h>
+#include <luci/Log.h>
+
+namespace
+{
+
+enum class DataFormat
+{
+  NCHW,
+  NHWC
+};
+
+/**
+ * @brief Set annotation for DataFormat (NCHW, NHWC)
+ *
+ * @note DataFormatAnnotation will live longer than this Pass (until the
+ *       annotated loco::Node is erased). So, do not use large data in the
+ *       annotation to avoid excessive memory usage.
+ */
+class DataFormatAnnotation final : public loco::NodeAnnotation
+{
+public:
+  DataFormatAnnotation(const DataFormat &format) : _format{format}
+  {
+    // DO NOTHING
+  }
+
+public:
+  const DataFormat &format(void) const { return _format; }
+
+private:
+  DataFormat _format;
+};
+
+void set_data_format(loco::Node *node, const DataFormat &format)
+{
+  node->annot(std::make_unique<DataFormatAnnotation>(format));
+}
+
+DataFormat get_data_format(loco::Node *node)
+{
+  assert(node->annot<DataFormatAnnotation>() != nullptr);
+  return node->annot<DataFormatAnnotation>()->format();
+}
+
+bool has_data_format(loco::Node *node) { return node->annot<DataFormatAnnotation>() != nullptr; }
+
+luci::CircleTranspose *create_4d_transpose(luci::CircleNode *node,
+                                           const std::vector<int32_t> indices)
+{
+  assert(indices.size() == 4);
+
+  auto name = node->name();
+  assert(name.length() > 0);
+
+  auto perm = node->graph()->nodes()->create<luci::CircleConst>();
+  perm->dtype(loco::DataType::S32);
+  perm->size<loco::DataType::S32>(4);
+  perm->rank(1);
+  perm->dim(0) = 4;
+  for (uint32_t i = 0; i < 4; i++)
+    perm->at<loco::DataType::S32>(i) = indices[i];
+  perm->shape_status(luci::ShapeStatus::VALID);
+
+  auto make_string = [](const std::vector<int32_t> &nums) {
+    std::string str;
+    for (auto num : nums)
+    {
+      if (str.length() > 0)
+        str += ".";
+      str += std::to_string(num);
+    }
+    return str;
+  };
+
+  auto str_indices = make_string(indices);
+
+  perm->name(name + "/Transpose_" + str_indices + "/perm");
+
+  auto trans = node->graph()->nodes()->create<luci::CircleTranspose>();
+  trans->perm(perm);
+  trans->name(name + "/Transpose_" + str_indices);
+  luci::add_origin(trans, luci::get_origin(node));
+
+  return trans;
+}
+
+int32_t nchw_axis_to_nhwc(int32_t axis)
+{
+  uint32_t pos_axis = axis >= 0 ? static_cast<uint32_t>(axis) : static_cast<uint32_t>(axis + 4);
+  static const uint32_t to_nhwc[4] = {0, 3, 1, 2};
+  if (pos_axis > 3)
+    throw std::runtime_error("Concat axis must be in range [-4, 4)");
+  return to_nhwc[pos_axis];
+}
+
+luci::CircleTranspose *create_post_transpose(luci::CircleNode *node)
+{
+  return create_4d_transpose(node, {0, 3, 1, 2});
+}
+
+luci::CircleTranspose *create_pre_transpose(luci::CircleNode *node)
+{
+  return create_4d_transpose(node, {0, 2, 3, 1});
+}
+
+uint32_t cal_offset(const loco::TensorShape &dimension, const uint32_t *indices)
+{
+  return indices[0] * dimension.dim(1).value() * dimension.dim(2).value() *
+           dimension.dim(3).value() +
+         indices[1] * dimension.dim(2).value() * dimension.dim(3).value() +
+         indices[2] * dimension.dim(3).value() + indices[3];
+}
+
+luci::CircleConst *create_NHWC_paddings(luci::CircleConst *paddings)
+{
+  // paddings shape is (4,2) (it was checked by is_NCHW)
+  assert(paddings != nullptr);
+  assert(paddings->rank() == 2);
+  assert(paddings->dim(0).value() == 4);
+  assert(paddings->dim(1).value() == 2);
+
+  // paddings for idx 0~3 are 0 (checked by is_NCHW)
+  assert(paddings->at<loco::DataType::S32>(0) == 0);
+  assert(paddings->at<loco::DataType::S32>(1) == 0);
+  assert(paddings->at<loco::DataType::S32>(2) == 0);
+  assert(paddings->at<loco::DataType::S32>(3) == 0);
+
+  auto name = paddings->name();
+  assert(name.length() > 0);
+
+  auto nhwc_paddings = paddings->graph()->nodes()->create<luci::CircleConst>();
+  nhwc_paddings->dtype(loco::DataType::S32);
+  nhwc_paddings->shape({4, 2});
+  nhwc_paddings->shape_status(luci::ShapeStatus::VALID);
+  nhwc_paddings->size<loco::DataType::S32>(4 * 2);
+  nhwc_paddings->name(name + "_NHWC");
+
+  for (uint32_t dim = 0; dim < 4; dim++)
+  {
+    for (uint32_t i = 0; i < 2; i++)
+    {
+      int32_t data = 0;
+
+      if (dim == 1)
+      {
+        // get third dimension (H in NCHW)
+        data = paddings->at<loco::DataType::S32>(2 * 2 + i);
+      }
+      else if (dim == 2)
+      {
+        // get fourth dimension (W in NCHW)
+        data = paddings->at<loco::DataType::S32>(3 * 2 + i);
+      }
+
+      nhwc_paddings->at<loco::DataType::S32>(dim * 2 + i) = data;
+    }
+  }
+  return nhwc_paddings;
+}
+
+luci::CircleConst *create_NHWC_from_NCHW(luci::CircleConst *constant)
+{
+  LOGGER(l);
+  assert(constant->rank() == 4);
+
+  // TODO: Support non-float types
+  if (constant->dtype() != loco::DataType::FLOAT32)
+  {
+    INFO(l) << "Non-float type constant: " << constant->name() << std::endl;
+    return nullptr;
+  }
+
+  loco::TensorShape nchw_dimension{constant->dim(0), constant->dim(1), constant->dim(2),
+                                   constant->dim(3)};
+  loco::TensorShape nhwc_dimension{constant->dim(0), constant->dim(2), constant->dim(3),
+                                   constant->dim(1)};
+
+  auto name = constant->name();
+  assert(name.length() > 0);
+
+  auto nhwc_const = constant->graph()->nodes()->create<luci::CircleConst>();
+  nhwc_const->dtype(constant->dtype());
+  nhwc_const->rank(4);
+  nhwc_const->dim(0).set(constant->dim(0).value());
+  nhwc_const->dim(1).set(constant->dim(2).value());
+  nhwc_const->dim(2).set(constant->dim(3).value());
+  nhwc_const->dim(3).set(constant->dim(1).value());
+  nhwc_const->shape_status(luci::ShapeStatus::VALID);
+  nhwc_const->size<loco::DataType::FLOAT32>(constant->size<loco::DataType::FLOAT32>());
+  nhwc_const->name(name + "_NHWC");
+
+  for (uint32_t n = 0; n < nchw_dimension.dim(0).value(); n++)
+  {
+    for (uint32_t c = 0; c < nchw_dimension.dim(1).value(); c++)
+    {
+      for (uint32_t h = 0; h < nchw_dimension.dim(2).value(); h++)
+      {
+        for (uint32_t w = 0; w < nchw_dimension.dim(3).value(); w++)
+        {
+          uint32_t nchw_indices[4] = {n, c, h, w};
+          uint32_t nhwc_indices[4] = {n, h, w, c};
+          auto data =
+            constant->at<loco::DataType::FLOAT32>(cal_offset(nchw_dimension, nchw_indices));
+          nhwc_const->at<loco::DataType::FLOAT32>(cal_offset(nhwc_dimension, nhwc_indices)) = data;
+        }
+      }
+    }
+  }
+  return nhwc_const;
+}
+
+// NOTE Following conditions can be extended later
+//
+// Find PAD with an NCHW pattern described below
+//   - Paddings shape : [4, 2]
+//   - Paddings value : [[0, 0], [0, 0], [h_t, h_b], [w_t, w_b]]]
+bool is_NCHW(const luci::CirclePad *node)
+{
+  const auto paddings = dynamic_cast<luci::CircleConst *>(node->paddings());
+  // Non-const paddings is not supported
+  if (paddings == nullptr)
+    return false;
+
+  if (paddings->rank() != 2)
+    return false;
+
+  if (paddings->dim(0).value() != 4 || paddings->dim(1).value() != 2)
+    return false;
+
+  // Only check the first two dimensions
+  for (uint32_t dim = 0; dim < 2; dim++)
+  {
+    for (uint32_t i = 0; i < 2; i++)
+    {
+      auto data = paddings->at<loco::DataType::S32>(dim * 2 + i);
+      if (data != 0)
+        return false;
+    }
+  }
+
+  return true;
+}
+
+// NOTE Following conditions can be extended later
+//
+// Find MUL with an NCHW pattern described below
+//   - Input (non-constant) shape : [N, C, H, W]
+//   - Input (constant) shape : [1, C, 1, 1]
+//   - Output shape : [N, C, H, W]
+bool is_NCHW_with_const(const luci::CircleMul *node, luci::CircleNode *&pred_node,
+                        luci::CircleConst *&multiplier)
+{
+  auto x = dynamic_cast<luci::CircleConst *>(node->x());
+  auto y = dynamic_cast<luci::CircleConst *>(node->y());
+
+  if (x != nullptr && y == nullptr)
+  {
+    pred_node = loco::must_cast<luci::CircleNode *>(node->y());
+    multiplier = x;
+  }
+  else if (x == nullptr && y != nullptr)
+  {
+    pred_node = loco::must_cast<luci::CircleNode *>(node->x());
+    multiplier = y;
+  }
+  else
+  {
+    // Ignore if MUL does not have a multiplier input.
+    return false;
+  }
+
+  if (pred_node->rank() != 4)
+    return false;
+
+  const auto const_rank = multiplier->rank();
+  if (const_rank != 4)
+    return false;
+
+  for (uint32_t i = 0; i < const_rank; i++)
+  {
+    if (i != 1 && multiplier->dim(i).value() != 1)
+      return false;
+  }
+
+  const auto const_cdim = multiplier->dim(1);
+  const auto input_cdim = pred_node->dim(1);
+  const auto output_cdim = node->dim(1);
+
+  if (const_cdim == input_cdim && input_cdim == output_cdim)
+    return true;
+  else
+    return false;
+}
+
+// We assume ADD with const input is NCHW if,
+// Input shape: (N, C, H, W)
+// Output shape: (N, C, H, W)
+// 1. Const shape is (1, C, 1, 1)
+// 2. Input, Output, Const have the same C.
+bool is_NCHW_with_const(const luci::CircleAdd *node, luci::CircleNode *&pred_node,
+                        luci::CircleConst *&beta)
+{
+  auto x = dynamic_cast<luci::CircleConst *>(node->x());
+  auto y = dynamic_cast<luci::CircleConst *>(node->y());
+
+  if (x != nullptr && y == nullptr)
+  {
+    pred_node = loco::must_cast<luci::CircleNode *>(node->y());
+    beta = x;
+  }
+  else if (x == nullptr && y != nullptr)
+  {
+    pred_node = loco::must_cast<luci::CircleNode *>(node->x());
+    beta = y;
+  }
+  else
+  {
+    // Ignore if ADD does not have a constant input.
+    return false;
+  }
+
+  if (pred_node->rank() != 4)
+    return false;
+
+  const auto const_rank = beta->rank();
+  if (const_rank != 4)
+    return false;
+
+  // Check the shape is (1, C, 1, 1)
+  for (uint32_t i = 0; i < const_rank; i++)
+  {
+    if (i == 1)
+      continue;
+
+    if (beta->dim(i).value() != 1)
+      return false;
+  }
+
+  const auto const_cdim = beta->dim(1);
+  const auto input_cdim = pred_node->dim(1);
+  const auto output_cdim = node->dim(1);
+
+  // Check Input, Output, Const have the same channel size
+  if (const_cdim == input_cdim && input_cdim == output_cdim)
+    return true;
+  else
+    return false;
+}
+
+template <class T> bool convert_unary_features(T *node)
+{
+  const auto pred_node = loco::must_cast<luci::CircleNode *>(node->features());
+  auto pre_trans = create_pre_transpose(node);
+  pre_trans->a(pred_node);
+  node->features(pre_trans);
+
+  // Do shape inference for this node again.
+  node->shape_status(luci::ShapeStatus::UNDEFINED);
+
+  auto post_trans = create_post_transpose(node);
+  loco::replace(node).with(post_trans);
+
+  post_trans->a(node);
+
+  return true;
+}
+
+class ConvertNCHWToNHWC final : public luci::CircleNodeMutableVisitor<bool>
+{
+  // Default
+  bool visit(luci::CircleNode *node)
+  {
+    throw std::runtime_error(node->name() + " is an unsupported operator.");
+  }
+
+  bool visit(luci::CircleInput *node)
+  {
+    const auto n = node->dim(0);
+    const auto c = node->dim(1);
+    const auto h = node->dim(2);
+    const auto w = node->dim(3);
+
+    node->dim(1) = h;
+    node->dim(2) = w;
+    node->dim(3) = c;
+
+    // Do shape inference for this node again.
+    node->shape_status(luci::ShapeStatus::UNDEFINED);
+
+    // Insert post-tranpose
+    auto post_trans = create_post_transpose(node);
+    loco::replace(node).with(post_trans);
+
+    post_trans->a(node);
+
+    // Update graph input
+    auto graph_inputs = node->graph()->inputs();
+    auto graph_input = graph_inputs->at(node->index());
+    graph_input->shape({n, h, w, c});
+
+    return true;
+  }
+
+  bool visit(luci::CircleOutput *node)
+  {
+    // Insert pre-transpose
+    auto pre_trans = create_pre_transpose(node);
+    pre_trans->a(node->from());
+
+    node->from(pre_trans);
+
+    // Do shape inference for this node again.
+    node->shape_status(luci::ShapeStatus::UNDEFINED);
+
+    // Update graph output
+    const auto n = node->dim(0).value();
+    const auto c = node->dim(1).value();
+    const auto h = node->dim(2).value();
+    const auto w = node->dim(3).value();
+
+    auto graph_outputs = node->graph()->outputs();
+    auto graph_output = graph_outputs->at(node->index());
+    graph_output->shape({n, h, w, c});
+
+    return true;
+  }
+
+  bool visit(luci::CircleAdd *node)
+  {
+    luci::CircleNode *pred_node = nullptr;
+    luci::CircleConst *beta = nullptr;
+
+    if (is_NCHW_with_const(node, pred_node, beta))
+    {
+      auto pre_trans = create_pre_transpose(node);
+      pre_trans->a(pred_node);
+
+      auto nhwc_const = create_NHWC_from_NCHW(beta);
+      if (nhwc_const == nullptr)
+        return false;
+
+      node->x(pre_trans);
+      node->y(nhwc_const);
+    }
+    else if (beta == nullptr)
+    {
+      // Both inputs are not constant.
+      // In this case, we cannot distinguish NCHW from NHWC,
+      // so just insert Transpose Ops.
+      auto pre_trans_x = create_pre_transpose(node);
+      pre_trans_x->a(node->x());
+      node->x(pre_trans_x);
+
+      auto pre_trans_y = create_pre_transpose(node);
+      pre_trans_y->a(node->y());
+      node->y(pre_trans_y);
+    }
+    else
+    {
+      return false;
+    }
+
+    // Do shape inference for this node again.
+    node->shape_status(luci::ShapeStatus::UNDEFINED);
+
+    auto post_trans = create_post_transpose(node);
+    loco::replace(node).with(post_trans);
+
+    post_trans->a(node);
+    return true;
+  }
+
+  bool visit(luci::CircleConcatenation *node)
+  {
+    const auto num_values = node->numValues();
+    for (uint32_t i = 0; i < num_values; i++)
+    {
+      auto pred_node = loco::must_cast<luci::CircleNode *>(node->values(i));
+      auto pre_trans = create_pre_transpose(node);
+      pre_trans->a(pred_node);
+      node->values(i, pre_trans);
+    }
+
+    // Do shape inference for this node again.
+    node->shape_status(luci::ShapeStatus::UNDEFINED);
+
+    node->axis(nchw_axis_to_nhwc(node->axis()));
+
+    auto post_trans = create_post_transpose(node);
+    loco::replace(node).with(post_trans);
+
+    post_trans->a(node);
+
+    return true;
+  }
+
+  bool visit(luci::CircleLeakyRelu *node)
+  {
+    return convert_unary_features<luci::CircleLeakyRelu>(node);
+  }
+
+  bool visit(luci::CircleMul *node)
+  {
+    LOGGER(l);
+
+    luci::CircleNode *pred_node = nullptr;
+    luci::CircleConst *multiplier = nullptr;
+
+    if (is_NCHW_with_const(node, pred_node, multiplier))
+    {
+      auto pre_trans = create_pre_transpose(node);
+      pre_trans->a(pred_node);
+      node->x(pre_trans);
+
+      auto nhwc_const = create_NHWC_from_NCHW(multiplier);
+      node->y(nhwc_const);
+    }
+    else if (multiplier == nullptr)
+    {
+      // TODO : Implement this case.
+      INFO(l) << "Not yet implemented. Both inputs of MUL are non-const." << std::endl;
+      return false;
+    }
+    else
+    {
+      return false;
+    }
+
+    // Do shape inference for this node again.
+    node->shape_status(luci::ShapeStatus::UNDEFINED);
+
+    auto post_trans = create_post_transpose(node);
+    loco::replace(node).with(post_trans);
+
+    post_trans->a(node);
+    return true;
+  }
+
+  bool visit(luci::CircleNeg *node)
+  {
+    const auto pred_node = loco::must_cast<luci::CircleNode *>(node->x());
+    auto pre_trans = create_pre_transpose(node);
+    pre_trans->a(pred_node);
+    node->x(pre_trans);
+
+    // Do shape inference for this node again.
+    node->shape_status(luci::ShapeStatus::UNDEFINED);
+
+    auto post_trans = create_post_transpose(node);
+    loco::replace(node).with(post_trans);
+
+    post_trans->a(node);
+
+    return true;
+  }
+
+  bool visit(luci::CirclePad *node)
+  {
+    if (!is_NCHW(node))
+      return false;
+
+    const auto pred_node = loco::must_cast<luci::CircleNode *>(node->input());
+    auto pre_trans = create_pre_transpose(node);
+    pre_trans->a(pred_node);
+    node->input(pre_trans);
+
+    auto nchw_paddings = loco::must_cast<luci::CircleConst *>(node->paddings());
+    const auto nhwc_paddings = create_NHWC_paddings(nchw_paddings);
+    node->paddings(nhwc_paddings);
+
+    // Do shape inference for this node again.
+    node->shape_status(luci::ShapeStatus::UNDEFINED);
+
+    auto post_trans = create_post_transpose(node);
+    loco::replace(node).with(post_trans);
+
+    post_trans->a(node);
+
+    return true;
+  }
+
+  bool visit(luci::CircleRelu *node) { return convert_unary_features<luci::CircleRelu>(node); }
+
+  bool visit(luci::CircleRelu6 *node) { return convert_unary_features<luci::CircleRelu6>(node); }
+};
+
+} // namespace
+
+namespace luci
+{
+
+bool ConvertNCHWToNHWCPass::run(loco::Graph *g)
+{
+  LOGGER(l);
+  INFO(l) << "ConvertNCHWToNHWCPass Start" << std::endl;
+
+  // Annotate NCHW operators
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    auto circle_node = loco::must_cast<luci::CircleNode *>(node);
+    switch (circle_node->opcode())
+    {
+      // List of supported Ops
+      case luci::CircleOpcode::CIRCLEINPUT:
+        if (!_preserve_input && !has_data_format(node))
+        {
+          set_data_format(node, DataFormat::NCHW);
+        }
+        break;
+      case luci::CircleOpcode::CIRCLEOUTPUT:
+        if (!_preserve_output && !has_data_format(node))
+        {
+          set_data_format(node, DataFormat::NCHW);
+        }
+        break;
+      case luci::CircleOpcode::ADD:
+      case luci::CircleOpcode::CONCATENATION:
+      case luci::CircleOpcode::LEAKY_RELU:
+      case luci::CircleOpcode::MUL:
+      case luci::CircleOpcode::NEG:
+      case luci::CircleOpcode::PAD:
+      case luci::CircleOpcode::RELU:
+      case luci::CircleOpcode::RELU6:
+        if (!has_data_format(node))
+        {
+          set_data_format(node, DataFormat::NCHW);
+        }
+        break;
+      default:
+        break;
+    }
+  }
+
+  bool changed = false;
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    if (!has_data_format(node))
+    {
+      // Unsupported Op
+      continue;
+    }
+    else if (get_data_format(node) == DataFormat::NHWC)
+    {
+      // Already converted to NHWC
+      continue;
+    }
+    else if (has_dynamic_shape(node))
+    {
+      // This pass only works for static-shaped node
+      INFO(l) << "Skip the node with a dynamic shape." << std::endl;
+      continue;
+    }
+    else
+    {
+      ConvertNCHWToNHWC converter;
+      auto circle_node = loco::must_cast<luci::CircleNode *>(node);
+      if (circle_node->rank() != 4)
+        continue;
+
+      if (circle_node->accept(&converter))
+      {
+        set_data_format(node, DataFormat::NHWC);
+        changed = true;
+      }
+      else
+      {
+        continue;
+      }
+    }
+  }
+
+  INFO(l) << "ConvertNCHWToNHWCPass End" << std::endl;
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/ConvertNCHWToNHWCPass.test.cpp b/compiler/luci/pass/src/ConvertNCHWToNHWCPass.test.cpp
new file mode 100644
index 000000000..831d5f89a
--- /dev/null
+++ b/compiler/luci/pass/src/ConvertNCHWToNHWCPass.test.cpp
@@ -0,0 +1,636 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <logo/Phase.h>
+
+#include "luci/Pass/ConvertNCHWToNHWCPass.h"
+#include "luci/Pass/CircleShapeInferencePass.h"
+
+#include <luci/IR/CircleNodes.h>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+/**
+ *  Graph with a single Op (example: Add).
+ *
+ *  BEFORE
+ *  - All Ops including Input/Output are NCHW.
+ *
+ *             [Input] [beta]
+ *                |  /
+ *              [Add]
+ *                |
+ *             [Output]
+ *
+ *  AFTER
+ *  - All Ops including Input/Output are NHWC.
+ *
+ *             [Input]
+ *                |
+ *         [Transpose]
+ *                |
+ *        [Transpose] [beta]
+ *                |  /
+ *              [Add]
+ *                |
+ *         [Transpose]
+ *                |
+ *         [Transpose]
+ *                |
+ *             [Output]
+ */
+class SimpleGraph
+{
+public:
+  SimpleGraph() = default;
+
+public:
+  void init()
+  {
+    input = g.nodes()->create<luci::CircleInput>();
+    output = g.nodes()->create<luci::CircleOutput>();
+    input->name("input");
+    output->name("output");
+
+    auto graph_input = g.inputs()->create();
+    input->index(graph_input->index());
+    auto graph_output = g.outputs()->create();
+    output->index(graph_output->index());
+
+    graph_input->dtype(loco::DataType::FLOAT32);
+    input->dtype(loco::DataType::FLOAT32);
+    output->dtype(loco::DataType::FLOAT32);
+    graph_output->dtype(loco::DataType::FLOAT32);
+
+    uint32_t channel_size = 16;
+    graph_input->shape({1, channel_size, 4, 4});
+    input->shape({1, channel_size, 4, 4});
+    output->shape({1, channel_size, 4, 4});
+    graph_output->shape({1, channel_size, 4, 4});
+
+    auto graph_body = insertGraphBody(input);
+    output->from(graph_body);
+  }
+
+  virtual ~SimpleGraph() = default;
+
+protected:
+  virtual loco::Node *insertGraphBody(loco::Node *input) = 0;
+
+public:
+  loco::Graph g;
+  luci::CircleInput *input = nullptr;
+  luci::CircleOutput *output = nullptr;
+};
+
+class AddGraph final : public SimpleGraph
+{
+protected:
+  loco::Node *insertGraphBody(loco::Node *input) override
+  {
+    add = g.nodes()->create<luci::CircleAdd>();
+    beta = g.nodes()->create<luci::CircleConst>();
+
+    add->dtype(loco::DataType::FLOAT32);
+    beta->dtype(loco::DataType::FLOAT32);
+
+    uint32_t channel_size = 16;
+    add->shape({1, channel_size, 4, 4});
+    beta->shape({1, channel_size, 1, 1});
+
+    beta->size<loco::DataType::FLOAT32>(channel_size);
+    for (uint32_t i = 0; i < channel_size; i++)
+    {
+      beta->at<loco::DataType::FLOAT32>(i) = i;
+    }
+
+    add->x(input);
+    add->y(beta);
+
+    add->name("add");
+    beta->name("beta");
+
+    return add;
+  }
+
+public:
+  luci::CircleAdd *add = nullptr;
+  luci::CircleConst *beta = nullptr;
+};
+
+class ConcatenationGraph final : public SimpleGraph
+{
+protected:
+  loco::Node *insertGraphBody(loco::Node *input) override
+  {
+    concat = g.nodes()->create<luci::CircleConcatenation>(2);
+    concat->values(0, input);
+    concat->axis(1);
+
+    input2 = g.nodes()->create<luci::CircleConst>();
+    input2->dtype(loco::DataType::FLOAT32);
+    input2->shape({1, 16, 4, 4});
+    input2->size<loco::DataType::FLOAT32>(16 * 4 * 4);
+    for (uint32_t i = 0; i < 16 * 4 * 4; i++)
+    {
+      input2->at<loco::DataType::FLOAT32>(i) = i;
+    }
+    concat->values(1, input2);
+
+    concat->name("concat");
+    input2->name("input2");
+
+    return concat;
+  }
+
+public:
+  luci::CircleConcatenation *concat = nullptr;
+  luci::CircleConst *input2 = nullptr;
+};
+
+class LeakyReluGraph final : public SimpleGraph
+{
+protected:
+  loco::Node *insertGraphBody(loco::Node *input) override
+  {
+    leakyrelu = g.nodes()->create<luci::CircleLeakyRelu>();
+    leakyrelu->features(input);
+    leakyrelu->name("leakyrelu");
+
+    return leakyrelu;
+  }
+
+public:
+  luci::CircleLeakyRelu *leakyrelu = nullptr;
+};
+
+class MulGraph final : public SimpleGraph
+{
+protected:
+  loco::Node *insertGraphBody(loco::Node *input) override
+  {
+    mul = g.nodes()->create<luci::CircleMul>();
+    multiplier = g.nodes()->create<luci::CircleConst>();
+
+    mul->dtype(loco::DataType::FLOAT32);
+    multiplier->dtype(loco::DataType::FLOAT32);
+
+    uint32_t channel_size = 16;
+    mul->shape({1, channel_size, 4, 4});
+    multiplier->shape({1, channel_size, 1, 1});
+
+    multiplier->size<loco::DataType::FLOAT32>(channel_size);
+    for (uint32_t i = 0; i < channel_size; i++)
+    {
+      multiplier->at<loco::DataType::FLOAT32>(i) = i;
+    }
+
+    mul->x(input);
+    mul->y(multiplier);
+
+    mul->name("mul");
+    multiplier->name("multiplier");
+
+    return mul;
+  }
+
+public:
+  luci::CircleMul *mul = nullptr;
+  luci::CircleConst *multiplier = nullptr;
+};
+
+class NegGraph final : public SimpleGraph
+{
+protected:
+  loco::Node *insertGraphBody(loco::Node *input) override
+  {
+    neg = g.nodes()->create<luci::CircleNeg>();
+    neg->x(input);
+    neg->name("neg");
+
+    return neg;
+  }
+
+public:
+  luci::CircleNeg *neg = nullptr;
+};
+
+class PadGraph final : public SimpleGraph
+{
+protected:
+  loco::Node *insertGraphBody(loco::Node *input) override
+  {
+    pad = g.nodes()->create<luci::CirclePad>();
+    paddings = g.nodes()->create<luci::CircleConst>();
+
+    pad->dtype(loco::DataType::FLOAT32);
+    paddings->dtype(loco::DataType::S32);
+
+    uint32_t channel_size = 16;
+    pad->shape({1, channel_size, 4, 4});
+    paddings->shape({4, 2});
+
+    // paddings data (NCHW)
+    // [[0,0], [0,0], [1,1], [2,2]]
+    paddings->size<loco::DataType::S32>(8);
+    for (uint32_t dim = 0; dim < 4; dim++)
+    {
+      for (uint32_t i = 0; i < 2; i++)
+      {
+        int32_t data = 0;
+
+        if (dim == 2)
+          data = 1;
+        else if (dim == 3)
+          data = 2;
+
+        paddings->at<loco::DataType::S32>(dim * 2 + i) = data;
+      }
+    }
+
+    pad->input(input);
+    pad->paddings(paddings);
+
+    pad->name("pad");
+    paddings->name("paddings");
+
+    return pad;
+  }
+
+public:
+  luci::CirclePad *pad = nullptr;
+  luci::CircleConst *paddings = nullptr;
+};
+
+class ReluGraph final : public SimpleGraph
+{
+protected:
+  loco::Node *insertGraphBody(loco::Node *input) override
+  {
+    relu = g.nodes()->create<luci::CircleRelu>();
+    relu->features(input);
+    relu->name("Relu");
+
+    return relu;
+  }
+
+public:
+  luci::CircleRelu *relu = nullptr;
+};
+
+class Relu6Graph final : public SimpleGraph
+{
+protected:
+  loco::Node *insertGraphBody(loco::Node *input) override
+  {
+    relu6 = g.nodes()->create<luci::CircleRelu6>();
+    relu6->features(input);
+    relu6->name("relu6");
+
+    return relu6;
+  }
+
+public:
+  luci::CircleRelu6 *relu6 = nullptr;
+};
+
+void check_pre_trans(loco::Node *node)
+{
+  auto pre_trans = dynamic_cast<luci::CircleTranspose *>(node);
+  EXPECT_NE(nullptr, pre_trans);
+  auto pre_trans_perm = dynamic_cast<luci::CircleConst *>(pre_trans->perm());
+  EXPECT_NE(nullptr, pre_trans_perm);
+  EXPECT_EQ(1, pre_trans_perm->rank());
+  EXPECT_EQ(4, pre_trans_perm->dim(0).value());
+  EXPECT_EQ(loco::DataType::S32, pre_trans_perm->dtype());
+  EXPECT_EQ(0, pre_trans_perm->at<loco::DataType::S32>(0));
+  EXPECT_EQ(2, pre_trans_perm->at<loco::DataType::S32>(1));
+  EXPECT_EQ(3, pre_trans_perm->at<loco::DataType::S32>(2));
+  EXPECT_EQ(1, pre_trans_perm->at<loco::DataType::S32>(3));
+}
+
+void check_post_trans(loco::Node *node)
+{
+  auto post_trans = dynamic_cast<luci::CircleTranspose *>(node);
+  EXPECT_NE(nullptr, post_trans);
+  auto post_trans_perm = dynamic_cast<luci::CircleConst *>(post_trans->perm());
+  EXPECT_NE(nullptr, post_trans_perm);
+  EXPECT_EQ(1, post_trans_perm->rank());
+  EXPECT_EQ(4, post_trans_perm->dim(0).value());
+  EXPECT_EQ(loco::DataType::S32, post_trans_perm->dtype());
+  EXPECT_EQ(0, post_trans_perm->at<loco::DataType::S32>(0));
+  EXPECT_EQ(3, post_trans_perm->at<loco::DataType::S32>(1));
+  EXPECT_EQ(1, post_trans_perm->at<loco::DataType::S32>(2));
+  EXPECT_EQ(2, post_trans_perm->at<loco::DataType::S32>(3));
+}
+
+void run_phase(loco::Graph *g, bool preserve_input, bool preserve_output)
+{
+  logo::Phase phase;
+
+  // Default passes.
+  phase.emplace_back(std::make_unique<luci::CircleShapeInferencePass>());
+
+  // Pass to test
+  phase.emplace_back(
+    std::make_unique<luci::ConvertNCHWToNHWCPass>(preserve_input, preserve_output));
+
+  logo::PhaseRunner<logo::PhaseStrategy::Restart> phase_runner{g};
+  phase_runner.run(phase);
+}
+
+} // namespace
+
+TEST(ConvertNCHWToNHWCPassTest, name)
+{
+  luci::ConvertNCHWToNHWCPass pass(false, false);
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
+
+TEST(ConvertNCHWToNHWC, Add)
+{
+  AddGraph g;
+  g.init();
+
+  run_phase(&g.g, false, false);
+
+  auto input_succs = loco::succs(g.input);
+  EXPECT_EQ(1, input_succs.size());
+  check_post_trans(*input_succs.begin());
+
+  check_pre_trans(g.add->x());
+
+  auto add_succs = loco::succs(g.add);
+  EXPECT_EQ(1, add_succs.size());
+  check_post_trans(*add_succs.begin());
+
+  uint32_t channel_size = 16;
+  auto new_beta = dynamic_cast<luci::CircleConst *>(g.add->y());
+  EXPECT_NE(nullptr, new_beta);
+  EXPECT_EQ(4, new_beta->rank());
+  EXPECT_EQ(1, new_beta->dim(0).value());
+  EXPECT_EQ(1, new_beta->dim(1).value());
+  EXPECT_EQ(1, new_beta->dim(2).value());
+  EXPECT_EQ(channel_size, new_beta->dim(3).value());
+
+  check_pre_trans(g.output->from());
+}
+
+TEST(ConvertNCHWToNHWC, Concatenation)
+{
+  ConcatenationGraph g;
+  g.init();
+
+  run_phase(&g.g, true, true);
+
+  check_pre_trans(g.concat->values(0));
+  check_pre_trans(g.concat->values(1));
+
+  auto concat_succs = loco::succs(g.concat);
+  EXPECT_EQ(1, concat_succs.size());
+  check_post_trans(*concat_succs.begin());
+
+  // Check concat shape, axis
+  EXPECT_EQ(1, g.concat->dim(0).value());
+  EXPECT_EQ(4, g.concat->dim(1).value());
+  EXPECT_EQ(4, g.concat->dim(2).value());
+  EXPECT_EQ(32, g.concat->dim(3).value());
+  EXPECT_EQ(3, g.concat->axis());
+}
+
+TEST(ConvertNCHWToNHWC, LeakyRelu)
+{
+  LeakyReluGraph g;
+  g.init();
+
+  run_phase(&g.g, true, true);
+
+  check_pre_trans(g.leakyrelu->features());
+
+  auto leakyrelu_succs = loco::succs(g.leakyrelu);
+  EXPECT_EQ(1, leakyrelu_succs.size());
+  check_post_trans(*leakyrelu_succs.begin());
+
+  // Check leakyrelu shape
+  EXPECT_EQ(1, g.leakyrelu->dim(0).value());
+  EXPECT_EQ(4, g.leakyrelu->dim(1).value());
+  EXPECT_EQ(4, g.leakyrelu->dim(2).value());
+  EXPECT_EQ(16, g.leakyrelu->dim(3).value());
+}
+
+TEST(ConvertNCHWToNHWC, Mul)
+{
+  MulGraph g;
+  g.init();
+
+  run_phase(&g.g, false, false);
+
+  auto input_succs = loco::succs(g.input);
+  EXPECT_EQ(1, input_succs.size());
+  check_post_trans(*input_succs.begin());
+
+  check_pre_trans(g.mul->x());
+
+  auto mul_succs = loco::succs(g.mul);
+  EXPECT_EQ(1, mul_succs.size());
+  check_post_trans(*mul_succs.begin());
+
+  uint32_t channel_size = 16;
+  auto new_multiplier = dynamic_cast<luci::CircleConst *>(g.mul->y());
+  EXPECT_NE(nullptr, new_multiplier);
+  EXPECT_EQ(4, new_multiplier->rank());
+  EXPECT_EQ(1, new_multiplier->dim(0).value());
+  EXPECT_EQ(1, new_multiplier->dim(1).value());
+  EXPECT_EQ(1, new_multiplier->dim(2).value());
+  EXPECT_EQ(channel_size, new_multiplier->dim(3).value());
+
+  check_pre_trans(g.output->from());
+}
+
+TEST(ConvertNCHWToNHWC, Neg)
+{
+  NegGraph g;
+  g.init();
+
+  run_phase(&g.g, true, true);
+
+  check_pre_trans(g.neg->x());
+
+  auto neg_succs = loco::succs(g.neg);
+  EXPECT_EQ(1, neg_succs.size());
+  check_post_trans(*neg_succs.begin());
+
+  // Check leakyrelu shape
+  EXPECT_EQ(1, g.neg->dim(0).value());
+  EXPECT_EQ(4, g.neg->dim(1).value());
+  EXPECT_EQ(4, g.neg->dim(2).value());
+  EXPECT_EQ(16, g.neg->dim(3).value());
+}
+
+TEST(ConvertNCHWToNHWC, Pad)
+{
+  PadGraph g;
+  g.init();
+
+  run_phase(&g.g, false, false);
+
+  auto input_succs = loco::succs(g.input);
+  EXPECT_EQ(1, input_succs.size());
+  check_post_trans(*input_succs.begin());
+
+  check_pre_trans(g.pad->input());
+
+  auto pad_succs = loco::succs(g.pad);
+  EXPECT_EQ(1, pad_succs.size());
+  check_post_trans(*pad_succs.begin());
+
+  auto new_paddings = dynamic_cast<luci::CircleConst *>(g.pad->paddings());
+  EXPECT_NE(nullptr, new_paddings);
+  EXPECT_EQ(2, new_paddings->rank());
+  EXPECT_EQ(4, new_paddings->dim(0).value());
+  EXPECT_EQ(2, new_paddings->dim(1).value());
+  EXPECT_EQ(0, new_paddings->at<loco::DataType::S32>(0));
+  EXPECT_EQ(0, new_paddings->at<loco::DataType::S32>(1));
+  EXPECT_EQ(1, new_paddings->at<loco::DataType::S32>(2));
+  EXPECT_EQ(1, new_paddings->at<loco::DataType::S32>(3));
+  EXPECT_EQ(2, new_paddings->at<loco::DataType::S32>(4));
+  EXPECT_EQ(2, new_paddings->at<loco::DataType::S32>(5));
+  EXPECT_EQ(0, new_paddings->at<loco::DataType::S32>(6));
+  EXPECT_EQ(0, new_paddings->at<loco::DataType::S32>(7));
+
+  check_pre_trans(g.output->from());
+}
+
+TEST(ConvertNCHWToNHWC, Unknown_Shape_NEG)
+{
+  AddGraph g;
+  g.init();
+
+  // Unknown shape
+  g.input->dim(0).unset();
+  g.add->dim(0).unset();
+  g.output->dim(0).unset();
+
+  luci::ConvertNCHWToNHWCPass pass(false, false);
+  EXPECT_EQ(false, pass.run(&g.g));
+}
+
+TEST(ConvertNCHWToNHWC, Preserve_Input_Output)
+{
+  // Preserve input
+  {
+    AddGraph g;
+    g.init();
+
+    run_phase(&g.g, true, false);
+
+    // Check input shape
+    EXPECT_EQ(1, g.input->dim(0).value());
+    EXPECT_EQ(16, g.input->dim(1).value());
+    EXPECT_EQ(4, g.input->dim(2).value());
+    EXPECT_EQ(4, g.input->dim(3).value());
+
+    // Check output shape
+    EXPECT_EQ(1, g.output->dim(0).value());
+    EXPECT_EQ(4, g.output->dim(1).value());
+    EXPECT_EQ(4, g.output->dim(2).value());
+    EXPECT_EQ(16, g.output->dim(3).value());
+  }
+
+  // Preserve output
+  {
+    AddGraph g;
+    g.init();
+
+    run_phase(&g.g, false, true);
+
+    // Check input shape
+    EXPECT_EQ(1, g.input->dim(0).value());
+    EXPECT_EQ(4, g.input->dim(1).value());
+    EXPECT_EQ(4, g.input->dim(2).value());
+    EXPECT_EQ(16, g.input->dim(3).value());
+
+    // Check output shape
+    EXPECT_EQ(1, g.output->dim(0).value());
+    EXPECT_EQ(16, g.output->dim(1).value());
+    EXPECT_EQ(4, g.output->dim(2).value());
+    EXPECT_EQ(4, g.output->dim(3).value());
+  }
+
+  // Preserve both input and output
+  {
+    AddGraph g;
+    g.init();
+
+    run_phase(&g.g, true, true);
+
+    // Check input shape
+    EXPECT_EQ(1, g.input->dim(0).value());
+    EXPECT_EQ(16, g.input->dim(1).value());
+    EXPECT_EQ(4, g.input->dim(2).value());
+    EXPECT_EQ(4, g.input->dim(3).value());
+
+    // Check output shape
+    EXPECT_EQ(1, g.output->dim(0).value());
+    EXPECT_EQ(16, g.output->dim(1).value());
+    EXPECT_EQ(4, g.output->dim(2).value());
+    EXPECT_EQ(4, g.output->dim(3).value());
+  }
+}
+
+TEST(ConvertNCHWToNHWC, Relu)
+{
+  ReluGraph g;
+  g.init();
+
+  run_phase(&g.g, true, true);
+
+  check_pre_trans(g.relu->features());
+
+  auto relu_succs = loco::succs(g.relu);
+  EXPECT_EQ(1, relu_succs.size());
+  check_post_trans(*relu_succs.begin());
+
+  // Check relu shape
+  EXPECT_EQ(1, g.relu->dim(0).value());
+  EXPECT_EQ(4, g.relu->dim(1).value());
+  EXPECT_EQ(4, g.relu->dim(2).value());
+  EXPECT_EQ(16, g.relu->dim(3).value());
+}
+
+TEST(ConvertNCHWToNHWC, Relu6)
+{
+  Relu6Graph g;
+  g.init();
+
+  run_phase(&g.g, true, true);
+
+  check_pre_trans(g.relu6->features());
+
+  auto relu6_succs = loco::succs(g.relu6);
+  EXPECT_EQ(1, relu6_succs.size());
+  check_post_trans(*relu6_succs.begin());
+
+  // Check relu6 shape
+  EXPECT_EQ(1, g.relu6->dim(0).value());
+  EXPECT_EQ(4, g.relu6->dim(1).value());
+  EXPECT_EQ(4, g.relu6->dim(2).value());
+  EXPECT_EQ(16, g.relu6->dim(3).value());
+}
diff --git a/compiler/luci/pass/src/FoldAddV2Pass.cpp b/compiler/luci/pass/src/FoldAddV2Pass.cpp
new file mode 100644
index 000000000..20c1022f8
--- /dev/null
+++ b/compiler/luci/pass/src/FoldAddV2Pass.cpp
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/FoldAddV2Pass.h"
+
+#include <luci/IR/CircleNodes.h>
+
+#include <iostream>
+
+namespace
+{
+
+bool same_shape(const luci::CircleConst *x, const luci::CircleConst *y)
+{
+  if (x->rank() != y->rank())
+    return false;
+
+  for (uint32_t i = 0; i < x->rank(); i++)
+  {
+    if (!(x->dim(i) == y->dim(i)))
+      return false;
+  }
+
+  return true;
+}
+
+/**
+ * Fold AddV2 to const if both inputs are const
+ **/
+template <loco::DataType T> bool fold_add_v2(luci::CircleCustom *add_v2)
+{
+  // This should hold for AddV2
+  if (add_v2->numInputs() != 2)
+    return false;
+
+  // Check first input is const
+  auto x = dynamic_cast<luci::CircleConst *>(add_v2->inputs(0));
+  if (not x)
+    return false;
+
+  // Check second input is const
+  auto y = dynamic_cast<luci::CircleConst *>(add_v2->inputs(1));
+  if (not y)
+    return false;
+
+  if (x->dtype() != y->dtype())
+    return false;
+
+  if (!same_shape(x, y))
+    return false;
+
+  auto name_x = x->name();
+  auto name_y = y->name();
+  assert(name_x.length() > 0);
+  assert(name_y.length() > 0);
+  auto constant = add_v2->graph()->nodes()->create<luci::CircleConst>();
+  constant->dtype(x->dtype());
+  constant->rank(x->rank());
+  for (uint32_t i = 0; i < x->rank(); i++)
+    constant->dim(i).set(x->dim(i).value());
+
+  const auto size = x->size<T>();
+  constant->size<T>(size);
+  for (uint32_t i = 0; i < size; i++)
+    constant->at<T>(i) = x->at<T>(i) + y->at<T>(i);
+
+  constant->shape_status(luci::ShapeStatus::VALID);
+  constant->name(name_x + ";" + name_y);
+
+  for (auto succ : loco::succs(add_v2))
+  {
+    auto custom_out = loco::must_cast<luci::CircleCustomOut *>(succ);
+    loco::replace(custom_out).with(constant);
+  }
+
+  return true;
+}
+
+} // namespace
+
+namespace luci
+{
+
+/**
+ * Constant Folding for AddV2 Op
+ **/
+bool FoldAddV2Pass::run(loco::Graph *g)
+{
+  bool changed = false;
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    if (auto custom = dynamic_cast<luci::CircleCustom *>(node))
+    {
+      if (custom->custom_code() == "AddV2")
+      {
+        // TODO: Support more data types
+        if (custom->dtype() == loco::DataType::S64)
+        {
+          if (fold_add_v2<loco::DataType::S64>(custom))
+            changed = true;
+        }
+      }
+    }
+  }
+
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/FoldAddV2Pass.test.cpp b/compiler/luci/pass/src/FoldAddV2Pass.test.cpp
new file mode 100644
index 000000000..438d7f077
--- /dev/null
+++ b/compiler/luci/pass/src/FoldAddV2Pass.test.cpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/FoldAddV2Pass.h"
+#include "PassTestGraphs.h"
+
+#include <luci/IR/CircleNodes.h>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+/**
+ *  Graph has an AddV2 Op with constant inputs
+ *
+ *    BEFORE
+ *
+ *    [CircleConst] [CircleConst]
+ *               |   |
+ *       [CircleCustom (AddV2)]
+ *                 |
+ *         [CircleCustomOut]
+ *
+ *    AFTER
+ *
+ *           [CircleConst]
+ */
+template <loco::DataType T> class FoldAddV2Test : public luci::ConstantFoldingAddTestGraph
+{
+public:
+  FoldAddV2Test(std::initializer_list<uint32_t> shape) : luci::ConstantFoldingAddTestGraph(shape, T)
+  {
+    _addV2 = _g.nodes()->create<luci::CircleCustom>(2, 1);
+    _x = _g.nodes()->create<luci::CircleConst>();
+    _y = _g.nodes()->create<luci::CircleConst>();
+    _addV2_out = _g.nodes()->create<luci::CircleCustomOut>();
+
+    _addV2->dtype(T);
+    _x->dtype(T);
+    _y->dtype(T);
+    _addV2_out->dtype(T);
+
+    _addV2->shape(shape);
+    _x->shape(shape);
+    _y->shape(shape);
+    _addV2_out->shape(shape);
+
+    uint32_t num_elems = 1;
+    for (auto dim = shape.begin(); dim != shape.end(); dim++)
+      num_elems *= *dim;
+
+    _x->size<T>(num_elems);
+    _y->size<T>(num_elems);
+
+    for (uint32_t i = 0; i < num_elems; i++)
+    {
+      _x->at<T>(i) = i + 1;
+      _y->at<T>(i) = i + 1;
+    }
+
+    _addV2->custom_code("AddV2");
+    _addV2->inputs(0, _x);
+    _addV2->inputs(1, _y);
+    _addV2_out->input(_addV2);
+
+    _addV2->name("addV2");
+    _x->name("x");
+    _y->name("y");
+  }
+
+  loco::Node *createFoldedPattern() override { return _addV2_out; }
+
+  virtual ~FoldAddV2Test() = default;
+
+protected:
+  luci::CircleCustom *_addV2 = nullptr;
+  luci::CircleCustomOut *_addV2_out = nullptr;
+  luci::CircleConst *_x = nullptr;
+  luci::CircleConst *_y = nullptr;
+};
+
+class FoldS64AddV2Test : public FoldAddV2Test<loco::DataType::S64>, public ::testing::Test
+{
+public:
+  FoldS64AddV2Test() : FoldAddV2Test<loco::DataType::S64>({3}) {}
+
+  virtual void SetUp() { init(); }
+};
+
+} // namespace
+
+TEST(FoldAddV2PassTest, name)
+{
+  luci::FoldAddV2Pass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
+
+TEST_F(FoldS64AddV2Test, fold_addV2)
+{
+  luci::FoldAddV2Pass pass;
+  while (pass.run(graph()))
+    ;
+
+  auto folded_const = getFoldedPattern();
+  EXPECT_NE(nullptr, folded_const);
+
+  // Check type, shape, values of folded const
+  EXPECT_EQ(loco::DataType::S64, folded_const->dtype());
+  EXPECT_EQ(1, folded_const->rank());
+  EXPECT_EQ(3, folded_const->dim(0).value());
+  EXPECT_EQ(2, folded_const->at<loco::DataType::S64>(0));
+  EXPECT_EQ(4, folded_const->at<loco::DataType::S64>(1));
+  EXPECT_EQ(6, folded_const->at<loco::DataType::S64>(2));
+}
+
+TEST_F(FoldS64AddV2Test, input_type_mismatch_NEG)
+{
+  _x->dtype(loco::DataType::S32);
+
+  luci::FoldAddV2Pass pass;
+  EXPECT_FALSE(pass.run(graph()));
+}
diff --git a/compiler/luci/pass/src/FoldCastPass.cpp b/compiler/luci/pass/src/FoldCastPass.cpp
new file mode 100644
index 000000000..00b86fe48
--- /dev/null
+++ b/compiler/luci/pass/src/FoldCastPass.cpp
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/FoldCastPass.h"
+
+#include <luci/IR/CircleNodes.h>
+
+namespace
+{
+
+luci::CircleConst *cast_const(luci::CircleConst *node, loco::DataType from_dtype,
+                              loco::DataType to_dtype)
+{
+  assert(node->dtype() == from_dtype);
+
+  auto name = node->name();
+  assert(name.length() > 0);
+  auto constant = node->graph()->nodes()->create<luci::CircleConst>();
+  constant->dtype(to_dtype);
+  constant->rank(node->rank());
+  uint32_t num_elems = 1;
+  for (uint32_t i = 0; i < node->rank(); i++)
+  {
+    constant->dim(i).set(node->dim(i).value());
+    num_elems *= node->dim(i).value();
+  }
+
+  constant->shape_status(luci::ShapeStatus::VALID);
+
+  // TODO: Support more data types
+  if (from_dtype == loco::DataType::S64)
+  {
+    if (to_dtype == loco::DataType::S32)
+    {
+      constant->size<loco::DataType::S32>(num_elems);
+      for (uint32_t i = 0; i < num_elems; i++)
+        constant->at<loco::DataType::S32>(i) =
+          static_cast<int32_t>(node->at<loco::DataType::S64>(i));
+
+      constant->name(name + "_S32");
+      return constant;
+    }
+    return nullptr;
+  }
+
+  return nullptr;
+}
+
+/**
+ * Fold Cast to const if it has const input
+ **/
+bool fold_cast(luci::CircleCast *cast)
+{
+  // Check cast has const input
+  auto const_x = dynamic_cast<luci::CircleConst *>(cast->x());
+  if (not const_x)
+    return false;
+
+  const auto in_dtype = const_x->dtype();
+  const auto out_dtype = cast->dtype();
+
+  auto casted_const = cast_const(const_x, in_dtype, out_dtype);
+  if (not casted_const)
+    return false;
+
+  loco::replace(cast).with(casted_const);
+
+  return true;
+}
+
+} // namespace
+
+namespace luci
+{
+
+/**
+ * Constant Folding for Cast Op
+ **/
+bool FoldCastPass::run(loco::Graph *g)
+{
+  bool changed = false;
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    if (auto cast = dynamic_cast<luci::CircleCast *>(node))
+    {
+      if (fold_cast(cast))
+        changed = true;
+    }
+  }
+
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/FoldCastPass.test.cpp b/compiler/luci/pass/src/FoldCastPass.test.cpp
new file mode 100644
index 000000000..5911adf11
--- /dev/null
+++ b/compiler/luci/pass/src/FoldCastPass.test.cpp
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/FoldCastPass.h"
+#include "PassTestGraphs.h"
+
+#include <luci/IR/CircleNodes.h>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+template <loco::DataType FromT, loco::DataType ToT>
+class FoldCastTest : public luci::ConstantFoldingAddTestGraph
+{
+public:
+  FoldCastTest(std::initializer_list<uint32_t> shape)
+    : luci::ConstantFoldingAddTestGraph(shape, ToT)
+  {
+    _cast = _g.nodes()->create<luci::CircleCast>();
+    _x = _g.nodes()->create<luci::CircleConst>();
+
+    _cast->dtype(ToT);
+    _x->dtype(FromT);
+
+    _cast->shape(shape);
+    _x->shape(shape);
+
+    uint32_t num_elems = 1;
+    for (auto dim = shape.begin(); dim != shape.end(); dim++)
+      num_elems *= *dim;
+
+    _x->size<FromT>(num_elems);
+    for (uint32_t i = 0; i < num_elems; i++)
+      _x->at<FromT>(i) = i + 1;
+
+    _cast->x(_x);
+
+    _cast->name("cast");
+    _x->name("x");
+  }
+
+  loco::Node *createFoldedPattern() override { return _cast; }
+
+protected:
+  luci::CircleCast *_cast = nullptr;
+  luci::CircleConst *_x = nullptr;
+};
+
+/**
+ *  Graph that has a Cast Op with constant input
+ *
+ *    BEFORE
+ *
+ *         [CircleConst]
+ *               |
+ *            [Cast]
+ *
+ *    AFTER
+ *
+ *         [CircleConst]
+ *
+ */
+class FoldS64ToS32CastTest : public FoldCastTest<loco::DataType::S64, loco::DataType::S32>,
+                             public ::testing::Test
+{
+public:
+  FoldS64ToS32CastTest() : FoldCastTest<loco::DataType::S64, loco::DataType::S32>({3}) {}
+
+  virtual void SetUp() { init(); }
+};
+
+} // namespace
+
+TEST(FoldCastPassTest, name)
+{
+  luci::FoldCastPass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
+
+TEST_F(FoldS64ToS32CastTest, fold_cast_s64_to_s32)
+{
+  luci::FoldCastPass pass;
+  while (pass.run(graph()))
+    ;
+
+  auto folded_const = getFoldedPattern();
+  EXPECT_NE(nullptr, folded_const);
+
+  // Check type, shape, values of folded const
+  EXPECT_EQ(loco::DataType::S32, folded_const->dtype());
+  EXPECT_EQ(1, folded_const->rank());
+  EXPECT_EQ(3, folded_const->dim(0).value());
+  EXPECT_EQ(1, folded_const->at<loco::DataType::S32>(0));
+  EXPECT_EQ(2, folded_const->at<loco::DataType::S32>(1));
+  EXPECT_EQ(3, folded_const->at<loco::DataType::S32>(2));
+}
diff --git a/compiler/luci/pass/src/FoldDequantizePass.cpp b/compiler/luci/pass/src/FoldDequantizePass.cpp
index 01c04f478..3dd4f8cea 100644
--- a/compiler/luci/pass/src/FoldDequantizePass.cpp
+++ b/compiler/luci/pass/src/FoldDequantizePass.cpp
@@ -17,8 +17,7 @@
 #include "luci/Pass/FoldDequantizePass.h"
 
 #include <luci/IR/CircleNodes.h>
-
-#include <loco/Service/TypeInference.h>
+#include <luci/Profile/CircleNodeOrigin.h>
 
 namespace
 {
@@ -51,6 +50,8 @@ luci::CircleConst *dequantized_const_node(luci::CircleConst *const_node)
     throw std::runtime_error("Given constant node has no quantization parameter");
   }
 
+  auto name = const_node->name();
+  assert(name.length() > 0);
   auto g = const_node->graph();
   auto new_const_node = g->nodes()->create<luci::CircleConst>();
 
@@ -64,6 +65,7 @@ luci::CircleConst *dequantized_const_node(luci::CircleConst *const_node)
   }
   new_const_node->size<loco::DataType::FLOAT32>(dim_size);
   new_const_node->shape_status(luci::ShapeStatus::VALID);
+  new_const_node->name(name + "_DQ");
 
   const int32_t q_dim = const_node->quantparam()->quantized_dimension;
   const int32_t q_dim_value = const_node->dim(q_dim).value();
@@ -81,8 +83,8 @@ luci::CircleConst *dequantized_const_node(luci::CircleConst *const_node)
         qd = 0;
 
       new_const_node->at<loco::DataType::FLOAT32>(i) =
-          (float)(const_node->at<loco::DataType::S8>(i) - const_node->quantparam()->zerop.at(qd)) *
-          const_node->quantparam()->scale.at(qd);
+        (float)(const_node->at<loco::DataType::S8>(i) - const_node->quantparam()->zerop.at(qd)) *
+        const_node->quantparam()->scale.at(qd);
     }
   }
   else
@@ -94,9 +96,9 @@ luci::CircleConst *dequantized_const_node(luci::CircleConst *const_node)
         qd = 0;
 
       new_const_node->at<loco::DataType::FLOAT32>(i) =
-          (float)((int)const_node->at<loco::DataType::U8>(i) -
-                  const_node->quantparam()->zerop.at(qd)) *
-          const_node->quantparam()->scale.at(qd);
+        (float)((int)const_node->at<loco::DataType::U8>(i) -
+                const_node->quantparam()->zerop.at(qd)) *
+        const_node->quantparam()->scale.at(qd);
     }
   }
 
@@ -192,6 +194,8 @@ bool FoldDequantizePass::run(loco::Graph *g)
             if (replace_const_node(const_node_user, const_node))
             {
               loco::replace(dequant).with(const_node_user);
+              luci::add_origin(loco::must_cast<luci::CircleNode *>(const_node_user),
+                               luci::get_origin(dequant));
               changed = true;
             }
           }
diff --git a/compiler/luci/service/src/Nodes/CircleOutput.cpp b/compiler/luci/pass/src/FoldDequantizePass.test.cpp
index d4c8da2d8..d82a7bc87 100644
--- a/compiler/luci/service/src/Nodes/CircleOutput.cpp
+++ b/compiler/luci/pass/src/FoldDequantizePass.test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,14 +14,13 @@
  * limitations under the License.
  */
 
-#include <luci/Service/CircleShapeSignatureInference.h>
+#include "luci/Pass/FoldDequantizePass.h"
 
-namespace luci
-{
+#include <gtest/gtest.h>
 
-ShapeSignature ssinf::Algorithm::visit(const luci::CircleOutput *node)
+TEST(FoldDequantizePassTest, name)
 {
-  return input_arg_signature(node, 0);
+  luci::FoldDequantizePass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
 }
-
-} // namespace luci
diff --git a/compiler/luci/pass/src/FoldSparseToDensePass.cpp b/compiler/luci/pass/src/FoldSparseToDensePass.cpp
new file mode 100644
index 000000000..0c6fc43ed
--- /dev/null
+++ b/compiler/luci/pass/src/FoldSparseToDensePass.cpp
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/FoldSparseToDensePass.h"
+#include "CircleOptimizerUtils.h"
+
+#include <luci/IR/CircleNodes.h>
+
+namespace
+{
+
+/**
+ * Fold to const if
+ *
+ * 1. indices has 0-sized static shape such as [0]
+ *    (i.e., output is filled with default value)
+ * 2. default_value: const scalar
+ * 3. output_shape: const
+ *
+ * TODO: Support more general patterns
+ **/
+template <loco::DataType IndexT, loco::DataType ValueT>
+bool fold_sparse_to_dense(luci::CircleSparseToDense *stod)
+{
+  const auto indices = loco::must_cast<luci::CircleNode *>(stod->indices());
+  const auto default_value = loco::must_cast<luci::CircleConst *>(stod->default_value());
+  const auto output_shape = loco::must_cast<luci::CircleConst *>(stod->output_shape());
+
+  bool has_zero = false;
+  for (uint32_t i = 0; i < indices->rank(); i++)
+  {
+    if (indices->dim(i).known() && indices->dim(i).value() == 0)
+      has_zero = true;
+  }
+  if (!has_zero)
+    return false;
+
+  if (default_value->rank() != 0 || default_value->size<ValueT>() != 1)
+    return false;
+
+  auto rank = output_shape->size<IndexT>();
+  std::vector<uint32_t> shape;
+  for (uint32_t i = 0; i < rank; i++)
+  {
+    auto dim = output_shape->at<IndexT>(i);
+    assert(dim >= 0 && dim <= std::numeric_limits<uint32_t>::max());
+    if (!(dim >= 0 && dim <= std::numeric_limits<uint32_t>::max()))
+      return false;
+
+    shape.push_back(dim);
+  }
+
+  auto name = stod->name();
+  assert(name.length() > 0);
+  auto constant = stod->graph()->nodes()->create<luci::CircleConst>();
+  constant->dtype(default_value->dtype());
+  constant->rank(rank);
+  uint32_t dim_size = 1;
+  for (uint32_t i = 0; i < rank; i++)
+  {
+    constant->dim(i).set(shape[i]);
+    dim_size *= shape[i];
+  }
+
+  constant->size<ValueT>(dim_size);
+  const auto value = default_value->scalar<ValueT>();
+  for (uint32_t i = 0; i < dim_size; i++)
+    constant->at<ValueT>(i) = value;
+
+  constant->shape_status(luci::ShapeStatus::VALID);
+  constant->name(name + "_D");
+
+  loco::replace(stod).with(constant);
+
+  return true;
+}
+
+bool fold_sparse_to_dense(luci::CircleSparseToDense *stod)
+{
+  auto indices = loco::must_cast<luci::CircleNode *>(stod->indices());
+  auto default_value = dynamic_cast<luci::CircleConst *>(stod->default_value());
+  if (not default_value)
+    return false;
+
+  auto output_shape = dynamic_cast<luci::CircleConst *>(stod->output_shape());
+  if (not output_shape)
+    return false;
+
+  // Illegal input check
+  if (indices->dtype() != output_shape->dtype())
+    throw std::runtime_error("indices and output_shape of SparseToDense must have the same dtype");
+
+  // TODO: Support more data types
+  if (indices->dtype() == loco::DataType::S64)
+  {
+    if (default_value->dtype() == loco::DataType::S64)
+    {
+      return fold_sparse_to_dense<loco::DataType::S64, loco::DataType::S64>(stod);
+    }
+  }
+  return false;
+}
+
+} // namespace
+
+namespace luci
+{
+
+/**
+ * Constant Folding for SparseToDense Op
+ **/
+bool FoldSparseToDensePass::run(loco::Graph *g)
+{
+  bool changed = false;
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    if (auto stod = dynamic_cast<luci::CircleSparseToDense *>(node))
+    {
+      if (fold_sparse_to_dense(stod))
+        changed = true;
+    }
+  }
+
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/FoldSparseToDensePass.test.cpp b/compiler/luci/pass/src/FoldSparseToDensePass.test.cpp
new file mode 100644
index 000000000..7c6dcb033
--- /dev/null
+++ b/compiler/luci/pass/src/FoldSparseToDensePass.test.cpp
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/FoldSparseToDensePass.h"
+#include "PassTestGraphs.h"
+
+#include <luci/IR/CircleNodes.h>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+/**
+ *  Graph that has a SparseToDense Op with zero-sized indices
+ *
+ *    BEFORE
+ *    - shape of indices: [0,1]
+ *    - output_shape: [3]
+ *    - default_value: scalar 2
+ *
+ *     [indices] [output_shape] [values] [default_value]
+ *            |         |          |      |
+ *            +------[SparseToDense]------+
+ *
+ *    AFTER
+ *
+ *            [Const] (shape: [3], values: [2, 2, 2])
+ *
+ */
+class S64SparseToDenseZeroIndicesTest : public luci::ConstantFoldingAddTestGraph,
+                                        public ::testing::Test
+{
+public:
+  S64SparseToDenseZeroIndicesTest() : luci::ConstantFoldingAddTestGraph({3}, loco::DataType::S64) {}
+
+  virtual void SetUp() { init(); }
+
+  loco::Node *createFoldedPattern() override
+  {
+    _stod = _g.nodes()->create<luci::CircleSparseToDense>();
+    _indices = _g.nodes()->create<luci::CircleConst>();
+    _output_shape = _g.nodes()->create<luci::CircleConst>();
+    _values = _g.nodes()->create<luci::CircleConst>();
+    _default_value = _g.nodes()->create<luci::CircleConst>();
+
+    _stod->dtype(loco::DataType::S64);
+    _indices->dtype(loco::DataType::S64);
+    _output_shape->dtype(loco::DataType::S64);
+    _values->dtype(loco::DataType::S64);
+    _default_value->dtype(loco::DataType::S64);
+
+    _indices->shape({0, 1});
+    _output_shape->shape({1});
+    _values->shape({0});
+    _default_value->rank(0);
+
+    _indices->size<loco::DataType::S64>(0);
+    _output_shape->size<loco::DataType::S64>(1);
+    _output_shape->at<loco::DataType::S64>(0) = 3;
+    _values->size<loco::DataType::S64>(0);
+    _default_value->size<loco::DataType::S64>(1);
+    _default_value->at<loco::DataType::S64>(0) = 2;
+
+    _stod->indices(_indices);
+    _stod->output_shape(_output_shape);
+    _stod->values(_values);
+    _stod->default_value(_default_value);
+
+    _stod->name("stod");
+    _indices->name("indices");
+    _output_shape->name("output_shape");
+    _values->name("values");
+    _default_value->name("default_value");
+
+    return _stod;
+  }
+
+protected:
+  luci::CircleSparseToDense *_stod = nullptr;
+  luci::CircleConst *_indices = nullptr;
+  luci::CircleConst *_output_shape = nullptr;
+  luci::CircleConst *_values = nullptr;
+  luci::CircleConst *_default_value = nullptr;
+};
+
+} // namespace
+
+TEST(FoldSparseToDensePassTest, name)
+{
+  luci::FoldSparseToDensePass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
+
+TEST_F(S64SparseToDenseZeroIndicesTest, fold_stod_with_zero_indices)
+{
+  luci::FoldSparseToDensePass pass;
+  while (pass.run(graph()))
+    ;
+
+  auto folded_const = getFoldedPattern();
+  EXPECT_NE(nullptr, folded_const);
+
+  // Chec type, shape, values of folded const
+  EXPECT_EQ(loco::DataType::S64, folded_const->dtype());
+  EXPECT_EQ(1, folded_const->rank());
+  EXPECT_EQ(3, folded_const->dim(0).value());
+  EXPECT_EQ(2, folded_const->at<loco::DataType::S64>(0));
+  EXPECT_EQ(2, folded_const->at<loco::DataType::S64>(1));
+  EXPECT_EQ(2, folded_const->at<loco::DataType::S64>(2));
+}
+
+TEST_F(S64SparseToDenseZeroIndicesTest, illegal_input_NEG)
+{
+  _indices->dtype(loco::DataType::S32);
+
+  luci::FoldSparseToDensePass pass;
+  EXPECT_ANY_THROW(pass.run(graph()));
+}
diff --git a/compiler/luci/pass/src/ForwardReshapeToUnaryOpPass.cpp b/compiler/luci/pass/src/ForwardReshapeToUnaryOpPass.cpp
new file mode 100644
index 000000000..2c990f0a5
--- /dev/null
+++ b/compiler/luci/pass/src/ForwardReshapeToUnaryOpPass.cpp
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/ForwardReshapeToUnaryOpPass.h"
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/IR/CircleNodeVisitor.h>
+#include <luci/Log.h>
+#include <luci/Profile/CircleNodeOrigin.h>
+#include <luci/Service/CircleShapeInference.h>
+#include <luci/Service/Nodes/CircleConst.h>
+
+namespace
+{
+
+luci::CircleReshape *as_reshape(loco::Node *node)
+{
+  return dynamic_cast<luci::CircleReshape *>(node);
+}
+
+luci::CircleConst *clone_shape(luci::CircleReshape *reshape)
+{
+  const auto shape = dynamic_cast<luci::CircleConst *>(reshape->shape());
+  // only support CircleConst for now
+  if (shape == nullptr)
+    return nullptr;
+
+  // NOTE tflite and circle only supports S32
+  // TODO just check with assert() after import handles this
+  auto dtype = shape->dtype();
+  if (dtype != loco::DataType::S32)
+    return nullptr;
+
+  return luci::clone(shape);
+}
+
+void copy_shape(luci::CircleReshape *reshape, luci::CircleReshape *new_reshape)
+{
+  auto ns_rank = reshape->newShape()->rank();
+  new_reshape->newShape()->rank(ns_rank);
+  for (uint32_t r = 0; r < ns_rank; ++r)
+    new_reshape->newShape()->dim(r) = reshape->newShape()->dim(r);
+}
+
+bool forward_reshape(luci::CircleReshape *reshape, luci::CircleNeg *neg)
+{
+  assert(reshape != nullptr);
+  assert(neg != nullptr);
+
+  luci::CircleConst *cloned_shape = clone_shape(reshape);
+  if (cloned_shape == nullptr)
+    return false;
+
+  auto name = reshape->name();
+  assert(name.length() > 0);
+  loco::Graph *graph = neg->graph();
+  // create reshape placed after neg
+  luci::CircleReshape *new_reshape = graph->nodes()->create<luci::CircleReshape>();
+  copy_shape(reshape, new_reshape);
+  new_reshape->shape(cloned_shape);
+  new_reshape->name(name + "_C");
+  luci::add_origin(new_reshape, luci::get_origin(reshape));
+
+  // reconnect network
+  loco::replace(neg).with(new_reshape);
+  neg->x(reshape->tensor());
+  new_reshape->tensor(neg);
+
+  // Do shape inference for this node again.
+  neg->shape_status(luci::ShapeStatus::UNDEFINED);
+
+  return true;
+}
+
+class ForwardReshape final : public luci::CircleNodeMutableVisitor<bool>
+{
+protected:
+  bool visit(luci::CircleNode *node)
+  {
+    LOGGER(l);
+    INFO(l) << "ForwardReshape: Unsupported operator: " << node->name() << std::endl;
+    return false;
+  }
+
+  bool visit(luci::CircleNeg *node)
+  {
+    auto reshape = as_reshape(node->x());
+    if (reshape == nullptr)
+      return false;
+    return forward_reshape(reshape, node);
+  }
+
+  // TODO add more unary operators
+};
+
+} // namespace
+
+namespace luci
+{
+
+/**
+ * BEFORE
+ *                       |
+ *                  [CircleNode]  [CircleConst]
+ *                       |       /
+ *                 [CircleReshape]
+ *                /      |
+ *     [CircleNode]  [(UnaryOp)]
+ *          |            |     \
+ *          |            |      [CircleNode]
+ *          |            |           |
+ *
+ *   UnaryOp: CircleNeg, ...
+ *
+ * AFTER
+ *                       |
+ *   [CircleConst]  [CircleNode]
+ *         |       /     |
+ *  [CircleReshape] [(UnaryOp)] [CircleConst]
+ *         |             |      /
+ *   [CircleNode] [CircleReshape]
+ *         |             |      \
+ *         |             |       [CircleNode]
+ *         |             |            |
+ *
+ *   Note: new [CircleReshape] after [(UnaryOp)] added
+ */
+bool ForwardReshapeToUnaryOpPass::run(loco::Graph *g)
+{
+  bool changed = false;
+  ForwardReshape forward;
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    auto circle_node = loco::must_cast<luci::CircleNode *>(node);
+    if (circle_node->accept(&forward))
+      changed = true;
+  }
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/ForwardReshapeToUnaryOpPass.test.cpp b/compiler/luci/pass/src/ForwardReshapeToUnaryOpPass.test.cpp
new file mode 100644
index 000000000..2593a014c
--- /dev/null
+++ b/compiler/luci/pass/src/ForwardReshapeToUnaryOpPass.test.cpp
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/ForwardReshapeToUnaryOpPass.h"
+#include "luci/Pass/CircleShapeInferencePass.h"
+
+#include <luci/IR/CircleNodes.h>
+
+#include <luci/test/TestIOGraph.h>
+
+#include <gtest/gtest.h>
+
+#include <vector>
+
+namespace
+{
+
+using namespace luci::test;
+
+class ReshapeNegGraphlet
+{
+public:
+  ReshapeNegGraphlet() = default;
+
+public:
+  void init(loco::Graph *g, const ShapeU32 shape_in, const ShapeU32 shape_out)
+  {
+    std::vector<uint32_t> shape_out_v = shape_out;
+
+    _reshape_shape = g->nodes()->create<luci::CircleConst>();
+    _reshape = g->nodes()->create<luci::CircleReshape>();
+    _neg = g->nodes()->create<luci::CircleNeg>();
+
+    _reshape_shape->dtype(loco::DataType::S32);
+    _reshape_shape->rank(1);
+    _reshape_shape->dim(0).set(shape_out_v.size());
+    _reshape_shape->shape_status(luci::ShapeStatus::VALID);
+    // values
+    const auto size = shape_out_v.size();
+    _reshape_shape->size<loco::DataType::S32>(size);
+    for (uint32_t i = 0; i < size; i++)
+      _reshape_shape->at<loco::DataType::S32>(i) = shape_out_v[i];
+
+    _reshape_shape->name("reshape_shape");
+    _reshape->name("reshape");
+    _neg->name("neg");
+  }
+
+protected:
+  luci::CircleReshape *_reshape = nullptr;
+  luci::CircleNeg *_neg = nullptr;
+  luci::CircleConst *_reshape_shape = nullptr;
+};
+
+class ForwardReshapeToNegGraph : public TestIOGraph, public ReshapeNegGraphlet
+{
+public:
+  ForwardReshapeToNegGraph() = default;
+
+public:
+  void init(const ShapeU32 shape_in, const ShapeU32 shape_out)
+  {
+    TestIOGraph::init(shape_in, shape_out);
+    ReshapeNegGraphlet::init(g(), shape_in, shape_out);
+
+    // connect network
+    _reshape->tensor(input());
+    _reshape->shape(_reshape_shape);
+    _neg->x(_reshape);
+
+    output()->from(_neg);
+  }
+};
+
+class ForwardReshapeToNegGraphTest : public ::testing::Test
+{
+public:
+  ForwardReshapeToNegGraphTest() = default;
+
+  void run_pass(void)
+  {
+    while (_pass.run(_graph.g()))
+      ;
+  }
+
+protected:
+  ForwardReshapeToNegGraph _graph;
+  luci::ForwardReshapeToUnaryOpPass _pass;
+};
+
+} // namespace
+
+TEST(ForwardReshapeToUnaryOpPassTest, name)
+{
+  luci::ForwardReshapeToUnaryOpPass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
+
+TEST_F(ForwardReshapeToNegGraphTest, simple_forward)
+{
+  _graph.init({2, 2, 2}, {2, 4});
+
+  run_pass();
+
+  auto reshape = dynamic_cast<luci::CircleReshape *>(_graph.output()->from());
+  auto neg = dynamic_cast<luci::CircleNeg *>(_graph.output()->from());
+  ASSERT_NE(nullptr, reshape);
+  ASSERT_EQ(nullptr, neg);
+  neg = dynamic_cast<luci::CircleNeg *>(reshape->tensor());
+  ASSERT_NE(nullptr, neg);
+}
diff --git a/compiler/luci/pass/src/FuseActivationFunctionPass.cpp b/compiler/luci/pass/src/FuseActivationFunctionPass.cpp
index 844541d2d..66e341518 100644
--- a/compiler/luci/pass/src/FuseActivationFunctionPass.cpp
+++ b/compiler/luci/pass/src/FuseActivationFunctionPass.cpp
@@ -17,7 +17,9 @@
 #include "luci/Pass/FuseActivationFunctionPass.h"
 
 #include <luci/IR/CircleNodes.h>
+#include <luci/IR/CircleNodeMixins.h>
 #include <luci/IR/CircleOpcode.h>
+#include <luci/Profile/CircleNodeOrigin.h>
 
 namespace luci
 {
@@ -32,10 +34,15 @@ bool fuse_activation_function(luci::CircleNode *node)
     return false;
 
   auto node_with_fused_act =
-      dynamic_cast<luci::LuciNodeMixin<luci::LuciNodeTrait::FusedActFunc> *>(pred_node);
+    dynamic_cast<luci::CircleNodeMixin<luci::CircleNodeTrait::FusedActFunc> *>(pred_node);
   if (node_with_fused_act == nullptr)
     return false;
 
+  // TODO remove this work-around
+  // This will skip fuse for concat as luci-interpreter doesn't support this yet
+  if (dynamic_cast<luci::CircleConcatenation *>(pred_node) != nullptr)
+    return false;
+
   auto fused_act = node_with_fused_act->fusedActivationFunction();
 
   luci::FusedActFunc target_func = luci::FusedActFunc::UNDEFINED;
@@ -76,6 +83,7 @@ bool fuse_activation_function(luci::CircleNode *node)
     return false;
 
   node_with_fused_act->fusedActivationFunction(target_func);
+  luci::add_origin(pred_node, luci::get_origin(node));
   loco::replace(node).with(pred_node);
 
   node->drop();
diff --git a/compiler/luci/pass/src/FuseActivationFunctionPass.test.cpp b/compiler/luci/pass/src/FuseActivationFunctionPass.test.cpp
index 226a303a1..56b414143 100644
--- a/compiler/luci/pass/src/FuseActivationFunctionPass.test.cpp
+++ b/compiler/luci/pass/src/FuseActivationFunctionPass.test.cpp
@@ -14,15 +14,19 @@
  * limitations under the License.
  */
 
-#include "FuseActivationFunctionPassInternal.h"
+#include "luci/Pass/FuseActivationFunctionPass.h"
 
 #include <luci/IR/CircleNodes.h>
 
+#include <luci/test/TestIOGraph.h>
+
 #include <gtest/gtest.h>
 
 namespace
 {
 
+using namespace luci::test;
+
 /**
  *  Simple graph for test
  *
@@ -41,60 +45,148 @@ namespace
  *         [Conv2]
  *
  */
-class SimpleGraph
+class ConvReluConvGraphlet
+{
+public:
+  ConvReluConvGraphlet() = default;
+
+  void init(loco::Graph *g)
+  {
+    _conv1 = g->nodes()->create<luci::CircleConv2D>();
+    _conv2 = g->nodes()->create<luci::CircleConv2D>();
+    _relu = g->nodes()->create<luci::CircleRelu>();
+    _conv1_f = g->nodes()->create<luci::CircleConst>();
+    _conv1_b = g->nodes()->create<luci::CircleConst>();
+    _conv2_f = g->nodes()->create<luci::CircleConst>();
+    _conv2_b = g->nodes()->create<luci::CircleConst>();
+
+    _conv1->fusedActivationFunction(luci::FusedActFunc::NONE);
+
+    _conv1->name("conv1");
+    _conv2->name("conv2");
+    _relu->name("relu");
+    _conv1_f->name("conv1f");
+    _conv1_b->name("conv1b");
+    _conv2_f->name("conv2f");
+    _conv2_b->name("conv2b");
+  }
+
+public:
+  luci::CircleRelu *relu() { return _relu; }
+  luci::CircleConv2D *conv1() { return _conv1; }
+  luci::CircleConv2D *conv2() { return _conv2; }
+
+protected:
+  luci::CircleConv2D *_conv1 = nullptr;
+  luci::CircleConv2D *_conv2 = nullptr;
+  luci::CircleRelu *_relu = nullptr;
+  luci::CircleConst *_conv1_f = nullptr;
+  luci::CircleConst *_conv1_b = nullptr;
+  luci::CircleConst *_conv2_f = nullptr;
+  luci::CircleConst *_conv2_b = nullptr;
+};
+
+class FuseActTestGraph : public TestIOGraph, public ConvReluConvGraphlet
 {
 public:
-  SimpleGraph()
+  FuseActTestGraph() = default;
+
+  void init(void)
   {
-    conv1 = g.nodes()->create<luci::CircleConv2D>();
-    conv2 = g.nodes()->create<luci::CircleConv2D>();
-    relu = g.nodes()->create<luci::CircleRelu>();
+    TestIOGraph::init({1}, {1});
+    ConvReluConvGraphlet::init(g());
 
-    conv1->fusedActivationFunction(luci::FusedActFunc::NONE);
+    _conv1->input(input());
+    _conv1->filter(_conv1_f);
+    _conv1->bias(_conv1_b);
 
-    relu->features(conv1);
-    conv2->input(relu);
+    _relu->features(_conv1);
+
+    _conv2->input(_relu);
+    _conv2->filter(_conv2_f);
+    _conv2->bias(_conv2_b);
+
+    output()->from(_conv2);
   }
+};
 
+class ConvHasMultiSuccGraph : public TestIOGraph, public ConvReluConvGraphlet
+{
 public:
-  loco::Graph g;
-  luci::CircleConv2D *conv1;
-  luci::CircleConv2D *conv2;
-  luci::CircleRelu *relu;
+  ConvHasMultiSuccGraph() = default;
+
+  void init(void)
+  {
+    TestIOGraph::init({1}, {1});
+    ConvReluConvGraphlet::init(g());
+
+    _conv1->input(input());
+    _conv1->filter(_conv1_f);
+    _conv1->bias(_conv1_b);
+
+    _relu->features(_conv1);
+
+    _conv2->input(_conv1);
+    _conv2->filter(_conv2_f);
+    _conv2->bias(_conv2_b);
+
+    output()->from(_relu); // We need to check from relu
+  }
 };
 
+// TODO use ::testing::Test
+
 } // namespace
 
+TEST(FuseActivationFunctionPassTest, name)
+{
+  luci::FuseActivationFunctionPass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
+
 TEST(FusePreActivationBatchNorm, fuse_activation_function)
 {
-  SimpleGraph g;
+  FuseActTestGraph g;
+  luci::FuseActivationFunctionPass pass;
 
-  EXPECT_TRUE(luci::fuse_activation_function(g.relu));
+  g.init();
 
-  EXPECT_EQ(g.conv1, g.conv2->input());
+  EXPECT_TRUE(pass.run(g.g()));
+  EXPECT_EQ(g.conv1(), g.conv2()->input());
 }
 
 TEST(FusePreActivationBatchNorm, fuse_activation_function_dup_relu)
 {
-  SimpleGraph g;
-  g.conv1->fusedActivationFunction(luci::FusedActFunc::RELU);
+  FuseActTestGraph g;
+  luci::FuseActivationFunctionPass pass;
 
-  EXPECT_TRUE(luci::fuse_activation_function(g.relu));
+  g.init();
+  g.conv1()->fusedActivationFunction(luci::FusedActFunc::RELU);
 
-  EXPECT_EQ(g.conv1, g.conv2->input());
+  EXPECT_TRUE(pass.run(g.g()));
+  EXPECT_EQ(g.conv1(), g.conv2()->input());
 }
 
-TEST(FusePreActivationBatchNorm, fuse_activation_function_NEG)
+TEST(FusePreActivationBatchNorm, fuse_activation_function_mulsucc_NEG)
 {
-  SimpleGraph g;
-  g.conv2->input(g.conv1);
+  ConvHasMultiSuccGraph g;
+  luci::FuseActivationFunctionPass pass;
+
+  g.init();
 
-  // Conv1 has multiple successors
-  EXPECT_FALSE(luci::fuse_activation_function(g.relu));
+  // Relu input Conv2D has multiple successors
+  EXPECT_FALSE(pass.run(g.g()));
+}
+
+TEST(FusePreActivationBatchNorm, fuse_activation_function_tanh_NEG)
+{
+  FuseActTestGraph g;
+  luci::FuseActivationFunctionPass pass;
 
-  g.conv2->input(g.relu);
-  g.conv1->fusedActivationFunction(luci::FusedActFunc::TANH);
+  g.init();
+  g.conv1()->fusedActivationFunction(luci::FusedActFunc::TANH);
 
-  // Conv1 already has activation function
-  EXPECT_FALSE(luci::fuse_activation_function(g.relu));
+  // Relu input Conv2D already has activation function
+  EXPECT_FALSE(pass.run(g.g()));
 }
diff --git a/compiler/luci/pass/src/FuseAddWithTConvPass.cpp b/compiler/luci/pass/src/FuseAddWithTConvPass.cpp
index bd7805f6a..2bca57014 100644
--- a/compiler/luci/pass/src/FuseAddWithTConvPass.cpp
+++ b/compiler/luci/pass/src/FuseAddWithTConvPass.cpp
@@ -17,20 +17,30 @@
 #include "luci/Pass/FuseAddWithTConvPass.h"
 
 #include <luci/IR/CircleNodes.h>
+#include <luci/Profile/CircleNodeOrigin.h>
 
 namespace
 {
 /**
- *  Fuse add to TCONV if possible
+ *  Fuse Add to TransposeConv if possible
  *
  *  BEFORE
- *
- *         [CircleTransposeConv]
+ *                     |
+ *   [CircleConst]  [CircleTransposeConv]
+ *               \     |
+ *             [CircleAdd]
  *                  |
- *                [add]
+ *
  *  AFTER
+ *                  |
+ *   [CircleConst]  |
+ *             \    |
+ *         [CircleTransposeConv]   [CircleAdd]
+ *                  |
+ *            ([CircleRelu6])
+ *                  |
  *
- *         [CircleTransposeConv]
+ *  Note: CircleRelu6 is inserted if Add activation is ReLU6
  */
 bool fuse_add_with_tconv(luci::CircleTransposeConv *tconv)
 {
@@ -81,9 +91,13 @@ bool fuse_add_with_tconv(luci::CircleTransposeConv *tconv)
 
   if (add->fusedActivationFunction() == luci::FusedActFunc::RELU6)
   {
+    auto name = addition->name();
+    assert(name.length() > 0);
     // separate relu op from add op
     auto relu = add->graph()->nodes()->create<luci::CircleRelu6>();
     relu->features(tconv);
+    relu->name(name + "/Relu6");
+    luci::add_origin(relu, luci::get_origin(add));
 
     // remove add node
     replace(add).with(relu);
@@ -93,6 +107,9 @@ bool fuse_add_with_tconv(luci::CircleTransposeConv *tconv)
     replace(add).with(tconv);
   }
 
+  // set origin
+  luci::add_origin(tconv, luci::get_origin(add));
+
   return true;
 }
 
diff --git a/compiler/luci/pass/src/FuseAddWithTConvPass.test.cpp b/compiler/luci/pass/src/FuseAddWithTConvPass.test.cpp
new file mode 100644
index 000000000..8748d73ef
--- /dev/null
+++ b/compiler/luci/pass/src/FuseAddWithTConvPass.test.cpp
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/FuseAddWithTConvPass.h"
+
+#include <gtest/gtest.h>
+
+TEST(FuseAddWithTConvPassTest, name)
+{
+  luci::FuseAddWithTConvPass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
diff --git a/compiler/luci/pass/src/FuseBCQPass.cpp b/compiler/luci/pass/src/FuseBCQPass.cpp
index c0583d848..09180d8c1 100644
--- a/compiler/luci/pass/src/FuseBCQPass.cpp
+++ b/compiler/luci/pass/src/FuseBCQPass.cpp
@@ -17,6 +17,7 @@
 #include "luci/Pass/FuseBCQPass.h"
 
 #include <luci/IR/CircleNodes.h>
+#include <luci/Profile/CircleNodeOrigin.h>
 #include <luci/Log.h>
 
 #include <cassert>
@@ -111,7 +112,7 @@ template <> class BCQFuser<1>
 {
 public:
   BCQFuser<1>(int32_t original_output_cnt, int32_t bundle_cnt)
-      : _original_output_cnt{original_output_cnt}, _bundle_cnt{bundle_cnt}
+    : _original_output_cnt{original_output_cnt}, _bundle_cnt{bundle_cnt}
   {
     // Do nothing
   }
@@ -133,7 +134,7 @@ public:
       {
         const auto prefix = (output_node->index() - (_original_output_cnt + 1)) / (_bundle_cnt);
         const MetadataType metadata_type = static_cast<MetadataType>(
-            (output_node->index() - (_original_output_cnt + 1)) % (_bundle_cnt));
+          (output_node->index() - (_original_output_cnt + 1)) % (_bundle_cnt));
         const auto circle_node = loco::must_cast<luci::CircleNode *>(output_node->from());
         add_BCQ_info_node(prefix, metadata_type, circle_node);
       }
@@ -156,13 +157,18 @@ public:
           if (prefix == -1 || !is_valid_prefix(prefix))
             continue;
 
+          auto name = gather->name();
+          assert(name.length() > 0);
+
           auto bcq_gather = g->nodes()->create<luci::CircleBCQGather>();
+          luci::add_origin(bcq_gather, luci::get_origin(gather));
 
           bcq_gather->op_version(1);
           bcq_gather->input_scales(alpha(g, prefix));
           bcq_gather->input_binary(packed_binary_code(g, prefix));
           bcq_gather->indices(gather->indices());
           bcq_gather->input_clusters(packed_clusters(g, prefix));
+          bcq_gather->name(name + "/BCQGather");
 
           if (_do_w_x[prefix]->at<loco::DataType::BOOL>(0))
           {
@@ -177,7 +183,7 @@ public:
             bcq_gather->axis(axis_transpose);
 
             const auto indices_rank =
-                loco::must_cast<luci::CircleNode *>(gather->indices())->rank();
+              loco::must_cast<luci::CircleNode *>(gather->indices())->rank();
 
             auto perm = g->nodes()->create<luci::CircleConst>();
             perm->dtype(loco::DataType::S32);
@@ -188,10 +194,13 @@ public:
               perm->at<loco::DataType::S32>(idx) = idx + 1;
             perm->at<loco::DataType::S32>(indices_rank) = 0;
             perm->shape_status(luci::ShapeStatus::VALID);
+            perm->name(name + "/Transpose/perm");
 
             auto output_transpose = g->nodes()->create<luci::CircleTranspose>();
+            luci::add_origin(output_transpose, luci::get_origin(gather));
             output_transpose->a(bcq_gather);
             output_transpose->perm(perm);
+            output_transpose->name(name + "/Transpose");
 
             loco::replace(gather).with(output_transpose);
           }
@@ -209,7 +218,11 @@ public:
           if (prefix == -1 || !is_valid_prefix(prefix))
             continue;
 
+          auto name = fully_connected->name();
+          assert(name.length() > 0);
+
           auto bcq_fc = g->nodes()->create<luci::CircleBCQFullyConnected>();
+          luci::add_origin(bcq_fc, luci::get_origin(fully_connected));
 
           bcq_fc->op_version(1);
           bcq_fc->weights_scales(alpha(g, prefix));
@@ -217,6 +230,7 @@ public:
           bcq_fc->bias(fully_connected->bias());
           bcq_fc->weights_clusters(packed_clusters(g, prefix));
           bcq_fc->fusedActivationFunction(fully_connected->fusedActivationFunction());
+          bcq_fc->name(name + "/BCQFullyConnected");
 
           loco::Node *bcq_input = fully_connected->input();
 
@@ -231,18 +245,16 @@ public:
             new_shape->rank(1);
             new_shape->dim(0) = 2;
 
-            auto batch_size = 1;
-            for (uint32_t i = 0; i < original_input->rank() - 1; ++i)
-              batch_size *= original_input->dim(i).value();
-
-            new_shape->at<loco::DataType::S32>(0) = batch_size;
-            new_shape->at<loco::DataType::S32>(1) =
-                original_input->dim(original_input->rank() - 1).value();
+            new_shape->at<loco::DataType::S32>(0) = -1;
+            new_shape->at<loco::DataType::S32>(1) = weights->dim(1).value();
             new_shape->shape_status(luci::ShapeStatus::VALID);
+            new_shape->name(name + "/Reshape/shape");
 
             auto reshape = g->nodes()->create<luci::CircleReshape>();
+            luci::add_origin(reshape, luci::get_origin(fully_connected));
             reshape->tensor(original_input);
             reshape->shape(new_shape);
+            reshape->name(name + "/Reshape");
 
             bcq_input = reshape;
           }
@@ -258,23 +270,28 @@ public:
           perm->at<loco::DataType::S32>(0) = 1;
           perm->at<loco::DataType::S32>(1) = 0;
           perm->shape_status(luci::ShapeStatus::VALID);
+          perm->name(name + "/Transpose/perm");
 
           auto input_transpose = g->nodes()->create<luci::CircleTranspose>();
+          luci::add_origin(input_transpose, luci::get_origin(fully_connected));
           input_transpose->a(bcq_input);
           input_transpose->perm(perm);
+          input_transpose->name(name + "_input/Transpose");
 
           bcq_fc->input(input_transpose);
 
           auto output_transpose = g->nodes()->create<luci::CircleTranspose>();
+          luci::add_origin(output_transpose, luci::get_origin(fully_connected));
           output_transpose->a(bcq_fc);
           output_transpose->perm(perm);
+          output_transpose->name(name + "_output/Transpose");
 
           loco::replace(fully_connected).with(output_transpose);
 
           return true;
         }
         else if (auto weights_as_input =
-                     dynamic_cast<luci::CircleConst *>(fully_connected->input()))
+                   dynamic_cast<luci::CircleConst *>(fully_connected->input()))
         {
           auto prefix = get_prefix_of_const(weights_as_input);
           if (prefix == -1 || !is_valid_prefix(prefix))
@@ -282,6 +299,9 @@ public:
 
           assert(_do_w_x[prefix]->at<loco::DataType::BOOL>(0) == true);
 
+          auto name = weights_as_input->name();
+          assert(name.length() > 0);
+
           auto perm = g->nodes()->create<luci::CircleConst>();
           perm->dtype(loco::DataType::S32);
           perm->size<loco::DataType::S32>(2);
@@ -290,12 +310,16 @@ public:
           perm->at<loco::DataType::S32>(0) = 1;
           perm->at<loco::DataType::S32>(1) = 0;
           perm->shape_status(luci::ShapeStatus::VALID);
+          perm->name(name + "/Transpose/perm");
 
           auto input_transpose = g->nodes()->create<luci::CircleTranspose>();
+          luci::add_origin(input_transpose, luci::get_origin(fully_connected));
           input_transpose->a(fully_connected->weights());
           input_transpose->perm(perm);
+          input_transpose->name(name + "/Transpose");
 
           auto bcq_fc = g->nodes()->create<luci::CircleBCQFullyConnected>();
+          luci::add_origin(bcq_fc, luci::get_origin(fully_connected));
 
           assert(dynamic_cast<luci::CircleOutputExclude *>(fully_connected->bias()) != nullptr);
 
@@ -308,6 +332,8 @@ public:
 
           bcq_fc->weights_hidden_size(weights_as_input->dim(1).value());
           bcq_fc->input(input_transpose);
+          bcq_fc->name(name + "/BCQFullyConnected");
+
           loco::replace(fully_connected).with(bcq_fc);
 
           return true;
@@ -533,7 +559,7 @@ private:
     new_beta->dim(1) = _packed_binary_code[prefix]->dim(1);
     for (uint32_t i = 0; i < _packed_binary_code[prefix]->size<loco::DataType::S32>(); ++i)
       new_beta->at<loco::DataType::S32>(i) =
-          _packed_binary_code[prefix]->at<loco::DataType::S32>(i);
+        _packed_binary_code[prefix]->at<loco::DataType::S32>(i);
     new_beta->shape_status(luci::ShapeStatus::VALID);
 
     return new_beta;
@@ -556,9 +582,9 @@ private:
     for (int i = 0; i < number_of_clusters; ++i)
     {
       packed_clusters->at<loco::DataType::S32>(i * 2) =
-          qbits_of_clusters->at<loco::DataType::S32>(i);
+        qbits_of_clusters->at<loco::DataType::S32>(i);
       packed_clusters->at<loco::DataType::S32>(i * 2 + 1) =
-          size_of_clusters->at<loco::DataType::S32>(i);
+        size_of_clusters->at<loco::DataType::S32>(i);
     }
 
     return packed_clusters;
diff --git a/compiler/luci/pass/src/FuseBCQPass.test.cpp b/compiler/luci/pass/src/FuseBCQPass.test.cpp
new file mode 100644
index 000000000..73677affd
--- /dev/null
+++ b/compiler/luci/pass/src/FuseBCQPass.test.cpp
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/FuseBCQPass.h"
+
+#include <gtest/gtest.h>
+
+TEST(FuseBCQPassTest, name)
+{
+  luci::FuseBCQPass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
diff --git a/compiler/luci/pass/src/FuseBatchNormWithConvPass.cpp b/compiler/luci/pass/src/FuseBatchNormWithConvPass.cpp
new file mode 100644
index 000000000..062da7058
--- /dev/null
+++ b/compiler/luci/pass/src/FuseBatchNormWithConvPass.cpp
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/FuseBatchNormWithConvPass.h"
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/Profile/CircleNodeOrigin.h>
+
+namespace
+{
+/**
+ *  Fuse Mul-Add to Conv2D if possible.
+ *
+ *  NOTE TF's BatchNormalization is converted to Mul and Add.
+ *
+ *  BEFORE
+ *                  |   [CircleConst]
+ *                  |  / [CircleConst]
+ *                  | / /
+ *         [CircleConv2D] [CircleConst]
+ *                  |    /
+ *            [CircleMul] [CircleConst]
+ *                  |    /
+ *             [CircleAdd]
+ *                  |
+ *
+ *  AFTER
+ *                  |                  [CircleConst]
+ *                  +--------------+  / [CircleConst]
+ *                  |              | / /
+ *                  |     [CircleConv2D] [CircleConst]
+ *  [CircleConst]   |              |    /
+ * [CircleConst] \  |         [CircleMul] [CircleConst]
+ *              \ \ |              |     /
+ *           [CircleConv2D]   [CircleAdd]
+ *                  |
+ */
+bool fused_batch_norm_with_conv(luci::CircleAdd *add)
+{
+  luci::CircleMul *mul = nullptr;
+  luci::CircleConst *shift = nullptr;
+  if (auto add_lhs = dynamic_cast<luci::CircleMul *>(add->x()))
+  {
+    mul = add_lhs;
+    shift = dynamic_cast<luci::CircleConst *>(add->y());
+  }
+  else if (auto add_rhs = dynamic_cast<luci::CircleMul *>(add->y()))
+  {
+    mul = add_rhs;
+    shift = dynamic_cast<luci::CircleConst *>(add->x());
+  }
+
+  // If CircleMul is not found or constant operand of CircleAdd is not found,
+  // this pass cannot be applied.
+  if (mul == nullptr || shift == nullptr)
+    return false;
+
+  // If FusedActivationFunction of mul is not none, this pass cannot be applied.
+  if (mul->fusedActivationFunction() != luci::FusedActFunc::NONE)
+    return false;
+
+  // To apply this pass, shape of shift should be [1, 1, 1, out_channel].
+  if (shift->rank() != 4)
+    return false;
+  for (uint32_t i = 0; i < 3; ++i)
+    if (shift->dim(i).value() != 1)
+      return false;
+
+  luci::CircleConv2D *conv = nullptr;
+  luci::CircleConst *scale = nullptr;
+  if (auto mul_lhs = dynamic_cast<luci::CircleConv2D *>(mul->x()))
+  {
+    conv = mul_lhs;
+    scale = dynamic_cast<luci::CircleConst *>(mul->y());
+  }
+  else if (auto mul_rhs = dynamic_cast<luci::CircleConv2D *>(mul->y()))
+  {
+    conv = mul_rhs;
+    scale = dynamic_cast<luci::CircleConst *>(mul->x());
+  }
+
+  // If CircleConv2D is not found or constant operand of CircleMul is not found,
+  // this pass cannot be applied.
+  if (conv == nullptr || scale == nullptr)
+    return false;
+
+  // To apply this pass, shape of scale should be [1, 1, 1, out_channel].
+  if (scale->rank() != 4)
+    return false;
+  for (uint32_t i = 0; i < 3; ++i)
+    if (scale->dim(i).value() != 1)
+      return false;
+
+  // If FusedActivationFunction of conv is not none, this pass cannot be applied.
+  if (conv->fusedActivationFunction() != luci::FusedActFunc::NONE)
+    return false;
+
+  luci::CircleConst *filter = dynamic_cast<luci::CircleConst *>(conv->filter());
+  luci::CircleConst *bias = dynamic_cast<luci::CircleConst *>(conv->bias());
+
+  // If filter or bias of conv is not const, this pass cannot be applied.
+  if (filter == nullptr || bias == nullptr)
+    return false;
+
+  // If dtype of filter is different with scale and shift, multiplication may be impossible.
+  if (filter->dtype() != scale->dtype())
+    return false;
+  if (filter->dtype() != shift->dtype())
+    return false;
+
+  // TODO Support more data type
+  if (filter->dtype() != loco::DataType::FLOAT32)
+    return false;
+
+  // Output channel dimension should be same. If not, this pass cannot be applied.
+  if (filter->dim(0).value() != scale->dim(3).value())
+    return false;
+  if (filter->dim(0).value() != shift->dim(3).value())
+    return false;
+
+  auto name = add->name();
+  assert(name.length() > 0);
+
+  luci::CircleConv2D *fused_conv = add->graph()->nodes()->create<luci::CircleConv2D>();
+  luci::CircleConst *fused_filter = add->graph()->nodes()->create<luci::CircleConst>();
+  luci::CircleConst *fused_bias = add->graph()->nodes()->create<luci::CircleConst>();
+
+  uint32_t filter_out_channel = filter->dim(0).value();
+  uint32_t filter_height = filter->dim(1).value();
+  uint32_t filter_width = filter->dim(2).value();
+  uint32_t filter_in_channel = filter->dim(3).value();
+
+  // Copy filter
+  fused_filter->dtype(filter->dtype());
+  fused_filter->size<loco::DataType::FLOAT32>(filter->size<loco::DataType::FLOAT32>());
+  fused_filter->rank(4);
+  fused_filter->dim(0).set(filter_out_channel);
+  fused_filter->dim(1).set(filter_height);
+  fused_filter->dim(2).set(filter_width);
+  fused_filter->dim(3).set(filter_in_channel);
+  fused_filter->shape_status(luci::ShapeStatus::VALID);
+  fused_filter->name(name + "/Conv2D/filter");
+
+  // Fuse scale to new filter
+  for (uint32_t c = 0; c < filter_out_channel; c++)
+  {
+    for (uint32_t h = 0; h < filter_height; h++)
+    {
+      for (uint32_t w = 0; w < filter_width; w++)
+      {
+        for (uint32_t b = 0; b < filter_in_channel; b++)
+        {
+          uint32_t offset = c * filter_height * filter_width * filter_in_channel +
+                            h * filter_width * filter_in_channel + w * filter_in_channel + b;
+          fused_filter->at<loco::DataType::FLOAT32>(offset) =
+            filter->at<loco::DataType::FLOAT32>(offset) * scale->at<loco::DataType::FLOAT32>(c);
+        }
+      }
+    }
+  }
+
+  // Copy bias
+  assert(bias->rank() == 1);
+  assert(bias->dim(0).value() == filter_out_channel);
+  fused_bias->dtype(bias->dtype());
+  fused_bias->size<loco::DataType::FLOAT32>(bias->size<loco::DataType::FLOAT32>());
+  fused_bias->rank(1);
+  fused_bias->dim(0).set(filter_out_channel);
+  fused_bias->shape_status(luci::ShapeStatus::VALID);
+  fused_bias->name(name + "/Conv2D/bias");
+
+  // Fuse scale and shift to bias
+  for (uint32_t b = 0; b < filter_out_channel; ++b)
+  {
+    fused_bias->at<loco::DataType::FLOAT32>(b) =
+      bias->at<loco::DataType::FLOAT32>(b) * scale->at<loco::DataType::FLOAT32>(b) +
+      shift->at<loco::DataType::FLOAT32>(b);
+  }
+
+  // Set attributes of fused_conv
+  fused_conv->input(conv->input());
+  fused_conv->filter(fused_filter);
+  fused_conv->bias(fused_bias);
+  fused_conv->fusedActivationFunction(add->fusedActivationFunction());
+  fused_conv->padding(conv->padding());
+  fused_conv->stride()->h(conv->stride()->h());
+  fused_conv->stride()->w(conv->stride()->w());
+  fused_conv->dilation()->h(conv->dilation()->h());
+  fused_conv->dilation()->w(conv->dilation()->w());
+  fused_conv->name(name + "/Conv2D");
+  luci::add_origin(fused_conv, luci::composite_origin({luci::get_origin(add), luci::get_origin(mul),
+                                                       luci::get_origin(conv)}));
+
+  replace(add).with(fused_conv);
+
+  return true;
+}
+
+} // namespace
+
+namespace luci
+{
+
+bool FuseBatchNormWithConvPass::run(loco::Graph *g)
+{
+  bool changed = false;
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    if (auto add = dynamic_cast<luci::CircleAdd *>(node))
+    {
+      if (fused_batch_norm_with_conv(add))
+        changed = true;
+    }
+  }
+
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/FuseBatchNormWithConvPass.test.cpp b/compiler/luci/pass/src/FuseBatchNormWithConvPass.test.cpp
new file mode 100644
index 000000000..96bc2bd35
--- /dev/null
+++ b/compiler/luci/pass/src/FuseBatchNormWithConvPass.test.cpp
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/FuseBatchNormWithConvPass.h"
+
+#include <gtest/gtest.h>
+
+TEST(FuseBatchNormWithConvPassTest, name)
+{
+  luci::FuseBatchNormWithConvPass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
diff --git a/compiler/luci/pass/src/FuseBatchNormWithDwConvPass.cpp b/compiler/luci/pass/src/FuseBatchNormWithDwConvPass.cpp
new file mode 100644
index 000000000..8b2286f43
--- /dev/null
+++ b/compiler/luci/pass/src/FuseBatchNormWithDwConvPass.cpp
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/FuseBatchNormWithDwConvPass.h"
+
+#include "helpers/NodeFiller.h"
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/Profile/CircleNodeOrigin.h>
+
+namespace
+{
+/**
+ *  Fuse Mul-Add to DepthwiseConv2D if possible.
+ *
+ *  NOTE TF's BatchNormalization is converted to Mul and Add.
+ *
+ *  BEFORE
+ *                     |   [CircleConst]
+ *                     |   / [CircleConst]
+ *                     |  / /
+ *    [CircleDepthwiseConv2D] [CircleConst]
+ *                     |     /
+ *                [CircleMul] [CircleConst]
+ *                     |     /
+ *                [CircleAdd]
+ *                     |
+ *
+ *  AFTER
+ *                     |                                          [CircleConst]
+ *                     +-------------------------------------+   / [CircleConst]
+ *                     |                                     |  / /
+ *                     |                    [CircleDepthwiseConv2D] [CircleConst]
+ *                     |    [CircleConst]                    |     /
+ *                     |   / [CircleConst]              [CircleMul] [CircleConst]
+ *                     |  / /                                |     /
+ *    [CircleDepthwiseConv2D]                           [CircleAdd]
+ *                     |
+ *
+ */
+
+/**
+ * @brief Check shape is [x] or [1, 1, 1, x]
+ */
+bool is_scale_shift_shape(luci::CircleConst *node)
+{
+  auto rank = node->rank();
+  if (rank != 1 && rank != 4)
+    return false;
+  for (uint32_t r = 0; r < rank - 1; ++r)
+  {
+    if (node->dim(r).value() != 1)
+      return false;
+  }
+  return true;
+}
+
+bool fused_batch_norm_with_dwconv(luci::CircleAdd *add)
+{
+  assert(add != nullptr);
+
+  // Find the pattern of CircleDepthwiseConv2D - CircleMul - CircleAdd
+  luci::CircleConst *scale = nullptr;
+  luci::CircleConst *shift = nullptr;
+  luci::CircleDepthwiseConv2D *dwconv = nullptr;
+  luci::CircleMul *mul = nullptr;
+  if (not luci::fill(&shift, &mul).with_commutative_args_of(add))
+    return false;
+  if (not luci::fill(&scale, &dwconv).with_commutative_args_of(mul))
+    return false;
+
+  // check scale and shift constant attributes
+  // scale and shift can be [x] or [1, 1, 1, x]
+  if (not is_scale_shift_shape(scale))
+    return false;
+  if (not is_scale_shift_shape(shift))
+    return false;
+
+  // check mul, add attributes
+  if (mul->dtype() != loco::DataType::FLOAT32)
+    return false;
+  if (mul->fusedActivationFunction() != luci::FusedActFunc::NONE)
+    return false;
+  if (add->dtype() != loco::DataType::FLOAT32)
+    return false;
+  // TODO support more Activations
+  if (add->fusedActivationFunction() != luci::FusedActFunc::NONE &&
+      add->fusedActivationFunction() != luci::FusedActFunc::RELU6)
+    return false;
+
+  // get weight of dwconv
+  auto filter = dynamic_cast<luci::CircleConst *>(dwconv->filter());
+  if (not filter)
+    return false;
+  if (filter->dtype() != loco::DataType::FLOAT32)
+    return false;
+  if (filter->rank() != 4)
+    return false;
+
+  // check attributes of dwconv
+  if (dwconv->fusedActivationFunction() != luci::FusedActFunc::NONE)
+    return false;
+  if (dwconv->depthMultiplier() < 0) // can this happen?
+    return false;
+
+  // get bias of dwconv
+  auto bias = dynamic_cast<luci::CircleConst *>(dwconv->bias());
+  if (not bias)
+    return false;
+  if (bias->dtype() != loco::DataType::FLOAT32)
+    return false;
+  if (bias->rank() != 1)
+    return false;
+
+  // filter represents as [1, H, W, C*M] where M is multiplier.
+  auto filter_out_chn = filter->dim(3).value();
+  auto multiplier = static_cast<uint32_t>(dwconv->depthMultiplier());
+  auto srank = scale->rank(); // as rank can be 1 or 4
+  if (filter_out_chn != scale->dim(srank - 1).value() * multiplier)
+    return false;
+  srank = shift->rank();
+  if (filter_out_chn != shift->dim(srank - 1).value() * multiplier)
+    return false;
+  auto channel = filter_out_chn / multiplier;
+
+  auto name = add->name();
+  assert(name.length() > 0);
+
+  loco::Graph *graph = add->graph();
+  luci::CircleDepthwiseConv2D *fused_dwconv = graph->nodes()->create<luci::CircleDepthwiseConv2D>();
+  luci::CircleConst *fused_filter = graph->nodes()->create<luci::CircleConst>();
+  luci::CircleConst *fused_bias = graph->nodes()->create<luci::CircleConst>();
+
+  auto filter_in_chn = filter->dim(0).value();
+  auto filter_height = filter->dim(1).value();
+  auto filter_width = filter->dim(2).value();
+  assert(filter_in_chn == 1);
+
+  // Copy filter shape
+  fused_filter->dtype(filter->dtype());
+  fused_filter->size<loco::DataType::FLOAT32>(filter->size<loco::DataType::FLOAT32>());
+  fused_filter->rank(4);
+  fused_filter->dim(0).set(filter_in_chn);
+  fused_filter->dim(1).set(filter_height);
+  fused_filter->dim(2).set(filter_width);
+  fused_filter->dim(3).set(filter_out_chn);
+  fused_filter->shape_status(luci::ShapeStatus::VALID);
+  fused_filter->name(name + "/DepthwiseConv2D/filter");
+
+  // fused filter weight = filter weight * mul(scale) + add(shift)
+  for (uint32_t b = 0; b < filter_in_chn; b++)
+  {
+    for (uint32_t h = 0; h < filter_height; h++)
+    {
+      for (uint32_t w = 0; w < filter_width; w++)
+      {
+        for (uint32_t c = 0; c < filter_out_chn; c++)
+        {
+          uint32_t offset = b * filter_height * filter_width * filter_out_chn +
+                            h * filter_width * filter_out_chn + w * filter_out_chn + c;
+          uint32_t chn = c / multiplier;
+          fused_filter->at<loco::DataType::FLOAT32>(offset) =
+            filter->at<loco::DataType::FLOAT32>(offset) * scale->at<loco::DataType::FLOAT32>(chn);
+        }
+      }
+    }
+  }
+
+  // Fuse bias with scale and shift
+  fused_bias->dtype(shift->dtype());
+  fused_bias->size<loco::DataType::FLOAT32>(shift->size<loco::DataType::FLOAT32>());
+  fused_bias->rank(1);
+  fused_bias->dim(0).set(channel);
+  fused_bias->shape_status(luci::ShapeStatus::VALID);
+  for (uint32_t c = 0; c < channel; ++c)
+  {
+    fused_bias->at<loco::DataType::FLOAT32>(c) =
+      bias->at<loco::DataType::FLOAT32>(c) * scale->at<loco::DataType::FLOAT32>(c) +
+      shift->at<loco::DataType::FLOAT32>(c);
+  }
+  fused_bias->name(name + "/DepthwiseConv2D/bias");
+
+  // set new tconv properties
+  fused_dwconv->input(dwconv->input());
+  fused_dwconv->filter(fused_filter);
+  fused_dwconv->bias(fused_bias);
+  fused_dwconv->fusedActivationFunction(add->fusedActivationFunction());
+  fused_dwconv->padding(dwconv->padding());
+  fused_dwconv->stride()->h(dwconv->stride()->h());
+  fused_dwconv->stride()->w(dwconv->stride()->w());
+  fused_dwconv->depthMultiplier(dwconv->depthMultiplier());
+  fused_dwconv->dilation()->h(dwconv->dilation()->h());
+  fused_dwconv->dilation()->w(dwconv->dilation()->w());
+  fused_dwconv->name(name + "/DepthwiseConv2D");
+  luci::add_origin(fused_dwconv,
+                   luci::composite_origin(
+                     {luci::get_origin(add), luci::get_origin(mul), luci::get_origin(dwconv)}));
+
+  replace(add).with(fused_dwconv);
+
+  return true;
+}
+
+} // namespace
+
+namespace luci
+{
+
+bool FuseBatchNormWithDwConvPass::run(loco::Graph *g)
+{
+  bool changed = false;
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    if (auto add = dynamic_cast<luci::CircleAdd *>(node))
+    {
+      if (fused_batch_norm_with_dwconv(add))
+        changed = true;
+    }
+  }
+
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/FuseBatchNormWithDwConvPass.test.cpp b/compiler/luci/pass/src/FuseBatchNormWithDwConvPass.test.cpp
new file mode 100644
index 000000000..3030a7306
--- /dev/null
+++ b/compiler/luci/pass/src/FuseBatchNormWithDwConvPass.test.cpp
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/FuseBatchNormWithDwConvPass.h"
+
+#include <gtest/gtest.h>
+
+TEST(FuseBatchNormWithDwConvPassTest, name)
+{
+  luci::FuseBatchNormWithDwConvPass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
diff --git a/compiler/luci/pass/src/FuseBatchNormWithTConv.cpp b/compiler/luci/pass/src/FuseBatchNormWithTConv.cpp
deleted file mode 100644
index 95ccd8176..000000000
--- a/compiler/luci/pass/src/FuseBatchNormWithTConv.cpp
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "luci/Pass/FuseBatchNormWithTConv.h"
-
-#include <luci/IR/CircleNodes.h>
-
-namespace
-{
-/**
- *  NOTE TF's fusedBatchNorm is converted to mul and add of Circle.
- *
- *  BEFORE
- *
- *         [CircleTransposeConv]
- *                  |
- *                [mul]
- *                  |
- *                [add]
- *  AFTER
- *
- *         [CircleTransposeConv]
- */
-bool fused_batch_norm_with_tconv(luci::CircleTransposeConv *tconv)
-{
-  // check whether it has bias or not. This optimization works only if it doesn't.
-  auto bias = dynamic_cast<luci::CircleOutputExclude *>(tconv->bias());
-  if (not bias)
-    return false;
-
-  // get weight of tconv
-  auto filter = dynamic_cast<luci::CircleConst *>(tconv->filter());
-  if (not filter)
-    return false;
-  if (filter->dtype() != loco::DataType::FLOAT32)
-    return false;
-
-  // get mul node
-  auto tconv_output = loco::succs(tconv);
-  assert(tconv_output.size() == 1);
-  auto mul = dynamic_cast<luci::CircleMul *>(*tconv_output.begin());
-  if (not mul)
-    return false;
-  if (mul->dtype() != loco::DataType::FLOAT32)
-    return false;
-
-  // get add node
-  auto mul_output = loco::succs(mul);
-  assert(mul_output.size() == 1);
-  auto add = dynamic_cast<luci::CircleAdd *>(*mul_output.begin());
-  if (not add)
-    return false;
-  if (add->dtype() != loco::DataType::FLOAT32)
-    return false;
-  if (add->fusedActivationFunction() != luci::FusedActFunc::NONE &&
-      add->fusedActivationFunction() != luci::FusedActFunc::RELU6)
-    return false;
-
-  // get scale of batchnorm
-  auto scale = dynamic_cast<luci::CircleConst *>(mul->y());
-  if (not scale)
-    return false;
-
-  // scale dim(0) == tconv filter channel dim
-  if (filter->rank() != 4)
-    return false;
-  auto filter_out_dim = filter->dim(0).value();
-  if (scale->rank() != 1)
-    return false;
-  auto scale_dim = scale->dim(0).value();
-  if (filter_out_dim != scale_dim)
-    return false;
-
-  // get shift of batchnorm
-  auto shift = dynamic_cast<luci::CircleConst *>(add->y());
-  if (not shift)
-    return false;
-
-  // shift dim(0) == tconv filter channel dim
-  if (shift->rank() != 1)
-    return false;
-  auto shift_dim = shift->dim(0).value();
-  if (filter_out_dim != shift_dim)
-    return false;
-
-  // filter weight = filter weight * mul(scale) + add(shift)
-  uint32_t filter_height_dim = filter->dim(1).value();
-  uint32_t filter_width_dim = filter->dim(2).value();
-  uint32_t filter_in_dim = filter->dim(3).value();
-  for (uint32_t c = 0; c < filter_out_dim; c++)
-  {
-    for (uint32_t h = 0; h < filter_height_dim; h++)
-    {
-      for (uint32_t w = 0; w < filter_width_dim; w++)
-      {
-        for (uint32_t b = 0; b < filter_in_dim; b++)
-        {
-          uint32_t offset = c * filter_height_dim * filter_width_dim * filter_in_dim +
-                            h * filter_width_dim * filter_in_dim + w * filter_in_dim + b;
-          filter->at<loco::DataType::FLOAT32>(offset) *= scale->at<loco::DataType::FLOAT32>(c);
-        }
-      }
-    }
-  }
-
-  // fuse shift with transposed conv
-  tconv->bias(shift);
-
-  if (add->fusedActivationFunction() == luci::FusedActFunc::RELU6)
-  {
-    // separate relu op from add op
-    auto relu = add->graph()->nodes()->create<luci::CircleRelu6>();
-    relu->features(tconv);
-
-    // remove mul node
-    replace(add).with(relu);
-  }
-  else
-  {
-    replace(add).with(tconv);
-  }
-
-  return true;
-}
-
-} // namespace
-
-namespace luci
-{
-
-bool FuseBatchNormWithTConvPass::run(loco::Graph *g)
-{
-  bool changed = false;
-  for (auto node : loco::active_nodes(loco::output_nodes(g)))
-  {
-    auto tconv = dynamic_cast<luci::CircleTransposeConv *>(node);
-    if (not tconv)
-      continue;
-
-    changed |= fused_batch_norm_with_tconv(tconv);
-  }
-
-  return changed;
-}
-
-} // namespace luci
diff --git a/compiler/luci/pass/src/FuseBatchNormWithTConvPass.cpp b/compiler/luci/pass/src/FuseBatchNormWithTConvPass.cpp
new file mode 100644
index 000000000..337954960
--- /dev/null
+++ b/compiler/luci/pass/src/FuseBatchNormWithTConvPass.cpp
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/FuseBatchNormWithTConvPass.h"
+
+#include "helpers/NodeFiller.h"
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/Profile/CircleNodeOrigin.h>
+
+namespace
+{
+/**
+ *  Fuse Mul-Add to TransposeConv if possible.
+ *
+ *  NOTE TF's BatchNormalization is converted to Mul and Add.
+ *
+ *  BEFORE
+ *                     |   [CircleOutputExclude]
+ *                     |   / [CircleConst]
+ *                     |  / /
+ *     [CircleTransposeConv]  [CircleConst]
+ *                     |     /
+ *                [CircleMul] [CircleConst]
+ *                     |     /
+ *                [CircleAdd]
+ *                     |
+ *
+ *  AFTER
+ *                     |                                          [CircleOutputExclude]
+ *                     +-------------------------------------+   / [CircleConst]
+ *                     |                                     |  / /
+ *                     |                     [CircleTransposeConv]  [CircleConst]
+ *                     |    [CircleConst]                    |     /
+ *                     |   / [CircleConst]              [CircleMul] [CircleConst]
+ *                     |  / /                                |     /
+ *     [CircleTransposeConv]                            [CircleAdd]
+ *                     |
+ *              ([CircleRelu6])
+ *                     |
+ *
+ * Note: CircleRelu6 is inserted if Add activation is ReLU6
+ */
+bool fused_batch_norm_with_tconv(luci::CircleAdd *add)
+{
+  assert(add != nullptr);
+
+  // Find the pattern of CircleTransposeConv - CircleMul - CircleAdd
+  luci::CircleConst *scale = nullptr;
+  luci::CircleConst *shift = nullptr;
+  luci::CircleTransposeConv *tconv = nullptr;
+  luci::CircleMul *mul = nullptr;
+  if (not luci::fill(&shift, &mul).with_commutative_args_of(add))
+    return false;
+  if (not luci::fill(&scale, &tconv).with_commutative_args_of(mul))
+    return false;
+
+  // check scale and shift constant attributes
+  if (scale->rank() != 1)
+    return false;
+  if (shift->rank() != 1)
+    return false;
+  // check mul, add attributes
+  if (mul->dtype() != loco::DataType::FLOAT32)
+    return false;
+  if (add->dtype() != loco::DataType::FLOAT32)
+    return false;
+  if (add->fusedActivationFunction() != luci::FusedActFunc::NONE &&
+      add->fusedActivationFunction() != luci::FusedActFunc::RELU6)
+    return false;
+
+  // tconv bias should be not set
+  if (not dynamic_cast<luci::CircleOutputExclude *>(tconv->bias()))
+    return false;
+
+  // get weight of tconv
+  auto filter = dynamic_cast<luci::CircleConst *>(tconv->filter());
+  if (not filter)
+    return false;
+  if (filter->dtype() != loco::DataType::FLOAT32)
+    return false;
+  if (filter->rank() != 4)
+    return false;
+
+  auto filter_out_chn = filter->dim(0).value();
+  if (filter_out_chn != scale->dim(0).value())
+    return false;
+  if (filter_out_chn != shift->dim(0).value())
+    return false;
+
+  auto name = add->name();
+  assert(name.length() > 0);
+
+  loco::Graph *graph = add->graph();
+  luci::CircleTransposeConv *fused_tconv = graph->nodes()->create<luci::CircleTransposeConv>();
+  luci::CircleConst *fused_filter = graph->nodes()->create<luci::CircleConst>();
+  luci::CircleConst *fused_bias = graph->nodes()->create<luci::CircleConst>();
+
+  auto filter_height = filter->dim(1).value();
+  auto filter_width = filter->dim(2).value();
+  auto filter_in_chn = filter->dim(3).value();
+
+  // Copy filter shape
+  fused_filter->dtype(filter->dtype());
+  fused_filter->size<loco::DataType::FLOAT32>(filter->size<loco::DataType::FLOAT32>());
+  fused_filter->rank(4);
+  fused_filter->dim(0).set(filter_out_chn);
+  fused_filter->dim(1).set(filter_height);
+  fused_filter->dim(2).set(filter_width);
+  fused_filter->dim(3).set(filter_in_chn);
+  fused_filter->shape_status(luci::ShapeStatus::VALID);
+  fused_filter->name(name + "/TransposeConv/filter");
+
+  // fused filter weight = filter weight * mul(scale) + add(shift)
+  for (uint32_t c = 0; c < filter_out_chn; c++)
+  {
+    for (uint32_t h = 0; h < filter_height; h++)
+    {
+      for (uint32_t w = 0; w < filter_width; w++)
+      {
+        for (uint32_t b = 0; b < filter_in_chn; b++)
+        {
+          uint32_t offset = c * filter_height * filter_width * filter_in_chn +
+                            h * filter_width * filter_in_chn + w * filter_in_chn + b;
+          fused_filter->at<loco::DataType::FLOAT32>(offset) =
+            filter->at<loco::DataType::FLOAT32>(offset) * scale->at<loco::DataType::FLOAT32>(c);
+        }
+      }
+    }
+  }
+
+  // Copy fused_bias from shift
+  fused_bias->dtype(shift->dtype());
+  fused_bias->size<loco::DataType::FLOAT32>(shift->size<loco::DataType::FLOAT32>());
+  fused_bias->rank(1);
+  fused_bias->dim(0).set(filter_out_chn);
+  fused_bias->shape_status(luci::ShapeStatus::VALID);
+  for (uint32_t c = 0; c < filter_out_chn; ++c)
+  {
+    fused_bias->at<loco::DataType::FLOAT32>(c) = shift->at<loco::DataType::FLOAT32>(c);
+  }
+  fused_bias->name(name + "/TransposeConv/bias");
+
+  // set new tconv properties
+  fused_tconv->inputSizes(tconv->inputSizes());
+  fused_tconv->filter(fused_filter);
+  fused_tconv->outBackprop(tconv->outBackprop());
+  fused_tconv->bias(fused_bias);
+  fused_tconv->padding(tconv->padding());
+  fused_tconv->stride()->h(tconv->stride()->h());
+  fused_tconv->stride()->w(tconv->stride()->w());
+  fused_tconv->name(name + "/TransposeConv");
+  luci::add_origin(fused_tconv,
+                   luci::composite_origin(
+                     {luci::get_origin(add), luci::get_origin(mul), luci::get_origin(tconv)}));
+
+  if (add->fusedActivationFunction() == luci::FusedActFunc::RELU6)
+  {
+    // separate relu op from add op
+    auto relu = add->graph()->nodes()->create<luci::CircleRelu6>();
+    relu->features(fused_tconv);
+    relu->name(name + "/Relu6");
+    luci::add_origin(relu, luci::get_origin(add));
+
+    replace(add).with(relu);
+  }
+  else
+  {
+    replace(add).with(fused_tconv);
+  }
+
+  return true;
+}
+
+} // namespace
+
+namespace luci
+{
+
+bool FuseBatchNormWithTConvPass::run(loco::Graph *g)
+{
+  bool changed = false;
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    if (auto add = dynamic_cast<luci::CircleAdd *>(node))
+    {
+      if (fused_batch_norm_with_tconv(add))
+        changed = true;
+    }
+  }
+
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/FuseBatchNormWithTConvPass.test.cpp b/compiler/luci/pass/src/FuseBatchNormWithTConvPass.test.cpp
new file mode 100644
index 000000000..051100dc9
--- /dev/null
+++ b/compiler/luci/pass/src/FuseBatchNormWithTConvPass.test.cpp
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/FuseBatchNormWithTConvPass.h"
+
+#include <gtest/gtest.h>
+
+TEST(FuseBatchNormWithTConvPassTest, name)
+{
+  luci::FuseBatchNormWithTConvPass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
diff --git a/compiler/luci/pass/src/FuseInstanceNormPass.cpp b/compiler/luci/pass/src/FuseInstanceNormPass.cpp
index 237152f98..ab7baa1fa 100644
--- a/compiler/luci/pass/src/FuseInstanceNormPass.cpp
+++ b/compiler/luci/pass/src/FuseInstanceNormPass.cpp
@@ -15,105 +15,16 @@
  */
 
 #include "luci/Pass/FuseInstanceNormPass.h"
+#include "helpers/NodeFiller.h"
 #include "FuseInstanceNormPassInternal.h"
 
 #include <luci/IR/CircleNodes.h>
 
-#include <loco/Service/ShapeInference.h>
+#include <luci/Profile/CircleNodeOrigin.h>
 
 #include <cassert>
 #include <set>
 
-// Helper to find commutative node's arguments
-namespace
-{
-
-/**
- * INTRODUCTION
- *         Binary operation f(x,y) is 'commutative' when
- *         f(x,y) == f(y,x) holds for all x, y.
- *         For examples, ADD, MUL and SQUARED_DIFFERENCE are commutative.
- *         These helpers make it easy to find commutative arguemnts of commtative node.
- *
- * HOW TO USE
- *         COMM_NODE *node;
- *         ARG_TYPE_1 *arg1;
- *         ARG_TYPE_2 *arg2;
- *
- *         bool ok = fill(&arg1, &arg2).with_commutative_args_of(node);
- *
- * Result
- *         If 'node's commutative argument types are actually {ARG_TYPE_1, ARG_TYPE_2}
- *         (as a set), 'arg1' and 'arg2' set as actual 'node's arguemnts with matching
- *         type, and return value 'ok' is true.
- *         Otherwise, 'arg1' and 'arg2' not changed, 'ok' is false.
- */
-
-template <class ARG_TYPE_1, class ARG_TYPE_2> class NodeFiller final
-{
-public:
-  NodeFiller(ARG_TYPE_1 **arg_1, ARG_TYPE_2 **arg_2) : _arg_1(arg_1), _arg_2(arg_2)
-  {
-    // DO NOTHING
-  }
-
-  /**
-   * @return true   When 'node's argument types are 'ARG_TYPE_1' and 'ARG_TYPE_2'
-   *                In such case, it assign '_arg_1' and '_arg_2' to actual arguments
-   *
-   * @return false  When 'node's argument types are NOT matched with 'ARG_TYPE_*'
-   *                In such case, it does not amend '_arg_1' and '_arg_2'
-   *
-   * @require       COMM_NODE has member x() and y()
-   */
-  template <class COMM_NODE> bool with_commutative_args_of(const COMM_NODE *node);
-
-private:
-  ARG_TYPE_1 **_arg_1;
-  ARG_TYPE_2 **_arg_2;
-};
-
-template <class ARG_TYPE_1, class ARG_TYPE_2>
-inline NodeFiller<ARG_TYPE_1, ARG_TYPE_2> fill(ARG_TYPE_1 **arg_1, ARG_TYPE_2 **arg_2)
-{
-  return NodeFiller<ARG_TYPE_1, ARG_TYPE_2>{arg_1, arg_2};
-}
-
-template <class ARG_TYPE_1, class ARG_TYPE_2>
-template <class COMM_NODE>
-bool NodeFiller<ARG_TYPE_1, ARG_TYPE_2>::with_commutative_args_of(const COMM_NODE *node)
-{
-  // Case 1) X == ARG_TYPE_1 / Y == ARG_TYPE_2
-  {
-    auto x = dynamic_cast<ARG_TYPE_1 *>(node->x());
-    auto y = dynamic_cast<ARG_TYPE_2 *>(node->y());
-
-    if (x && y)
-    {
-      *_arg_1 = x;
-      *_arg_2 = y;
-      return true;
-    }
-  }
-
-  // Case 2) X == ARG_TYPE_2 / Y == ARG_TYPE_1
-  {
-    auto x = dynamic_cast<ARG_TYPE_2 *>(node->x());
-    auto y = dynamic_cast<ARG_TYPE_1 *>(node->y());
-
-    if (x && y)
-    {
-      *_arg_1 = y;
-      *_arg_2 = x;
-      return true;
-    }
-  }
-
-  return false;
-}
-
-} // namespace
-
 // Helper to check detail
 
 /// @return true  When node has shape of '1 x .. x 1 x depth'
@@ -150,11 +61,10 @@ bool is_instance_mean_v0(luci::CircleMean *mean)
   //
   // CHECK 1) input is rank 4
   //
-  auto input = mean->input();
-  if (not loco::shape_known(input))
+  auto input = loco::must_cast<luci::CircleNode *>(mean->input());
+  if (input->shape_status() != luci::ShapeStatus::VALID)
     return false;
-  auto input_shape = loco::shape_get(input).as<loco::TensorShape>();
-  if (input_shape.rank() != 4)
+  if (input->rank() != 4)
     return false;
 
   //
@@ -195,11 +105,10 @@ bool is_instance_mean_v1(luci::CircleMean *mean)
   //
   // CHECK 1) input is rank 5 (NHWCX)
   //
-  auto input = mean->input();
-  if (not loco::shape_known(input))
+  auto input = loco::must_cast<luci::CircleNode *>(mean->input());
+  if (input->shape_status() != luci::ShapeStatus::VALID)
     return false;
-  auto input_shape = loco::shape_get(input).as<loco::TensorShape>();
-  if (input_shape.rank() != 5)
+  if (input->rank() != 5)
     return false;
 
   //
@@ -445,8 +354,9 @@ bool InstanceNormPattern::matched()
   // So it is handled in the separate if statement
   if (_pv == PatternVersion::Version_2)
   {
-    CHECK_OR_FALSE(fill(&mul_gamma, &const_as_beta).with_commutative_args_of(add_as_terminal));
-    CHECK_OR_FALSE(fill(&div, &const_as_gamma).with_commutative_args_of(mul_gamma));
+    CHECK_OR_FALSE(
+      luci::fill(&mul_gamma, &const_as_beta).with_commutative_args_of(add_as_terminal));
+    CHECK_OR_FALSE(luci::fill(&div, &const_as_gamma).with_commutative_args_of(mul_gamma));
 
     sub = dynamic_cast<luci::CircleSub *>(div->x());
     CHECK_OR_FALSE(sub);
@@ -456,6 +366,7 @@ bool InstanceNormPattern::matched()
 
     luci::CircleNode *ifm_node = loco::must_cast<luci::CircleNode *>(ifm);
     CHECK_OR_FALSE(ifm_node->rank() == 4);
+    CHECK_OR_FALSE(ifm_node->dim(3).known());
     uint32_t ifm_channel_depth = ifm_node->dim(3).value();
 
     mean_of_ifm = dynamic_cast<luci::CircleMean *>(sub->y());
@@ -477,7 +388,7 @@ bool InstanceNormPattern::matched()
     CHECK_OR_FALSE(zero_point_five->at<loco::DataType::FLOAT32>(0) == 0.5);
 
     CHECK_OR_FALSE(
-        fill(&mean_as_variance, &const_as_epsilon).with_commutative_args_of(add_as_variance));
+      luci::fill(&mean_as_variance, &const_as_epsilon).with_commutative_args_of(add_as_variance));
     CHECK_OR_FALSE(const_as_epsilon->dtype() == loco::DataType::FLOAT32);
     // TODO Support regarding broadcast
     CHECK_OR_FALSE(const_as_epsilon->size<loco::DataType::FLOAT32>() == 1);
@@ -489,7 +400,8 @@ bool InstanceNormPattern::matched()
 
     loco::Node *ifm_should_be = nullptr;
     luci::CircleMean *mean_of_ifm_should_be = nullptr;
-    CHECK_OR_FALSE(fill(&ifm_should_be, &mean_of_ifm_should_be).with_commutative_args_of(sqdiff));
+    CHECK_OR_FALSE(
+      luci::fill(&ifm_should_be, &mean_of_ifm_should_be).with_commutative_args_of(sqdiff));
     CHECK_OR_FALSE(ifm == ifm_should_be);
     CHECK_OR_FALSE(mean_of_ifm == mean_of_ifm_should_be);
 
@@ -503,25 +415,25 @@ bool InstanceNormPattern::matched()
 
   if (_pv == PatternVersion::Version_0)
   {
-    CHECK_OR_FALSE(fill(&mul_as_scaled_ifm, &sub).with_commutative_args_of(add_as_terminal));
-    CHECK_OR_FALSE(fill(&ifm, &mul_gamma).with_commutative_args_of(mul_as_scaled_ifm));
+    CHECK_OR_FALSE(luci::fill(&mul_as_scaled_ifm, &sub).with_commutative_args_of(add_as_terminal));
+    CHECK_OR_FALSE(luci::fill(&ifm, &mul_gamma).with_commutative_args_of(mul_as_scaled_ifm));
   }
   if (_pv == PatternVersion::Version_1)
   {
-    CHECK_OR_FALSE(fill(&mul_as_scaled_reshape, &sub).with_commutative_args_of(add_as_terminal));
     CHECK_OR_FALSE(
-        fill(&reshape_of_ifm, &mul_gamma).with_commutative_args_of(mul_as_scaled_reshape));
+      luci::fill(&mul_as_scaled_reshape, &sub).with_commutative_args_of(add_as_terminal));
+    CHECK_OR_FALSE(
+      luci::fill(&reshape_of_ifm, &mul_gamma).with_commutative_args_of(mul_as_scaled_reshape));
     ifm = reshape_of_ifm->tensor();
   }
 
-  CHECK_OR_FALSE(loco::shape_known(ifm));
-  auto ifm_shape = loco::shape_get(ifm);
-  CHECK_OR_FALSE(ifm_shape.domain() == loco::Domain::Tensor);
-  auto ifm_tensor_shape = ifm_shape.as<loco::TensorShape>();
-  CHECK_OR_FALSE(ifm_tensor_shape.rank() == 4);
-  uint32_t ifm_channel_depth = ifm_tensor_shape.dim(3).value();
+  auto ifm_circle = loco::must_cast<luci::CircleNode *>(ifm);
+  CHECK_OR_FALSE(ifm_circle->shape_status() == luci::ShapeStatus::VALID);
+  CHECK_OR_FALSE(ifm_circle->rank() == 4);
+  CHECK_OR_FALSE(ifm_circle->dim(3).known());
+  uint32_t ifm_channel_depth = ifm_circle->dim(3).value();
 
-  CHECK_OR_FALSE(fill(&rsqrt, &const_as_gamma).with_commutative_args_of(mul_gamma));
+  CHECK_OR_FALSE(luci::fill(&rsqrt, &const_as_gamma).with_commutative_args_of(mul_gamma));
 
   if (_pv == PatternVersion::Version_0)
   {
@@ -536,7 +448,7 @@ bool InstanceNormPattern::matched()
   CHECK_OR_FALSE(add_as_variance);
 
   CHECK_OR_FALSE(
-      fill(&mean_as_variance, &const_as_epsilon).with_commutative_args_of(add_as_variance));
+    luci::fill(&mean_as_variance, &const_as_epsilon).with_commutative_args_of(add_as_variance));
 
   CHECK_OR_FALSE(const_as_epsilon->dtype() == loco::DataType::FLOAT32);
   // TODO Support regarding broadcast
@@ -557,7 +469,7 @@ bool InstanceNormPattern::matched()
   if (_pv == PatternVersion::Version_0)
   {
     loco::Node *ifm_should_be = nullptr;
-    CHECK_OR_FALSE(fill(&ifm_should_be, &mean_of_ifm).with_commutative_args_of(sqdiff));
+    CHECK_OR_FALSE(luci::fill(&ifm_should_be, &mean_of_ifm).with_commutative_args_of(sqdiff));
     CHECK_OR_FALSE(ifm == ifm_should_be);
     CHECK_OR_FALSE(is_instance_mean_v0(mean_of_ifm));
     CHECK_OR_FALSE(ifm == mean_of_ifm->input());
@@ -565,7 +477,8 @@ bool InstanceNormPattern::matched()
   if (_pv == PatternVersion::Version_1)
   {
     loco::Node *reshape_should_be = nullptr;
-    CHECK_OR_FALSE(fill(&reshape_should_be, &mean_of_reshape).with_commutative_args_of(sqdiff));
+    CHECK_OR_FALSE(
+      luci::fill(&reshape_should_be, &mean_of_reshape).with_commutative_args_of(sqdiff));
     CHECK_OR_FALSE(reshape_of_ifm == reshape_should_be);
     CHECK_OR_FALSE(is_instance_mean_v1(mean_of_reshape));
     CHECK_OR_FALSE(reshape_of_ifm == mean_of_reshape->input());
@@ -592,15 +505,15 @@ bool InstanceNormPattern::matched()
 
   if (_pv == PatternVersion::Version_0)
   {
-    CHECK_OR_FALSE(fill(&mul_gamma_should_be, &mean_of_ifm_should_be)
-                       .with_commutative_args_of(mul_as_scaled_mean));
+    CHECK_OR_FALSE(luci::fill(&mul_gamma_should_be, &mean_of_ifm_should_be)
+                     .with_commutative_args_of(mul_as_scaled_mean));
     CHECK_OR_FALSE(mul_gamma == mul_gamma_should_be);
     CHECK_OR_FALSE(mean_of_ifm == mean_of_ifm_should_be);
   }
   if (_pv == PatternVersion::Version_1)
   {
-    CHECK_OR_FALSE(fill(&mul_gamma_should_be, &mean_of_reshape_should_be)
-                       .with_commutative_args_of(mul_as_scaled_mean));
+    CHECK_OR_FALSE(luci::fill(&mul_gamma_should_be, &mean_of_reshape_should_be)
+                     .with_commutative_args_of(mul_as_scaled_mean));
     CHECK_OR_FALSE(mul_gamma == mul_gamma_should_be);
     CHECK_OR_FALSE(mean_of_reshape == mean_of_reshape_should_be);
   }
@@ -631,47 +544,59 @@ void fuse_instance_norm(const InstanceNormPattern &p)
 
   auto graph = p.add_as_terminal->graph();
 
-  // Special case for version 2 (no need to reshape)
-  if (p.version() == InstanceNormPattern::Version_2)
+  // Version 0 and 1 need to reshape
+  if (p.version() != InstanceNormPattern::Version_2)
   {
-    // Make Instance Norm to replace
-    auto instance_norm = graph->nodes()->create<luci::CircleInstanceNorm>();
-    instance_norm->input(p.ifm);
-    instance_norm->gamma(p.const_as_gamma);
-    instance_norm->beta(p.const_as_beta);
-    float epsilon = p.const_as_epsilon->at<loco::DataType::FLOAT32>(0);
-    instance_norm->epsilon(epsilon);
-    instance_norm->fusedActivationFunction(p.add_as_terminal->fusedActivationFunction());
-
-    replace(p.add_as_terminal).with(instance_norm);
-
-    return;
-  }
-
-  // Make reshape for gamma & beta
-  auto reshape_gamma = graph->nodes()->create<luci::CircleReshape>();
-  auto reshape_beta = graph->nodes()->create<luci::CircleReshape>();
-  {
-    auto ifm_shape = loco::shape_get(p.ifm).as<loco::TensorShape>();
-    uint32_t ifm_channel_depth = ifm_shape.dim(3).value();
-
-    int32_t new_shape[1] = {static_cast<int32_t>(ifm_channel_depth)};
-
-    reshape_gamma->tensor(p.const_as_gamma);
-    reshape_beta->tensor(p.const_as_beta);
+    p.const_as_gamma->rank(1);
+    p.const_as_gamma->dim(0).set(p.const_as_gamma->size<loco::DataType::FLOAT32>());
+    p.const_as_beta->rank(1);
+    p.const_as_beta->dim(0).set(p.const_as_beta->size<loco::DataType::FLOAT32>());
 
-    luci::set_new_shape(reshape_gamma, new_shape, 1);
-    luci::set_new_shape(reshape_beta, new_shape, 1);
+    p.const_as_gamma->shape_status(luci::ShapeStatus::UNDEFINED);
+    p.const_as_beta->shape_status(luci::ShapeStatus::UNDEFINED);
   }
 
   // Make Instance Norm to replace
   auto instance_norm = graph->nodes()->create<luci::CircleInstanceNorm>();
   instance_norm->input(p.ifm);
-  instance_norm->gamma(reshape_gamma);
-  instance_norm->beta(reshape_beta);
+  instance_norm->gamma(p.const_as_gamma);
+  instance_norm->beta(p.const_as_beta);
   float epsilon = p.const_as_epsilon->at<loco::DataType::FLOAT32>(0);
   instance_norm->epsilon(epsilon);
   instance_norm->fusedActivationFunction(p.add_as_terminal->fusedActivationFunction());
+  // NOTE unique name should be assigned in export
+  instance_norm->name("InstanceNorm");
+
+  // set origin
+  std::vector<std::shared_ptr<luci::CircleNodeOrigin>> origin_vec{
+    luci::get_origin(p.sqdiff),
+    luci::get_origin(p.mean_as_variance),
+    luci::get_origin(p.add_as_variance),
+    luci::get_origin(p.mul_gamma),
+    luci::get_origin(p.sub),
+    luci::get_origin(p.add_as_terminal)};
+  if (p.version() == InstanceNormPattern::PatternVersion::Version_0)
+  {
+    origin_vec.push_back(luci::get_origin(p.mean_of_ifm));
+    origin_vec.push_back(luci::get_origin(p.rsqrt));
+    origin_vec.push_back(luci::get_origin(p.mul_as_scaled_ifm));
+    origin_vec.push_back(luci::get_origin(p.mul_as_scaled_mean));
+  }
+  if (p.version() == InstanceNormPattern::PatternVersion::Version_1)
+  {
+    origin_vec.push_back(luci::get_origin(p.reshape_of_ifm));
+    origin_vec.push_back(luci::get_origin(p.mean_of_reshape));
+    origin_vec.push_back(luci::get_origin(p.rsqrt));
+    origin_vec.push_back(luci::get_origin(p.mul_as_scaled_mean));
+    origin_vec.push_back(luci::get_origin(p.mul_as_scaled_reshape));
+  }
+  if (p.version() == InstanceNormPattern::PatternVersion::Version_2)
+  {
+    origin_vec.push_back(luci::get_origin(p.mean_of_ifm));
+    origin_vec.push_back(luci::get_origin(p.pow));
+    origin_vec.push_back(luci::get_origin(p.div));
+  }
+  luci::add_origin(instance_norm, luci::composite_origin(origin_vec));
 
   replace(p.add_as_terminal).with(instance_norm);
 }
diff --git a/compiler/luci/pass/src/FuseInstanceNormPass.test.cpp b/compiler/luci/pass/src/FuseInstanceNormPass.test.cpp
index 3037f3def..b83ccca50 100644
--- a/compiler/luci/pass/src/FuseInstanceNormPass.test.cpp
+++ b/compiler/luci/pass/src/FuseInstanceNormPass.test.cpp
@@ -16,6 +16,8 @@
 
 #include "FuseInstanceNormPassInternal.h"
 
+#include "luci/Pass/FuseInstanceNormPass.h"
+
 #include <vector>
 
 #include <gtest/gtest.h>
@@ -34,6 +36,13 @@ void setShape(luci::CircleNode &node, const std::vector<int> &v)
 
 } // namespace
 
+TEST(FuseInstanceNormPassTest, name)
+{
+  luci::FuseInstanceNormPass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
+
 TEST(FuseInstanceNormPass, is_quasi_1D_with_dummy_dim)
 {
   luci::CircleConst const_node;
diff --git a/compiler/luci/pass/src/FusePreActivationBatchNormPass.cpp b/compiler/luci/pass/src/FusePreActivationBatchNormPass.cpp
index bcde5fac4..469fcddbb 100644
--- a/compiler/luci/pass/src/FusePreActivationBatchNormPass.cpp
+++ b/compiler/luci/pass/src/FusePreActivationBatchNormPass.cpp
@@ -16,9 +16,11 @@
 
 #include "luci/Pass/FusePreActivationBatchNormPass.h"
 #include "FusePreActivationBatchNormPassInternal.h"
+#include "BatchNormPatternFinder.h"
 
 #include <luci/IR/CircleNodes.h>
 #include <luci/Log.h>
+#include <luci/Profile/CircleNodeOrigin.h>
 
 namespace
 {
@@ -37,83 +39,6 @@ bool is_non_negative(const luci::CircleConst *node)
   return true;
 }
 
-// Check if mul is batchnorm mul
-bool is_batchnorm_mul(const luci::CircleMul *mul, luci::CircleNode *&pred_node,
-                      luci::CircleConst *&gamma)
-{
-  auto x = dynamic_cast<luci::CircleConst *>(mul->x());
-  auto y = dynamic_cast<luci::CircleConst *>(mul->y());
-
-  luci::CircleNode *pred = nullptr;
-  luci::CircleConst *constant = nullptr;
-
-  if (x != nullptr && y == nullptr)
-  {
-    pred = loco::must_cast<luci::CircleNode *>(mul->y());
-    constant = x;
-  }
-  else if (x == nullptr && y != nullptr)
-  {
-    pred = loco::must_cast<luci::CircleNode *>(mul->x());
-    constant = y;
-  }
-  else
-  {
-    return false;
-  }
-
-  if (constant->rank() != 1)
-    return false;
-
-  auto channel_dim = constant->dim(0);
-  if (!(channel_dim == mul->dim(mul->rank() - 1)))
-    return false;
-
-  pred_node = pred;
-  gamma = constant;
-  return true;
-}
-
-// Check if add is batchnorm add
-bool is_batchnorm_add(const luci::CircleAdd *add, luci::CircleMul *&mul, luci::CircleConst *&beta)
-{
-  auto x = loco::must_cast<luci::CircleNode *>(add->x());
-  auto y = loco::must_cast<luci::CircleNode *>(add->y());
-
-  luci::CircleMul *pred = nullptr;
-  luci::CircleConst *constant = nullptr;
-
-  if (add->fusedActivationFunction() != luci::FusedActFunc::RELU)
-    return false;
-
-  if (x->opcode() == luci::CircleOpcode::CIRCLECONST && y->opcode() == luci::CircleOpcode::MUL)
-  {
-    pred = loco::must_cast<luci::CircleMul *>(y);
-    constant = loco::must_cast<luci::CircleConst *>(x);
-  }
-  else if (x->opcode() == luci::CircleOpcode::MUL && y->opcode() == luci::CircleOpcode::CIRCLECONST)
-  {
-    pred = loco::must_cast<luci::CircleMul *>(x);
-    constant = loco::must_cast<luci::CircleConst *>(y);
-  }
-  else
-  {
-    return false;
-  }
-
-  if (constant->rank() != 1)
-    return false;
-
-  auto channel_dim = constant->dim(0);
-  // Assumption: Layout is channel-last
-  if (!(channel_dim == add->dim(add->rank() - 1)))
-    return false;
-
-  mul = pred;
-  beta = constant;
-  return true;
-}
-
 const luci::CircleConv2D *get_forward_conv2d(const luci::CircleNode *node, uint32_t channel_size)
 {
   auto opcode = node->opcode();
@@ -249,6 +174,9 @@ bool update_conv_bias_with_beta(luci::CircleConv2D *conv, const luci::CircleCons
   auto size = beta->dim(0).value();
   auto bias = dynamic_cast<luci::CircleConst *>(conv->bias());
 
+  auto name = conv->name();
+  assert(name.length() > 0);
+
   if (bias == nullptr)
   {
     bias = conv->graph()->nodes()->create<luci::CircleConst>();
@@ -256,6 +184,7 @@ bool update_conv_bias_with_beta(luci::CircleConv2D *conv, const luci::CircleCons
     bias->rank(1);
     bias->dim(0).set(size);
     bias->size<loco::DataType::FLOAT32>(size);
+    bias->name(name + "/bias");
     conv->bias(bias);
   }
   else
@@ -282,14 +211,12 @@ bool update_conv_bias_with_beta(luci::CircleConv2D *conv, const luci::CircleCons
 
 luci::CircleSub *insert_sub(luci::CircleNode *pred, luci::CircleConst *beta)
 {
+  auto name = pred->name();
+  assert(name.length() > 0);
+
   auto sub = pred->graph()->nodes()->create<luci::CircleSub>();
-  sub->dtype(loco::DataType::FLOAT32);
-  sub->rank(pred->rank());
-  for (uint32_t i = 0; i < sub->rank(); i++)
-  {
-    sub->dim(i).set(pred->dim(i).value());
-  }
   sub->fusedActivationFunction(luci::FusedActFunc::NONE);
+  sub->name(name + "/Sub");
 
   loco::replace(pred).with(sub);
 
@@ -366,6 +293,8 @@ bool fuse_sub_with_conv(luci::CircleSub *sub)
   if (!update_conv_bias_with_beta(conv, beta, false))
     return false;
 
+  luci::add_origin(conv, luci::get_origin(sub));
+
   auto pred = sub->x();
   loco::replace(sub).with(pred);
 
@@ -442,6 +371,7 @@ bool fuse_add_with_conv(luci::CircleAdd *add, std::vector<luci::CircleSub *> &su
     if (!update_conv_bias_with_beta(conv, beta, true))
       return false;
 
+    luci::add_origin(conv, luci::get_origin(add));
     loco::replace(add).with(pred);
     add->drop();
 
@@ -462,6 +392,8 @@ bool fuse_add_with_conv(luci::CircleAdd *add, std::vector<luci::CircleSub *> &su
     if (!update_conv_bias_with_beta(conv, beta, true))
       return false;
 
+    luci::add_origin(conv, luci::get_origin(add));
+
     auto relu = *loco::succs(add).begin();
     auto relu_node = loco::must_cast<luci::CircleRelu *>(relu);
     assert(relu_node != nullptr);
@@ -471,6 +403,7 @@ bool fuse_add_with_conv(luci::CircleAdd *add, std::vector<luci::CircleSub *> &su
     add->drop();
 
     sub_list.push_back(insert_sub(pred, beta));
+    luci::add_origin(sub_list.back(), luci::get_origin(add));
 
     relu_node->features(pred);
 
@@ -530,6 +463,11 @@ bool fuse_mul_with_conv(luci::CircleMul *mul)
 
       // Update CONV weights
       update_conv_weights_with_gamma(conv, gamma);
+
+      // Update origin
+      // TODO need to remove const
+      luci::add_origin(const_cast<luci::CircleConv2D *>(conv),
+                       luci::get_origin(loco::must_cast<luci::CircleNode *>(mul)));
     }
 
     loco::replace(mul).with(pred_node);
@@ -568,6 +506,8 @@ bool swap_mul_add(luci::CircleAdd *add, std::vector<luci::CircleMul *> &mul_list
 
   if (!is_batchnorm_add(add, mul, beta))
     return false;
+  if (add->fusedActivationFunction() != luci::FusedActFunc::RELU)
+    return false;
 
   if (loco::succs(mul).size() != 1)
     return false;
@@ -582,8 +522,13 @@ bool swap_mul_add(luci::CircleAdd *add, std::vector<luci::CircleMul *> &mul_list
     return false;
 
   // Insert Relu at the bottom
+  auto name = add->name();
+  assert(name.length() > 0);
+
   auto relu = add->graph()->nodes()->create<luci::CircleRelu>();
   relu->features(mul);
+  relu->name(name + "/Relu");
+  luci::add_origin(relu, luci::get_origin(add));
   loco::replace(add).with(relu);
 
   // Replace beta <- beta / gamma
diff --git a/compiler/luci/pass/src/FusePreActivationBatchNormPass.test.cpp b/compiler/luci/pass/src/FusePreActivationBatchNormPass.test.cpp
index a79b5bd5d..3d5791c9e 100644
--- a/compiler/luci/pass/src/FusePreActivationBatchNormPass.test.cpp
+++ b/compiler/luci/pass/src/FusePreActivationBatchNormPass.test.cpp
@@ -16,6 +16,8 @@
 
 #include "FusePreActivationBatchNormPassInternal.h"
 
+#include "luci/Pass/FusePreActivationBatchNormPass.h"
+
 #include <luci/IR/CircleNodes.h>
 
 #include <math.h>
@@ -148,6 +150,22 @@ public:
         conv_filter->at<loco::DataType::FLOAT32>(i * out_size + j) = i * out_size + j;
       }
     }
+
+    pred_conv->name("pred_conv");
+    pred_conv_filter->name("pred_conv_filter");
+    pred_conv_bias->name("pred_conv_bias");
+    pred_conv2->name("pred_conv2");
+    pred_conv2_filter->name("pred_conv2_filter");
+    pred_conv2_bias->name("pred_conv2_bias");
+    pred_add->name("pred_add");
+    mul->name("mul");
+    mul_gamma->name("mul_gamma");
+    add->name("add");
+    add_beta->name("add_beta");
+    conv->name("conv");
+    conv_filter->name("conv_filter");
+    conv_bias->name("conv_bias");
+    succ_add->name("succ_add");
   }
 
 public:
@@ -171,6 +189,13 @@ public:
 
 } // namespace
 
+TEST(FusePreActivationBatchNormPassTest, name)
+{
+  luci::FusePreActivationBatchNormPass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
+
 TEST(FusePreActivationBatchNorm, swap_mul_add)
 {
   SimpleGraph g;
diff --git a/compiler/luci/pass/src/MakeBatchNormGammaPositivePass.cpp b/compiler/luci/pass/src/MakeBatchNormGammaPositivePass.cpp
index 281d1b081..96776dc92 100644
--- a/compiler/luci/pass/src/MakeBatchNormGammaPositivePass.cpp
+++ b/compiler/luci/pass/src/MakeBatchNormGammaPositivePass.cpp
@@ -16,6 +16,8 @@
 
 #include "luci/Pass/MakeBatchNormGammaPositivePass.h"
 
+#include "BatchNormPatternFinder.h"
+
 #include <luci/IR/CircleNodes.h>
 
 namespace
@@ -39,71 +41,27 @@ bool negative_gamma_to_positive(luci::CircleConst *gamma)
   return changed;
 }
 
-// Check if add is batchnorm add
-bool is_batchnorm_add(const luci::CircleAdd *add)
+bool make_positive_gamma(luci::CircleAdd *add)
 {
-  auto x = dynamic_cast<luci::CircleConst *>(add->x());
-  auto y = dynamic_cast<luci::CircleConst *>(add->y());
-
-  luci::CircleConst *constant = nullptr;
+  luci::CircleMul *mul = nullptr;
+  luci::CircleConst *beta = nullptr;
+  luci::CircleConst *gamma = nullptr;
+  luci::CircleNode *pred = nullptr;
 
-  if (x != nullptr && y == nullptr)
-    constant = x;
-  else if (x == nullptr && y != nullptr)
-    constant = y;
-  else
+  if (!is_batchnorm_add(add, mul, beta))
     return false;
 
-  if (constant->rank() != 1)
+  if (loco::succs(mul).size() != 1)
     return false;
 
+  if (!is_batchnorm_mul(mul, pred, gamma))
+    return false;
+  assert(pred == add);
   // Only support Relu
   if (add->fusedActivationFunction() != luci::FusedActFunc::RELU)
     return false;
 
-  auto channel_dim = constant->dim(0);
-  if (!(channel_dim == add->dim(add->rank() - 1)))
-    return false;
-
-  return true;
-}
-
-// Check if mul is batchnorm mul
-bool is_batchnorm_mul(const luci::CircleMul *mul, luci::CircleConst *&gamma)
-{
-  auto x = dynamic_cast<luci::CircleConst *>(mul->x());
-  auto y = dynamic_cast<luci::CircleConst *>(mul->y());
-
-  luci::CircleConst *constant = nullptr;
-
-  if (x != nullptr && y == nullptr)
-    constant = x;
-  else if (x == nullptr && y != nullptr)
-    constant = y;
-  else
-    return false;
-
-  if (constant->rank() != 1)
-    return false;
-
-  auto channel_dim = constant->dim(0);
-  if (!(channel_dim == mul->dim(mul->rank() - 1)))
-    return false;
-
-  // Check successor is batchnorm add
-  auto succs = loco::succs(mul);
-  if (succs.size() != 1)
-    return false;
-
-  auto add = dynamic_cast<luci::CircleAdd *>(*succs.begin());
-  if (add == nullptr)
-    return false;
-
-  if (!is_batchnorm_add(add))
-    return false;
-
-  gamma = constant;
-  return true;
+  return negative_gamma_to_positive(gamma);
 }
 
 } // namespace
@@ -111,18 +69,29 @@ bool is_batchnorm_mul(const luci::CircleMul *mul, luci::CircleConst *&gamma)
 namespace luci
 {
 
+/**
+ * Make negative gamma values of Mul-Add (as BatchNorm) to a small positive value (1e-10)
+ *
+ *  PATTERN:
+ *          |
+ *    [CircleNode] [CircleConst](as gamma)
+ *              |   |
+ *           [CircleMul] [CircleConst]
+ *                   |    |
+ *               [CircleAdd]
+ *                     |
+ */
 bool MakeBatchNormGammaPositivePass::run(loco::Graph *g)
 {
   bool changed = false;
   for (auto node : loco::active_nodes(loco::output_nodes(g)))
   {
-    auto mul = dynamic_cast<luci::CircleMul *>(node);
-    if (mul == nullptr)
+    auto add = dynamic_cast<luci::CircleAdd *>(node);
+    if (add == nullptr)
       continue;
 
-    luci::CircleConst *gamma;
-    if (is_batchnorm_mul(mul, gamma))
-      changed = negative_gamma_to_positive(gamma);
+    if (make_positive_gamma(add))
+      changed = true;
   }
   return changed;
 }
diff --git a/compiler/luci/pass/src/MakeBatchNormGammaPositivePass.test.cpp b/compiler/luci/pass/src/MakeBatchNormGammaPositivePass.test.cpp
new file mode 100644
index 000000000..83093edc8
--- /dev/null
+++ b/compiler/luci/pass/src/MakeBatchNormGammaPositivePass.test.cpp
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/MakeBatchNormGammaPositivePass.h"
+
+#include <gtest/gtest.h>
+
+TEST(MakeBatchNormGammaPositivePassTest, name)
+{
+  luci::MakeBatchNormGammaPositivePass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
diff --git a/compiler/luci/pass/src/MigrateLegacyShapeDtypePass.cpp b/compiler/luci/pass/src/MigrateLegacyShapeDtypePass.cpp
deleted file mode 100644
index beb962a05..000000000
--- a/compiler/luci/pass/src/MigrateLegacyShapeDtypePass.cpp
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "luci/Pass/MigrateLegacyShapeDtypePass.h"
-
-#include <loco/Service/ShapeInference.h>
-#include <loco/Service/TypeInference.h>
-
-#include <luci/IR/CircleNodes.h>
-
-#include <loco.h>
-
-namespace
-{
-
-bool has_same_shape(luci::CircleNode *node, loco::TensorShape shape)
-{
-  if (node->rank() != shape.rank())
-    return false;
-
-  for (uint32_t i = 0; i < shape.rank(); ++i)
-    if (!(node->dim(i) == shape.dim(i)))
-      return false;
-
-  return true;
-}
-
-} // namespace
-
-namespace luci
-{
-
-bool MigrateLegacyShapeDtypePass::run(luci::Module *m)
-{
-  bool changed = false;
-
-  for (size_t g = 0; g < m->size(); ++g)
-  {
-    if (run(m->graph(g)))
-      changed = true;
-  }
-
-  return changed;
-}
-
-bool MigrateLegacyShapeDtypePass::run(loco::Graph *g)
-{
-  bool changed = false;
-
-  for (auto node : loco::all_nodes(g))
-  {
-    auto circle_node = loco::must_cast<luci::CircleNode *>(node);
-    if (loco::shape_known(node))
-    {
-      auto loco_shape = loco::shape_get(node).as<loco::TensorShape>();
-
-      assert(circle_node->shape_signature().rank() == 0 ||
-             circle_node->shape_signature().rank() == loco_shape.rank());
-
-      // When shape of loco is copied to circle node, ShapeSignature should be applied.
-      loco::TensorShape new_shape;
-      new_shape.rank(loco_shape.rank());
-      for (uint32_t i = 0; i < loco_shape.rank(); ++i)
-      {
-        if (circle_node->shape_signature().rank() > 0 &&
-            circle_node->shape_signature().dim(i) == -1)
-          new_shape.dim(i) = 1;
-        else
-          new_shape.dim(i) = loco_shape.dim(i);
-      }
-
-      if (circle_node->shape_status() == luci::ShapeStatus::UNDEFINED ||
-          !has_same_shape(circle_node, new_shape))
-      {
-        circle_node->rank(new_shape.rank());
-        for (uint32_t i = 0; i < new_shape.rank(); ++i)
-          circle_node->dim(i) = new_shape.dim(i);
-
-        if (circle_node->shape_status() == luci::ShapeStatus::UNDEFINED)
-          circle_node->shape_status(luci::ShapeStatus::VALID);
-
-        changed = true;
-      }
-    }
-
-    if (loco::dtype_known(node))
-    {
-      if (loco::dtype_get(node) != circle_node->dtype())
-      {
-        circle_node->dtype(loco::dtype_get(node));
-        changed = true;
-      }
-    }
-  }
-
-  return changed;
-}
-
-} // namespace luci
diff --git a/compiler/luci/pass/src/ModulePhase.test.cpp b/compiler/luci/pass/src/ModulePhase.test.cpp
new file mode 100644
index 000000000..5d92c59f4
--- /dev/null
+++ b/compiler/luci/pass/src/ModulePhase.test.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ModulePhase.h"
+
+#include "luci/Pass/CircleShapeInferencePass.h"
+
+#include <loco.h>
+
+#include <gtest/gtest.h>
+
+TEST(ModulePhaseTest, saturate)
+{
+  auto m = luci::make_module();
+  auto g = loco::make_graph();
+  m->add(std::move(g));
+
+  luci::Phase phase;
+
+  // Any Pass will do for testing
+  phase.emplace_back(std::make_unique<luci::CircleShapeInferencePass>());
+
+  luci::PhaseRunner<logo::PhaseStrategy::Saturate> phase_runner{m.get()};
+  phase_runner.run(phase);
+
+  SUCCEED();
+}
+
+TEST(ModulePhaseTest, restart)
+{
+  auto m = luci::make_module();
+  auto g = loco::make_graph();
+  m->add(std::move(g));
+
+  luci::Phase phase;
+
+  // Any Pass will do for testing
+  phase.emplace_back(std::make_unique<luci::CircleShapeInferencePass>());
+
+  luci::PhaseRunner<logo::PhaseStrategy::Restart> phase_runner{m.get()};
+  phase_runner.run(phase);
+
+  SUCCEED();
+}
diff --git a/compiler/luci/pass/src/PassTestGraphs.h b/compiler/luci/pass/src/PassTestGraphs.h
new file mode 100644
index 000000000..f5ae24f0b
--- /dev/null
+++ b/compiler/luci/pass/src/PassTestGraphs.h
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_PASS_TEST_GRAPHS_H__
+#define __LUCI_PASS_TEST_GRAPHS_H__
+
+#include <loco.h>
+#include <luci/IR/CircleNodes.h>
+
+namespace luci
+{
+
+/**
+ *  ConstantFoldingTestGraph is a base class for testing
+ *  constant folding passes. It creates Input and Output
+ *  in the below graph. Child classes must implement Connector
+ *  and Folded pattern.
+ *
+ *      [Input]   [Folded pattern] (Implemented by child class)
+ *           \    /
+ *         [Connector] (Implemented by child class)
+ *              |
+ *           [Output]
+ *
+ *    Connector should satisfy the below conditions
+ *      - Input type == Output type == Folded pattern type
+ *      - Input shape == Output shape == Folded pattern shape
+ *
+ *    For example, Add, Mul, Sub, .. can be a Connector
+ */
+class ConstantFoldingTestGraph
+{
+public:
+  ConstantFoldingTestGraph(std::vector<uint32_t> input_shape, loco::DataType input_dtype)
+  {
+    _input = _g.nodes()->create<luci::CircleInput>();
+    _output = _g.nodes()->create<luci::CircleOutput>();
+
+    auto graph_input = _g.inputs()->create();
+    _input->index(graph_input->index());
+    auto graph_output = _g.outputs()->create();
+    _output->index(graph_output->index());
+
+    graph_input->dtype(input_dtype);
+    graph_output->dtype(input_dtype);
+    _input->dtype(input_dtype);
+    _output->dtype(input_dtype);
+
+    auto input_tensor_shape = std::make_unique<loco::TensorShape>();
+    input_tensor_shape->rank(input_shape.size());
+    for (int i = 0; i < input_shape.size(); i++)
+      input_tensor_shape->dim(i).set(input_shape[i]);
+    graph_input->shape(std::move(input_tensor_shape));
+
+    auto output_tensor_shape = std::make_unique<loco::TensorShape>();
+    output_tensor_shape->rank(input_shape.size());
+    for (int i = 0; i < input_shape.size(); i++)
+      output_tensor_shape->dim(i).set(input_shape[i]);
+    graph_output->shape(std::move(output_tensor_shape));
+
+    _input->rank(input_shape.size());
+    for (int i = 0; i < input_shape.size(); i++)
+      _input->dim(i).set(input_shape[i]);
+
+    _output->rank(input_shape.size());
+    for (int i = 0; i < input_shape.size(); i++)
+      _output->dim(i).set(input_shape[i]);
+
+    _input->name("input");
+    _output->name("output");
+  }
+
+  virtual void init() = 0;
+
+  virtual ~ConstantFoldingTestGraph() = default;
+
+  virtual loco::Node *createFoldedPattern() = 0;
+
+  virtual luci::CircleConst *getFoldedPattern() = 0;
+
+  loco::Graph *graph() { return &_g; }
+
+  // NOTE: we're not adding _ prefix as these class members are public
+protected:
+  loco::Graph _g;
+  luci::CircleInput *_input = nullptr;
+  luci::CircleOutput *_output = nullptr;
+};
+
+/**
+ *  ConstantFoldingTestAddGraph is ConstantFoldingTestGraph
+ *  whose Connector is Add.
+ */
+class ConstantFoldingAddTestGraph : public ConstantFoldingTestGraph
+{
+protected:
+  ConstantFoldingAddTestGraph(std::vector<uint32_t> input_shape, loco::DataType input_dtype)
+    : ConstantFoldingTestGraph(input_shape, input_dtype)
+  {
+    _add = _g.nodes()->create<luci::CircleAdd>();
+    _add->dtype(input_dtype);
+
+    _add->rank(input_shape.size());
+    for (int i = 0; i < input_shape.size(); i++)
+      _add->dim(i).set(input_shape[i]);
+
+    _add->x(_input);
+
+    _output->from(_add);
+
+    _add->name("add");
+  }
+
+protected:
+  void init() override { _add->y(createFoldedPattern()); }
+
+protected:
+  luci::CircleConst *getFoldedPattern() override
+  {
+    return dynamic_cast<luci::CircleConst *>(_add->y());
+  }
+
+protected:
+  luci::CircleAdd *_add = nullptr;
+};
+
+} // namespace luci
+
+#endif // __LUCI_PASS_TEST_GRAPHS_H__
diff --git a/compiler/luci/pass/src/ProgressReporter.h b/compiler/luci/pass/src/ProgressReporter.h
index cf30da735..8c6c95e65 100644
--- a/compiler/luci/pass/src/ProgressReporter.h
+++ b/compiler/luci/pass/src/ProgressReporter.h
@@ -30,7 +30,7 @@ class ProgressReporter : public logo::PhaseEventListener
 {
 public:
   ProgressReporter(loco::Graph *graph, logo::PhaseStrategy strategy)
-      : _graph{graph}, _strategy{strategy}
+    : _graph{graph}, _strategy{strategy}
   {
     // DO NOTHING
   }
@@ -54,7 +54,7 @@ class ModuleProgressReporter : public logo::PhaseEventListener
 {
 public:
   ModuleProgressReporter(luci::Module *module, logo::PhaseStrategy strategy)
-      : _module{module}, _strategy{strategy}
+    : _module{module}, _strategy{strategy}
   {
     // DO NOTHING
   }
diff --git a/compiler/luci/pass/src/PropagateConcatenationQparam.test.cpp b/compiler/luci/pass/src/PropagateConcatenationQparam.test.cpp
index 0f8d562e9..de973a431 100644
--- a/compiler/luci/pass/src/PropagateConcatenationQparam.test.cpp
+++ b/compiler/luci/pass/src/PropagateConcatenationQparam.test.cpp
@@ -136,30 +136,34 @@ class ConstInputConcatGraph
 public:
   ConstInputConcatGraph(loco::DataType quant_type)
   {
-    concat_node.dtype(quant_type);
-    concat_node.fusedActivationFunction(luci::FusedActFunc::NONE);
-    input_1.dtype(loco::DataType::FLOAT32);
-    input_1.size<loco::DataType::FLOAT32>(5);
+    concat_node = g.nodes()->create<luci::CircleConcatenation>(2);
+    input_1 = g.nodes()->create<luci::CircleConst>();
+    input_2 = g.nodes()->create<luci::CircleConv2D>();
+
+    concat_node->dtype(quant_type);
+    concat_node->fusedActivationFunction(luci::FusedActFunc::NONE);
+    input_1->dtype(loco::DataType::FLOAT32);
+    input_1->size<loco::DataType::FLOAT32>(5);
     for (int i = 0; i < 5; i++)
     {
       // Set data {-2, -1, 0, 1, 2}
-      input_1.at<loco::DataType::FLOAT32>(i) = i - 2.0;
+      input_1->at<loco::DataType::FLOAT32>(i) = i - 2.0;
     }
 
-    input_2.dtype(quant_type);
+    input_2->dtype(quant_type);
 
-    concat_node.values(0, &input_1);
-    concat_node.values(1, &input_2);
+    concat_node->values(0, input_1);
+    concat_node->values(1, input_2);
 
     if (quant_type == loco::DataType::U8)
     {
-      addQuantParam(concat_node, {0.1}, {10});
-      addQuantParam(input_2, {2.0}, {2});
+      addQuantParam(*concat_node, {0.1}, {10});
+      addQuantParam(*input_2, {2.0}, {2});
     }
     else if (quant_type == loco::DataType::S16)
     {
-      addQuantParam(concat_node, {0.1}, {0});
-      addQuantParam(input_2, {2.0}, {0});
+      addQuantParam(*concat_node, {0.1}, {0});
+      addQuantParam(*input_2, {2.0}, {0});
     }
     else
     {
@@ -167,16 +171,11 @@ public:
     }
   }
 
-  ~ConstInputConcatGraph()
-  {
-    concat_node.values(0, nullptr);
-    concat_node.values(1, nullptr);
-  }
-
 public:
-  luci::CircleConcatenation concat_node{2};
-  luci::CircleConst input_1;
-  luci::CircleConv2D input_2;
+  loco::Graph g;
+  luci::CircleConcatenation *concat_node = nullptr;
+  luci::CircleConst *input_1 = nullptr;
+  luci::CircleConv2D *input_2 = nullptr;
 };
 
 } // namespace
@@ -223,19 +222,20 @@ TEST(PropagateConcatenationQparam, propagate_concat_quantparam_u8)
 
   // input_1 is const. const values are quantized with the qparam of concat
   ConstInputConcatGraph cg(loco::DataType::U8);
-  luci::propagate_concat_quantparam(&cg.concat_node, loco::DataType::U8);
-  EXPECT_FLOAT_EQ(0.1, cg.concat_node.quantparam()->scale[0]);
-  EXPECT_EQ(10, cg.concat_node.quantparam()->zerop[0]);
-  EXPECT_FLOAT_EQ(0.1, cg.input_1.quantparam()->scale[0]);
-  EXPECT_EQ(10, cg.input_1.quantparam()->zerop[0]);
-  EXPECT_FLOAT_EQ(0.1, cg.input_2.quantparam()->scale[0]);
-  EXPECT_EQ(10, cg.input_2.quantparam()->zerop[0]);
-  EXPECT_EQ(loco::DataType::U8, cg.input_1.dtype());
-  EXPECT_EQ(0, cg.input_1.at<loco::DataType::U8>(0));
-  EXPECT_EQ(0, cg.input_1.at<loco::DataType::U8>(1));
-  EXPECT_EQ(10, cg.input_1.at<loco::DataType::U8>(2));
-  EXPECT_EQ(20, cg.input_1.at<loco::DataType::U8>(3));
-  EXPECT_EQ(30, cg.input_1.at<loco::DataType::U8>(4));
+  luci::propagate_concat_quantparam(cg.concat_node, loco::DataType::U8);
+  EXPECT_FLOAT_EQ(0.1, cg.concat_node->quantparam()->scale[0]);
+  EXPECT_EQ(10, cg.concat_node->quantparam()->zerop[0]);
+  const auto cg_input_1 = loco::must_cast<luci::CircleConst *>(cg.concat_node->values(0));
+  EXPECT_FLOAT_EQ(0.1, cg_input_1->quantparam()->scale[0]);
+  EXPECT_EQ(10, cg_input_1->quantparam()->zerop[0]);
+  EXPECT_FLOAT_EQ(0.1, cg.input_2->quantparam()->scale[0]);
+  EXPECT_EQ(10, cg.input_2->quantparam()->zerop[0]);
+  EXPECT_EQ(loco::DataType::U8, cg_input_1->dtype());
+  EXPECT_EQ(0, cg_input_1->at<loco::DataType::U8>(0));
+  EXPECT_EQ(0, cg_input_1->at<loco::DataType::U8>(1));
+  EXPECT_EQ(10, cg_input_1->at<loco::DataType::U8>(2));
+  EXPECT_EQ(20, cg_input_1->at<loco::DataType::U8>(3));
+  EXPECT_EQ(30, cg_input_1->at<loco::DataType::U8>(4));
 }
 
 TEST(PropagateConcatenationQparam, propagate_concat_quantparam_u8_NEG)
@@ -260,20 +260,21 @@ TEST(PropagateConcatenationQparam, propagate_concat_quantparam_u8_NEG)
   // concat has fused activation function and input_1 is const.
   // const values are quantized using its min/max
   ConstInputConcatGraph cg(loco::DataType::U8);
-  cg.concat_node.fusedActivationFunction(luci::FusedActFunc::RELU);
-  luci::propagate_concat_quantparam(&cg.concat_node, loco::DataType::U8);
-  EXPECT_FLOAT_EQ(0.1, cg.concat_node.quantparam()->scale[0]);
-  EXPECT_EQ(10, cg.concat_node.quantparam()->zerop[0]);
-  EXPECT_FLOAT_EQ(0.015686275, cg.input_1.quantparam()->scale[0]);
-  EXPECT_EQ(128, cg.input_1.quantparam()->zerop[0]);
-  EXPECT_FLOAT_EQ(2.0, cg.input_2.quantparam()->scale[0]);
-  EXPECT_EQ(2, cg.input_2.quantparam()->zerop[0]);
-  EXPECT_EQ(loco::DataType::U8, cg.input_1.dtype());
-  EXPECT_EQ(quantize(-2, cg.input_1.quantparam()), cg.input_1.at<loco::DataType::U8>(0));
-  EXPECT_EQ(quantize(-1, cg.input_1.quantparam()), cg.input_1.at<loco::DataType::U8>(1));
-  EXPECT_EQ(quantize(0, cg.input_1.quantparam()), cg.input_1.at<loco::DataType::U8>(2));
-  EXPECT_EQ(quantize(1, cg.input_1.quantparam()), cg.input_1.at<loco::DataType::U8>(3));
-  EXPECT_EQ(quantize(2, cg.input_1.quantparam()), cg.input_1.at<loco::DataType::U8>(4));
+  cg.concat_node->fusedActivationFunction(luci::FusedActFunc::RELU);
+  luci::propagate_concat_quantparam(cg.concat_node, loco::DataType::U8);
+  EXPECT_FLOAT_EQ(0.1, cg.concat_node->quantparam()->scale[0]);
+  EXPECT_EQ(10, cg.concat_node->quantparam()->zerop[0]);
+  const auto cg_input_1 = loco::must_cast<luci::CircleConst *>(cg.concat_node->values(0));
+  EXPECT_FLOAT_EQ(0.015686275, cg_input_1->quantparam()->scale[0]);
+  EXPECT_EQ(128, cg_input_1->quantparam()->zerop[0]);
+  EXPECT_FLOAT_EQ(2.0, cg.input_2->quantparam()->scale[0]);
+  EXPECT_EQ(2, cg.input_2->quantparam()->zerop[0]);
+  EXPECT_EQ(loco::DataType::U8, cg_input_1->dtype());
+  EXPECT_EQ(quantize(-2, cg_input_1->quantparam()), cg_input_1->at<loco::DataType::U8>(0));
+  EXPECT_EQ(quantize(-1, cg_input_1->quantparam()), cg_input_1->at<loco::DataType::U8>(1));
+  EXPECT_EQ(quantize(0, cg_input_1->quantparam()), cg_input_1->at<loco::DataType::U8>(2));
+  EXPECT_EQ(quantize(1, cg_input_1->quantparam()), cg_input_1->at<loco::DataType::U8>(3));
+  EXPECT_EQ(quantize(2, cg_input_1->quantparam()), cg_input_1->at<loco::DataType::U8>(4));
 }
 
 TEST(PropagateConcatenationQparam, propagate_concat_quantparam_i16)
@@ -318,19 +319,20 @@ TEST(PropagateConcatenationQparam, propagate_concat_quantparam_i16)
 
   // input_1 is const. const values are quantized with the qparam of concat
   ConstInputConcatGraph cg(loco::DataType::S16);
-  luci::propagate_concat_quantparam(&cg.concat_node, loco::DataType::S16);
-  EXPECT_FLOAT_EQ(0.1, cg.concat_node.quantparam()->scale[0]);
-  EXPECT_EQ(0, cg.concat_node.quantparam()->zerop[0]);
-  EXPECT_FLOAT_EQ(0.1, cg.input_1.quantparam()->scale[0]);
-  EXPECT_EQ(0, cg.input_1.quantparam()->zerop[0]);
-  EXPECT_FLOAT_EQ(0.1, cg.input_2.quantparam()->scale[0]);
-  EXPECT_EQ(0, cg.input_2.quantparam()->zerop[0]);
-  EXPECT_EQ(loco::DataType::S16, cg.input_1.dtype());
-  EXPECT_EQ(-20, cg.input_1.at<loco::DataType::S16>(0));
-  EXPECT_EQ(-10, cg.input_1.at<loco::DataType::S16>(1));
-  EXPECT_EQ(0, cg.input_1.at<loco::DataType::S16>(2));
-  EXPECT_EQ(10, cg.input_1.at<loco::DataType::S16>(3));
-  EXPECT_EQ(20, cg.input_1.at<loco::DataType::S16>(4));
+  luci::propagate_concat_quantparam(cg.concat_node, loco::DataType::S16);
+  EXPECT_FLOAT_EQ(0.1, cg.concat_node->quantparam()->scale[0]);
+  EXPECT_EQ(0, cg.concat_node->quantparam()->zerop[0]);
+  const auto cg_input_1 = loco::must_cast<luci::CircleConst *>(cg.concat_node->values(0));
+  EXPECT_FLOAT_EQ(0.1, cg_input_1->quantparam()->scale[0]);
+  EXPECT_EQ(0, cg_input_1->quantparam()->zerop[0]);
+  EXPECT_FLOAT_EQ(0.1, cg.input_2->quantparam()->scale[0]);
+  EXPECT_EQ(0, cg.input_2->quantparam()->zerop[0]);
+  EXPECT_EQ(loco::DataType::S16, cg_input_1->dtype());
+  EXPECT_EQ(-20, cg_input_1->at<loco::DataType::S16>(0));
+  EXPECT_EQ(-10, cg_input_1->at<loco::DataType::S16>(1));
+  EXPECT_EQ(0, cg_input_1->at<loco::DataType::S16>(2));
+  EXPECT_EQ(10, cg_input_1->at<loco::DataType::S16>(3));
+  EXPECT_EQ(20, cg_input_1->at<loco::DataType::S16>(4));
 }
 
 TEST(PropagateConcatenationQparam, propagate_concat_quantparam_i16_NEG)
@@ -355,18 +357,19 @@ TEST(PropagateConcatenationQparam, propagate_concat_quantparam_i16_NEG)
   // concat has fused activation function and input_1 is const.
   // const values are quantized using its min/max
   ConstInputConcatGraph cg(loco::DataType::S16);
-  cg.concat_node.fusedActivationFunction(luci::FusedActFunc::RELU);
-  luci::propagate_concat_quantparam(&cg.concat_node, loco::DataType::S16);
-  EXPECT_FLOAT_EQ(0.1, cg.concat_node.quantparam()->scale[0]);
-  EXPECT_EQ(0, cg.concat_node.quantparam()->zerop[0]);
-  EXPECT_FLOAT_EQ(0.000061037, cg.input_1.quantparam()->scale[0]);
-  EXPECT_EQ(0, cg.input_1.quantparam()->zerop[0]);
-  EXPECT_FLOAT_EQ(2.0, cg.input_2.quantparam()->scale[0]);
-  EXPECT_EQ(0, cg.input_2.quantparam()->zerop[0]);
-  EXPECT_EQ(loco::DataType::S16, cg.input_1.dtype());
-  EXPECT_EQ(quantize(-2, cg.input_1.quantparam()), cg.input_1.at<loco::DataType::S16>(0));
-  EXPECT_EQ(quantize(-1, cg.input_1.quantparam()), cg.input_1.at<loco::DataType::S16>(1));
-  EXPECT_EQ(quantize(0, cg.input_1.quantparam()), cg.input_1.at<loco::DataType::S16>(2));
-  EXPECT_EQ(quantize(1, cg.input_1.quantparam()), cg.input_1.at<loco::DataType::S16>(3));
-  EXPECT_EQ(quantize(2, cg.input_1.quantparam()), cg.input_1.at<loco::DataType::S16>(4));
+  cg.concat_node->fusedActivationFunction(luci::FusedActFunc::RELU);
+  luci::propagate_concat_quantparam(cg.concat_node, loco::DataType::S16);
+  EXPECT_FLOAT_EQ(0.1, cg.concat_node->quantparam()->scale[0]);
+  EXPECT_EQ(0, cg.concat_node->quantparam()->zerop[0]);
+  const auto cg_input_1 = loco::must_cast<luci::CircleConst *>(cg.concat_node->values(0));
+  EXPECT_FLOAT_EQ(0.000061037, cg_input_1->quantparam()->scale[0]);
+  EXPECT_EQ(0, cg_input_1->quantparam()->zerop[0]);
+  EXPECT_FLOAT_EQ(2.0, cg.input_2->quantparam()->scale[0]);
+  EXPECT_EQ(0, cg.input_2->quantparam()->zerop[0]);
+  EXPECT_EQ(loco::DataType::S16, cg_input_1->dtype());
+  EXPECT_EQ(quantize(-2, cg_input_1->quantparam()), cg_input_1->at<loco::DataType::S16>(0));
+  EXPECT_EQ(quantize(-1, cg_input_1->quantparam()), cg_input_1->at<loco::DataType::S16>(1));
+  EXPECT_EQ(quantize(0, cg_input_1->quantparam()), cg_input_1->at<loco::DataType::S16>(2));
+  EXPECT_EQ(quantize(1, cg_input_1->quantparam()), cg_input_1->at<loco::DataType::S16>(3));
+  EXPECT_EQ(quantize(2, cg_input_1->quantparam()), cg_input_1->at<loco::DataType::S16>(4));
 }
diff --git a/compiler/luci/pass/src/PropagateQuantParamPass.cpp b/compiler/luci/pass/src/PropagateQuantParamPass.cpp
index af83cd83b..26282086b 100644
--- a/compiler/luci/pass/src/PropagateQuantParamPass.cpp
+++ b/compiler/luci/pass/src/PropagateQuantParamPass.cpp
@@ -91,9 +91,8 @@ bool PropagateQuantParamPass::run(loco::Graph *g)
     INFO(l) << "PropagateQuantParamPass visit node: " << circle_node->name() << std::endl;
 
     PropagateQuantParam pqp;
-    changed = circle_node->accept(&pqp);
-    if (changed)
-      break;
+    if (circle_node->accept(&pqp))
+      changed = true;
   }
 
   return changed;
diff --git a/compiler/luci/pass/src/PropagateQuantParamPass.test.cpp b/compiler/luci/pass/src/PropagateQuantParamPass.test.cpp
index 15adbfc01..ed1f96828 100644
--- a/compiler/luci/pass/src/PropagateQuantParamPass.test.cpp
+++ b/compiler/luci/pass/src/PropagateQuantParamPass.test.cpp
@@ -83,6 +83,13 @@ public:
 
 } // namespace
 
+TEST(PropagateQuantParamPassTest, name)
+{
+  luci::PropagateQuantParamPass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
+
 TEST(PropagateQuantParam, simple)
 {
   SimpleGraph g;
diff --git a/compiler/luci/pass/src/QuantizationUtils.cpp b/compiler/luci/pass/src/QuantizationUtils.cpp
index fa0141114..85d600e47 100644
--- a/compiler/luci/pass/src/QuantizationUtils.cpp
+++ b/compiler/luci/pass/src/QuantizationUtils.cpp
@@ -96,7 +96,7 @@ void asymmetric_wquant_with_minmax_per_layer(CircleConst *node, float min, float
     data = data < nudged_min ? nudged_min : data;
     data = data > nudged_max ? nudged_max : data;
     quantized_values[i] =
-        static_cast<int32_t>(std::round((data - nudged_min) * scaling_factor_inv));
+      static_cast<int32_t>(std::round((data - nudged_min) * scaling_factor_inv));
   }
 
   node->dtype(loco::DataType::U8);      // change the type of tensor
@@ -133,14 +133,14 @@ void symmetric_wquant_with_minmax_per_layer(CircleConst *node, float min, float
   for (uint32_t i = 0; i < size; ++i)
   {
     node->at<loco::DataType::S16>(i) =
-        std::min(kMaxScale, std::max(kMinScale, quantized_values[i]));
+      std::min(kMaxScale, std::max(kMinScale, quantized_values[i]));
   }
 }
 
 void compute_sym_scale_zp(float min, float max, float &scaling_factor, int64_t &zp,
                           float &nudged_min, float &nudged_max)
 {
-  assert(min != max);
+  assert(min <= max);
 
   const int32_t kMaxScale = std::numeric_limits<int16_t>::max();
   const int32_t kMinScale = -kMaxScale;
@@ -158,8 +158,8 @@ void compute_sym_scale_zp(float min, float max, float &scaling_factor, int64_t &
     scale_factor_from_max_side = rmax / qmax_double;
 
   scaling_factor = scale_factor_from_min_side > scale_factor_from_max_side
-                       ? scale_factor_from_min_side
-                       : scale_factor_from_max_side;
+                     ? scale_factor_from_min_side
+                     : scale_factor_from_max_side;
   zp = 0;
   nudged_min = static_cast<float>(qmin_double * scaling_factor);
   nudged_max = static_cast<float>(qmax_double * scaling_factor);
@@ -226,7 +226,8 @@ void compute_asym_scale_zp(float min, float max, float &scaling_factor, int64_t
   zp = nudged_zero_point;
 }
 
-bool get_channel_dim_index(CircleConst *node, loco::TensorShape &dimension, int &channel_dim_index)
+bool get_channel_dim_index(CircleConst *node, loco::TensorShape &dimension,
+                           int32_t &channel_dim_index)
 {
   auto succs = loco::succs(node);
 
@@ -304,7 +305,7 @@ bool get_channel_dim_index(CircleConst *node, loco::TensorShape &dimension, int
 uint32_t cal_offset(loco::TensorShape &dimension, uint32_t *indices)
 {
   return indices[0] * dimension.dim(1).value() * dimension.dim(2).value() *
-             dimension.dim(3).value() +
+           dimension.dim(3).value() +
          indices[1] * dimension.dim(2).value() * dimension.dim(3).value() +
          indices[2] * dimension.dim(3).value() + indices[3];
 }
diff --git a/compiler/luci/pass/src/QuantizationUtils.h b/compiler/luci/pass/src/QuantizationUtils.h
index 22a5cf1ee..c8c558d3c 100644
--- a/compiler/luci/pass/src/QuantizationUtils.h
+++ b/compiler/luci/pass/src/QuantizationUtils.h
@@ -37,7 +37,8 @@ void symmetric_wquant_with_minmax_per_layer(CircleConst *node, float min, float
                                             float &scaling_factor, int64_t &zp, float &nudged_min,
                                             float &nudged_max);
 
-bool get_channel_dim_index(CircleConst *node, loco::TensorShape &dimension, int &channel_dim_index);
+bool get_channel_dim_index(CircleConst *node, loco::TensorShape &dimension,
+                           int32_t &channel_dim_index);
 
 uint32_t cal_offset(loco::TensorShape &dimension, uint32_t *indices);
 
diff --git a/compiler/luci/pass/src/QuantizeDequantizeWeightsPass.cpp b/compiler/luci/pass/src/QuantizeDequantizeWeightsPass.cpp
index e10c4bb4d..e99c7b389 100644
--- a/compiler/luci/pass/src/QuantizeDequantizeWeightsPass.cpp
+++ b/compiler/luci/pass/src/QuantizeDequantizeWeightsPass.cpp
@@ -24,33 +24,29 @@
 
 #include <iostream>
 #include <cmath>
-
-namespace luci
-{
+#include <functional>
 
 namespace
 {
 
-void cal_minmax_per_channel(CircleConst *node, std::vector<float> &min, std::vector<float> &max)
+using namespace luci;
+using IterFunc = std::function<void(uint32_t *, loco::TensorShape &, int32_t)>;
+
+void iterate_per_channel(CircleConst *node, IterFunc func)
 {
   loco::TensorShape dimension;
   dimension.rank(4);
   uint32_t indices[4] = {
-      0,
+    0,
   };
-  int channel_dim_index{0};
-  int size{0};
+  int32_t channel_dim_index{0};
 
   if (!get_channel_dim_index(node, dimension, channel_dim_index))
   {
     assert(false);
     return;
   }
-  size = dimension.dim(channel_dim_index).value();
 
-  std::vector<bool> has_min_max_value(size, false);
-  min.resize(size);
-  max.resize(size);
   for (indices[0] = 0; indices[0] < dimension.dim(0).value(); indices[0]++)
   {
     for (indices[1] = 0; indices[1] < dimension.dim(1).value(); indices[1]++)
@@ -59,25 +55,57 @@ void cal_minmax_per_channel(CircleConst *node, std::vector<float> &min, std::vec
       {
         for (indices[3] = 0; indices[3] < dimension.dim(3).value(); indices[3]++)
         {
-          int channel_idx = indices[channel_dim_index];
-          auto data = node->at<loco::DataType::FLOAT32>(cal_offset(dimension, indices));
-          if (has_min_max_value[channel_idx])
-          {
-            min[channel_idx] = data < min[channel_idx] ? data : min[channel_idx];
-            max[channel_idx] = data > max[channel_idx] ? data : max[channel_idx];
-          }
-          else
-          {
-            min[channel_idx] = data;
-            max[channel_idx] = data;
-            has_min_max_value[channel_idx] = true;
-          }
+          func(indices, dimension, channel_dim_index);
         }
       }
     }
   }
 }
 
+} // namespace
+
+namespace luci
+{
+
+namespace
+{
+
+void cal_minmax_per_channel(CircleConst *node, std::vector<float> &min, std::vector<float> &max)
+{
+  loco::TensorShape dimension;
+  dimension.rank(4);
+  int32_t channel_dim_index{0};
+
+  if (!get_channel_dim_index(node, dimension, channel_dim_index))
+  {
+    assert(false);
+    return;
+  }
+  auto size = dimension.dim(channel_dim_index).value();
+
+  std::vector<bool> has_min_max_value(size, false);
+  min.resize(size);
+  max.resize(size);
+
+  auto cal_minmax = [&](uint32_t *indices, loco::TensorShape &dimension, int channel_dim_index) {
+    int channel_idx = indices[channel_dim_index];
+    auto data = node->at<loco::DataType::FLOAT32>(cal_offset(dimension, indices));
+    if (has_min_max_value[channel_idx])
+    {
+      min[channel_idx] = data < min[channel_idx] ? data : min[channel_idx];
+      max[channel_idx] = data > max[channel_idx] ? data : max[channel_idx];
+    }
+    else
+    {
+      min[channel_idx] = data;
+      max[channel_idx] = data;
+      has_min_max_value[channel_idx] = true;
+    }
+  };
+
+  iterate_per_channel(node, cal_minmax);
+}
+
 void sym_wquant_per_channel(CircleConst *node, std::vector<float> &min, std::vector<float> &max,
                             std::vector<float> &scaling_factor, std::vector<int64_t> &zp,
                             std::vector<float> &nudged_min, std::vector<float> &nudged_max)
@@ -94,45 +122,24 @@ void sym_wquant_per_channel(CircleConst *node, std::vector<float> &min, std::vec
     compute_sym_scale_zp(min[i], max[i], scaling_factor[i], zp[i], nudged_min[i], nudged_max[i]);
   }
 
-  loco::TensorShape dimension;
-  dimension.rank(4);
-  uint32_t indices[4] = {
-      0,
+  auto quantize = [&](uint32_t *indices, loco::TensorShape &dimension, int channel_dim_index) {
+    int channel_idx = indices[channel_dim_index];
+    const float scaling_factor_inv = 1.0 / scaling_factor[channel_idx];
+    auto data = node->at<loco::DataType::FLOAT32>(cal_offset(dimension, indices));
+    data = data < nudged_min[channel_idx] ? nudged_min[channel_idx] : data;
+    data = data > nudged_max[channel_idx] ? nudged_max[channel_idx] : data;
+    quantized_values[cal_offset(dimension, indices)] =
+      static_cast<int32_t>(std::round(data * scaling_factor_inv));
   };
-  int channel_dim_index{0};
-
-  if (!get_channel_dim_index(node, dimension, channel_dim_index))
-  {
-    assert(false);
-    return;
-  }
 
-  for (indices[0] = 0; indices[0] < dimension.dim(0).value(); indices[0]++)
-  {
-    for (indices[1] = 0; indices[1] < dimension.dim(1).value(); indices[1]++)
-    {
-      for (indices[2] = 0; indices[2] < dimension.dim(2).value(); indices[2]++)
-      {
-        for (indices[3] = 0; indices[3] < dimension.dim(3).value(); indices[3]++)
-        {
-          int channel_idx = indices[channel_dim_index];
-          const float scaling_factor_inv = 1.0 / scaling_factor[channel_idx];
-          auto data = node->at<loco::DataType::FLOAT32>(cal_offset(dimension, indices));
-          data = data < nudged_min[channel_idx] ? nudged_min[channel_idx] : data;
-          data = data > nudged_max[channel_idx] ? nudged_max[channel_idx] : data;
-          quantized_values[cal_offset(dimension, indices)] =
-              static_cast<int32_t>(std::round(data * scaling_factor_inv));
-        }
-      }
-    }
-  }
+  iterate_per_channel(node, quantize);
 
   node->dtype(loco::DataType::S16);      // change the type of tensor
   node->size<loco::DataType::S16>(size); // resize tensor
   for (uint32_t i = 0; i < size; ++i)
   {
     node->at<loco::DataType::S16>(i) =
-        std::min(kMaxScale, std::max(kMinScale, quantized_values[i]));
+      std::min(kMaxScale, std::max(kMinScale, quantized_values[i]));
   }
 }
 
@@ -142,35 +149,14 @@ void sym_wdequant_per_channel(CircleConst *node, std::vector<float> &scaling_fac
   uint32_t size = node->size<loco::DataType::S16>();
   std::vector<float> dequantized_values(size);
 
-  loco::TensorShape dimension;
-  dimension.rank(4);
-  uint32_t indices[4] = {
-      0,
+  auto dequantize = [&](uint32_t *indices, loco::TensorShape &dimension, int channel_dim_index) {
+    int channel_idx = indices[channel_dim_index];
+    auto data = node->at<loco::DataType::S16>(cal_offset(dimension, indices));
+    dequantized_values[cal_offset(dimension, indices)] =
+      static_cast<float>(data) * scaling_factor[channel_idx];
   };
-  int channel_dim_index{0};
-
-  if (!get_channel_dim_index(node, dimension, channel_dim_index))
-  {
-    assert(false);
-    return;
-  }
 
-  for (indices[0] = 0; indices[0] < dimension.dim(0).value(); indices[0]++)
-  {
-    for (indices[1] = 0; indices[1] < dimension.dim(1).value(); indices[1]++)
-    {
-      for (indices[2] = 0; indices[2] < dimension.dim(2).value(); indices[2]++)
-      {
-        for (indices[3] = 0; indices[3] < dimension.dim(3).value(); indices[3]++)
-        {
-          int channel_idx = indices[channel_dim_index];
-          auto data = node->at<loco::DataType::S16>(cal_offset(dimension, indices));
-          dequantized_values[cal_offset(dimension, indices)] =
-              static_cast<float>(data) * scaling_factor[channel_idx];
-        }
-      }
-    }
-  }
+  iterate_per_channel(node, dequantize);
 
   node->dtype(loco::DataType::FLOAT32);      // change the type of tensor
   node->size<loco::DataType::FLOAT32>(size); // resize tensor
@@ -198,38 +184,17 @@ void asymmetric_wquant_per_channel(CircleConst *node, std::vector<float> &min,
     compute_asym_scale_zp(min[i], max[i], scaling_factor[i], zp[i], nudged_min[i], nudged_max[i]);
   }
 
-  loco::TensorShape dimension;
-  dimension.rank(4);
-  uint32_t indices[4] = {
-      0,
+  auto quantize = [&](uint32_t *indices, loco::TensorShape &dimension, int channel_dim_index) {
+    int channel_idx = indices[channel_dim_index];
+    const float scaling_factor_inv = 1.0 / scaling_factor[channel_idx];
+    auto data = node->at<loco::DataType::FLOAT32>(cal_offset(dimension, indices));
+    data = data < nudged_min[channel_idx] ? nudged_min[channel_idx] : data;
+    data = data > nudged_max[channel_idx] ? nudged_max[channel_idx] : data;
+    quantized_values[cal_offset(dimension, indices)] =
+      static_cast<int32_t>(std::round((data - nudged_min[channel_idx]) * scaling_factor_inv));
   };
-  int channel_dim_index{0};
-
-  if (!get_channel_dim_index(node, dimension, channel_dim_index))
-  {
-    assert(false);
-    return;
-  }
 
-  for (indices[0] = 0; indices[0] < dimension.dim(0).value(); indices[0]++)
-  {
-    for (indices[1] = 0; indices[1] < dimension.dim(1).value(); indices[1]++)
-    {
-      for (indices[2] = 0; indices[2] < dimension.dim(2).value(); indices[2]++)
-      {
-        for (indices[3] = 0; indices[3] < dimension.dim(3).value(); indices[3]++)
-        {
-          int channel_idx = indices[channel_dim_index];
-          const float scaling_factor_inv = 1.0 / scaling_factor[channel_idx];
-          auto data = node->at<loco::DataType::FLOAT32>(cal_offset(dimension, indices));
-          data = data < nudged_min[channel_idx] ? nudged_min[channel_idx] : data;
-          data = data > nudged_max[channel_idx] ? nudged_max[channel_idx] : data;
-          quantized_values[cal_offset(dimension, indices)] = static_cast<int32_t>(
-              std::round((data - nudged_min[channel_idx]) * scaling_factor_inv));
-        }
-      }
-    }
-  }
+  iterate_per_channel(node, quantize);
 
   node->dtype(loco::DataType::U8);      // change the type of tensor
   node->size<loco::DataType::U8>(size); // resize tensor
@@ -246,35 +211,14 @@ void asymmetric_wdequant_per_channel(CircleConst *node, std::vector<float> &scal
   uint32_t size = node->size<loco::DataType::U8>();
   std::vector<float> dequantized_values(size);
 
-  loco::TensorShape dimension;
-  dimension.rank(4);
-  uint32_t indices[4] = {
-      0,
+  auto dequantize = [&](uint32_t *indices, loco::TensorShape &dimension, int channel_dim_index) {
+    int channel_idx = indices[channel_dim_index];
+    auto data = node->at<loco::DataType::U8>(cal_offset(dimension, indices));
+    dequantized_values[cal_offset(dimension, indices)] =
+      static_cast<float>(data) * scaling_factor[channel_idx] + nudged_min[channel_idx];
   };
-  int channel_dim_index{0};
-
-  if (!get_channel_dim_index(node, dimension, channel_dim_index))
-  {
-    assert(false);
-    return;
-  }
 
-  for (indices[0] = 0; indices[0] < dimension.dim(0).value(); indices[0]++)
-  {
-    for (indices[1] = 0; indices[1] < dimension.dim(1).value(); indices[1]++)
-    {
-      for (indices[2] = 0; indices[2] < dimension.dim(2).value(); indices[2]++)
-      {
-        for (indices[3] = 0; indices[3] < dimension.dim(3).value(); indices[3]++)
-        {
-          int channel_idx = indices[channel_dim_index];
-          auto data = node->at<loco::DataType::U8>(cal_offset(dimension, indices));
-          dequantized_values[cal_offset(dimension, indices)] =
-              static_cast<float>(data) * scaling_factor[channel_idx] + nudged_min[channel_idx];
-        }
-      }
-    }
-  }
+  iterate_per_channel(node, dequantize);
 
   node->dtype(loco::DataType::FLOAT32);      // change the type of tensor
   node->size<loco::DataType::FLOAT32>(size); // resize tensor
@@ -311,7 +255,7 @@ struct QuantizeDequantizeWeights final : public luci::CircleNodeMutableVisitor<b
 {
   QuantizeDequantizeWeights(loco::DataType input, loco::DataType output,
                             QuantizationGranularity granularity)
-      : input_type(input), output_type(output), granularity(granularity)
+    : input_type(input), output_type(output), granularity(granularity)
   {
   }
 
diff --git a/compiler/luci/pass/src/QuantizeDequantizeWeightsPass.test.cpp b/compiler/luci/pass/src/QuantizeDequantizeWeightsPass.test.cpp
new file mode 100644
index 000000000..f226253c2
--- /dev/null
+++ b/compiler/luci/pass/src/QuantizeDequantizeWeightsPass.test.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/QuantizeDequantizeWeightsPass.h"
+
+#include <gtest/gtest.h>
+
+TEST(QuantizeDequantizeWeightsPassTest, name)
+{
+  luci::QuantizeDequantizeWeightsPass pass(loco::DataType::FLOAT32, loco::DataType::U8,
+                                           luci::QuantizationGranularity::LayerWise);
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
diff --git a/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp b/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp
index f6eebe3b9..4707ad0e9 100644
--- a/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp
+++ b/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp
@@ -19,12 +19,51 @@
 
 #include <luci/IR/CircleNodes.h>
 #include <luci/IR/CircleNodeVisitor.h>
+#include <luci/Service/Nodes/CircleConst.h>
 #include <luci/Log.h>
 
 #include <oops/UserExn.h>
 
 #include <iostream>
 #include <cmath>
+#include <functional>
+
+namespace
+{
+
+using namespace luci;
+using IterFunc = std::function<void(uint32_t *, loco::TensorShape &, int32_t)>;
+
+void iterate_per_channel(CircleConst *node, int32_t &channel_dim_index, IterFunc func)
+{
+  loco::TensorShape dimension;
+  dimension.rank(4);
+  uint32_t indices[4] = {
+    0,
+  };
+
+  if (!get_channel_dim_index(node, dimension, channel_dim_index))
+  {
+    assert(false);
+    return;
+  }
+
+  for (indices[0] = 0; indices[0] < dimension.dim(0).value(); indices[0]++)
+  {
+    for (indices[1] = 0; indices[1] < dimension.dim(1).value(); indices[1]++)
+    {
+      for (indices[2] = 0; indices[2] < dimension.dim(2).value(); indices[2]++)
+      {
+        for (indices[3] = 0; indices[3] < dimension.dim(3).value(); indices[3]++)
+        {
+          func(indices, dimension, channel_dim_index);
+        }
+      }
+    }
+  }
+}
+
+} // namespace
 
 namespace luci
 {
@@ -32,6 +71,30 @@ namespace luci
 namespace
 {
 
+// Create a new const node from an existing node.
+// The new node has the following characteristics
+// type: T
+// shape: same with 'node' (given as an argument)
+// buffer size: 'size' (given as an argument)
+// Note that contents are not filled in this function.
+template <loco::DataType T>
+luci::CircleConst *create_empty_const_from(luci::CircleConst *node, uint32_t size)
+{
+  auto new_node = node->graph()->nodes()->create<CircleConst>();
+  // TODO: We don't have any naming convention for quantized nodes yet.
+  //       Fix this when we have one.
+  new_node->name(node->name());
+  new_node->dtype(T);
+  new_node->rank(node->rank());
+  for (uint32_t i = 0; i < node->rank(); i++)
+    new_node->dim(i).set(node->dim(i).value());
+
+  new_node->size<T>(size);
+  new_node->shape_status(luci::ShapeStatus::VALID);
+
+  return new_node;
+}
+
 void overwrite_quantparam(luci::CircleConcatenation *concat, luci::CircleNode *target)
 {
   auto concat_qparam = concat->quantparam();
@@ -44,6 +107,9 @@ void overwrite_quantparam(luci::CircleConcatenation *concat, luci::CircleNode *t
     auto quantparam = std::make_unique<CircleQuantParam>();
     target->quantparam(std::move(quantparam));
     target_qparam = target->quantparam();
+
+    if (target_qparam == nullptr)
+      throw std::runtime_error("Creating new quant param failed");
   }
   target_qparam->min = concat_qparam->min;
   target_qparam->max = concat_qparam->max;
@@ -79,7 +145,7 @@ void quant_const_values(luci::CircleConst *const_node, float scaling_factor, flo
       const_node->size<loco::DataType::S16>(size); // resize tensor
       for (uint32_t i = 0; i < size; ++i)
         const_node->at<loco::DataType::S16>(i) =
-            std::min(32767, std::max(-32767, quantized_values[i]));
+          std::min(32767, std::max(-32767, quantized_values[i]));
       break;
     default:
       throw std::runtime_error("Unsupported data type");
@@ -219,17 +285,16 @@ void quant_const(CircleConst *node, loco::DataType quant_type)
 }
 
 // Check if the node is the bias of Conv2D, DepthwiseConv2D, FullyConnected, or TransposeConv layer
-// If true, return <input, weight> pair of the successor node (used to quantize bias)
-// If flase, return <nullptr, nullptr>
-std::pair<loco::Node *, loco::Node *> get_input_weight_of_bias(CircleNode *node)
+// Returns a list of <input, weights, output> vectors for the above operators.
+// Note that it returns a 'list' because bias can be used by multiple operators.
+std::vector<std::vector<loco::Node *>> get_input_weight_output_of_bias(CircleNode *node)
 {
+  std::vector<std::vector<loco::Node *>> result;
   auto circle_const = dynamic_cast<CircleConst *>(node);
   if (circle_const == nullptr)
-    return std::make_pair(nullptr, nullptr);
+    return result;
 
   auto succs = loco::succs(node);
-  if (succs.size() != 1) // assume bias is used by only one node
-    return std::make_pair(nullptr, nullptr);
 
   for (auto out : succs)
   {
@@ -238,35 +303,39 @@ std::pair<loco::Node *, loco::Node *> get_input_weight_of_bias(CircleNode *node)
     {
       assert(conv->input() != nullptr);
       assert(conv->filter() != nullptr);
-      return std::make_pair(conv->input(), conv->filter());
+      result.push_back({conv->input(), conv->filter(), conv});
+      continue;
     }
     auto dw_conv = dynamic_cast<CircleDepthwiseConv2D *>(out);
     if (dw_conv != nullptr && dw_conv->bias() == circle_const)
     {
       assert(dw_conv->input() != nullptr);
       assert(dw_conv->filter() != nullptr);
-      return std::make_pair(dw_conv->input(), dw_conv->filter());
+      result.push_back({dw_conv->input(), dw_conv->filter(), dw_conv});
+      continue;
     }
     auto fc = dynamic_cast<CircleFullyConnected *>(out);
     if (fc != nullptr && fc->bias() == circle_const)
     {
       assert(fc->input() != nullptr);
       assert(fc->weights() != nullptr);
-      return std::make_pair(fc->input(), fc->weights());
+      result.push_back({fc->input(), fc->weights(), fc});
+      continue;
     }
     auto tconv = dynamic_cast<CircleTransposeConv *>(out);
     if (tconv != nullptr && tconv->bias() == circle_const)
     {
       assert(tconv->outBackprop() != nullptr);
       assert(tconv->filter() != nullptr);
-      return std::make_pair(tconv->outBackprop(), tconv->filter());
+      result.push_back({tconv->outBackprop(), tconv->filter(), tconv});
+      continue;
     }
   }
-  return std::make_pair(nullptr, nullptr);
+  return result;
 }
 
-void asym_quant_bias_per_layer(CircleConst *node, float input_scale, float weight_scale,
-                               float *scaling_factor, int64_t *zp)
+CircleConst *asym_quant_bias_per_layer(CircleConst *node, float input_scale, float weight_scale,
+                                       float *scaling_factor, int64_t *zp)
 {
   float scale = input_scale * weight_scale;
   const float scaling_factor_inv = (scale == 0) ? 0 : 1.0 / scale;
@@ -276,24 +345,27 @@ void asym_quant_bias_per_layer(CircleConst *node, float input_scale, float weigh
   for (uint32_t i = 0; i < size; ++i)
   {
     quantized_values[i] =
-        static_cast<int32_t>(std::round(node->at<loco::DataType::FLOAT32>(i) * scaling_factor_inv));
+      static_cast<int32_t>(std::round(node->at<loco::DataType::FLOAT32>(i) * scaling_factor_inv));
   }
 
-  node->dtype(loco::DataType::S32);      // change the type of tensor
-  node->size<loco::DataType::S32>(size); // resize tensor
+  auto new_bias = create_empty_const_from<loco::DataType::S32>(node, size);
+
   const int32_t kMinScale = std::numeric_limits<int32_t>::lowest();
   const int32_t kMaxScale = std::numeric_limits<int32_t>::max();
   for (uint32_t i = 0; i < size; ++i)
   {
-    node->at<loco::DataType::S32>(i) =
-        std::min(kMaxScale, std::max(kMinScale, quantized_values[i]));
+    new_bias->at<loco::DataType::S32>(i) =
+      std::min(kMaxScale, std::max(kMinScale, quantized_values[i]));
   }
   *scaling_factor = scale;
   *zp = 0;
+
+  return new_bias;
 }
 
-void quant_bias_per_channel(CircleConst *node, float input_scale, std::vector<float> &weight_scale,
-                            std::vector<float> &scaling_factor, std::vector<int64_t> &zp)
+CircleConst *quant_bias_per_channel(CircleConst *node, float input_scale,
+                                    std::vector<float> &weight_scale,
+                                    std::vector<float> &scaling_factor, std::vector<int64_t> &zp)
 {
   float scaling_factor_inv{0};
 
@@ -305,24 +377,27 @@ void quant_bias_per_channel(CircleConst *node, float input_scale, std::vector<fl
     scaling_factor[i] = input_scale * weight_scale[i];
     scaling_factor_inv = (scaling_factor[i] == 0) ? 0 : 1.0 / scaling_factor[i];
     quantized_values[i] =
-        static_cast<int32_t>(std::round(node->at<loco::DataType::FLOAT32>(i) * scaling_factor_inv));
+      static_cast<int32_t>(std::round(node->at<loco::DataType::FLOAT32>(i) * scaling_factor_inv));
     zp[i] = 0;
   }
 
-  node->dtype(loco::DataType::S32);      // change the type of tensor
-  node->size<loco::DataType::S32>(size); // resize tensor
+  auto new_bias = create_empty_const_from<loco::DataType::S32>(node, size);
+
   const int32_t kMinScale = std::numeric_limits<int32_t>::lowest();
   const int32_t kMaxScale = std::numeric_limits<int32_t>::max();
   for (uint32_t i = 0; i < size; ++i)
   {
-    node->at<loco::DataType::S32>(i) =
-        std::min(kMaxScale, std::max(kMinScale, quantized_values[i]));
+    new_bias->at<loco::DataType::S32>(i) =
+      std::min(kMaxScale, std::max(kMinScale, quantized_values[i]));
   }
+
+  return new_bias;
 }
 
-void int16_quant_bias_per_channel(CircleConst *node, float input_scale,
-                                  std::vector<float> &weight_scale,
-                                  std::vector<float> &scaling_factor, std::vector<int64_t> &zp)
+CircleConst *int16_quant_bias_per_channel(CircleConst *node, float input_scale,
+                                          std::vector<float> &weight_scale,
+                                          std::vector<float> &scaling_factor,
+                                          std::vector<int64_t> &zp)
 {
   float scaling_factor_inv{0};
 
@@ -334,16 +409,18 @@ void int16_quant_bias_per_channel(CircleConst *node, float input_scale,
     scaling_factor[i] = input_scale * weight_scale[i];
     scaling_factor_inv = (scaling_factor[i] == 0) ? 0 : 1.0 / scaling_factor[i];
     quantized_values[i] =
-        static_cast<int64_t>(std::round(node->at<loco::DataType::FLOAT32>(i) * scaling_factor_inv));
+      static_cast<int64_t>(std::round(node->at<loco::DataType::FLOAT32>(i) * scaling_factor_inv));
     zp[i] = 0;
   }
 
-  node->dtype(loco::DataType::S64);      // change the type of tensor
-  node->size<loco::DataType::S64>(size); // resize tensor
+  auto new_bias = create_empty_const_from<loco::DataType::S64>(node, size);
+
   for (uint32_t i = 0; i < size; ++i)
   {
-    node->at<loco::DataType::S64>(i) = quantized_values[i];
+    new_bias->at<loco::DataType::S64>(i) = quantized_values[i];
   }
+
+  return new_bias;
 }
 
 bool has_min_max(const CircleNode *node)
@@ -362,42 +439,22 @@ void sym_wquant_per_channel(CircleConst *node, std::vector<float> &scaling_facto
   uint32_t size = node->size<loco::DataType::FLOAT32>();
   std::vector<int32_t> quantized_values(size);
 
-  loco::TensorShape dimension;
-  dimension.rank(4);
-  uint32_t indices[4] = {
-      0,
+  auto quantize = [&](uint32_t *indices, loco::TensorShape &dimension, int32_t channel_dim_index) {
+    int channel_idx = indices[channel_dim_index];
+    const float scaling_factor_inv = 1.0 / scaling_factor[channel_idx];
+    auto data = node->at<loco::DataType::FLOAT32>(cal_offset(dimension, indices));
+    quantized_values[cal_offset(dimension, indices)] =
+      static_cast<int32_t>(std::round(data * scaling_factor_inv));
   };
 
-  if (!get_channel_dim_index(node, dimension, channel_dim_index))
-  {
-    assert(false);
-    return;
-  }
-
-  for (indices[0] = 0; indices[0] < dimension.dim(0).value(); indices[0]++)
-  {
-    for (indices[1] = 0; indices[1] < dimension.dim(1).value(); indices[1]++)
-    {
-      for (indices[2] = 0; indices[2] < dimension.dim(2).value(); indices[2]++)
-      {
-        for (indices[3] = 0; indices[3] < dimension.dim(3).value(); indices[3]++)
-        {
-          int channel_idx = indices[channel_dim_index];
-          const float scaling_factor_inv = 1.0 / scaling_factor[channel_idx];
-          auto data = node->at<loco::DataType::FLOAT32>(cal_offset(dimension, indices));
-          quantized_values[cal_offset(dimension, indices)] =
-              static_cast<int32_t>(std::round(data * scaling_factor_inv));
-        }
-      }
-    }
-  }
+  iterate_per_channel(node, channel_dim_index, quantize);
 
   node->dtype(loco::DataType::S16);      // change the type of tensor
   node->size<loco::DataType::S16>(size); // resize tensor
   for (uint32_t i = 0; i < size; ++i)
   {
     node->at<loco::DataType::S16>(i) =
-        std::min(kMaxScale, std::max(kMinScale, quantized_values[i]));
+      std::min(kMaxScale, std::max(kMinScale, quantized_values[i]));
   }
 }
 
@@ -412,35 +469,15 @@ void asym_wquant_per_channel(CircleConst *node, std::vector<float> &min,
   uint32_t size = node->size<loco::DataType::FLOAT32>();
   std::vector<int32_t> quantized_values(size);
 
-  loco::TensorShape dimension;
-  dimension.rank(4);
-  uint32_t indices[4] = {
-      0,
+  auto quantize = [&](uint32_t *indices, loco::TensorShape &dimension, int32_t channel_dim_index) {
+    int channel_idx = indices[channel_dim_index];
+    const float scaling_factor_inv = 1.0 / scaling_factor[channel_idx];
+    auto data = node->at<loco::DataType::FLOAT32>(cal_offset(dimension, indices));
+    quantized_values[cal_offset(dimension, indices)] =
+      static_cast<int32_t>(std::round((data - min[channel_idx]) * scaling_factor_inv));
   };
 
-  if (!get_channel_dim_index(node, dimension, channel_dim_index))
-  {
-    assert(false);
-    return;
-  }
-
-  for (indices[0] = 0; indices[0] < dimension.dim(0).value(); indices[0]++)
-  {
-    for (indices[1] = 0; indices[1] < dimension.dim(1).value(); indices[1]++)
-    {
-      for (indices[2] = 0; indices[2] < dimension.dim(2).value(); indices[2]++)
-      {
-        for (indices[3] = 0; indices[3] < dimension.dim(3).value(); indices[3]++)
-        {
-          int channel_idx = indices[channel_dim_index];
-          const float scaling_factor_inv = 1.0 / scaling_factor[channel_idx];
-          auto data = node->at<loco::DataType::FLOAT32>(cal_offset(dimension, indices));
-          quantized_values[cal_offset(dimension, indices)] =
-              static_cast<int32_t>(std::round((data - min[channel_idx]) * scaling_factor_inv));
-        }
-      }
-    }
-  }
+  iterate_per_channel(node, channel_dim_index, quantize);
 
   node->dtype(loco::DataType::U8);      // change the type of tensor
   node->size<loco::DataType::U8>(size); // resize tensor
@@ -473,6 +510,21 @@ void asym_wquant_per_layer(CircleConst *node, float min, float scaling_factor)
   }
 }
 
+void set_bias(luci::CircleNode *node, luci::CircleConst *bias)
+{
+  if (auto conv = dynamic_cast<CircleConv2D *>(node))
+    conv->bias(bias);
+  else if (auto dconv = dynamic_cast<CircleDepthwiseConv2D *>(node))
+    dconv->bias(bias);
+  else if (auto tconv = dynamic_cast<CircleTransposeConv *>(node))
+    tconv->bias(bias);
+  else if (auto fc = dynamic_cast<CircleFullyConnected *>(node))
+    fc->bias(bias);
+  else
+    throw std::runtime_error("Only convolution, depthwise convolution, transposed convolution, and "
+                             "fully-connected layer have bias");
+}
+
 /**
  * @brief QuantizeActivation quantizes tensors for activations
  * @details Quantize using recorded min/max values
@@ -480,7 +532,7 @@ void asym_wquant_per_layer(CircleConst *node, float min, float scaling_factor)
 struct QuantizeActivation final : public luci::CircleNodeMutableVisitor<bool>
 {
   QuantizeActivation(loco::DataType input, loco::DataType output)
-      : input_type(input), output_type(output)
+    : input_type(input), output_type(output)
   {
   }
 
@@ -503,8 +555,12 @@ struct QuantizeActivation final : public luci::CircleNodeMutableVisitor<bool>
         continue;
 
       // Check if this is bias (bias is quantized later)
-      auto iw = get_input_weight_of_bias(circle_node);
-      if (iw.first != nullptr && iw.second != nullptr)
+      auto iwo = get_input_weight_output_of_bias(circle_node);
+      if (iwo.size() > 0)
+        continue;
+
+      // Check if this is bool type (bool type is not quantized)
+      if (circle_node->dtype() == loco::DataType::BOOL)
         continue;
 
       // Check if this is activation
@@ -547,7 +603,7 @@ struct QuantizeActivation final : public luci::CircleNodeMutableVisitor<bool>
 struct QuantizeBias final : public luci::CircleNodeMutableVisitor<bool>
 {
   QuantizeBias(loco::DataType input, loco::DataType output, QuantizationGranularity gr)
-      : input_type(input), output_type(output), granularity(gr)
+    : input_type(input), output_type(output), granularity(gr)
   {
   }
 
@@ -562,65 +618,77 @@ struct QuantizeBias final : public luci::CircleNodeMutableVisitor<bool>
     if (is_quantized(node))
       return false;
 
-    // Check if this is bias
-    auto iw = get_input_weight_of_bias(node);
-    if (iw.first == nullptr || iw.second == nullptr)
-      return false;
-
-    auto input = loco::must_cast<luci::CircleNode *>(iw.first);
-    auto weight = loco::must_cast<luci::CircleNode *>(iw.second);
+    auto iwo_list = get_input_weight_output_of_bias(node);
 
-    if (granularity == QuantizationGranularity::ChannelWise)
+    for (auto iwo : iwo_list)
     {
-      assert(input->quantparam()->scale.size() == 1); // input scale's layer-wise
-      auto input_scale = input->quantparam()->scale[0];
+      assert(iwo.size() == 3);
 
-      assert(weight->quantparam() != nullptr); // weight scale's channel-wise
-      auto weight_scale = weight->quantparam()->scale;
+      auto input = loco::must_cast<luci::CircleNode *>(iwo[0]);
+      auto weight = loco::must_cast<luci::CircleNode *>(iwo[1]);
+      auto output = loco::must_cast<luci::CircleNode *>(iwo[2]);
 
-      auto circle_const = loco::must_cast<luci::CircleConst *>(node);
+      auto const_bias = loco::must_cast<luci::CircleConst *>(node);
+      assert(const_bias->dtype() == loco::DataType::FLOAT32);
 
-      uint32_t size = circle_const->size<loco::DataType::FLOAT32>();
-      assert(size == weight_scale.size());
-      std::vector<float> scaling_factor(size);
-      std::vector<int64_t> zp(size);
+      CircleConst *new_bias = nullptr;
 
-      if (output_type == loco::DataType::U8)
-      {
-        quant_bias_per_channel(circle_const, input_scale, weight_scale, scaling_factor, zp);
-      }
-      else if (output_type == loco::DataType::S16)
+      if (granularity == QuantizationGranularity::ChannelWise)
       {
-        int16_quant_bias_per_channel(circle_const, input_scale, weight_scale, scaling_factor, zp);
+        assert(input->quantparam()->scale.size() == 1); // input scale's layer-wise
+        auto input_scale = input->quantparam()->scale[0];
+
+        assert(weight->quantparam() != nullptr); // weight scale's channel-wise
+        auto weight_scale = weight->quantparam()->scale;
+
+        uint32_t size = const_bias->size<loco::DataType::FLOAT32>();
+        assert(size == weight_scale.size());
+        std::vector<float> scaling_factor(size);
+        std::vector<int64_t> zp(size);
+
+        if (output_type == loco::DataType::U8)
+        {
+          new_bias =
+            quant_bias_per_channel(const_bias, input_scale, weight_scale, scaling_factor, zp);
+        }
+        else if (output_type == loco::DataType::S16)
+        {
+          new_bias =
+            int16_quant_bias_per_channel(const_bias, input_scale, weight_scale, scaling_factor, zp);
+        }
+        else
+        {
+          throw std::runtime_error("Unsupported quantization type.");
+        }
+
+        auto quantparam = std::make_unique<CircleQuantParam>();
+        quantparam->scale = scaling_factor;
+        quantparam->zerop = zp;
+        assert(new_bias->quantparam() == nullptr); // bias should not be quantized before
+        new_bias->quantparam(std::move(quantparam));
+
+        set_bias(output, new_bias);
       }
       else
       {
-        throw std::runtime_error("Unsupported quantization type.");
-      }
+        assert(input->quantparam()->scale.size() == 1); // Only support per-layer quant
+        auto input_scale = input->quantparam()->scale[0];
 
-      auto quantparam = std::make_unique<CircleQuantParam>();
-      quantparam->scale = scaling_factor;
-      quantparam->zerop = zp;
-      assert(circle_const->quantparam() == nullptr); // bias should not be quantized before
-      circle_const->quantparam(std::move(quantparam));
-    }
-    else
-    {
-      assert(input->quantparam()->scale.size() == 1); // Only support per-layer quant
-      auto input_scale = input->quantparam()->scale[0];
-
-      assert(weight->quantparam()->scale.size() == 1); // Only support per-layer quant
-      auto weight_scale = weight->quantparam()->scale[0];
-
-      auto circle_const = loco::must_cast<luci::CircleConst *>(node);
-      float scaling_factor{0};
-      int64_t zp{0};
-      asym_quant_bias_per_layer(circle_const, input_scale, weight_scale, &scaling_factor, &zp);
-      auto quantparam = std::make_unique<CircleQuantParam>();
-      quantparam->scale.push_back(scaling_factor);
-      quantparam->zerop.push_back(zp);
-      assert(circle_const->quantparam() == nullptr); // bias should not be quantized before
-      circle_const->quantparam(std::move(quantparam));
+        assert(weight->quantparam()->scale.size() == 1); // Only support per-layer quant
+        auto weight_scale = weight->quantparam()->scale[0];
+
+        float scaling_factor{0};
+        int64_t zp{0};
+        new_bias =
+          asym_quant_bias_per_layer(const_bias, input_scale, weight_scale, &scaling_factor, &zp);
+        auto quantparam = std::make_unique<CircleQuantParam>();
+        quantparam->scale.push_back(scaling_factor);
+        quantparam->zerop.push_back(zp);
+        assert(new_bias->quantparam() == nullptr); // bias should not be quantized before
+        new_bias->quantparam(std::move(quantparam));
+
+        set_bias(output, new_bias);
+      }
     }
     return false;
   }
@@ -633,7 +701,7 @@ struct QuantizeBias final : public luci::CircleNodeMutableVisitor<bool>
 struct QuantizeWeights final : public luci::CircleNodeMutableVisitor<bool>
 {
   QuantizeWeights(loco::DataType input, loco::DataType output, QuantizationGranularity gr)
-      : input_type(input), output_type(output), granularity(gr)
+    : input_type(input), output_type(output), granularity(gr)
   {
   }
 
@@ -641,116 +709,179 @@ struct QuantizeWeights final : public luci::CircleNodeMutableVisitor<bool>
   loco::DataType output_type;
   QuantizationGranularity granularity;
 
-  // Quantize input tensors of each node
-  bool visit(luci::CircleNode *node)
+private:
+  void quantize_weights(luci::CircleConst *weights)
   {
-    LOGGER(l);
-    INFO(l) << "QuantizeWeights visit node: " << node->name() << std::endl;
-    auto arity = node->arity();
-    for (uint32_t i = 0; i < arity; i++)
+    // Find min/max per channel-wise
+    if (granularity == QuantizationGranularity::ChannelWise)
     {
-      auto input_node = node->arg(i);
-      auto circle_node = loco::must_cast<luci::CircleNode *>(input_node);
+      auto quantparam = weights->quantparam();
+      if (quantparam == nullptr)
+      {
+        assert(false && "quantparam is nullptr");
+        return;
+      }
 
-      // Check if this is already quantized
-      if (is_quantized(circle_node))
-        continue;
+      auto min = quantparam->min;
+      auto scaling_factor = quantparam->scale;
+      int32_t channel_dim_index = 0;
 
-      if (is_weights(circle_node))
+      if (output_type == loco::DataType::U8)
       {
-        auto circle_const = loco::must_cast<luci::CircleConst *>(circle_node);
-
-        // Find min/max per channel-wise
-        if (granularity == QuantizationGranularity::ChannelWise)
-        {
-          auto quantparam = circle_node->quantparam();
-          if (quantparam == nullptr)
-          {
-            assert(false && "quantparam is nullptr");
-            return false;
-          }
-
-          auto min = quantparam->min;
-          auto scaling_factor = quantparam->scale;
-          int32_t channel_dim_index = 0;
-
-          if (output_type == loco::DataType::U8)
-          {
-            asym_wquant_per_channel(circle_const, min, scaling_factor, channel_dim_index);
-          }
-          else
-          {
-            sym_wquant_per_channel(circle_const, scaling_factor, channel_dim_index);
-          }
-          quantparam->min.clear();
-          quantparam->max.clear();
-          quantparam->quantized_dimension = channel_dim_index;
-        }
-        // Find min/max per layer-wise
-        else
-        {
-          // Quantize using recorded quantparam
-          auto quantparam = circle_node->quantparam();
-          assert(quantparam != nullptr);
-          assert(quantparam->min.size() == 1);   // only support layer-wise quant
-          assert(quantparam->scale.size() == 1); // only support layer-wise quant
-          auto min = quantparam->min[0];
-          auto scaling_factor = quantparam->scale[0];
-          asym_wquant_per_layer(circle_const, min, scaling_factor);
-          quantparam->min.clear();
-          quantparam->max.clear();
-        }
+        asym_wquant_per_channel(weights, min, scaling_factor, channel_dim_index);
+      }
+      else
+      {
+        sym_wquant_per_channel(weights, scaling_factor, channel_dim_index);
       }
+      quantparam->min.clear();
+      quantparam->max.clear();
+      quantparam->quantized_dimension = channel_dim_index;
+    }
+    // Find min/max per layer-wise
+    else
+    {
+      // Quantize using recorded quantparam
+      auto quantparam = weights->quantparam();
+      assert(quantparam != nullptr);
+      assert(quantparam->min.size() == 1);   // only support layer-wise quant
+      assert(quantparam->scale.size() == 1); // only support layer-wise quant
+      auto min = quantparam->min[0];
+      auto scaling_factor = quantparam->scale[0];
+      asym_wquant_per_layer(weights, min, scaling_factor);
+      quantparam->min.clear();
+      quantparam->max.clear();
     }
-    return false;
   }
-};
 
-void quant_instnorm(luci::CircleInstanceNorm *node, loco::DataType output_type,
-                    QuantizationGranularity granularity)
-{
-  auto gamma = loco::must_cast<luci::CircleConst *>(node->gamma());
-  auto beta = loco::must_cast<luci::CircleConst *>(node->beta());
-  assert(gamma->dtype() == loco::DataType::FLOAT32);
-  assert(beta->dtype() == loco::DataType::FLOAT32);
+  bool visit(luci::CircleConv2D *node)
+  {
+    LOGGER(l);
+    INFO(l) << "QuantizeWeights visit node: " << node->name() << std::endl;
 
-  if (granularity == QuantizationGranularity::LayerWise)
+    auto weights = loco::must_cast<luci::CircleConst *>(node->filter());
+    if (!is_quantized(weights))
+    {
+      auto new_weights = luci::clone(weights);
+      node->filter(new_weights);
+      quantize_weights(new_weights);
+      return true;
+    }
+    return false;
+  }
+
+  bool visit(luci::CircleDepthwiseConv2D *node)
   {
-    quant_const(gamma, output_type);
-    quant_const(beta, output_type);
+    LOGGER(l);
+    INFO(l) << "QuantizeWeights visit node: " << node->name() << std::endl;
+
+    auto weights = loco::must_cast<luci::CircleConst *>(node->filter());
+    if (!is_quantized(weights))
+    {
+      auto new_weights = luci::clone(weights);
+      node->filter(new_weights);
+      quantize_weights(new_weights);
+      return true;
+    }
+    return false;
   }
-  else if (granularity == QuantizationGranularity::ChannelWise)
+
+  bool visit(luci::CircleInstanceNorm *node)
   {
-    quant_const_per_channel(gamma, output_type);
-    quant_const_per_channel(beta, output_type);
+    LOGGER(l);
+    INFO(l) << "QuantizeWeights visit node: " << node->name() << std::endl;
+
+    auto gamma = loco::must_cast<luci::CircleConst *>(node->gamma());
+    auto beta = loco::must_cast<luci::CircleConst *>(node->beta());
+
+    bool changed = false;
+    if (!is_quantized(gamma))
+    {
+      assert(gamma->dtype() == loco::DataType::FLOAT32);
+      auto new_gamma = luci::clone(gamma);
+      if (granularity == QuantizationGranularity::LayerWise)
+        quant_const(new_gamma, output_type);
+      else if (granularity == QuantizationGranularity::ChannelWise)
+        quant_const_per_channel(new_gamma, output_type);
+      node->gamma(new_gamma);
+      changed = true;
+    }
+    if (!is_quantized(beta))
+    {
+      assert(beta->dtype() == loco::DataType::FLOAT32);
+      auto new_beta = luci::clone(beta);
+      if (granularity == QuantizationGranularity::LayerWise)
+        quant_const(new_beta, output_type);
+      else if (granularity == QuantizationGranularity::ChannelWise)
+        quant_const_per_channel(new_beta, output_type);
+      node->beta(new_beta);
+      changed = true;
+    }
+
+    return changed;
   }
-  else
-    throw std::runtime_error("Quantization granularity must be either 'layer' or 'channel'");
-}
 
-void quant_prelu(luci::CirclePRelu *node, loco::DataType output_type,
-                 QuantizationGranularity granularity)
-{
-  auto alpha = loco::must_cast<luci::CircleConst *>(node->alpha());
-  assert(alpha->dtype() == loco::DataType::FLOAT32);
+  bool visit(luci::CirclePRelu *node)
+  {
+    LOGGER(l);
+    INFO(l) << "QuantizeWeights visit node: " << node->name() << std::endl;
+
+    auto alpha = loco::must_cast<luci::CircleConst *>(node->alpha());
+
+    if (!is_quantized(alpha))
+    {
+      assert(alpha->dtype() == loco::DataType::FLOAT32);
+      auto new_alpha = luci::clone(alpha);
+      if (granularity == QuantizationGranularity::LayerWise)
+        quant_const(new_alpha, output_type);
+      else if (granularity == QuantizationGranularity::ChannelWise)
+        quant_const_per_channel(new_alpha, output_type);
+      node->alpha(new_alpha);
+      return true;
+    }
 
-  if (granularity == QuantizationGranularity::LayerWise)
+    return false;
+  }
+
+  bool visit(luci::CircleTransposeConv *node)
   {
-    quant_const(alpha, output_type);
+    LOGGER(l);
+    INFO(l) << "QuantizeWeights visit node: " << node->name() << std::endl;
+
+    auto weights = loco::must_cast<luci::CircleConst *>(node->filter());
+    if (!is_quantized(weights))
+    {
+      auto new_weights = luci::clone(weights);
+      node->filter(new_weights);
+      quantize_weights(new_weights);
+      return true;
+    }
+    return false;
   }
-  else if (granularity == QuantizationGranularity::ChannelWise)
+
+  bool visit(luci::CircleFullyConnected *node)
   {
-    quant_const_per_channel(alpha, output_type);
+    LOGGER(l);
+    INFO(l) << "QuantizeWeights visit node: " << node->name() << std::endl;
+
+    auto weights = loco::must_cast<luci::CircleConst *>(node->weights());
+    if (!is_quantized(weights))
+    {
+      auto new_weights = luci::clone(weights);
+      node->weights(new_weights);
+      quantize_weights(new_weights);
+      return true;
+    }
+    return false;
   }
-  else
-    throw std::runtime_error("Quantization granularity must be either 'layer' or 'channel'");
-}
+
+  bool visit(luci::CircleNode *) { return false; }
+};
 
 /**
  * @brief Quantize const input tensors using min/max of const values
  */
-void quantize_const_inputs(luci::CircleNode *node, loco::DataType output_type,
-                           QuantizationGranularity granularity)
+void quantize_const_inputs(luci::CircleNode *node, loco::DataType output_type)
 {
   auto opcode = node->opcode();
   auto arity = node->arity();
@@ -763,6 +894,8 @@ void quantize_const_inputs(luci::CircleNode *node, loco::DataType output_type,
     case luci::CircleOpcode::CONV_2D:
     case luci::CircleOpcode::DEPTHWISE_CONV_2D:
     case luci::CircleOpcode::FULLY_CONNECTED:
+    case luci::CircleOpcode::INSTANCE_NORM:
+    case luci::CircleOpcode::PRELU:
     case luci::CircleOpcode::TRANSPOSE_CONV:
       // Handled in QuantizeWeights and QuantizeBias
       break;
@@ -771,8 +904,13 @@ void quantize_const_inputs(luci::CircleNode *node, loco::DataType output_type,
       // Handled in propagate_concat_quantparam
       break;
 
+    case luci::CircleOpcode::LOGICAL_OR:
+      // Inputs of logical Ops are bool, thus not quantized
+      break;
+
     case luci::CircleOpcode::ARG_MAX:
     case luci::CircleOpcode::ARG_MIN:
+    case luci::CircleOpcode::BATCH_TO_SPACE_ND:
     case luci::CircleOpcode::MEAN:
     case luci::CircleOpcode::PAD:
     case luci::CircleOpcode::REDUCE_ANY:
@@ -783,6 +921,9 @@ void quantize_const_inputs(luci::CircleNode *node, loco::DataType output_type,
     case luci::CircleOpcode::RESIZE_BILINEAR:
     case luci::CircleOpcode::RESIZE_NEAREST_NEIGHBOR:
     case luci::CircleOpcode::REVERSE_SEQUENCE:
+    case luci::CircleOpcode::SLICE:
+    case luci::CircleOpcode::SPACE_TO_BATCH_ND:
+    case luci::CircleOpcode::STRIDED_SLICE:
     case luci::CircleOpcode::SUM:
     case luci::CircleOpcode::TILE:
     case luci::CircleOpcode::TOPK_V2:
@@ -791,41 +932,53 @@ void quantize_const_inputs(luci::CircleNode *node, loco::DataType output_type,
       // Ex: axis, paddings
       input_node = node->arg(0);
       const_node = dynamic_cast<luci::CircleConst *>(input_node);
-      if (const_node != nullptr)
+      if (const_node != nullptr && !is_quantized(const_node))
         quant_const(const_node, output_type);
       break;
 
-    case luci::CircleOpcode::INSTANCE_NORM:
-      quant_instnorm(loco::must_cast<luci::CircleInstanceNorm *>(node), output_type, granularity);
-      break;
-
-    case luci::CircleOpcode::PRELU:
-      quant_prelu(loco::must_cast<luci::CirclePRelu *>(node), output_type, granularity);
-      break;
-
     case luci::CircleOpcode::ADD:
     case luci::CircleOpcode::ADD_N:
+    case luci::CircleOpcode::DEPTH_TO_SPACE:
     case luci::CircleOpcode::DIV:
+    case luci::CircleOpcode::ELU:
     case luci::CircleOpcode::EQUAL:
+    case luci::CircleOpcode::FLOOR:
+    case luci::CircleOpcode::FLOOR_DIV:
     case luci::CircleOpcode::GREATER:
     case luci::CircleOpcode::GREATER_EQUAL:
     case luci::CircleOpcode::LESS:
     case luci::CircleOpcode::LESS_EQUAL:
+    case luci::CircleOpcode::LOGISTIC:
     case luci::CircleOpcode::MAXIMUM:
     case luci::CircleOpcode::MINIMUM:
     case luci::CircleOpcode::MUL:
     case luci::CircleOpcode::NOT_EQUAL:
+    case luci::CircleOpcode::POW:
+    case luci::CircleOpcode::RSQRT:
+    case luci::CircleOpcode::SOFTMAX:
+    case luci::CircleOpcode::SPACE_TO_DEPTH:
+    case luci::CircleOpcode::SQRT:
     case luci::CircleOpcode::SUB:
+    case luci::CircleOpcode::TANH:
       // Quantize all const inputs using their values
       for (uint32_t i = 0; i < arity; i++)
       {
         input_node = node->arg(i);
         const_node = dynamic_cast<luci::CircleConst *>(input_node);
-        if (const_node != nullptr)
+        if (const_node != nullptr && !is_quantized(const_node))
           quant_const(const_node, output_type);
       }
       break;
 
+    case luci::CircleOpcode::SPLIT:
+      // Only the second input is quantized
+      // First input should not be quantized (e.g., split_dim)
+      input_node = node->arg(1);
+      const_node = dynamic_cast<luci::CircleConst *>(input_node);
+      if (const_node != nullptr && !is_quantized(const_node))
+        quant_const(const_node, output_type);
+      break;
+
     default:
       for (uint32_t i = 0; i < arity; i++)
       {
@@ -850,8 +1003,8 @@ void quantize_const_inputs(luci::CircleNode *node, loco::DataType output_type,
  *                        (U8 qparam2)
  *
  *  AFTER
- *         [CircleNode]             [CircleConst]
- *         (U8 qparam2)             (U8 qparam2)
+ *         [CircleNode]             [CircleConst]   [CircleConst] <- Dead node
+ *         (U8 qparam2)             (U8 qparam2)       (FP32)
  *                   \                    /
  *                    \                  /
  *                    [CircleConcatenation]
@@ -871,7 +1024,11 @@ void propagate_concat_quantparam(luci::CircleConcatenation *concat, loco::DataTy
       auto node = concat->arg(i);
       auto const_node = dynamic_cast<luci::CircleConst *>(node);
       if (const_node != nullptr)
-        quant_const(const_node, quant_type);
+      {
+        auto new_const = luci::clone(const_node);
+        quant_const(new_const, quant_type);
+        concat->values(i, new_const);
+      }
     }
     return;
   }
@@ -884,20 +1041,6 @@ void propagate_concat_quantparam(luci::CircleConcatenation *concat, loco::DataTy
     if (node->opcode() == luci::CircleOpcode::CONCATENATION)
       continue;
 
-    // Skip if this input is used by other Ops
-    auto succs = loco::succs(node);
-    if (succs.size() != 1)
-    {
-      if (node->opcode() == luci::CircleOpcode::CIRCLECONST)
-      {
-        luci::CircleConst *const_node = loco::must_cast<luci::CircleConst *>(node);
-        quant_const(const_node, quant_type);
-      }
-      continue;
-    }
-
-    assert(succs.find(concat) != succs.end());
-
     // Quantize constant values
     if (node->opcode() == luci::CircleOpcode::CIRCLECONST)
     {
@@ -913,15 +1056,21 @@ void propagate_concat_quantparam(luci::CircleConcatenation *concat, loco::DataTy
       const auto scaling_factor = concat_qparam->scale[0];
       const auto zerop = concat_qparam->zerop[0];
 
-      quant_const_values(const_node, scaling_factor, zerop, quant_type);
+      auto new_const = luci::clone(const_node);
+      quant_const_values(new_const, scaling_factor, zerop, quant_type);
+      concat->values(i, new_const);
+      overwrite_quantparam(concat, new_const);
     }
     else
     {
+      const auto succs = loco::succs(node);
+      if (succs.size() > 1)
+        continue;
+
       // Non-const input must have been quantized
       assert(node->quantparam() != nullptr);
+      overwrite_quantparam(concat, node);
     }
-
-    overwrite_quantparam(concat, node);
   }
 }
 
@@ -954,13 +1103,6 @@ bool QuantizeWithMinMaxPass::run(loco::Graph *g)
     circle_node->accept(&qb);
   }
 
-  // Quantize const inputs other than weights and bias
-  for (auto node : loco::active_nodes(loco::output_nodes(g)))
-  {
-    auto circle_node = loco::must_cast<luci::CircleNode *>(node);
-    quantize_const_inputs(circle_node, _output_dtype, _granularity);
-  }
-
   // Propagate quantization parameters of concat Op
   for (auto node : loco::active_nodes(loco::output_nodes(g)))
   {
@@ -976,6 +1118,13 @@ bool QuantizeWithMinMaxPass::run(loco::Graph *g)
     propagate_concat_quantparam(concat, _output_dtype);
   }
 
+  // Quantize const inputs other than weights and bias
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    auto circle_node = loco::must_cast<luci::CircleNode *>(node);
+    quantize_const_inputs(circle_node, _output_dtype);
+  }
+
   // Update output dtype
   auto graph_outputs = g->outputs();
   for (auto node : loco::output_nodes(g))
diff --git a/compiler/luci/pass/src/QuantizeWithMinMaxPass.test.cpp b/compiler/luci/pass/src/QuantizeWithMinMaxPass.test.cpp
new file mode 100644
index 000000000..75ec0cfd8
--- /dev/null
+++ b/compiler/luci/pass/src/QuantizeWithMinMaxPass.test.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/QuantizeWithMinMaxPass.h"
+
+#include <gtest/gtest.h>
+
+TEST(QuantizeWithMinMaxPassTest, name)
+{
+  luci::QuantizeWithMinMaxPass pass(loco::DataType::FLOAT32, loco::DataType::U8,
+                                    luci::QuantizationGranularity::LayerWise);
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
diff --git a/compiler/luci/pass/src/QuantizedModelVerifier.cpp b/compiler/luci/pass/src/QuantizedModelVerifier.cpp
new file mode 100644
index 000000000..5ea803cc9
--- /dev/null
+++ b/compiler/luci/pass/src/QuantizedModelVerifier.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "QuantizedModelVerifier.h"
+
+#include "VerifyQuantizedNodeLayerWiseGranularity.h"
+#include "VerifyQuantizedNodeChannelWiseGranularity.h"
+#include "VerifyQuantizedNodeU8Type.h"
+#include "VerifyQuantizedNodeS16Type.h"
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/IR/CircleNodeVisitor.h>
+
+namespace luci
+{
+
+void QuantizedModelVerifier::verify(loco::Graph *g)
+{
+  if (_quantized_dtype != Type::U8 && _quantized_dtype != Type::S16)
+    throw std::runtime_error("Unsupported quantized dtype");
+
+  if (_granularity != Granularity::ChannelWise && _granularity != Granularity::LayerWise)
+    throw std::runtime_error("Unsupported granularity");
+
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    auto circle_node = loco::must_cast<luci::CircleNode *>(node);
+
+    // Verify Type
+    if (_quantized_dtype == Type::U8)
+    {
+      VerifyQuantizedNodeU8Type vt;
+      if (!circle_node->accept(&vt))
+        throw std::runtime_error("Wrong data type");
+    }
+    else if (_quantized_dtype == Type::S16)
+    {
+      VerifyQuantizedNodeS16Type vt;
+      if (!circle_node->accept(&vt))
+        throw std::runtime_error("Wrong data type");
+    }
+
+    // Verify Granularity
+    if (_granularity == Granularity::LayerWise)
+    {
+      VerifyQuantizedNodeLayerWiseGranularity vg;
+      if (!circle_node->accept(&vg))
+        throw std::runtime_error("Wrong granularity");
+    }
+    else if (_granularity == Granularity::ChannelWise)
+    {
+      VerifyQuantizedNodeChannelWiseGranularity vg;
+      if (!circle_node->accept(&vg))
+        throw std::runtime_error("Wrong granularity");
+    }
+  }
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/QuantizedModelVerifier.h b/compiler/luci/pass/src/QuantizedModelVerifier.h
new file mode 100644
index 000000000..d5fbb8e74
--- /dev/null
+++ b/compiler/luci/pass/src/QuantizedModelVerifier.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_QUANTIZED_MODEL_VERIFIER_H__
+#define __LUCI_QUANTIZED_MODEL_VERIFIER_H__
+
+#include "luci/Pass/QuantizationParameters.h"
+
+#include <loco.h>
+
+namespace luci
+{
+
+/**
+ * @brief  Class to verify quantized model
+ *
+ * TODO Move this to luci/service
+ */
+struct QuantizedModelVerifier
+{
+
+public:
+  QuantizedModelVerifier(loco::DataType quantized_dtype, QuantizationGranularity granularity)
+    : _quantized_dtype(quantized_dtype), _granularity(granularity)
+  {
+  }
+
+  void verify(loco::Graph *g);
+
+private:
+  loco::DataType _quantized_dtype;
+  QuantizationGranularity _granularity;
+};
+
+} // namespace luci
+
+#endif // __LUCI_QUANTIZED_MODEL_VERIFIER_H__
diff --git a/compiler/luci/pass/src/QuantizedModelVerifier.test.cpp b/compiler/luci/pass/src/QuantizedModelVerifier.test.cpp
new file mode 100644
index 000000000..eae1b0c1f
--- /dev/null
+++ b/compiler/luci/pass/src/QuantizedModelVerifier.test.cpp
@@ -0,0 +1,1668 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "QuantizedModelVerifier.h"
+
+#include "luci/Pass/QuantizeWithMinMaxPass.h"
+
+#include <luci/test/TestIOGraph.h>
+
+#include <gtest/gtest.h>
+
+using Type = loco::DataType;
+using Granularity = luci::QuantizationGranularity;
+
+namespace
+{
+
+/**
+ * @brief A helper function to create dummy const node
+ */
+template <Type T> luci::CircleConst *create_dummy_const(loco::Graph *g, luci::test::ShapeU32 shape)
+{
+  auto node = g->nodes()->create<luci::CircleConst>();
+  {
+    node->dtype(T);
+    node->shape(shape);
+    node->size<T>(luci::test::num_elements(shape));
+
+    for (int32_t i = 0; i < luci::test::num_elements(shape); i++)
+    {
+      // DESIGN NOTE
+      //
+      // Filling with any random numbers are fine
+      // Q. Should it include minus numbers?
+      switch (T)
+      {
+        case Type::FLOAT32:
+          // Fill with index
+          node->at<T>(i) = static_cast<float>(i);
+          break;
+        case Type::BOOL:
+          // Fill by flip
+          node->at<T>(i) = (i % 2) ? true : false;
+          break;
+        case Type::U8:
+          // Fill with index
+          node->at<T>(i) = static_cast<uint8_t>(i);
+          break;
+        case Type::S16:
+          // Fill with index
+          node->at<T>(i) = static_cast<int16_t>(i);
+          break;
+      }
+    }
+  }
+
+  return node;
+}
+
+/**
+ * @brief A helper function to create const node with value
+ */
+template <Type DT, typename T>
+luci::CircleConst *create_const(loco::Graph *g, luci::test::ShapeU32 shape,
+                                std::initializer_list<T> values)
+{
+  auto node = g->nodes()->create<luci::CircleConst>();
+  {
+    node->dtype(DT);
+    node->shape(shape);
+    node->size<DT>(luci::test::num_elements(shape));
+
+    assert(values.size() == node->size<DT>());
+
+    uint32_t index = 0;
+    for (auto val : values)
+    {
+      node->at<DT>(index++) = static_cast<T>(val);
+    }
+  }
+
+  return node;
+}
+
+void insert_scale_zp(luci::CircleNode *node, float scale, int64_t zp)
+{
+  auto qparam = node->quantparam();
+  assert(qparam != nullptr); // FIX_CALLER_UNLESS
+  qparam->scale.push_back(scale);
+  qparam->zerop.push_back(zp);
+}
+
+void quantize_and_verify(loco::Graph *g, Type quantized_dtype, Granularity granularity)
+{
+  luci::QuantizeWithMinMaxPass pass(Type::FLOAT32, quantized_dtype, granularity);
+  pass.run(g);
+
+  luci::QuantizedModelVerifier verifier(quantized_dtype, granularity);
+  verifier.verify(g);
+}
+
+// Helper function to reduce duplicate test codes
+// Assumption: g->output()->from() is the target node
+void quantize_and_verify_with_wrong_type(luci::test::TestIOGraph *g, Type quantized_dtype,
+                                         Granularity granularity, Type wrong_dtype)
+{
+  luci::QuantizeWithMinMaxPass pass(Type::FLOAT32, quantized_dtype, granularity);
+  pass.run(g->g());
+
+  auto node = loco::must_cast<luci::CircleNode *>(g->output()->from());
+  node->dtype(wrong_dtype);
+
+  luci::QuantizedModelVerifier verifier(quantized_dtype, granularity);
+  verifier.verify(g->g());
+}
+
+// Helper function to reduce duplicate test codes
+// Assumption: g->output()->from() is the target node
+void quantize_and_verify_with_wrong_granularity(luci::test::TestIOGraph *g, Type quantized_dtype,
+                                                Granularity granularity)
+{
+  luci::QuantizeWithMinMaxPass pass(Type::FLOAT32, quantized_dtype, granularity);
+  pass.run(g->g());
+
+  auto node = loco::must_cast<luci::CircleNode *>(g->output()->from());
+  insert_scale_zp(node, 1.0, 1);
+
+  luci::QuantizedModelVerifier verifier(quantized_dtype, granularity);
+  verifier.verify(g->g());
+}
+
+// Helper function to reduce duplicate test codes
+void quantize_and_verify_with_wrong_granularity(luci::test::TestIOGraph *g, Type quantized_dtype,
+                                                Granularity granularity, luci::CircleNode *target)
+{
+  luci::QuantizeWithMinMaxPass pass(Type::FLOAT32, quantized_dtype, granularity);
+  pass.run(g->g());
+
+  insert_scale_zp(target, 1.0, 1);
+
+  luci::QuantizedModelVerifier verifier(quantized_dtype, granularity);
+  verifier.verify(g->g());
+}
+
+// Set min/max for all non-const nodes in the graph
+void set_minmax_to_non_const(loco::Graph *g, float min, float max)
+{
+  for (auto node : loco::all_nodes(g))
+  {
+    auto const_node = dynamic_cast<luci::CircleConst *>(node);
+    if (const_node != nullptr)
+      continue;
+
+    // Min/Max is not recorded for ArgMax
+    // See MinMaxObserver.cpp in record_minmax module
+    auto argmax_node = dynamic_cast<luci::CircleArgMax *>(node);
+    if (argmax_node != nullptr)
+      continue;
+
+    // Min/Max is not recorded for Split
+    // See MinMaxObserver.cpp in record_minmax module
+    auto split_node = dynamic_cast<luci::CircleSplit *>(node);
+    if (split_node != nullptr)
+      continue;
+
+    auto circle_node = loco::must_cast<luci::CircleNode *>(node);
+    auto qparam = std::make_unique<luci::CircleQuantParam>();
+    {
+      qparam->min.emplace_back(min);
+      qparam->max.emplace_back(max);
+    }
+    circle_node->quantparam(std::move(qparam));
+  }
+}
+
+/**
+ * @brief Simple Test Graph
+ * @note
+ * The simple test graph's nodes are initialized with
+ * simple shapes and values.
+ */
+class SimpleTestGraph : public luci::test::TestIOGraph
+{
+public:
+  virtual void init(void) = 0;
+};
+
+class InstanceNormTestGraph final : public SimpleTestGraph
+{
+public:
+  void init(void) override
+  {
+    TestIOGraph::init({32}, {32});
+    _gamma = create_dummy_const<Type::FLOAT32>(g(), {32});
+    _beta = create_dummy_const<Type::FLOAT32>(g(), {32});
+    _instnorm = g()->nodes()->create<luci::CircleInstanceNorm>();
+    {
+      _instnorm->input(input());
+      _instnorm->gamma(_gamma);
+      _instnorm->beta(_beta);
+    }
+    output()->from(_instnorm);
+
+    set_minmax_to_non_const(g(), -1, 1);
+  }
+
+public:
+  loco::Node *gamma(void) const { return _instnorm->gamma(); }
+  loco::Node *beta(void) const { return _instnorm->beta(); }
+
+public:
+  luci::CircleInstanceNorm *_instnorm = nullptr;
+  luci::CircleConst *_input = nullptr;
+  luci::CircleConst *_gamma = nullptr;
+  luci::CircleConst *_beta = nullptr;
+};
+
+class LogisticTestGraph final : public SimpleTestGraph
+{
+public:
+  void init(void) override
+  {
+    TestIOGraph::init({32}, {32});
+    _logistic = g()->nodes()->create<luci::CircleLogistic>();
+    {
+      _logistic->x(input());
+    }
+    output()->from(_logistic);
+
+    set_minmax_to_non_const(g(), -1, 1);
+  }
+
+public:
+  luci::CircleLogistic *_logistic = nullptr;
+};
+
+class SoftmaxTestGraph final : public SimpleTestGraph
+{
+public:
+  void init(void) override
+  {
+    TestIOGraph::init({32}, {32});
+    _softmax = g()->nodes()->create<luci::CircleSoftmax>();
+    {
+      _softmax->logits(input());
+      _softmax->beta(0.1);
+    }
+    output()->from(_softmax);
+
+    set_minmax_to_non_const(g(), -1, 1);
+  }
+
+public:
+  luci::CircleSoftmax *_softmax = nullptr;
+};
+
+class SpaceToBatchNDTestGraph final : public SimpleTestGraph
+{
+public:
+  void init(void) override
+  {
+    TestIOGraph::init({1, 2, 2, 1}, {4, 1, 1, 1});
+    _block_shape = create_dummy_const<Type::S32>(g(), {2});
+    for (uint32_t i = 0; i < 2; i++)
+      _block_shape->at<Type::S32>(i) = 2;
+
+    _paddings = create_dummy_const<Type::S32>(g(), {2, 2});
+    for (uint32_t i = 0; i < 4; i++)
+      _paddings->at<Type::S32>(i) = 0;
+
+    _stob = g()->nodes()->create<luci::CircleSpaceToBatchND>();
+    {
+      _stob->input(input());
+      _stob->block_shape(_block_shape);
+      _stob->paddings(_paddings);
+    }
+    output()->from(_stob);
+
+    set_minmax_to_non_const(g(), -1, 1);
+  }
+
+public:
+  luci::CircleSpaceToBatchND *_stob = nullptr;
+  luci::CircleConst *_block_shape = nullptr;
+  luci::CircleConst *_paddings = nullptr;
+};
+
+class SpaceToDepthTestGraph final : public SimpleTestGraph
+{
+public:
+  void init(void) override
+  {
+    TestIOGraph::init({1, 2, 2, 1}, {1, 1, 1, 4});
+    _stod = g()->nodes()->create<luci::CircleSpaceToDepth>();
+    {
+      _stod->input(input());
+      _stod->block_size(2);
+    }
+    output()->from(_stod);
+
+    set_minmax_to_non_const(g(), -1, 1);
+  }
+
+public:
+  luci::CircleSpaceToDepth *_stod = nullptr;
+};
+
+template <Type indexT> class SliceTestGraph final : public SimpleTestGraph
+{
+public:
+  void init(void) override
+  {
+    TestIOGraph::init({32}, {32});
+    _begin = g()->nodes()->create<luci::CircleConst>();
+    {
+      _begin->dtype(indexT);
+    }
+    _size = g()->nodes()->create<luci::CircleConst>();
+    {
+      _size->dtype(indexT);
+    }
+    _slice = g()->nodes()->create<luci::CircleSlice>();
+    {
+      _slice->input(input());
+      _slice->begin(_begin);
+      _slice->size(_size);
+    }
+    output()->from(_slice);
+
+    set_minmax_to_non_const(g(), -1, 1);
+  }
+
+public:
+  luci::CircleSlice *_slice = nullptr;
+  luci::CircleConst *_begin = nullptr;
+  luci::CircleConst *_size = nullptr;
+};
+
+class SplitTestGraph final : public luci::test::TestIOGraph
+{
+public:
+  void init(void)
+  {
+    TestIOGraph::init({1, 32}, {32});
+    _split_dim = create_dummy_const<Type::S32>(g(), {1});
+    _split = g()->nodes()->create<luci::CircleSplit>();
+    {
+      _split->input(input());
+      _split->split_dim(_split_dim);
+    }
+    _split_o1 = g()->nodes()->create<luci::CircleSplitOut>();
+    {
+      _split_o1->input(_split);
+      _split_o1->index(0);
+    }
+
+    output()->from(_split_o1);
+
+    set_minmax_to_non_const(g(), -1, 1);
+  }
+
+public:
+  luci::CircleSplit *_split = nullptr;
+  luci::CircleSplitOut *_split_o1 = nullptr;
+  luci::CircleConst *_split_dim = nullptr;
+};
+
+class StridedSliceTestGraph final : public SimpleTestGraph
+{
+public:
+  void init(void) override
+  {
+    TestIOGraph::init({32}, {32});
+    _begin = g()->nodes()->create<luci::CircleConst>();
+    {
+      _begin->dtype(Type::S32);
+    }
+    _end = g()->nodes()->create<luci::CircleConst>();
+    {
+      _end->dtype(Type::S32);
+    }
+    _strides = g()->nodes()->create<luci::CircleConst>();
+    {
+      _strides->dtype(Type::S32);
+    }
+    _slice = g()->nodes()->create<luci::CircleStridedSlice>();
+    {
+      _slice->input(input());
+      _slice->begin(_begin);
+      _slice->end(_end);
+      _slice->strides(_strides);
+    }
+    output()->from(_slice);
+
+    set_minmax_to_non_const(g(), -1, 1);
+  }
+
+public:
+  luci::CircleStridedSlice *_slice = nullptr;
+  luci::CircleConst *_begin = nullptr;
+  luci::CircleConst *_end = nullptr;
+  luci::CircleConst *_strides = nullptr;
+};
+
+class ReshapeTestGraph final : public SimpleTestGraph
+{
+public:
+  void init(void) override
+  {
+    TestIOGraph::init({32}, {32});
+    _shape = g()->nodes()->create<luci::CircleConst>();
+    {
+      _shape->dtype(Type::S32);
+    }
+    _reshape = g()->nodes()->create<luci::CircleReshape>();
+    {
+      _reshape->tensor(input());
+      _reshape->shape(_shape);
+    }
+    output()->from(_reshape);
+
+    set_minmax_to_non_const(g(), -1, 1);
+  }
+
+public:
+  luci::CircleReshape *_reshape = nullptr;
+  luci::CircleConst *_shape = nullptr;
+};
+
+class TanhTestGraph final : public SimpleTestGraph
+{
+public:
+  void init(void) override
+  {
+    TestIOGraph::init({32}, {32});
+    _tanh = g()->nodes()->create<luci::CircleTanh>();
+    {
+      _tanh->x(input());
+    }
+    output()->from(_tanh);
+
+    set_minmax_to_non_const(g(), -1, 1);
+  }
+
+public:
+  luci::CircleTanh *_tanh = nullptr;
+};
+
+class FloorTestGraph final : public SimpleTestGraph
+{
+public:
+  void init(void) override
+  {
+    TestIOGraph::init({32}, {32});
+    _floor = g()->nodes()->create<luci::CircleFloor>();
+    {
+      _floor->x(input());
+    }
+    output()->from(_floor);
+
+    set_minmax_to_non_const(g(), -1, 1);
+  }
+
+public:
+  luci::CircleFloor *_floor = nullptr;
+};
+
+template <Type indexT> class ArgMaxTestGraph final : public SimpleTestGraph
+{
+public:
+  void init(void) override
+  {
+    TestIOGraph::init({32}, {1});
+    // output dtype is float by default, but ArgMax should have indexType (s32/s64)
+    output()->dtype(indexT);
+    _dimension = g()->nodes()->create<luci::CircleConst>();
+    {
+      _dimension->dtype(indexT);
+    }
+    _argmax = g()->nodes()->create<luci::CircleArgMax>();
+    {
+      _argmax->input(input());
+      _argmax->dimension(_dimension);
+      _argmax->output_type(indexT);
+      _argmax->dtype(indexT);
+    }
+    output()->from(_argmax);
+
+    set_minmax_to_non_const(g(), -1, 1);
+  }
+
+public:
+  luci::CircleArgMax *_argmax = nullptr;
+  luci::CircleConst *_dimension = nullptr;
+};
+
+class BatchToSpaceNDTestGraph final : public SimpleTestGraph
+{
+public:
+  void init(void) override
+  {
+    TestIOGraph::init({32}, {32});
+    _block_shape = g()->nodes()->create<luci::CircleConst>();
+    {
+      _block_shape->dtype(Type::S32);
+    }
+    _crops = g()->nodes()->create<luci::CircleConst>();
+    {
+      _crops->dtype(Type::S32);
+    }
+    _btos = g()->nodes()->create<luci::CircleBatchToSpaceND>();
+    {
+      _btos->input(input());
+      _btos->block_shape(_block_shape);
+      _btos->crops(_crops);
+    }
+    output()->from(_btos);
+
+    set_minmax_to_non_const(g(), -1, 1);
+  }
+
+public:
+  luci::CircleBatchToSpaceND *_btos = nullptr;
+  luci::CircleConst *_block_shape = nullptr;
+  luci::CircleConst *_crops = nullptr;
+};
+
+class DepthToSpaceTestGraph final : public SimpleTestGraph
+{
+public:
+  void init(void) override
+  {
+    TestIOGraph::init({1, 1, 1, 4}, {1, 2, 2, 1});
+    _dtos = g()->nodes()->create<luci::CircleDepthToSpace>();
+    {
+      _dtos->input(input());
+      _dtos->block_size(2);
+    }
+    output()->from(_dtos);
+
+    set_minmax_to_non_const(g(), -1, 1);
+  }
+
+public:
+  luci::CircleDepthToSpace *_dtos = nullptr;
+};
+
+class PadTestGraph final : public SimpleTestGraph
+{
+public:
+  void init(void) override
+  {
+    TestIOGraph::init({32}, {32});
+    _paddings = g()->nodes()->create<luci::CircleConst>();
+    {
+      _paddings->dtype(Type::S32);
+    }
+    _pad = g()->nodes()->create<luci::CirclePad>();
+    {
+      _pad->input(input());
+      _pad->paddings(_paddings);
+    }
+    output()->from(_pad);
+
+    set_minmax_to_non_const(g(), -1, 1);
+  }
+
+public:
+  luci::CirclePad *_pad = nullptr;
+  luci::CircleConst *_paddings = nullptr;
+};
+
+class TransposeTestGraph final : public SimpleTestGraph
+{
+public:
+  void init(void) override
+  {
+    TestIOGraph::init({32}, {32});
+    _perm = g()->nodes()->create<luci::CircleConst>();
+    {
+      _perm->dtype(Type::S32);
+    }
+    _transpose = g()->nodes()->create<luci::CircleTranspose>();
+    {
+      _transpose->a(input());
+      _transpose->perm(_perm);
+    }
+    output()->from(_transpose);
+
+    set_minmax_to_non_const(g(), -1, 1);
+  }
+
+public:
+  luci::CircleTranspose *_transpose = nullptr;
+  luci::CircleConst *_perm = nullptr;
+};
+
+class ConcatenationTestGraph final : public SimpleTestGraph
+{
+public:
+  void init(void) override
+  {
+    TestIOGraph::init({16}, {32});
+    _param = create_dummy_const<Type::FLOAT32>(g(), {16});
+    _concat = g()->nodes()->create<luci::CircleConcatenation>(2);
+    {
+      _concat->values(0, input());
+      _concat->values(1, _param);
+      _concat->axis(0);
+    }
+    output()->from(_concat);
+
+    set_minmax_to_non_const(g(), -1, 1);
+  }
+
+public:
+  luci::CircleConcatenation *_concat = nullptr;
+  luci::CircleConst *_param = nullptr;
+};
+
+// Test graph for comparison Ops
+// GREATER, GREATER_EQUAL, LESS, LESS_EQUAL, EQUAL, NOT_EQUAL
+template <class Op> class ComparisonOpTestGraph final : public SimpleTestGraph
+{
+public:
+  void init(void) override
+  {
+    TestIOGraph::init({32}, {32});
+    output()->dtype(loco::DataType::BOOL);
+    _y = create_dummy_const<Type::FLOAT32>(g(), {32});
+    _op = g()->nodes()->create<Op>();
+    {
+      _op->x(input());
+      _op->y(_y);
+      _op->dtype(loco::DataType::BOOL);
+    }
+    output()->from(_op);
+
+    set_minmax_to_non_const(g(), -1, 1);
+  }
+
+  loco::Node *x(void) const { return _op->x(); }
+  loco::Node *y(void) const { return _op->y(); }
+
+public:
+  Op *_op = nullptr;
+  luci::CircleConst *_y = nullptr;
+};
+
+// Test graph for binary logical Ops
+// LOGICAL_OR, LOGICAL_AND
+template <class Op> class BinaryLogicalOpTestGraph final : public SimpleTestGraph
+{
+public:
+  void init(void) override
+  {
+    TestIOGraph::init({32}, {32});
+    input()->dtype(loco::DataType::BOOL);
+    output()->dtype(loco::DataType::BOOL);
+    _y = create_dummy_const<Type::BOOL>(g(), {32});
+    _op = g()->nodes()->create<Op>();
+    {
+      _op->x(input());
+      _op->y(_y);
+      _op->dtype(loco::DataType::BOOL);
+    }
+    output()->from(_op);
+
+    set_minmax_to_non_const(g(), -1, 1);
+  }
+
+  loco::Node *x(void) const { return _op->x(); }
+  loco::Node *y(void) const { return _op->y(); }
+
+public:
+  Op *_op = nullptr;
+  luci::CircleConst *_y = nullptr;
+};
+
+class DivTestGraph final : public SimpleTestGraph
+{
+public:
+  void init(void) override
+  {
+    TestIOGraph::init({32}, {32});
+
+    _const = create_dummy_const<Type::FLOAT32>(g(), {32});
+    _div = g()->nodes()->create<luci::CircleDiv>();
+    {
+      _div->x(input());
+      _div->y(_const);
+    }
+    output()->from(_div);
+
+    set_minmax_to_non_const(g(), -1, 1);
+  }
+
+  loco::Node *x() { return _div->x(); }
+
+  loco::Node *y() { return _div->y(); }
+
+private:
+  luci::CircleDiv *_div = nullptr;
+  luci::CircleConst *_const = nullptr;
+};
+
+class FloorDivTestGraph final : public SimpleTestGraph
+{
+public:
+  void init(void) override
+  {
+    TestIOGraph::init({32}, {32});
+
+    _const = create_dummy_const<Type::FLOAT32>(g(), {32});
+    _floor_div = g()->nodes()->create<luci::CircleFloorDiv>();
+    {
+      _floor_div->x(input());
+      _floor_div->y(_const);
+    }
+    output()->from(_floor_div);
+
+    set_minmax_to_non_const(g(), -1, 1);
+  }
+
+  loco::Node *x() { return _floor_div->x(); }
+
+  loco::Node *y() { return _floor_div->y(); }
+
+private:
+  luci::CircleFloorDiv *_floor_div = nullptr;
+  luci::CircleConst *_const = nullptr;
+};
+
+class RsqrtTestGraph final : public SimpleTestGraph
+{
+public:
+  void init(void) override
+  {
+    TestIOGraph::init({32}, {32});
+    _rsqrt = g()->nodes()->create<luci::CircleRsqrt>();
+    {
+      _rsqrt->x(input());
+    }
+    output()->from(_rsqrt);
+
+    set_minmax_to_non_const(g(), -1, 1);
+  }
+
+public:
+  luci::CircleRsqrt *_rsqrt = nullptr;
+};
+
+class SqrtTestGraph final : public SimpleTestGraph
+{
+public:
+  void init(void) override
+  {
+    TestIOGraph::init({32}, {32});
+    _sqrt = g()->nodes()->create<luci::CircleSqrt>();
+    {
+      _sqrt->x(input());
+    }
+    output()->from(_sqrt);
+
+    set_minmax_to_non_const(g(), -1, 1);
+  }
+
+public:
+  luci::CircleSqrt *_sqrt = nullptr;
+};
+
+class EluTestGraph final : public SimpleTestGraph
+{
+public:
+  void init(void) override
+  {
+    TestIOGraph::init({32}, {32});
+    _elu = g()->nodes()->create<luci::CircleElu>();
+    {
+      _elu->features(input());
+    }
+    output()->from(_elu);
+
+    set_minmax_to_non_const(g(), -1, 1);
+  }
+
+public:
+  luci::CircleElu *_elu = nullptr;
+};
+
+class PowTestGraph final : public SimpleTestGraph
+{
+public:
+  void init(void) override
+  {
+    TestIOGraph::init({32}, {32});
+
+    _const = create_dummy_const<Type::FLOAT32>(g(), {32});
+    _pow = g()->nodes()->create<luci::CirclePow>();
+    {
+      _pow->x(input());
+      _pow->y(_const);
+    }
+    output()->from(_pow);
+
+    set_minmax_to_non_const(g(), -1, 1);
+  }
+
+  loco::Node *x() { return _pow->x(); }
+
+  loco::Node *y() { return _pow->y(); }
+
+private:
+  luci::CirclePow *_pow = nullptr;
+  luci::CircleConst *_const = nullptr;
+};
+
+class ResizeBilinearTestGraph final : public SimpleTestGraph
+{
+public:
+  void init(void) override
+  {
+    TestIOGraph::init({1, 4, 4, 1}, {1, 8, 8, 1});
+
+    _size = create_const<Type::S32, int32_t>(g(), {2}, {8, 8});
+    _resize_bilinear = g()->nodes()->create<luci::CircleResizeBilinear>();
+    {
+      _resize_bilinear->input(input());
+      _resize_bilinear->size(_size);
+    }
+    output()->from(_resize_bilinear);
+
+    set_minmax_to_non_const(g(), -1, 1);
+  }
+
+private:
+  luci::CircleResizeBilinear *_resize_bilinear = nullptr;
+  luci::CircleConst *_size = nullptr;
+};
+
+} // namespace
+
+// Quantize and verify with given configurations
+#define TEST_WITH_GRAPH(graph, type, granularity)                   \
+  do                                                                \
+  {                                                                 \
+    graph g;                                                        \
+    g.init();                                                       \
+    EXPECT_NO_THROW(quantize_and_verify(g.g(), type, granularity)); \
+  } while (0)
+
+// Quantize and verify with wrong type
+#define TEST_WITH_WRONG_TYPE(graph, type, granularity, wrong_dtype)                            \
+  do                                                                                           \
+  {                                                                                            \
+    graph g;                                                                                   \
+    g.init();                                                                                  \
+    EXPECT_ANY_THROW(quantize_and_verify_with_wrong_type(&g, type, granularity, wrong_dtype)); \
+  } while (0)
+
+// Quantize and verify with wrong granularity
+#define TEST_WITH_WRONG_GRANULARITY(graph, type, granularity)                            \
+  do                                                                                     \
+  {                                                                                      \
+    graph g;                                                                             \
+    g.init();                                                                            \
+    EXPECT_ANY_THROW(quantize_and_verify_with_wrong_granularity(&g, type, granularity)); \
+  } while (0)
+
+// Quantize and verify with wrong granularity
+// Users can specify the test target
+#define TEST_WITH_WRONG_GRANULARITY_TARGET(graph, type, granularity, target)                   \
+  do                                                                                           \
+  {                                                                                            \
+    graph g;                                                                                   \
+    g.init();                                                                                  \
+    auto node = loco::must_cast<luci::CircleNode *>(target);                                   \
+    EXPECT_ANY_THROW(quantize_and_verify_with_wrong_granularity(&g, type, granularity, node)); \
+  } while (0)
+
+// Test a local helper function
+TEST(QuantizedModelVerifierTest, LocalCreateDummyConst)
+{
+  loco::Graph g;
+
+  EXPECT_NO_THROW(create_dummy_const<Type::FLOAT32>(&g, {32, 32}));
+}
+
+TEST(QuantizedModelVerifierTest, LocalCreateConst)
+{
+  loco::Graph g;
+  std::initializer_list<float> values = {0.1, 0, -5, 100};
+  luci::CircleConst *node = create_const<Type::FLOAT32, float>(&g, {2, 2}, values);
+
+  uint32_t index = 0;
+  for (auto val : values)
+  {
+    EXPECT_EQ(node->at<Type::FLOAT32>(index++), val);
+  }
+}
+
+TEST(QuantizedModelVerifierTest, InstanceNorm)
+{
+  TEST_WITH_GRAPH(InstanceNormTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_GRAPH(InstanceNormTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_GRAPH(InstanceNormTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, InstanceNorm_wrong_type_NEG)
+{
+  TEST_WITH_WRONG_TYPE(InstanceNormTestGraph, Type::U8, Granularity::LayerWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(InstanceNormTestGraph, Type::U8, Granularity::ChannelWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(InstanceNormTestGraph, Type::S16, Granularity::ChannelWise, Type::U8);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, InstanceNorm_wrong_granularity_NEG)
+{
+  TEST_WITH_WRONG_GRANULARITY(InstanceNormTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_WRONG_GRANULARITY(InstanceNormTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_WRONG_GRANULARITY(InstanceNormTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Logistic)
+{
+  TEST_WITH_GRAPH(LogisticTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_GRAPH(LogisticTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_GRAPH(LogisticTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Logistic_wrong_type_NEG)
+{
+  TEST_WITH_WRONG_TYPE(LogisticTestGraph, Type::U8, Granularity::LayerWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(LogisticTestGraph, Type::U8, Granularity::ChannelWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(LogisticTestGraph, Type::S16, Granularity::ChannelWise, Type::U8);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Logistic_wrong_granularity_NEG)
+{
+  TEST_WITH_WRONG_GRANULARITY(LogisticTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_WRONG_GRANULARITY(LogisticTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_WRONG_GRANULARITY(LogisticTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Softmax)
+{
+  TEST_WITH_GRAPH(SoftmaxTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_GRAPH(SoftmaxTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_GRAPH(SoftmaxTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Softmax_wrong_type_NEG)
+{
+  TEST_WITH_WRONG_TYPE(SoftmaxTestGraph, Type::U8, Granularity::LayerWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(SoftmaxTestGraph, Type::U8, Granularity::ChannelWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(SoftmaxTestGraph, Type::S16, Granularity::ChannelWise, Type::U8);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Softmax_wrong_granularity_NEG)
+{
+  TEST_WITH_WRONG_GRANULARITY(SoftmaxTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_WRONG_GRANULARITY(SoftmaxTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_WRONG_GRANULARITY(SoftmaxTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, SpaceToBatchND)
+{
+  TEST_WITH_GRAPH(SpaceToBatchNDTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_GRAPH(SpaceToBatchNDTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_GRAPH(SpaceToBatchNDTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, SpaceToBatchND_wrong_type_NEG)
+{
+  TEST_WITH_WRONG_TYPE(SpaceToBatchNDTestGraph, Type::U8, Granularity::LayerWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(SpaceToBatchNDTestGraph, Type::U8, Granularity::ChannelWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(SpaceToBatchNDTestGraph, Type::S16, Granularity::ChannelWise, Type::U8);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, SpaceToBatchND_wrong_granularity_NEG)
+{
+  TEST_WITH_WRONG_GRANULARITY(SpaceToBatchNDTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_WRONG_GRANULARITY(SpaceToBatchNDTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_WRONG_GRANULARITY(SpaceToBatchNDTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, SpaceToDepth)
+{
+  TEST_WITH_GRAPH(SpaceToDepthTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_GRAPH(SpaceToDepthTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_GRAPH(SpaceToDepthTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, SpaceToDepth_wrong_type_NEG)
+{
+  TEST_WITH_WRONG_TYPE(SpaceToDepthTestGraph, Type::U8, Granularity::LayerWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(SpaceToDepthTestGraph, Type::U8, Granularity::ChannelWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(SpaceToDepthTestGraph, Type::S16, Granularity::ChannelWise, Type::U8);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, SpaceToDepth_wrong_granularity_NEG)
+{
+  TEST_WITH_WRONG_GRANULARITY(SpaceToDepthTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_WRONG_GRANULARITY(SpaceToDepthTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_WRONG_GRANULARITY(SpaceToDepthTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Slice)
+{
+  TEST_WITH_GRAPH(SliceTestGraph<Type::S32>, Type::U8, Granularity::LayerWise);
+  TEST_WITH_GRAPH(SliceTestGraph<Type::S32>, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_GRAPH(SliceTestGraph<Type::S32>, Type::S16, Granularity::ChannelWise);
+
+  TEST_WITH_GRAPH(SliceTestGraph<Type::S64>, Type::U8, Granularity::LayerWise);
+  TEST_WITH_GRAPH(SliceTestGraph<Type::S64>, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_GRAPH(SliceTestGraph<Type::S64>, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Slice_wrong_type_NEG)
+{
+  TEST_WITH_WRONG_TYPE(SliceTestGraph<Type::S32>, Type::U8, Granularity::LayerWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(SliceTestGraph<Type::S32>, Type::U8, Granularity::ChannelWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(SliceTestGraph<Type::S32>, Type::S16, Granularity::ChannelWise, Type::U8);
+
+  TEST_WITH_WRONG_TYPE(SliceTestGraph<Type::S64>, Type::U8, Granularity::LayerWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(SliceTestGraph<Type::S64>, Type::U8, Granularity::ChannelWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(SliceTestGraph<Type::S64>, Type::S16, Granularity::ChannelWise, Type::U8);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Slice_wrong_granularity_NEG)
+{
+  TEST_WITH_WRONG_GRANULARITY(SliceTestGraph<Type::S32>, Type::U8, Granularity::LayerWise);
+  TEST_WITH_WRONG_GRANULARITY(SliceTestGraph<Type::S32>, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_WRONG_GRANULARITY(SliceTestGraph<Type::S32>, Type::S16, Granularity::ChannelWise);
+
+  TEST_WITH_WRONG_GRANULARITY(SliceTestGraph<Type::S64>, Type::U8, Granularity::LayerWise);
+  TEST_WITH_WRONG_GRANULARITY(SliceTestGraph<Type::S64>, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_WRONG_GRANULARITY(SliceTestGraph<Type::S64>, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Split)
+{
+  TEST_WITH_GRAPH(SplitTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_GRAPH(SplitTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_GRAPH(SplitTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Split_wrong_type_NEG)
+{
+  TEST_WITH_WRONG_TYPE(SplitTestGraph, Type::U8, Granularity::LayerWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(SplitTestGraph, Type::U8, Granularity::ChannelWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(SplitTestGraph, Type::S16, Granularity::ChannelWise, Type::U8);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Split_wrong_granularity_NEG)
+{
+  TEST_WITH_WRONG_GRANULARITY(SplitTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_WRONG_GRANULARITY(SplitTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_WRONG_GRANULARITY(SplitTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, StridedSlice)
+{
+  TEST_WITH_GRAPH(StridedSliceTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_GRAPH(StridedSliceTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_GRAPH(StridedSliceTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, StridedSlice_wrong_type_NEG)
+{
+  TEST_WITH_WRONG_TYPE(StridedSliceTestGraph, Type::U8, Granularity::LayerWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(StridedSliceTestGraph, Type::U8, Granularity::ChannelWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(StridedSliceTestGraph, Type::S16, Granularity::ChannelWise, Type::U8);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, StridedSlice_wrong_granularity_NEG)
+{
+  TEST_WITH_WRONG_GRANULARITY(StridedSliceTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_WRONG_GRANULARITY(StridedSliceTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_WRONG_GRANULARITY(StridedSliceTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, ArgMax)
+{
+  TEST_WITH_GRAPH(ArgMaxTestGraph<Type::S32>, Type::U8, Granularity::LayerWise);
+  TEST_WITH_GRAPH(ArgMaxTestGraph<Type::S32>, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_GRAPH(ArgMaxTestGraph<Type::S32>, Type::S16, Granularity::ChannelWise);
+
+  TEST_WITH_GRAPH(ArgMaxTestGraph<Type::S64>, Type::U8, Granularity::LayerWise);
+  TEST_WITH_GRAPH(ArgMaxTestGraph<Type::S64>, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_GRAPH(ArgMaxTestGraph<Type::S64>, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, ArgMax_wrong_dimension_type_NEG)
+{
+  ArgMaxTestGraph<Type::S32> g;
+  g.init();
+  luci::QuantizeWithMinMaxPass pass(Type::FLOAT32, Type::U8, Granularity::LayerWise);
+  pass.run(g.g());
+
+  g._dimension->dtype(Type::U8);
+
+  luci::QuantizedModelVerifier verifier(Type::U8, Granularity::LayerWise);
+  EXPECT_ANY_THROW(verifier.verify(g.g()));
+}
+
+TEST(QuantizedModelVerifierTest, ArgMax_wrong_input_granularity_NEG)
+{
+  ArgMaxTestGraph<Type::S32> g;
+  g.init();
+
+  luci::QuantizeWithMinMaxPass pass(Type::FLOAT32, Type::U8, Granularity::LayerWise);
+  pass.run(g.g());
+
+  insert_scale_zp(loco::must_cast<luci::CircleNode *>(g._argmax->input()), 1.0, 1);
+
+  luci::QuantizedModelVerifier verifier(Type::U8, Granularity::LayerWise);
+  EXPECT_ANY_THROW(verifier.verify(g.g()));
+}
+
+TEST(QuantizedModelVerifierTest, BatchToSpaceND)
+{
+  TEST_WITH_GRAPH(BatchToSpaceNDTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_GRAPH(BatchToSpaceNDTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_GRAPH(BatchToSpaceNDTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, BatchToSpaceND_wrong_type_NEG)
+{
+  TEST_WITH_WRONG_TYPE(BatchToSpaceNDTestGraph, Type::U8, Granularity::LayerWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(BatchToSpaceNDTestGraph, Type::U8, Granularity::ChannelWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(BatchToSpaceNDTestGraph, Type::S16, Granularity::ChannelWise, Type::U8);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, BatchToSpaceND_wrong_granularity_NEG)
+{
+  TEST_WITH_WRONG_GRANULARITY(BatchToSpaceNDTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_WRONG_GRANULARITY(BatchToSpaceNDTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_WRONG_GRANULARITY(BatchToSpaceNDTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, DepthToSpace)
+{
+  TEST_WITH_GRAPH(DepthToSpaceTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_GRAPH(DepthToSpaceTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_GRAPH(DepthToSpaceTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, DepthToSpace_wrong_type_NEG)
+{
+  TEST_WITH_WRONG_TYPE(DepthToSpaceTestGraph, Type::U8, Granularity::LayerWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(DepthToSpaceTestGraph, Type::U8, Granularity::ChannelWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(DepthToSpaceTestGraph, Type::S16, Granularity::ChannelWise, Type::U8);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, DepthToSpace_wrong_granularity_NEG)
+{
+  TEST_WITH_WRONG_GRANULARITY(DepthToSpaceTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_WRONG_GRANULARITY(DepthToSpaceTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_WRONG_GRANULARITY(DepthToSpaceTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Concatenation)
+{
+  TEST_WITH_GRAPH(ConcatenationTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_GRAPH(ConcatenationTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_GRAPH(ConcatenationTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Concatenation_wrong_type_NEG)
+{
+  TEST_WITH_WRONG_TYPE(ConcatenationTestGraph, Type::U8, Granularity::LayerWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(ConcatenationTestGraph, Type::U8, Granularity::ChannelWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(ConcatenationTestGraph, Type::S16, Granularity::ChannelWise, Type::U8);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Concatenation_wrong_granularity_NEG)
+{
+  TEST_WITH_WRONG_GRANULARITY(ConcatenationTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_WRONG_GRANULARITY(ConcatenationTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_WRONG_GRANULARITY(ConcatenationTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, LogicalOr)
+{
+  TEST_WITH_GRAPH(BinaryLogicalOpTestGraph<luci::CircleLogicalOr>, Type::U8,
+                  Granularity::LayerWise);
+  TEST_WITH_GRAPH(BinaryLogicalOpTestGraph<luci::CircleLogicalOr>, Type::U8,
+                  Granularity::ChannelWise);
+  TEST_WITH_GRAPH(BinaryLogicalOpTestGraph<luci::CircleLogicalOr>, Type::S16,
+                  Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, LogicalOr_wrong_type_NEG)
+{
+  TEST_WITH_WRONG_TYPE(BinaryLogicalOpTestGraph<luci::CircleLogicalOr>, Type::U8,
+                       Granularity::LayerWise, Type::U8);
+  TEST_WITH_WRONG_TYPE(BinaryLogicalOpTestGraph<luci::CircleLogicalOr>, Type::U8,
+                       Granularity::ChannelWise, Type::U8);
+  TEST_WITH_WRONG_TYPE(BinaryLogicalOpTestGraph<luci::CircleLogicalOr>, Type::S16,
+                       Granularity::ChannelWise, Type::S16);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Reshape)
+{
+  TEST_WITH_GRAPH(ReshapeTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_GRAPH(ReshapeTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_GRAPH(ReshapeTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Reshape_wrong_type_NEG)
+{
+  TEST_WITH_WRONG_TYPE(ReshapeTestGraph, Type::U8, Granularity::LayerWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(ReshapeTestGraph, Type::U8, Granularity::ChannelWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(ReshapeTestGraph, Type::S16, Granularity::ChannelWise, Type::U8);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Reshape_wrong_granularity_NEG)
+{
+  TEST_WITH_WRONG_GRANULARITY(ReshapeTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_WRONG_GRANULARITY(ReshapeTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_WRONG_GRANULARITY(ReshapeTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Tanh)
+{
+  TEST_WITH_GRAPH(TanhTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_GRAPH(TanhTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_GRAPH(TanhTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Tanh_wrong_type_NEG)
+{
+  TEST_WITH_WRONG_TYPE(TanhTestGraph, Type::U8, Granularity::LayerWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(TanhTestGraph, Type::U8, Granularity::ChannelWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(TanhTestGraph, Type::S16, Granularity::ChannelWise, Type::U8);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Tanh_wrong_granularity_NEG)
+{
+  TEST_WITH_WRONG_GRANULARITY(TanhTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_WRONG_GRANULARITY(TanhTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_WRONG_GRANULARITY(TanhTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Pad)
+{
+  TEST_WITH_GRAPH(PadTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_GRAPH(PadTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_GRAPH(PadTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Pad_wrong_type_NEG)
+{
+  TEST_WITH_WRONG_TYPE(PadTestGraph, Type::U8, Granularity::LayerWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(PadTestGraph, Type::U8, Granularity::ChannelWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(PadTestGraph, Type::S16, Granularity::ChannelWise, Type::U8);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Pad_wrong_granularity_NEG)
+{
+  TEST_WITH_WRONG_GRANULARITY(PadTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_WRONG_GRANULARITY(PadTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_WRONG_GRANULARITY(PadTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Transpose)
+{
+  TEST_WITH_GRAPH(TransposeTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_GRAPH(TransposeTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_GRAPH(TransposeTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Transpose_wrong_type_NEG)
+{
+  TEST_WITH_WRONG_TYPE(TransposeTestGraph, Type::U8, Granularity::LayerWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(TransposeTestGraph, Type::U8, Granularity::ChannelWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(TransposeTestGraph, Type::S16, Granularity::ChannelWise, Type::U8);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Transpose_wrong_granularity_NEG)
+{
+  TEST_WITH_WRONG_GRANULARITY(TransposeTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_WRONG_GRANULARITY(TransposeTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_WRONG_GRANULARITY(TransposeTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Floor)
+{
+  TEST_WITH_GRAPH(FloorTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_GRAPH(FloorTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_GRAPH(FloorTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Floor_wrong_type_NEG)
+{
+  TEST_WITH_WRONG_TYPE(FloorTestGraph, Type::U8, Granularity::LayerWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(FloorTestGraph, Type::U8, Granularity::ChannelWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(FloorTestGraph, Type::S16, Granularity::ChannelWise, Type::U8);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Floor_wrong_granularity_NEG)
+{
+  TEST_WITH_WRONG_GRANULARITY(FloorTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_WRONG_GRANULARITY(FloorTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_WRONG_GRANULARITY(FloorTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, GreaterEqual)
+{
+  TEST_WITH_GRAPH(ComparisonOpTestGraph<luci::CircleGreaterEqual>, Type::U8,
+                  Granularity::LayerWise);
+  TEST_WITH_GRAPH(ComparisonOpTestGraph<luci::CircleGreaterEqual>, Type::U8,
+                  Granularity::ChannelWise);
+  TEST_WITH_GRAPH(ComparisonOpTestGraph<luci::CircleGreaterEqual>, Type::S16,
+                  Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, GreaterEqual_wrong_type_NEG)
+{
+  TEST_WITH_WRONG_TYPE(ComparisonOpTestGraph<luci::CircleGreaterEqual>, Type::U8,
+                       Granularity::LayerWise, Type::U8);
+  TEST_WITH_WRONG_TYPE(ComparisonOpTestGraph<luci::CircleGreaterEqual>, Type::U8,
+                       Granularity::ChannelWise, Type::U8);
+  TEST_WITH_WRONG_TYPE(ComparisonOpTestGraph<luci::CircleGreaterEqual>, Type::S16,
+                       Granularity::ChannelWise, Type::S16);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, GreaterEqual_wrong_granularity_NEG)
+{
+  TEST_WITH_WRONG_GRANULARITY_TARGET(ComparisonOpTestGraph<luci::CircleGreaterEqual>, Type::U8,
+                                     Granularity::LayerWise, g.x());
+  TEST_WITH_WRONG_GRANULARITY_TARGET(ComparisonOpTestGraph<luci::CircleGreaterEqual>, Type::U8,
+                                     Granularity::ChannelWise, g.x());
+  TEST_WITH_WRONG_GRANULARITY_TARGET(ComparisonOpTestGraph<luci::CircleGreaterEqual>, Type::S16,
+                                     Granularity::ChannelWise, g.x());
+
+  TEST_WITH_WRONG_GRANULARITY_TARGET(ComparisonOpTestGraph<luci::CircleGreaterEqual>, Type::U8,
+                                     Granularity::LayerWise, g.y());
+  TEST_WITH_WRONG_GRANULARITY_TARGET(ComparisonOpTestGraph<luci::CircleGreaterEqual>, Type::U8,
+                                     Granularity::ChannelWise, g.y());
+  TEST_WITH_WRONG_GRANULARITY_TARGET(ComparisonOpTestGraph<luci::CircleGreaterEqual>, Type::S16,
+                                     Granularity::ChannelWise, g.y());
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Greater)
+{
+  TEST_WITH_GRAPH(ComparisonOpTestGraph<luci::CircleGreater>, Type::U8, Granularity::LayerWise);
+  TEST_WITH_GRAPH(ComparisonOpTestGraph<luci::CircleGreater>, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_GRAPH(ComparisonOpTestGraph<luci::CircleGreater>, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Greater_wrong_type_NEG)
+{
+  TEST_WITH_WRONG_TYPE(ComparisonOpTestGraph<luci::CircleGreater>, Type::U8, Granularity::LayerWise,
+                       Type::U8);
+  TEST_WITH_WRONG_TYPE(ComparisonOpTestGraph<luci::CircleGreater>, Type::U8,
+                       Granularity::ChannelWise, Type::U8);
+  TEST_WITH_WRONG_TYPE(ComparisonOpTestGraph<luci::CircleGreater>, Type::S16,
+                       Granularity::ChannelWise, Type::S16);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Greater_wrong_granularity_NEG)
+{
+  TEST_WITH_WRONG_GRANULARITY_TARGET(ComparisonOpTestGraph<luci::CircleGreater>, Type::U8,
+                                     Granularity::LayerWise, g.x());
+  TEST_WITH_WRONG_GRANULARITY_TARGET(ComparisonOpTestGraph<luci::CircleGreater>, Type::U8,
+                                     Granularity::ChannelWise, g.x());
+  TEST_WITH_WRONG_GRANULARITY_TARGET(ComparisonOpTestGraph<luci::CircleGreater>, Type::S16,
+                                     Granularity::ChannelWise, g.x());
+
+  TEST_WITH_WRONG_GRANULARITY_TARGET(ComparisonOpTestGraph<luci::CircleGreater>, Type::U8,
+                                     Granularity::LayerWise, g.y());
+  TEST_WITH_WRONG_GRANULARITY_TARGET(ComparisonOpTestGraph<luci::CircleGreater>, Type::U8,
+                                     Granularity::ChannelWise, g.y());
+  TEST_WITH_WRONG_GRANULARITY_TARGET(ComparisonOpTestGraph<luci::CircleGreater>, Type::S16,
+                                     Granularity::ChannelWise, g.y());
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, NotEqual)
+{
+  TEST_WITH_GRAPH(ComparisonOpTestGraph<luci::CircleNotEqual>, Type::U8, Granularity::LayerWise);
+  TEST_WITH_GRAPH(ComparisonOpTestGraph<luci::CircleNotEqual>, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_GRAPH(ComparisonOpTestGraph<luci::CircleNotEqual>, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, NotEqual_wrong_type_NEG)
+{
+  TEST_WITH_WRONG_TYPE(ComparisonOpTestGraph<luci::CircleNotEqual>, Type::U8,
+                       Granularity::LayerWise, Type::U8);
+  TEST_WITH_WRONG_TYPE(ComparisonOpTestGraph<luci::CircleNotEqual>, Type::U8,
+                       Granularity::ChannelWise, Type::U8);
+  TEST_WITH_WRONG_TYPE(ComparisonOpTestGraph<luci::CircleNotEqual>, Type::S16,
+                       Granularity::ChannelWise, Type::S16);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, NotEqual_wrong_granularity_NEG)
+{
+  TEST_WITH_WRONG_GRANULARITY_TARGET(ComparisonOpTestGraph<luci::CircleNotEqual>, Type::U8,
+                                     Granularity::LayerWise, g.x());
+  TEST_WITH_WRONG_GRANULARITY_TARGET(ComparisonOpTestGraph<luci::CircleNotEqual>, Type::U8,
+                                     Granularity::ChannelWise, g.x());
+  TEST_WITH_WRONG_GRANULARITY_TARGET(ComparisonOpTestGraph<luci::CircleNotEqual>, Type::S16,
+                                     Granularity::ChannelWise, g.x());
+
+  TEST_WITH_WRONG_GRANULARITY_TARGET(ComparisonOpTestGraph<luci::CircleNotEqual>, Type::U8,
+                                     Granularity::LayerWise, g.y());
+  TEST_WITH_WRONG_GRANULARITY_TARGET(ComparisonOpTestGraph<luci::CircleNotEqual>, Type::U8,
+                                     Granularity::ChannelWise, g.y());
+  TEST_WITH_WRONG_GRANULARITY_TARGET(ComparisonOpTestGraph<luci::CircleNotEqual>, Type::S16,
+                                     Granularity::ChannelWise, g.y());
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Div)
+{
+  TEST_WITH_GRAPH(DivTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_GRAPH(DivTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_GRAPH(DivTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Div_wrong_type_NEG)
+{
+  TEST_WITH_WRONG_TYPE(DivTestGraph, Type::U8, Granularity::LayerWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(DivTestGraph, Type::U8, Granularity::ChannelWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(DivTestGraph, Type::S16, Granularity::ChannelWise, Type::U8);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Div_wrong_granularity_NEG)
+{
+  TEST_WITH_WRONG_GRANULARITY_TARGET(DivTestGraph, Type::U8, Granularity::LayerWise, g.x());
+  TEST_WITH_WRONG_GRANULARITY_TARGET(DivTestGraph, Type::U8, Granularity::ChannelWise, g.x());
+  TEST_WITH_WRONG_GRANULARITY_TARGET(DivTestGraph, Type::S16, Granularity::ChannelWise, g.x());
+
+  TEST_WITH_WRONG_GRANULARITY_TARGET(DivTestGraph, Type::U8, Granularity::LayerWise, g.y());
+  TEST_WITH_WRONG_GRANULARITY_TARGET(DivTestGraph, Type::U8, Granularity::ChannelWise, g.y());
+  TEST_WITH_WRONG_GRANULARITY_TARGET(DivTestGraph, Type::S16, Granularity::ChannelWise, g.y());
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, FloorDiv)
+{
+  TEST_WITH_GRAPH(FloorDivTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_GRAPH(FloorDivTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_GRAPH(FloorDivTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, FloorDiv_wrong_type_NEG)
+{
+  TEST_WITH_WRONG_TYPE(FloorDivTestGraph, Type::U8, Granularity::LayerWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(FloorDivTestGraph, Type::U8, Granularity::ChannelWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(FloorDivTestGraph, Type::S16, Granularity::ChannelWise, Type::U8);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, FloorDiv_wrong_granularity_NEG)
+{
+  TEST_WITH_WRONG_GRANULARITY_TARGET(FloorDivTestGraph, Type::U8, Granularity::LayerWise, g.x());
+  TEST_WITH_WRONG_GRANULARITY_TARGET(FloorDivTestGraph, Type::U8, Granularity::ChannelWise, g.x());
+  TEST_WITH_WRONG_GRANULARITY_TARGET(FloorDivTestGraph, Type::S16, Granularity::ChannelWise, g.x());
+
+  TEST_WITH_WRONG_GRANULARITY_TARGET(FloorDivTestGraph, Type::U8, Granularity::LayerWise, g.y());
+  TEST_WITH_WRONG_GRANULARITY_TARGET(FloorDivTestGraph, Type::U8, Granularity::ChannelWise, g.y());
+  TEST_WITH_WRONG_GRANULARITY_TARGET(FloorDivTestGraph, Type::S16, Granularity::ChannelWise, g.y());
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Rsqrt)
+{
+  TEST_WITH_GRAPH(RsqrtTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_GRAPH(RsqrtTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_GRAPH(RsqrtTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Rsqrt_wrong_type_NEG)
+{
+  TEST_WITH_WRONG_TYPE(RsqrtTestGraph, Type::U8, Granularity::LayerWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(RsqrtTestGraph, Type::U8, Granularity::ChannelWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(RsqrtTestGraph, Type::S16, Granularity::ChannelWise, Type::U8);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Rsqrt_wrong_granularity_NEG)
+{
+  TEST_WITH_WRONG_GRANULARITY(RsqrtTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_WRONG_GRANULARITY(RsqrtTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_WRONG_GRANULARITY(RsqrtTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Sqrt)
+{
+  TEST_WITH_GRAPH(SqrtTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_GRAPH(SqrtTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_GRAPH(SqrtTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Sqrt_wrong_type_NEG)
+{
+  TEST_WITH_WRONG_TYPE(SqrtTestGraph, Type::U8, Granularity::LayerWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(SqrtTestGraph, Type::U8, Granularity::ChannelWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(SqrtTestGraph, Type::S16, Granularity::ChannelWise, Type::U8);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Sqrt_wrong_granularity_NEG)
+{
+  TEST_WITH_WRONG_GRANULARITY(SqrtTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_WRONG_GRANULARITY(SqrtTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_WRONG_GRANULARITY(SqrtTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Elu)
+{
+  TEST_WITH_GRAPH(EluTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_GRAPH(EluTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_GRAPH(EluTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Elu_wrong_type_NEG)
+{
+  TEST_WITH_WRONG_TYPE(EluTestGraph, Type::U8, Granularity::LayerWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(EluTestGraph, Type::U8, Granularity::ChannelWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(EluTestGraph, Type::S16, Granularity::ChannelWise, Type::U8);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Elu_wrong_granularity_NEG)
+{
+  TEST_WITH_WRONG_GRANULARITY(EluTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_WRONG_GRANULARITY(EluTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_WRONG_GRANULARITY(EluTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Pow)
+{
+  TEST_WITH_GRAPH(PowTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_GRAPH(PowTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_GRAPH(PowTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Pow_wrong_type_NEG)
+{
+  TEST_WITH_WRONG_TYPE(PowTestGraph, Type::U8, Granularity::LayerWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(PowTestGraph, Type::U8, Granularity::ChannelWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(PowTestGraph, Type::S16, Granularity::ChannelWise, Type::U8);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, Pow_wrong_granularity_NEG)
+{
+  TEST_WITH_WRONG_GRANULARITY_TARGET(PowTestGraph, Type::U8, Granularity::LayerWise, g.x());
+  TEST_WITH_WRONG_GRANULARITY_TARGET(PowTestGraph, Type::U8, Granularity::ChannelWise, g.x());
+  TEST_WITH_WRONG_GRANULARITY_TARGET(PowTestGraph, Type::S16, Granularity::ChannelWise, g.x());
+
+  TEST_WITH_WRONG_GRANULARITY_TARGET(PowTestGraph, Type::U8, Granularity::LayerWise, g.y());
+  TEST_WITH_WRONG_GRANULARITY_TARGET(PowTestGraph, Type::U8, Granularity::ChannelWise, g.y());
+  TEST_WITH_WRONG_GRANULARITY_TARGET(PowTestGraph, Type::S16, Granularity::ChannelWise, g.y());
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, ResizeBilinear)
+{
+  TEST_WITH_GRAPH(ResizeBilinearTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_GRAPH(ResizeBilinearTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_GRAPH(ResizeBilinearTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, ResizeBilinear_wrong_type_NEG)
+{
+  TEST_WITH_WRONG_TYPE(ResizeBilinearTestGraph, Type::U8, Granularity::LayerWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(ResizeBilinearTestGraph, Type::U8, Granularity::ChannelWise, Type::S16);
+  TEST_WITH_WRONG_TYPE(ResizeBilinearTestGraph, Type::S16, Granularity::ChannelWise, Type::U8);
+  SUCCEED();
+}
+
+TEST(QuantizedModelVerifierTest, ResizeBilinear_wrong_granularity_NEG)
+{
+  TEST_WITH_WRONG_GRANULARITY(ResizeBilinearTestGraph, Type::U8, Granularity::LayerWise);
+  TEST_WITH_WRONG_GRANULARITY(ResizeBilinearTestGraph, Type::U8, Granularity::ChannelWise);
+  TEST_WITH_WRONG_GRANULARITY(ResizeBilinearTestGraph, Type::S16, Granularity::ChannelWise);
+  SUCCEED();
+}
+
+#undef TEST_WITH_GRAPH
+#undef TEST_WITH_WRONG_TYPE
+#undef TEST_WITH_WRONG_GRANULARITY
diff --git a/compiler/luci/pass/src/RemoveRedundantReshape.cpp b/compiler/luci/pass/src/RemoveRedundantReshape.cpp
new file mode 100644
index 000000000..2f0b22ae6
--- /dev/null
+++ b/compiler/luci/pass/src/RemoveRedundantReshape.cpp
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/RemoveRedundantReshapePass.h"
+
+#include <luci/IR/CircleNodes.h>
+
+namespace
+{
+
+bool remove_redundant_reshape(luci::CircleReshape *node)
+{
+  auto pred_node = dynamic_cast<luci::CircleReshape *>(node->tensor());
+  if (pred_node == nullptr)
+    return false;
+
+  node->tensor(pred_node->tensor());
+  return true;
+}
+
+} // namespace
+
+namespace luci
+{
+
+/**
+ * BEFORE
+ *
+ *      [CircleNode]
+ *            |
+ *    [CircleReshape_1]
+ *            |
+ *    [CircleReshape_2]
+ *            |
+ *      [CircleNode]
+ *
+ * AFTER
+ *
+ *                [CircleNode]
+ *                /          \
+ *    [CircleReshape_1]  [CircleReshape_2]
+ *                               |
+ *                         [CircleNode]
+ **/
+bool RemoveRedundantReshapePass::run(loco::Graph *g)
+{
+  bool changed = false;
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    if (auto reshape_node = dynamic_cast<luci::CircleReshape *>(node))
+    {
+      if (remove_redundant_reshape(reshape_node))
+        changed = true;
+    }
+  }
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/RemoveRedundantReshape.test.cpp b/compiler/luci/pass/src/RemoveRedundantReshape.test.cpp
new file mode 100644
index 000000000..617840f3a
--- /dev/null
+++ b/compiler/luci/pass/src/RemoveRedundantReshape.test.cpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "luci/Pass/RemoveRedundantReshapePass.h"
+
+#include <luci/IR/CircleNodes.h>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+class RemoveRedundantReshape : public ::testing::Test
+{
+public:
+  RemoveRedundantReshape() {}
+
+  void createReshapeConst(luci::CircleReshape *target, const std::vector<int32_t> shape)
+  {
+    auto shape_const = g.nodes()->create<luci::CircleConst>();
+    shape_const->dtype(loco::DataType::S32);
+    shape_const->size<loco::DataType::S32>(shape.size());
+    shape_const->shape_status(luci::ShapeStatus::VALID);
+    shape_const->rank(1);
+    shape_const->dim(0).set(shape.size());
+    for (int32_t i = 0; i < shape.size(); i++)
+    {
+      shape_const->at<loco::DataType::S32>(i) = shape.at(i);
+    }
+    shape_const->name("shape_const");
+    target->shape(shape_const);
+  }
+
+  void buildGraph(const std::initializer_list<uint32_t> base_shape,
+                  const std::vector<int32_t> first_shape, const std::vector<int32_t> second_shape)
+  {
+    // Input Create.
+    input = g.nodes()->create<luci::CircleInput>();
+    auto graph_input = g.inputs()->create();
+    input->index(graph_input->index());
+    input->shape_status(luci::ShapeStatus::VALID);
+    input->rank(base_shape.size());
+    input->shape(base_shape);
+    input->name("input");
+
+    // Create first reshape.
+    first_reshape = g.nodes()->create<luci::CircleReshape>();
+    first_reshape->tensor(input);
+    first_reshape->name("Reshape");
+    createReshapeConst(first_reshape, first_shape);
+
+    // Create second reshape.
+    second_reshape = g.nodes()->create<luci::CircleReshape>();
+    second_reshape->tensor(first_reshape);
+    second_reshape->name("second_reshape");
+    createReshapeConst(second_reshape, second_shape);
+
+    // Output Connect.
+    output = g.nodes()->create<luci::CircleOutput>();
+    output->from(second_reshape);
+    output->name("output");
+    auto graph_output = g.outputs()->create();
+    output->index(graph_output->index());
+  }
+
+public:
+  loco::Graph g;
+  luci::CircleInput *input = nullptr;
+  luci::CircleReshape *first_reshape = nullptr;
+  luci::CircleReshape *second_reshape = nullptr;
+  luci::CircleOutput *output = nullptr;
+};
+
+} // namespace
+
+TEST(RemoveRedundantReshapePassTest, name)
+{
+  luci::RemoveRedundantReshapePass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
+
+TEST_F(RemoveRedundantReshape, simple_case)
+{
+  buildGraph({4, 6}, {-1, 4, 6}, {1, -1, 2, 3});
+  luci::RemoveRedundantReshapePass pass;
+  while (pass.run(&g))
+    ;
+  int count = 0;
+  for (auto node : loco::active_nodes(loco::output_nodes(&g)))
+  {
+    if (auto reshape = dynamic_cast<luci::CircleReshape *>(node))
+    {
+      count++;
+    }
+  }
+  ASSERT_EQ(1, count);
+}
diff --git a/compiler/luci/pass/src/RemoveRedundantTranspose.test.cpp b/compiler/luci/pass/src/RemoveRedundantTranspose.test.cpp
deleted file mode 100644
index db608b674..000000000
--- a/compiler/luci/pass/src/RemoveRedundantTranspose.test.cpp
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "luci/Pass/RemoveRedundantTransposePass.h"
-
-#include <luci/IR/CircleNodes.h>
-
-#include <vector>
-
-#include <gtest/gtest.h>
-
-namespace
-{
-
-void setValue(luci::CircleConst *node, const std::vector<int> &v)
-{
-  node->dtype(loco::DataType::S32);
-  node->size<loco::DataType::S32>(v.size());
-  node->rank(1);
-  node->dim(0).set(v.size());
-  for (int i = 0; i < v.size(); ++i)
-  {
-    node->at<loco::DataType::S32>(i) = v[i];
-  }
-}
-
-/**
- *  Type1
- *  BEFORE
- *         |
- *   [CircleNode]     [CircleConst]
- *           \              /
- *           [CircleTranspose]  [CircleConst]
- *                   \              /
- *                   [CircleTranspose]
- *                           |
- *
- *  AFTER
- *         |
- *   [CircleNode]
- *         |   Remove Both
- *
- * --------------------------------------------
- *
- *  Type2
- *  BEFORE
- *         |
- *   [CircleNode]     [CircleConst]
- *           \              /
- *           [CircleTranspose]  [CircleConst]
- *                   \               /
- *                   [CircleTranspose]
- *                           |
- *
- *  AFTER
- *          |                 |
- *    [CircleNode]      [CircleConst]
- *           \               /
- *           [CircleTranspose]
- *                   |
- *
- */
-void create_redundunt_transpose(loco::Graph *g, const std::vector<int32_t> &perm1,
-                                const std::vector<int32_t> &perm2)
-{
-  assert(g);
-
-  auto input = g->nodes()->create<luci::CircleInput>();
-  auto graph_input = g->inputs()->create();
-  input->index(graph_input->index());
-
-  // Create perm1
-  auto perm1_node = g->nodes()->create<luci::CircleConst>();
-  setValue(perm1_node, perm1);
-
-  auto transpose1 = g->nodes()->create<luci::CircleTranspose>();
-  transpose1->dtype(loco::DataType::FLOAT32);
-  transpose1->a(input);
-  transpose1->perm(perm1_node);
-
-  // Create perm2
-  auto perm2_node = g->nodes()->create<luci::CircleConst>();
-  setValue(perm2_node, perm2);
-
-  auto transpose2 = g->nodes()->create<luci::CircleTranspose>();
-  transpose2->dtype(loco::DataType::FLOAT32);
-  transpose2->a(transpose1);
-  transpose2->perm(perm2_node);
-
-  // Output
-  auto output = g->nodes()->create<luci::CircleOutput>();
-  output->from(transpose2);
-  auto graph_output = g->outputs()->create();
-  output->index(graph_output->index());
-}
-
-} // namespace
-
-TEST(RemoveRedundantTransposePass, remove_consecutive_transpose_function_type1)
-{
-  auto graph = loco::make_graph();
-  create_redundunt_transpose(graph.get(), {1, 0, 2, 3}, {1, 0, 2, 3});
-
-  luci::RemoveRedundantTransposePass pass;
-  while (pass.run(graph.get()))
-    ;
-  luci::CircleTranspose *transpose_node = nullptr;
-  for (auto node : loco::active_nodes(loco::output_nodes(graph.get())))
-  {
-    auto trans = dynamic_cast<luci::CircleTranspose *>(node);
-    if (not trans)
-      continue;
-    transpose_node = trans;
-    break;
-  }
-  // No transpose node is in graph.
-  ASSERT_EQ(nullptr, transpose_node);
-}
-
-TEST(RemoveRedundantTransposePass, remove_consecutive_transpose_function_type2)
-{
-  auto graph = loco::make_graph();
-  create_redundunt_transpose(graph.get(), {0, 1, 3, 2}, {1, 0, 2, 3});
-
-  luci::RemoveRedundantTransposePass pass;
-  while (pass.run(graph.get()))
-    ;
-  luci::CircleTranspose *transpose_node = nullptr;
-  for (auto node : loco::active_nodes(loco::output_nodes(graph.get())))
-  {
-    auto trans = dynamic_cast<luci::CircleTranspose *>(node);
-    if (not trans)
-      continue;
-    transpose_node = trans;
-    break;
-  }
-  // Just one transpose node, with updated perm constant.
-  ASSERT_NE(nullptr, transpose_node);
-  auto perm = loco::must_cast<luci::CircleConst *>(transpose_node->perm());
-  ASSERT_EQ(1, perm->at<loco::DataType::S32>(0));
-  ASSERT_EQ(0, perm->at<loco::DataType::S32>(1));
-  ASSERT_EQ(3, perm->at<loco::DataType::S32>(2));
-  ASSERT_EQ(2, perm->at<loco::DataType::S32>(3));
-}
diff --git a/compiler/luci/pass/src/RemoveRedundantTranspose.cpp b/compiler/luci/pass/src/RemoveRedundantTransposePass.cpp
index 33cb76520..71c51ecda 100644
--- a/compiler/luci/pass/src/RemoveRedundantTranspose.cpp
+++ b/compiler/luci/pass/src/RemoveRedundantTransposePass.cpp
@@ -17,6 +17,7 @@
 #include "luci/Pass/RemoveRedundantTransposePass.h"
 
 #include <luci/IR/CircleNodes.h>
+#include <luci/Profile/CircleNodeOrigin.h>
 
 namespace
 {
@@ -35,47 +36,54 @@ bool check_perm(const luci::CircleConst *first_perm, const luci::CircleConst *se
   return true;
 }
 
-bool remove_consecutive_transpose_function(luci::CircleNode *node)
+bool remove_consecutive_transpose_function(luci::CircleTranspose *target_node)
 {
-  auto target_node = dynamic_cast<luci::CircleTranspose *>(node);
-  if (target_node == nullptr)
-    return false;
   auto pred_node = dynamic_cast<luci::CircleTranspose *>(target_node->a());
   if (pred_node == nullptr)
     return false;
-  if (loco::succs(pred_node).size() != 1)
-    return false;
 
-  auto pred_perm = dynamic_cast<luci::CircleConst *>(target_node->perm());
-  if (pred_perm == nullptr)
+  auto target_perm = dynamic_cast<luci::CircleConst *>(target_node->perm());
+  if (target_perm == nullptr)
     return false;
 
-  auto main_perm = dynamic_cast<luci::CircleConst *>(pred_node->perm());
-  if (main_perm == nullptr)
+  auto pred_perm = dynamic_cast<luci::CircleConst *>(pred_node->perm());
+  if (pred_perm == nullptr)
     return false;
 
   auto main_node = loco::must_cast<luci::CircleNode *>(pred_node->a());
-  if (check_perm(pred_perm, main_perm))
+  if (check_perm(target_perm, pred_perm))
   {
-    replace(node).with(main_node);
+    replace(target_node).with(main_node);
   }
   else
   {
-    auto g = main_perm->graph();
+    auto name = target_node->name();
+    assert(name.length() > 0);
+
+    auto g = pred_perm->graph();
     auto new_const_node = g->nodes()->create<luci::CircleConst>();
 
     new_const_node->dtype(loco::DataType::S32);
     new_const_node->rank(1);
-    new_const_node->dim(0) = main_perm->dim(0);
-    new_const_node->size<loco::DataType::S32>(main_perm->dim(0).value());
+    new_const_node->dim(0) = pred_perm->dim(0);
+    new_const_node->size<loco::DataType::S32>(pred_perm->dim(0).value());
     new_const_node->shape_status(luci::ShapeStatus::VALID);
-    for (uint32_t i = 0; i < main_perm->size<loco::DataType::S32>(); i++)
+    for (uint32_t i = 0; i < pred_perm->size<loco::DataType::S32>(); i++)
     {
       new_const_node->at<loco::DataType::S32>(i) =
-          pred_perm->at<loco::DataType::S32>(main_perm->at<loco::DataType::S32>(i));
+        target_perm->at<loco::DataType::S32>(pred_perm->at<loco::DataType::S32>(i));
     }
-    pred_node->perm(new_const_node);
-    replace(node).with(pred_node);
+    new_const_node->name(name + "/Transpose/perm");
+
+    // Create New Transpose Node
+    auto new_transpose_node = g->nodes()->create<luci::CircleTranspose>();
+    new_transpose_node->dtype(target_node->dtype());
+    new_transpose_node->a(main_node);
+    new_transpose_node->perm(new_const_node);
+    new_transpose_node->name(name + "/Transpose");
+    luci::add_origin(new_transpose_node, luci::get_origin(target_node));
+
+    replace(target_node).with(new_transpose_node);
   }
   return true;
 }
@@ -84,41 +92,36 @@ bool remove_consecutive_transpose_function(luci::CircleNode *node)
 
 namespace luci
 {
+
 /**
  *  BEFORE
  *         |
  *   [CircleNode]     [CircleConst]
- *    (main_node)      (main_perm)
- *         \               /
+ *         |           (pred_perm)
+ *          \              /
  *         [CircleTranspose]  [CircleConst]
- *            (pred_node)      (pred_perm)
+ *            (pred_node)     (target_perm)
  *                 \               /
  *                 [CircleTranspose]
  *                   (target_node)
  *                         |
  *
  *  AFTER
- *      <Optional Case>
- *
- *          |                 |                   |
- *    [CircleNode]      [CircleConst]             |
- *     (main_node)     (new_const_node)           |
- *           \               /           or  [CircleNode]
- *           [CircleTranspose]                (main_node)
- *              (pred_node)                       |
+ *          |                                     |
+ *    [CircleNode]  [CircleConst](new)            |
+ *           \           /               or  [CircleNode]
+ *           [CircleTranspose](new)               |
  *                   |                            |
- *
  */
 bool RemoveRedundantTransposePass::run(loco::Graph *g)
 {
   bool changed = false;
   for (auto node : loco::active_nodes(loco::output_nodes(g)))
   {
-    auto circle_node = loco::must_cast<luci::CircleNode *>(node);
-    if (remove_consecutive_transpose_function(circle_node))
+    if (auto transpose = dynamic_cast<luci::CircleTranspose *>(node))
     {
-      changed = true;
-      break;
+      if (remove_consecutive_transpose_function(transpose))
+        changed = true;
     }
   }
   return changed;
diff --git a/compiler/luci/pass/src/RemoveRedundantTransposePass.test.cpp b/compiler/luci/pass/src/RemoveRedundantTransposePass.test.cpp
new file mode 100644
index 000000000..e80623499
--- /dev/null
+++ b/compiler/luci/pass/src/RemoveRedundantTransposePass.test.cpp
@@ -0,0 +1,321 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "luci/Pass/RemoveRedundantTransposePass.h"
+
+#include <luci/IR/CircleNodes.h>
+
+#include <vector>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+void setValue(luci::CircleConst *node, const std::vector<int> &v)
+{
+  node->dtype(loco::DataType::S32);
+  node->size<loco::DataType::S32>(v.size());
+  node->rank(1);
+  node->dim(0).set(v.size());
+  for (int i = 0; i < v.size(); ++i)
+  {
+    node->at<loco::DataType::S32>(i) = v[i];
+  }
+}
+
+/**
+ *  Remove for consecutive Transpose
+ *
+ *  Type1: Remove both Transpose
+ *     BEFORE
+ *            |
+ *      [CircleNode]     [CircleConst]
+ *              \              /
+ *              [CircleTranspose]  [CircleConst]
+ *                      \              /
+ *                      [CircleTranspose]
+ *                              |
+ *
+ *     AFTER
+ *            |
+ *      [CircleNode]
+ *            |
+ *
+ * --------------------------------------------
+ *
+ *  Type2: Merge to one Transpose
+ *     BEFORE
+ *            |
+ *      [CircleNode]     [CircleConst]
+ *              \              /
+ *              [CircleTranspose]  [CircleConst]
+ *                      \               /
+ *                      [CircleTranspose]
+ *                              |
+ *
+ *     AFTER
+ *             |
+ *       [CircleNode]      [CircleConst]
+ *              \               /
+ *              [CircleTranspose]
+ *                      |
+ *
+ */
+void create_redundunt_transpose(loco::Graph *g, const std::vector<int32_t> &perm1,
+                                const std::vector<int32_t> &perm2)
+{
+  assert(g);
+
+  auto input = g->nodes()->create<luci::CircleInput>();
+  auto graph_input = g->inputs()->create();
+  input->index(graph_input->index());
+  input->name("input");
+
+  // Create perm1
+  auto perm1_node = g->nodes()->create<luci::CircleConst>();
+  setValue(perm1_node, perm1);
+  perm1_node->name("perm1_node");
+
+  auto transpose1 = g->nodes()->create<luci::CircleTranspose>();
+  transpose1->dtype(loco::DataType::FLOAT32);
+  transpose1->a(input);
+  transpose1->perm(perm1_node);
+  transpose1->name("transpose1");
+
+  // Create perm2
+  auto perm2_node = g->nodes()->create<luci::CircleConst>();
+  setValue(perm2_node, perm2);
+  perm2_node->name("perm2_node");
+
+  auto transpose2 = g->nodes()->create<luci::CircleTranspose>();
+  transpose2->dtype(loco::DataType::FLOAT32);
+  transpose2->a(transpose1);
+  transpose2->perm(perm2_node);
+  transpose2->name("transpose2");
+
+  // Output
+  auto output = g->nodes()->create<luci::CircleOutput>();
+  output->from(transpose2);
+  auto graph_output = g->outputs()->create();
+  output->index(graph_output->index());
+  output->name("output");
+}
+
+/**
+ * Remove for consecutive Transposes with branching
+ *
+ *  BEFORE
+ *               |
+ *          [CircleNode]       [CircleConst]
+ *                    \           /
+ *     [CircleConst] [CircleTranspose] [CircleConst]
+ *             \          / \              /
+ *        [CircleTranspose] [CircleTranspose]
+ *               |                |
+ *          [CircleNode]     [CircleNode]
+ *               |                |
+ *
+ *  AFTER
+ *   Type 1: Remove all Transpose
+ *                 |
+ *            [CircleNode]
+ *               /    \
+ *      [CircleNode] [CircleNode]
+ *           |            |
+ *
+ *   Type 2: Remove both for one side and create new for another side
+ *                |
+ *          [CircleNode]      [CircleConst](new)
+ *              /  \               /
+ *             /    [CircleTranspose](new)
+ *            |            |
+ *     [CircleNode]   [CircleNode]
+ *            |            |
+ */
+void create_redundunt_transpose_with_branch(loco::Graph *g, const std::vector<int32_t> &perm1,
+                                            const std::vector<int32_t> &perm2,
+                                            const std::vector<int32_t> &perm3)
+{
+  assert(g);
+
+  auto input = g->nodes()->create<luci::CircleInput>();
+  auto graph_input = g->inputs()->create();
+  input->dtype(loco::DataType::FLOAT32);
+  input->index(graph_input->index());
+  input->name("input");
+  graph_input->dtype(loco::DataType::FLOAT32);
+
+  graph_input->shape({4, 4, 4, 4});
+  input->shape({4, 4, 4, 4});
+
+  // Create perm1
+  auto perm1_node = g->nodes()->create<luci::CircleConst>();
+  setValue(perm1_node, perm1);
+  perm1_node->name("perm1_node");
+
+  auto transpose1 = g->nodes()->create<luci::CircleTranspose>();
+  transpose1->dtype(loco::DataType::FLOAT32);
+  transpose1->a(input);
+  transpose1->perm(perm1_node);
+  transpose1->name("transpose1");
+
+  // Create perm2
+  auto perm2_node = g->nodes()->create<luci::CircleConst>();
+  setValue(perm2_node, perm2);
+  perm2_node->name("perm2_node");
+
+  auto transpose2 = g->nodes()->create<luci::CircleTranspose>();
+  transpose2->dtype(loco::DataType::FLOAT32);
+  transpose2->a(transpose1);
+  transpose2->perm(perm2_node);
+  transpose2->name("transpose2");
+
+  // create perm3
+  auto perm3_node = g->nodes()->create<luci::CircleConst>();
+  setValue(perm3_node, perm3);
+  perm3_node->name("perm3_node");
+
+  auto transpose3 = g->nodes()->create<luci::CircleTranspose>();
+  transpose3->dtype(loco::DataType::FLOAT32);
+  transpose3->a(transpose1);
+  transpose3->perm(perm3_node);
+  transpose3->name("transpose3");
+
+  // Output
+  auto output1 = g->nodes()->create<luci::CircleOutput>();
+  output1->from(transpose2);
+  output1->name("output1");
+  auto output2 = g->nodes()->create<luci::CircleOutput>();
+  output2->from(transpose3);
+  output2->name("output2");
+  auto graph_output1 = g->outputs()->create();
+  output1->index(graph_output1->index());
+  auto graph_output2 = g->outputs()->create();
+  output2->index(graph_output2->index());
+  output1->dtype(loco::DataType::FLOAT32);
+  output2->dtype(loco::DataType::FLOAT32);
+  graph_output1->dtype(loco::DataType::FLOAT32);
+  graph_output2->dtype(loco::DataType::FLOAT32);
+  output1->shape({4, 4, 4, 4});
+  output2->shape({4, 4, 4, 4});
+  graph_output1->shape({4, 4, 4, 4});
+  graph_output2->shape({4, 4, 4, 4});
+}
+
+} // namespace
+
+TEST(RemoveRedundantTransposePassTest, name)
+{
+  luci::RemoveRedundantTransposePass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
+
+TEST(RemoveRedundantTransposePass, remove_consecutive_transpose_function_type1)
+{
+  auto graph = loco::make_graph();
+  create_redundunt_transpose(graph.get(), {1, 0, 2, 3}, {1, 0, 2, 3});
+
+  luci::RemoveRedundantTransposePass pass;
+  while (pass.run(graph.get()))
+    ;
+  luci::CircleTranspose *transpose_node = nullptr;
+  for (auto node : loco::active_nodes(loco::output_nodes(graph.get())))
+  {
+    auto trans = dynamic_cast<luci::CircleTranspose *>(node);
+    if (not trans)
+      continue;
+    transpose_node = trans;
+    break;
+  }
+  // No transpose node is in graph.
+  ASSERT_EQ(nullptr, transpose_node);
+}
+
+TEST(RemoveRedundantTransposePass, remove_consecutive_transpose_function_type2)
+{
+  auto graph = loco::make_graph();
+  create_redundunt_transpose(graph.get(), {0, 1, 3, 2}, {1, 0, 2, 3});
+
+  luci::RemoveRedundantTransposePass pass;
+  while (pass.run(graph.get()))
+    ;
+  luci::CircleTranspose *transpose_node = nullptr;
+  for (auto node : loco::active_nodes(loco::output_nodes(graph.get())))
+  {
+    auto trans = dynamic_cast<luci::CircleTranspose *>(node);
+    if (not trans)
+      continue;
+    transpose_node = trans;
+    break;
+  }
+  // Just one transpose node, with updated perm constant.
+  ASSERT_NE(nullptr, transpose_node);
+  auto perm = loco::must_cast<luci::CircleConst *>(transpose_node->perm());
+  ASSERT_EQ(1, perm->at<loco::DataType::S32>(0));
+  ASSERT_EQ(0, perm->at<loco::DataType::S32>(1));
+  ASSERT_EQ(3, perm->at<loco::DataType::S32>(2));
+  ASSERT_EQ(2, perm->at<loco::DataType::S32>(3));
+}
+
+/**
+ * @brief Test case that first transpose output become input of operations more than one.
+ */
+TEST(RemoveRedundantTransposePass, remove_consecutive_transpose_function_with_branch_remove_case)
+{
+  auto graph = loco::make_graph();
+  create_redundunt_transpose_with_branch(graph.get(), {1, 0, 2, 3}, {1, 0, 2, 3}, {1, 0, 2, 3});
+
+  luci::RemoveRedundantTransposePass pass;
+  while (pass.run(graph.get()))
+    ;
+  luci::CircleTranspose *transpose_node = nullptr;
+  for (auto node : loco::active_nodes(loco::output_nodes(graph.get())))
+  {
+    auto trans = dynamic_cast<luci::CircleTranspose *>(node);
+    if (not trans)
+      continue;
+    transpose_node = trans;
+    break;
+  }
+  // No transpose node is in graph.
+  ASSERT_EQ(nullptr, transpose_node);
+}
+
+TEST(RemoveRedundantTransposePass, remove_consecutive_transpose_function_with_branch_leave_one)
+{
+  auto graph = loco::make_graph();
+  create_redundunt_transpose_with_branch(graph.get(), {1, 0, 2, 3}, {1, 0, 2, 3}, {0, 1, 3, 2});
+
+  luci::RemoveRedundantTransposePass pass;
+  while (pass.run(graph.get()))
+    ;
+  luci::CircleTranspose *transpose_node = nullptr;
+  for (auto node : loco::active_nodes(loco::output_nodes(graph.get())))
+  {
+    auto trans = dynamic_cast<luci::CircleTranspose *>(node);
+    if (not trans)
+      continue;
+    transpose_node = trans;
+    break;
+  }
+  ASSERT_NE(nullptr, transpose_node);
+  auto perm = loco::must_cast<luci::CircleConst *>(transpose_node->perm());
+  ASSERT_EQ(1, perm->at<loco::DataType::S32>(0));
+  ASSERT_EQ(0, perm->at<loco::DataType::S32>(1));
+  ASSERT_EQ(3, perm->at<loco::DataType::S32>(2));
+  ASSERT_EQ(2, perm->at<loco::DataType::S32>(3));
+}
diff --git a/compiler/luci/pass/src/RemoveUnnecessaryReshapePass.cpp b/compiler/luci/pass/src/RemoveUnnecessaryReshapePass.cpp
new file mode 100644
index 000000000..3f0c4ee82
--- /dev/null
+++ b/compiler/luci/pass/src/RemoveUnnecessaryReshapePass.cpp
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/RemoveUnnecessaryReshapePass.h"
+
+#include <luci/IR/CircleNodes.h>
+
+namespace
+{
+
+bool remove_no_effect_reshape(luci::CircleNode *node)
+{
+  auto target_node = dynamic_cast<luci::CircleReshape *>(node);
+  if (target_node == nullptr)
+    return false;
+
+  auto new_shape = dynamic_cast<luci::CircleConst *>(target_node->shape());
+  if (new_shape == nullptr)
+    return false;
+
+  // Compare updated shape and input shape.
+  auto input_node = loco::must_cast<luci::CircleNode *>(target_node->tensor());
+  if (input_node->rank() != new_shape->dim(0).value())
+    return false;
+  for (uint32_t i = 0; i < input_node->rank(); i++)
+  {
+    // If update_shape is -1, don't care
+    // TODO check updated shape has value -1 at most one.
+    if (new_shape->at<loco::DataType::S32>(i) == -1)
+      continue;
+    // If input_shape dynamic, can't remove this.
+    if (!input_node->dim(i).known())
+      return false;
+    // If input_shape and updated shape differ, also can't remove.
+    if (input_node->dim(i).value() != static_cast<uint32_t>(new_shape->at<loco::DataType::S32>(i)))
+      return false;
+  }
+
+  replace(target_node).with(input_node);
+  return true;
+}
+
+} // namespace
+
+namespace luci
+{
+
+bool RemoveUnnecessaryReshapePass::run(loco::Graph *g)
+{
+  bool changed = false;
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    auto circle_node = loco::must_cast<luci::CircleNode *>(node);
+    if (remove_no_effect_reshape(circle_node))
+    {
+      changed = true;
+    }
+  }
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/RemoveUnnecessaryReshapePass.test.cpp b/compiler/luci/pass/src/RemoveUnnecessaryReshapePass.test.cpp
new file mode 100644
index 000000000..9d2e758b4
--- /dev/null
+++ b/compiler/luci/pass/src/RemoveUnnecessaryReshapePass.test.cpp
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/RemoveUnnecessaryReshapePass.h"
+
+#include <luci/IR/CircleNodes.h>
+
+#include <luci/test/TestIOGraph.h>
+#include "test/TestFirstNode.h"
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+using namespace luci::test;
+
+class ReshapeGraphlet
+{
+public:
+  ReshapeGraphlet() = default;
+
+public:
+  void init(loco::Graph *g, const ShapeU32 input_shape, bool remove)
+  {
+    std::vector<uint32_t> shape_vector{input_shape};
+
+    auto dim0_val = remove ? shape_vector.size() : 1;
+    _reshape_shape = g->nodes()->create<luci::CircleConst>();
+    _reshape_shape->rank(1);
+    _reshape_shape->dim(0).set(dim0_val);
+    _reshape_shape->shape_status(luci::ShapeStatus::VALID);
+    _reshape_shape->dtype(loco::DataType::S32);
+
+    _reshape_shape->size<loco::DataType::S32>(dim0_val);
+    for (uint32_t i = 0; i < dim0_val; i++)
+    {
+      if (remove)
+        _reshape_shape->at<loco::DataType::S32>(i) = static_cast<int32_t>(shape_vector.at(i));
+      else
+        _reshape_shape->at<loco::DataType::S32>(i) = -1;
+    }
+    _reshape_shape->name("reshape_shape");
+
+    // Reshape create
+    auto newshape_rank = remove ? shape_vector.size() : 1;
+    _reshape = g->nodes()->create<luci::CircleReshape>();
+    _reshape->newShape()->rank(newshape_rank);
+    for (uint32_t i = 0; i < newshape_rank; i++)
+    {
+      if (remove)
+        _reshape->newShape()->dim(i) = static_cast<int32_t>(shape_vector.at(i));
+      else
+        _reshape->newShape()->dim(i) = -1;
+    }
+    _reshape->name("reshape");
+  }
+
+protected:
+  luci::CircleReshape *_reshape = nullptr;
+  luci::CircleConst *_reshape_shape = nullptr;
+};
+
+class ReshapeGraph : public TestIOGraph, public ReshapeGraphlet
+{
+public:
+  ReshapeGraph() = default;
+
+public:
+  void init(const ShapeU32 shape, bool remove)
+  {
+    TestIOGraph::init(shape, shape);
+    ReshapeGraphlet::init(g(), shape, remove);
+
+    // connect graph
+    _reshape->tensor(input());
+    _reshape->shape(_reshape_shape);
+
+    output()->from(_reshape);
+  }
+};
+
+// TODO use ::testing::Test
+
+} // namespace
+
+TEST(RemoveUnnecessaryReshapePassTest, name)
+{
+  luci::RemoveUnnecessaryReshapePass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
+
+TEST(RemoveUnnecessaryReshapePass, removed)
+{
+  ReshapeGraph g;
+
+  g.init({1, 2, 3, 4}, true);
+
+  // confirm graph has Reshape
+  auto reshape_node = luci::test::first_node<luci::CircleReshape>(g.g());
+  ASSERT_NE(nullptr, reshape_node);
+  luci::RemoveUnnecessaryReshapePass pass;
+  while (pass.run(g.g()))
+    ;
+
+  // check Reshape is removed
+  reshape_node = luci::test::first_node<luci::CircleReshape>(g.g());
+  ASSERT_EQ(nullptr, reshape_node);
+}
+
+TEST(RemoveUnnecessaryReshapePass, not_removed_NEG)
+{
+  ReshapeGraph g;
+
+  g.init({1, 2, 3, 4}, false);
+
+  // confirm graph has Reshape
+  auto reshape_node = luci::test::first_node<luci::CircleReshape>(g.g());
+  ASSERT_NE(nullptr, reshape_node);
+  luci::RemoveUnnecessaryReshapePass pass;
+  while (pass.run(g.g()))
+    ;
+
+  // check Reshape is NOT removed
+  reshape_node = luci::test::first_node<luci::CircleReshape>(g.g());
+  ASSERT_NE(nullptr, reshape_node);
+}
diff --git a/compiler/luci/pass/src/RemoveUnnecessarySlicePass.cpp b/compiler/luci/pass/src/RemoveUnnecessarySlicePass.cpp
new file mode 100644
index 000000000..0720813cd
--- /dev/null
+++ b/compiler/luci/pass/src/RemoveUnnecessarySlicePass.cpp
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/RemoveUnnecessarySlicePass.h"
+
+#include <luci/IR/CircleNodes.h>
+
+namespace
+{
+
+/**
+ * @brief   Return value in CircleConst.
+ * @details Return value in position on CircleConst with int64 format.
+ *          Begin must be larger than or equal to 0. Size must be larger
+ *          than or equal to -1.
+ */
+int64_t value_from_circle_const(const luci::CircleConst *node, uint32_t idx)
+{
+  assert(node->rank() == 1 && node->dim(0).value() > idx);
+  assert(node->dtype() == loco::DataType::S64 || node->dtype() == loco::DataType::S32);
+
+  if (node->dtype() == loco::DataType::S64)
+    return node->at<loco::DataType::S64>(idx);
+  return static_cast<int64_t>(node->at<loco::DataType::S32>(idx));
+}
+
+bool remove_no_effect_slice(luci::CircleNode *node)
+{
+  auto target_node = dynamic_cast<luci::CircleSlice *>(node);
+  if (target_node == nullptr)
+    return false;
+
+  auto begin_const = dynamic_cast<luci::CircleConst *>(target_node->begin());
+  if (begin_const == nullptr)
+    return false;
+
+  auto size_const = dynamic_cast<luci::CircleConst *>(target_node->size());
+  if (size_const == nullptr)
+    return false;
+
+  // Check input output shape.
+  auto input_node = loco::must_cast<luci::CircleNode *>(target_node->input());
+  for (uint32_t i = 0; i < input_node->rank(); i++)
+  {
+    if (value_from_circle_const(begin_const, i) != 0)
+      return false;
+
+    int64_t size_value = value_from_circle_const(size_const, i);
+    if (size_value == -1)
+      continue;
+    if (size_value != static_cast<int64_t>(input_node->dim(i).value()))
+      return false;
+
+    if (!input_node->dim(i).known())
+      return false;
+  }
+  replace(target_node).with(input_node);
+  return true;
+}
+
+} // namespace
+
+namespace luci
+{
+/**
+ * BEFORE
+ *
+ *    [CircleNode]
+ *          |
+ *    [CircleSlice]
+ *          |
+ *    [CircleNode]
+ *
+ * AFTER
+ *
+ *    [CircleNode]
+ *          |
+ *    [CircleNode]
+ *
+ * Slice OP has no effect if,
+ *    1. Static Shape : begin_const[idx] is 0 AND size_const[idx] is (-1 OR input_dimension[idx])
+ *    2. Dynamic Shape : begin_const[idx] is 0 AND size_const[idx] is -1
+ */
+bool RemoveUnnecessarySlicePass::run(loco::Graph *g)
+{
+  bool changed = false;
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    auto circle_node = loco::must_cast<luci::CircleNode *>(node);
+    if (remove_no_effect_slice(circle_node))
+    {
+      changed = true;
+    }
+  }
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/RemoveUnnecessarySlicePass.test.cpp b/compiler/luci/pass/src/RemoveUnnecessarySlicePass.test.cpp
new file mode 100644
index 000000000..80921a93a
--- /dev/null
+++ b/compiler/luci/pass/src/RemoveUnnecessarySlicePass.test.cpp
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "luci/Pass/RemoveUnnecessarySlicePass.h"
+
+#include <luci/IR/CircleNodes.h>
+
+#include <luci/test/TestIOGraph.h>
+#include "test/TestFirstNode.h"
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+using namespace luci::test;
+
+class SliceGraphlet
+{
+public:
+  SliceGraphlet() = default;
+
+public:
+  void init(loco::Graph *g, const ShapeU32 input_shape, bool remove)
+  {
+    // Begin Create.
+    _begin = g->nodes()->create<luci::CircleConst>();
+    _begin->rank(1);
+    _begin->dim(0).set(input_shape.size());
+    _begin->shape_status(luci::ShapeStatus::VALID);
+    _begin->dtype(loco::DataType::S32);
+    _begin->size<loco::DataType::S32>(input_shape.size());
+    for (int i = 0; i < input_shape.size(); ++i)
+      _begin->at<loco::DataType::S32>(i) = remove ? 0 : 1;
+    _begin->name("begin");
+
+    // Size Create.
+    _size = g->nodes()->create<luci::CircleConst>();
+    _size->rank(1);
+    _size->dim(0).set(input_shape.size());
+    _size->shape_status(luci::ShapeStatus::VALID);
+    _size->dtype(loco::DataType::S32);
+    _size->size<loco::DataType::S32>(input_shape.size());
+    for (int i = 0; i < input_shape.size(); ++i)
+      _size->at<loco::DataType::S32>(i) = -1;
+    _size->name("size");
+
+    // Slice Node create.
+    _slice = g->nodes()->create<luci::CircleSlice>();
+    _slice->dtype(loco::DataType::S32);
+    _slice->name("slice");
+  }
+
+protected:
+  luci::CircleSlice *_slice = nullptr;
+  luci::CircleConst *_begin = nullptr;
+  luci::CircleConst *_size = nullptr;
+};
+
+class SliceGraph : public TestIOGraph, public SliceGraphlet
+{
+public:
+  SliceGraph() = default;
+
+public:
+  void init(const ShapeU32 shape, bool remove)
+  {
+    TestIOGraph::init(shape, shape);
+    SliceGraphlet::init(g(), shape, remove);
+
+    _slice->input(input());
+    _slice->begin(_begin);
+    _slice->size(_size);
+
+    output()->from(_slice);
+  }
+};
+
+} // namespace
+
+TEST(RemoveUnnecessarySlicePass, name)
+{
+  luci::RemoveUnnecessarySlicePass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
+
+TEST(RemoveUnnecessarySlicePass, removed)
+{
+  SliceGraph g;
+
+  g.init({2, 4, 2, 3}, true);
+
+  // confirm graph has Slice
+  auto slice_node = luci::test::first_node<luci::CircleSlice>(g.g());
+  ASSERT_NE(nullptr, slice_node);
+  luci::RemoveUnnecessarySlicePass pass;
+  while (pass.run(g.g()))
+    ;
+
+  // check Slice is removed
+  slice_node = luci::test::first_node<luci::CircleSlice>(g.g());
+  ASSERT_EQ(nullptr, slice_node);
+}
+
+TEST(RemoveUnnecessarySlicePass, not_removed_NEG)
+{
+  SliceGraph g;
+
+  g.init({2, 4, 2, 3}, false);
+
+  // confirm graph has Slice
+  auto slice_node = luci::test::first_node<luci::CircleSlice>(g.g());
+  ASSERT_NE(nullptr, slice_node);
+  luci::RemoveUnnecessarySlicePass pass;
+  while (pass.run(g.g()))
+    ;
+
+  // check Slice is NOT removed
+  slice_node = luci::test::first_node<luci::CircleSlice>(g.g());
+  ASSERT_NE(nullptr, slice_node);
+}
diff --git a/compiler/luci/pass/src/ShapeSignatureInferencePass.cpp b/compiler/luci/pass/src/RemoveUnnecessarySplitPass.cpp
index 115b77a96..3243f6213 100644
--- a/compiler/luci/pass/src/ShapeSignatureInferencePass.cpp
+++ b/compiler/luci/pass/src/RemoveUnnecessarySplitPass.cpp
@@ -14,49 +14,50 @@
  * limitations under the License.
  */
 
-#include "luci/Pass/ShapeSignatureInferencePass.h"
+#include "luci/Pass/RemoveUnnecessarySplitPass.h"
 
-#include <luci/IR/CircleShapeSignature.h>
-#include <luci/Service/CircleShapeSignatureInference.h>
+#include <luci/IR/CircleNodes.h>
 
-#include <loco.h>
-
-namespace luci
+namespace
 {
-
-bool ShapeSignatureInferencePass::run(luci::Module *m)
+bool remove_unnecessary_split(luci::CircleNode *node)
 {
-  bool changed = false;
+  auto target_node = dynamic_cast<luci::CircleSplitOut *>(node);
+  if (target_node == nullptr)
+    return false;
+
+  auto split_node = dynamic_cast<luci::CircleSplit *>(target_node->input());
+  if (split_node == nullptr)
+    return false;
 
-  for (size_t g = 0; g < m->size(); ++g)
+  if (loco::succs(split_node).size() != 1)
+    return false;
+
+  if (split_node->num_split() == 1)
   {
-    if (run(m->graph(g)))
-      changed = true;
+    auto input_node = loco::must_cast<luci::CircleNode *>(split_node->input());
+    replace(target_node).with(input_node);
+    return true;
   }
-
-  return changed;
+  return false;
 }
 
-bool ShapeSignatureInferencePass::run(loco::Graph *g)
+} // namespace
+
+namespace luci
 {
-  luci::ssinf::Rule signature_inference_rule;
-  bool changed = false;
 
-  for (auto node : loco::postorder_traversal(loco::output_nodes(g)))
+bool RemoveUnnecessarySplitPass::run(loco::Graph *g)
+{
+  bool changed = false;
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
   {
-    luci::ShapeSignature shape_signature;
-
     auto circle_node = loco::must_cast<luci::CircleNode *>(node);
-    if (signature_inference_rule.infer(circle_node, shape_signature))
+    if (remove_unnecessary_split(circle_node))
     {
-      if (!(circle_node->shape_signature() == shape_signature))
-      {
-        circle_node->shape_signature(shape_signature);
-        changed = true;
-      }
+      changed = true;
     }
   }
-
   return changed;
 }
 
diff --git a/compiler/luci/pass/src/RemoveUnnecessarySplitPass.test.cpp b/compiler/luci/pass/src/RemoveUnnecessarySplitPass.test.cpp
new file mode 100644
index 000000000..f292b5357
--- /dev/null
+++ b/compiler/luci/pass/src/RemoveUnnecessarySplitPass.test.cpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/RemoveUnnecessarySplitPass.h"
+
+#include <luci/IR/CircleNodes.h>
+
+#include <luci/test/TestIOGraph.h>
+#include "test/TestFirstNode.h"
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+using namespace luci::test;
+
+class SplitGraphlet
+{
+public:
+  SplitGraphlet() = default;
+
+public:
+  void init(loco::Graph *g, uint32_t nout)
+  {
+    assert(nout == 1 || nout == 2);
+
+    _dim = g->nodes()->create<luci::CircleConst>();
+    set_shape_vector(_dim, {0});
+    _dim->name("dim");
+
+    _split = g->nodes()->create<luci::CircleSplit>();
+    _split->num_split(nout);
+    _split->name("split");
+
+    _split_out_0 = g->nodes()->create<luci::CircleSplitOut>();
+    _split_out_0->index(0);
+    _split_out_0->name("split_out_0");
+
+    if (nout == 2)
+    {
+      _split_out_1 = g->nodes()->create<luci::CircleSplitOut>();
+      _split_out_1->index(1);
+      _split_out_1->name("split_out_1");
+    }
+  }
+
+protected:
+  luci::CircleSplit *_split = nullptr;
+  luci::CircleConst *_dim = nullptr;
+  luci::CircleSplitOut *_split_out_0 = nullptr;
+  luci::CircleSplitOut *_split_out_1 = nullptr;
+};
+
+class SplitOneGraph : public TestIGraphlet, public TestOGraphlet, public SplitGraphlet
+{
+public:
+  SplitOneGraph() = default;
+
+public:
+  void init()
+  {
+    TestIGraphlet::init(g(), {1});
+    TestOGraphlet::init(g(), {1});
+    SplitGraphlet::init(g(), 1);
+
+    _split->input(input());
+    _split->split_dim(_dim);
+    _split_out_0->input(_split);
+
+    output()->from(_split_out_0);
+  }
+};
+
+class SplitTwoGraph : public TestIGraphlet, public TestOsGraphlet<2>, public SplitGraphlet
+{
+public:
+  SplitTwoGraph() = default;
+
+public:
+  void init()
+  {
+    TestIGraphlet::init(g(), {1});
+    TestOsGraphlet<2>::init(g(), {{1}, {1}});
+    SplitGraphlet::init(g(), 2);
+
+    _split->input(input());
+    _split->split_dim(_dim);
+    _split_out_0->input(_split);
+    _split_out_1->input(_split);
+
+    output(0)->from(_split_out_0);
+    output(1)->from(_split_out_1);
+  }
+};
+
+// TODO use ::testing::Test
+
+} // namespace
+
+TEST(RemoveUnnecessarySplitPass, name)
+{
+  luci::RemoveUnnecessarySplitPass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
+
+TEST(RemoveUnnecessarySplitPass, create_unnecessary_split)
+{
+  SplitOneGraph g;
+
+  g.init();
+
+  luci::RemoveUnnecessarySplitPass pass;
+  while (pass.run(g.g()))
+    ;
+
+  auto split_node = luci::test::first_node<luci::CircleSplit>(g.g());
+  // No Split node is in graph.
+  ASSERT_EQ(nullptr, split_node);
+}
+
+TEST(RemoveUnnecessarySplitPass, create_unnecessary_split_NEG)
+{
+  SplitTwoGraph g;
+
+  g.init();
+
+  luci::RemoveUnnecessarySplitPass pass;
+  while (pass.run(g.g()))
+    ;
+
+  auto split_node = luci::test::first_node<luci::CircleSplit>(g.g());
+  // Split node is in graph.
+  ASSERT_NE(nullptr, split_node);
+}
diff --git a/compiler/luci/pass/src/RemoveUnnecessaryStridedSlicePass.cpp b/compiler/luci/pass/src/RemoveUnnecessaryStridedSlicePass.cpp
new file mode 100644
index 000000000..22b1aa64f
--- /dev/null
+++ b/compiler/luci/pass/src/RemoveUnnecessaryStridedSlicePass.cpp
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/RemoveUnnecessaryStridedSlicePass.h"
+
+#include <luci/IR/CircleNodes.h>
+
+namespace
+{
+
+/**
+ * @brief   Return value in CircleConst.
+ * @details Return value in position on CircleConst with int64 format.
+ */
+int64_t value_from_circle_const(const luci::CircleConst *node, uint32_t idx)
+{
+  assert(node->rank() == 1 && node->dim(0).value() > idx);
+  assert(node->dtype() == loco::DataType::S64 || node->dtype() == loco::DataType::S32);
+
+  if (node->dtype() == loco::DataType::S64)
+    return node->at<loco::DataType::S64>(idx);
+  return static_cast<int64_t>(node->at<loco::DataType::S32>(idx));
+}
+
+bool remove_no_effect_strided_slice(luci::CircleStridedSlice *target_node)
+{
+  auto begin_const = dynamic_cast<luci::CircleConst *>(target_node->begin());
+  if (begin_const == nullptr)
+    return false;
+
+  auto strides_const = dynamic_cast<luci::CircleConst *>(target_node->strides());
+  if (strides_const == nullptr)
+    return false;
+
+  auto end_const = dynamic_cast<luci::CircleConst *>(target_node->end());
+  if (end_const == nullptr)
+    return false;
+
+  auto input_node = loco::must_cast<luci::CircleNode *>(target_node->input());
+  for (uint32_t i = 0; i < input_node->rank(); i++)
+  {
+    if (value_from_circle_const(begin_const, i) != 0)
+      return false;
+
+    int64_t strides_value = value_from_circle_const(strides_const, i);
+    if (strides_value != 1)
+      return false;
+
+    int64_t end_value = value_from_circle_const(end_const, i);
+    if (end_value == -1)
+      continue;
+
+    if (end_value != input_node->dim(i).value())
+      return false;
+
+    if (!input_node->dim(i).known())
+      return false;
+  }
+
+  /**
+   * We check additional attributes on zero after shapes
+   * for skipping wrong StridedSlice operator.
+   */
+  if (target_node->new_axis_mask() != 0 || target_node->shrink_axis_mask() != 0)
+    return false;
+
+  replace(target_node).with(input_node);
+  return true;
+}
+
+} // namespace
+
+namespace luci
+{
+/**
+ * BEFORE
+ *
+ *    [CircleNode]
+ *          |
+ *    [CircleStridedSlice]
+ *          |
+ *    [CircleNode]
+ *
+ * AFTER
+ *
+ *    [CircleNode]
+ *          |
+ *    [CircleNode]   [CircleStridedSlice]
+ *
+ * StridedSlice OP has no effect if,
+ *    1. Static Shape : begin_const[idx] is 0 AND strides_const[idx] is (not 1 OR
+ *       input_dimension[idx])
+ *    2. Dynamic Shape : begin_const[idx] is 0 AND strides_const[idx] is not 1
+ *
+ * StridedSlice OP has effect if,
+ *    1. begin_const[idx] is 0 AND input_shape[idx] are equal to end_shape[idx]
+ */
+bool RemoveUnnecessaryStridedSlicePass::run(loco::Graph *g)
+{
+  bool changed = false;
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    auto target_node = dynamic_cast<luci::CircleStridedSlice *>(node);
+    if (target_node != nullptr)
+      if (remove_no_effect_strided_slice(target_node))
+        changed = true;
+  }
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/RemoveUnnecessaryStridedSlicePass.test.cpp b/compiler/luci/pass/src/RemoveUnnecessaryStridedSlicePass.test.cpp
new file mode 100644
index 000000000..7d611c864
--- /dev/null
+++ b/compiler/luci/pass/src/RemoveUnnecessaryStridedSlicePass.test.cpp
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "luci/Pass/RemoveUnnecessaryStridedSlicePass.h"
+
+#include <luci/IR/CircleNodes.h>
+
+#include <luci/test/TestIOGraph.h>
+#include "test/TestFirstNode.h"
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+using namespace luci::test;
+
+class StridedSliceGraphlet
+{
+public:
+  StridedSliceGraphlet() = default;
+
+public:
+  void init(loco::Graph *g, const ShapeU32 input_shape, bool remove)
+  {
+    // Begin create
+    _begin = g->nodes()->create<luci::CircleConst>();
+    _begin->rank(1);
+    _begin->dim(0).set(input_shape.size());
+    _begin->shape_status(luci::ShapeStatus::VALID);
+    _begin->dtype(loco::DataType::S32);
+    _begin->size<loco::DataType::S32>(input_shape.size());
+    for (int i = 0; i < input_shape.size(); ++i)
+    {
+      _begin->at<loco::DataType::S32>(i) = remove ? 0 : 1;
+    }
+
+    // Strides create
+    _strides = g->nodes()->create<luci::CircleConst>();
+    _strides->rank(1);
+    _strides->dim(0).set(input_shape.size());
+    _strides->shape_status(luci::ShapeStatus::VALID);
+    _strides->dtype(loco::DataType::S32);
+    _strides->size<loco::DataType::S32>(input_shape.size());
+    for (int i = 0; i < input_shape.size(); ++i)
+    {
+      _strides->at<loco::DataType::S32>(i) = remove ? 1 : -1;
+    }
+
+    std::vector<uint32_t> shape_vector{input_shape};
+
+    _end = g->nodes()->create<luci::CircleConst>();
+    _end->rank(1);
+    _end->dim(0).set(input_shape.size());
+    _end->shape_status(luci::ShapeStatus::VALID);
+    _end->dtype(loco::DataType::S32);
+    _end->size<loco::DataType::S32>(input_shape.size());
+    for (int i = 0; i < input_shape.size(); ++i)
+    {
+      if (remove)
+        _end->at<loco::DataType::S32>(i) = static_cast<int32_t>(shape_vector.at(i));
+      else
+        _end->at<loco::DataType::S32>(i) = -1;
+    }
+
+    // StridedSlice Node create
+    _strided_slice = g->nodes()->create<luci::CircleStridedSlice>();
+    _strided_slice->dtype(loco::DataType::S32);
+  }
+
+protected:
+  luci::CircleStridedSlice *_strided_slice = nullptr;
+  luci::CircleConst *_begin = nullptr;
+  luci::CircleConst *_strides = nullptr;
+  luci::CircleConst *_end = nullptr;
+};
+
+class StridedSliceGraph : public TestIOGraph, public StridedSliceGraphlet
+{
+public:
+  StridedSliceGraph() = default;
+
+public:
+  void init(const ShapeU32 shape, bool remove)
+  {
+    TestIOGraph::init(shape, shape);
+    StridedSliceGraphlet::init(g(), shape, remove);
+
+    _strided_slice->input(input());
+    _strided_slice->begin(_begin);
+    _strided_slice->strides(_strides);
+    _strided_slice->end(_end);
+
+    output()->from(_strided_slice);
+  }
+};
+
+} // namespace
+
+TEST(RemoveUnnecessaryStridedSlicePass, basic_case)
+{
+  StridedSliceGraph g;
+
+  g.init({2, 4, 2, 3}, true);
+
+  auto strided_slice_node = luci::test::first_node<luci::CircleStridedSlice>(g.g());
+  ASSERT_NE(nullptr, strided_slice_node);
+  luci::RemoveUnnecessaryStridedSlicePass pass;
+  while (pass.run(g.g()))
+    ;
+
+  strided_slice_node = luci::test::first_node<luci::CircleStridedSlice>(g.g());
+  ASSERT_EQ(nullptr, strided_slice_node);
+}
+
+TEST(RemoveUnnecessaryStridedSlicePass, basic_fail_case_NEG)
+{
+  StridedSliceGraph g;
+
+  g.init({2, 4, 2, 3}, false);
+
+  auto strided_slice_node = luci::test::first_node<luci::CircleStridedSlice>(g.g());
+  ASSERT_NE(nullptr, strided_slice_node);
+  luci::RemoveUnnecessaryStridedSlicePass pass;
+  while (pass.run(g.g()))
+    ;
+
+  strided_slice_node = luci::test::first_node<luci::CircleStridedSlice>(g.g());
+  ASSERT_NE(nullptr, strided_slice_node);
+}
diff --git a/compiler/luci/pass/src/ReplaceMulAddWithDepthwiseConvPass.cpp b/compiler/luci/pass/src/ReplaceMulAddWithDepthwiseConvPass.cpp
index 7096c2591..a0cc0194f 100644
--- a/compiler/luci/pass/src/ReplaceMulAddWithDepthwiseConvPass.cpp
+++ b/compiler/luci/pass/src/ReplaceMulAddWithDepthwiseConvPass.cpp
@@ -16,7 +16,10 @@
 
 #include "luci/Pass/ReplaceMulAddWithDepthwiseConvPass.h"
 
+#include "BatchNormPatternFinder.h"
+
 #include <luci/IR/CircleNodes.h>
+#include <luci/Profile/CircleNodeOrigin.h>
 
 namespace
 {
@@ -26,6 +29,9 @@ luci::CircleConst *create_weights_from_gamma(luci::CircleConst *gamma)
   assert(gamma->rank() == 1);
   auto channel_size = gamma->dim(0).value();
 
+  auto name = gamma->name();
+  assert(name.length() > 0);
+
   // Channel-wise MUL is the same as DEPTHWISE_CONV2D with filter shape (1,1,1,channel_size)
   auto weights = gamma->graph()->nodes()->create<luci::CircleConst>();
   weights->dtype(loco::DataType::FLOAT32);
@@ -40,6 +46,7 @@ luci::CircleConst *create_weights_from_gamma(luci::CircleConst *gamma)
   {
     weights->at<loco::DataType::FLOAT32>(i) = gamma->at<loco::DataType::FLOAT32>(i);
   }
+  weights->name(name + "_weights");
 
   return weights;
 }
@@ -49,6 +56,9 @@ luci::CircleConst *create_bias_from_beta(luci::CircleConst *beta)
   assert(beta->rank() == 1);
   auto channel_size = beta->dim(0).value();
 
+  auto name = beta->name();
+  assert(name.length() > 0);
+
   // Channel-wise ADD is the same as bias (shape = (channel_size)) of DEPTHWISE_CONV2D
   auto bias = beta->graph()->nodes()->create<luci::CircleConst>();
   bias->dtype(loco::DataType::FLOAT32);
@@ -60,83 +70,11 @@ luci::CircleConst *create_bias_from_beta(luci::CircleConst *beta)
   {
     bias->at<loco::DataType::FLOAT32>(i) = beta->at<loco::DataType::FLOAT32>(i);
   }
+  bias->name(name + "_bias");
 
   return bias;
 }
 
-bool is_batchnorm_add(const luci::CircleAdd *add, luci::CircleMul *&mul, luci::CircleConst *&beta)
-{
-  auto x = loco::must_cast<luci::CircleNode *>(add->x());
-  auto y = loco::must_cast<luci::CircleNode *>(add->y());
-
-  luci::CircleMul *pred = nullptr;
-  luci::CircleConst *constant = nullptr;
-
-  if (x->opcode() == luci::CircleOpcode::CIRCLECONST && y->opcode() == luci::CircleOpcode::MUL)
-  {
-    pred = loco::must_cast<luci::CircleMul *>(y);
-    constant = loco::must_cast<luci::CircleConst *>(x);
-  }
-  else if (x->opcode() == luci::CircleOpcode::MUL && y->opcode() == luci::CircleOpcode::CIRCLECONST)
-  {
-    pred = loco::must_cast<luci::CircleMul *>(x);
-    constant = loco::must_cast<luci::CircleConst *>(y);
-  }
-  else
-  {
-    return false;
-  }
-
-  if (constant->rank() != 1)
-    return false;
-
-  auto channel_dim = constant->dim(0);
-  // Assumption: Layout is channel-last
-  if (!(channel_dim == add->dim(add->rank() - 1)))
-    return false;
-
-  mul = pred;
-  beta = constant;
-  return true;
-}
-
-// Check if mul is batchnorm mul
-bool is_batchnorm_mul(const luci::CircleMul *mul, luci::CircleNode *&pred_node,
-                      luci::CircleConst *&gamma)
-{
-  auto x = dynamic_cast<luci::CircleConst *>(mul->x());
-  auto y = dynamic_cast<luci::CircleConst *>(mul->y());
-
-  luci::CircleNode *pred = nullptr;
-  luci::CircleConst *constant = nullptr;
-
-  if (x != nullptr && y == nullptr)
-  {
-    pred = loco::must_cast<luci::CircleNode *>(mul->y());
-    constant = x;
-  }
-  else if (x == nullptr && y != nullptr)
-  {
-    pred = loco::must_cast<luci::CircleNode *>(mul->x());
-    constant = y;
-  }
-  else
-  {
-    return false;
-  }
-
-  if (constant->rank() != 1)
-    return false;
-
-  auto channel_dim = constant->dim(0);
-  if (!(channel_dim == mul->dim(mul->rank() - 1)))
-    return false;
-
-  pred_node = pred;
-  gamma = constant;
-  return true;
-}
-
 /**
  *  Replace channel-wise Mul/Add with DepthwiseConv2D
  *
@@ -180,6 +118,9 @@ bool replace_mul_add_with_dwconv(luci::CircleAdd *add)
   auto weights = create_weights_from_gamma(gamma);
   auto bias = create_bias_from_beta(beta);
 
+  auto name = add->name();
+  assert(name.length() > 0);
+
   auto dwconv = add->graph()->nodes()->create<luci::CircleDepthwiseConv2D>();
   dwconv->input(pred_node);
   dwconv->filter(weights);
@@ -191,6 +132,8 @@ bool replace_mul_add_with_dwconv(luci::CircleAdd *add)
   dwconv->dilation()->w(1);
   dwconv->dilation()->h(1);
   dwconv->fusedActivationFunction(add->fusedActivationFunction());
+  dwconv->name(name + "/DepthwiseConv2D");
+  luci::add_origin(dwconv, luci::composite_origin({luci::get_origin(mul), luci::get_origin(add)}));
 
   loco::replace(add).with(dwconv);
   return true;
@@ -206,14 +149,10 @@ bool ReplaceMulAddWithDepthwiseConvPass::run(loco::Graph *g)
   bool changed = false;
   for (auto node : loco::active_nodes(loco::output_nodes(g)))
   {
-    auto add = dynamic_cast<luci::CircleAdd *>(node);
-    if (not add)
-      continue;
-
-    if (replace_mul_add_with_dwconv(add))
+    if (auto add = dynamic_cast<luci::CircleAdd *>(node))
     {
-      changed = true;
-      break;
+      if (replace_mul_add_with_dwconv(add))
+        changed = true;
     }
   }
 
diff --git a/compiler/luci/pass/src/ReplaceMulAddWithDepthwiseConvPass.test.cpp b/compiler/luci/pass/src/ReplaceMulAddWithDepthwiseConvPass.test.cpp
index a90182aaa..903d4dcc9 100644
--- a/compiler/luci/pass/src/ReplaceMulAddWithDepthwiseConvPass.test.cpp
+++ b/compiler/luci/pass/src/ReplaceMulAddWithDepthwiseConvPass.test.cpp
@@ -85,6 +85,13 @@ public:
     add->x(mul);
     add->y(beta);
     output->from(add);
+
+    input->name("input");
+    mul->name("mul");
+    gamma->name("gamma");
+    add->name("add");
+    beta->name("beta");
+    output->name("output");
   }
 
 public:
@@ -99,6 +106,13 @@ public:
 
 } // namespace
 
+TEST(ReplaceMulAddWithDepthwiseConv, name)
+{
+  luci::ReplaceMulAddWithDepthwiseConvPass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
+
 TEST(ReplaceMulAddWithDepthwiseConv, simple)
 {
   SimpleGraph g;
diff --git a/compiler/luci/pass/src/RequantizePass.cpp b/compiler/luci/pass/src/RequantizePass.cpp
index fe84e3bc3..a56536251 100644
--- a/compiler/luci/pass/src/RequantizePass.cpp
+++ b/compiler/luci/pass/src/RequantizePass.cpp
@@ -113,7 +113,7 @@ void requant_const_int8_to_uint8(CircleConst *node)
 struct RequantizeNonConst final : public luci::CircleNodeMutableVisitor<bool>
 {
   RequantizeNonConst(loco::DataType input, loco::DataType output)
-      : _input_type(input), _output_type(output)
+    : _input_type(input), _output_type(output)
   {
   }
 
@@ -157,7 +157,7 @@ struct RequantizeNonConst final : public luci::CircleNodeMutableVisitor<bool>
 struct RequantizeConst final : public luci::CircleNodeMutableVisitor<bool>
 {
   RequantizeConst(loco::DataType input, loco::DataType output)
-      : _input_type(input), _output_type(output)
+    : _input_type(input), _output_type(output)
   {
   }
 
diff --git a/compiler/luci/pass/src/RequantizePass.test.cpp b/compiler/luci/pass/src/RequantizePass.test.cpp
new file mode 100644
index 000000000..d26743c9d
--- /dev/null
+++ b/compiler/luci/pass/src/RequantizePass.test.cpp
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/RequantizePass.h"
+
+#include <gtest/gtest.h>
+
+TEST(RequantizePassTest, name)
+{
+  luci::RequantizePass pass(loco::DataType::FLOAT32, loco::DataType::U8);
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
diff --git a/compiler/luci/pass/src/ResolveCustomOpAddPass.cpp b/compiler/luci/pass/src/ResolveCustomOpAddPass.cpp
index e52d667d7..1737e5dd6 100644
--- a/compiler/luci/pass/src/ResolveCustomOpAddPass.cpp
+++ b/compiler/luci/pass/src/ResolveCustomOpAddPass.cpp
@@ -20,6 +20,7 @@
 
 #include <luci/IR/CircleNodes.h>
 #include <luci/IR/AttrFusedActFunc.h>
+#include <luci/Profile/CircleNodeOrigin.h>
 
 namespace
 {
@@ -67,10 +68,17 @@ bool resolve_with_BroadcastTo(luci::CircleCustom *addv2)
   auto input = loco::must_cast<const luci::CircleCustomOut *>(addv2->inputs(broadcastTo_idx));
   auto broadcastTo = loco::must_cast<luci::CircleCustom *>(input->input());
 
+  auto name = addv2->name();
+  assert(name.length() > 0);
+
   auto add = addv2->graph()->nodes()->create<luci::CircleAdd>();
   add->fusedActivationFunction(luci::FusedActFunc::NONE);
   add->x(addv2->inputs(1 - broadcastTo_idx));
   add->y(broadcastTo->inputs(0));
+  add->name(name + "/Add");
+  luci::add_origin(
+    add, luci::composite_origin({luci::get_origin(broadcastTo), luci::get_origin(addv2)}));
+
   auto customOut = loco::succs(addv2);
   assert(customOut.size() == 1);
   replace(*customOut.begin()).with(add);
@@ -86,13 +94,39 @@ bool resolve_custom_op(luci::CircleCustom *addv2)
   if (custom_code != "AddV2")
     return false;
 
+  if (addv2->numInputs() != 2)
+    return false;
+
+  // check if inputs are suppport data types
+  for (uint32_t i = 0; i < addv2->numInputs(); i++)
+  {
+    auto input = loco::must_cast<luci::CircleNode *>(addv2->inputs(i));
+    switch (input->dtype())
+    {
+      case loco::DataType::U8:
+      case loco::DataType::S8:
+      case loco::DataType::S16:
+      case loco::DataType::S32:
+      case loco::DataType::FLOAT32:
+        break;
+      default:
+        return false;
+    }
+  }
+
   if (resolve_with_BroadcastTo(addv2))
     return true;
 
+  auto name = addv2->name();
+  assert(name.length() > 0);
+
   auto add = addv2->graph()->nodes()->create<luci::CircleAdd>();
   add->fusedActivationFunction(luci::FusedActFunc::NONE);
   add->x(addv2->inputs(0));
   add->y(addv2->inputs(1));
+  add->name(name + "/Add");
+  luci::add_origin(add, luci::get_origin(addv2));
+
   auto customOut = loco::succs(addv2);
   assert(customOut.size() == 1);
   replace(*customOut.begin()).with(add);
@@ -115,7 +149,8 @@ bool ResolveCustomOpAddPass::run(loco::Graph *g)
     if (not cop)
       continue;
 
-    changed |= resolve_custom_op(cop);
+    if (resolve_custom_op(cop))
+      changed = true;
   }
 
   return changed;
diff --git a/compiler/luci/pass/src/ResolveCustomOpAddPass.test.cpp b/compiler/luci/pass/src/ResolveCustomOpAddPass.test.cpp
new file mode 100644
index 000000000..31c245b0e
--- /dev/null
+++ b/compiler/luci/pass/src/ResolveCustomOpAddPass.test.cpp
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/ResolveCustomOpAddPass.h"
+
+#include <gtest/gtest.h>
+
+TEST(ResolveCustomOpAddPassTest, name)
+{
+  luci::ResolveCustomOpAddPass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
diff --git a/compiler/luci/pass/src/ResolveCustomOpBatchMatMulPass.cpp b/compiler/luci/pass/src/ResolveCustomOpBatchMatMulPass.cpp
index 145e9cb62..5e9466a63 100644
--- a/compiler/luci/pass/src/ResolveCustomOpBatchMatMulPass.cpp
+++ b/compiler/luci/pass/src/ResolveCustomOpBatchMatMulPass.cpp
@@ -19,6 +19,7 @@
 #include "flatbuffers/flexbuffers.h"
 
 #include <luci/IR/CircleNodes.h>
+#include <luci/Profile/CircleNodeOrigin.h>
 
 namespace
 {
@@ -30,6 +31,9 @@ bool resolve_custom_op(luci::CircleCustom *cop)
 
   if (custom_code == "BatchMatMulV2")
   {
+    auto name = cop->name();
+    assert(name.length() > 0);
+
     auto batch_matmul = cop->graph()->nodes()->create<luci::CircleBatchMatMul>();
     // input
     batch_matmul->x(cop->inputs(0));
@@ -39,10 +43,16 @@ bool resolve_custom_op(luci::CircleCustom *cop)
     auto map = flexbuffers::GetRoot(custom_options).AsMap();
     batch_matmul->adj_x(map["adj_x"].AsBool());
     batch_matmul->adj_y(map["adj_y"].AsBool());
+    batch_matmul->name(name + "/BatchMatMul");
+    luci::add_origin(batch_matmul, luci::get_origin(cop));
+
+    auto customOut = loco::succs(cop);
+    assert(customOut.size() == 1);
+    replace(*customOut.begin()).with(batch_matmul);
 
-    replace(cop).with(batch_matmul);
     return true;
   }
+
   return false;
 }
 
@@ -51,6 +61,27 @@ bool resolve_custom_op(luci::CircleCustom *cop)
 namespace luci
 {
 
+/**
+ *  BEFORE
+ *         |             |
+ *    [CircleNode]  [CircleNode]
+ *          \           /
+ *         [CircleCustom]("BatchMatMulV2")
+ *               |
+ *        [CircleCustomOut]
+ *               |
+ *          [CircleNode]
+ *               |
+ *
+ *  AFTER
+ *         |             |
+ *    [CircleNode]  [CircleNode]
+ *          \           /
+ *       [CircleBatchMatMul]
+ *               |
+ *          [CircleNode]
+ *               |
+ */
 bool ResolveCustomOpBatchMatMulPass::run(loco::Graph *g)
 {
   bool changed = false;
@@ -60,7 +91,8 @@ bool ResolveCustomOpBatchMatMulPass::run(loco::Graph *g)
     if (not cop)
       continue;
 
-    changed |= resolve_custom_op(cop);
+    if (resolve_custom_op(cop))
+      changed = true;
   }
 
   return changed;
diff --git a/compiler/luci/pass/src/ResolveCustomOpBatchMatMulPass.test.cpp b/compiler/luci/pass/src/ResolveCustomOpBatchMatMulPass.test.cpp
new file mode 100644
index 000000000..435016f9d
--- /dev/null
+++ b/compiler/luci/pass/src/ResolveCustomOpBatchMatMulPass.test.cpp
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/ResolveCustomOpBatchMatMulPass.h"
+
+#include <luci/IR/CircleNodes.h>
+
+#include "flatbuffers/flatbuffers.h"
+#include "flatbuffers/flexbuffers.h"
+
+#include <luci/test/TestIOGraph.h>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+using namespace luci::test;
+
+const int N = 1;
+const int C = 2;
+const int H_X = 1;
+const int W_X = 4;
+const int H_Y = 4;
+const int W_Y = 4;
+
+/**
+ *  graph having Custom operator BatchMatMulV2
+ *
+ *  [CircleInput]  [CircleInput]
+ *         \         /
+ *       [CircleCustom]
+ *             |
+ *      [CircleCustomOut]
+ *             |
+ *       [CircleOutput]
+ */
+class BatchMatmulV2Graphlet
+{
+public:
+  BatchMatmulV2Graphlet() = default;
+
+public:
+  void init(loco::Graph *g)
+  {
+    // custom option
+    auto flatbuffer_builder =
+      std::unique_ptr<flatbuffers::FlatBufferBuilder>(new flatbuffers::FlatBufferBuilder(1024));
+    auto flex_buffers = std::make_unique<flexbuffers::Builder>();
+    size_t map_start = flex_buffers->StartMap();
+    flex_buffers->Bool("adj_x", false);
+    flex_buffers->Bool("adj_y", false);
+    flex_buffers->Int("T", 0 /* circle::TensorType_FLOAT32 */);
+    flex_buffers->EndMap(map_start);
+    flex_buffers->Finish();
+
+    // CircleCustom(BatchMatMulV2, adj_x=False, adj_y=False)
+    _batchmatmulv2 = g->nodes()->create<luci::CircleCustom>(2, 1);
+    _batchmatmulv2->custom_code("BatchMatMulV2");
+    _batchmatmulv2->custom_options(flex_buffers->GetBuffer());
+    _batchmatmulv2->shape({N, C, H_X, W_Y});
+    _batchmatmulv2->dtype(loco::DataType::FLOAT32);
+    _batchmatmulv2->name("batchmatmulv2");
+
+    // CircleCustomOut
+    _batchmatmulv2_out = g->nodes()->create<luci::CircleCustomOut>();
+    _batchmatmulv2_out->shape({N, C, H_X, W_Y});
+    _batchmatmulv2_out->dtype(loco::DataType::FLOAT32);
+    _batchmatmulv2_out->index(0);
+  }
+
+public:
+  luci::CircleCustom *batchmatmulv2() { return _batchmatmulv2; }
+
+protected:
+  luci::CircleCustom *_batchmatmulv2 = nullptr;
+  luci::CircleCustomOut *_batchmatmulv2_out = nullptr;
+};
+
+class BatchMatmulV2Graph : public TestIsGraphlet<2>,
+                           public TestOGraphlet,
+                           public BatchMatmulV2Graphlet
+{
+public:
+  BatchMatmulV2Graph() = default;
+
+  void init(void)
+  {
+    TestIsGraphlet<2>::init(g(), {{N, C, H_X, W_X}, {N, C, H_X, W_X}});
+    TestOGraphlet::init(g(), {N, C, H_X, W_Y});
+    BatchMatmulV2Graphlet::init(g());
+
+    // TODO how set multiple of shape vector for TestIsGraphlet?
+    // update shape for second input
+    input(1)->shape({N, C, H_Y, W_Y});
+
+    // connect graph
+    _batchmatmulv2->inputs(0, input(0));
+    _batchmatmulv2->inputs(1, input(1));
+    _batchmatmulv2_out->input(_batchmatmulv2);
+
+    output()->from(_batchmatmulv2_out);
+  }
+};
+
+class BatchMatmulV2GraphTest : public ::testing::Test
+{
+public:
+  BatchMatmulV2Graph g;
+  luci::ResolveCustomOpBatchMatMulPass pass;
+};
+
+} // namespace
+
+TEST(ResolveCustomOpBatchMatMulPassTest, name)
+{
+  luci::ResolveCustomOpBatchMatMulPass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
+
+/**
+ *  Optimized graph looks like below.
+ *
+ *  [CircleInput]
+ *        |
+ *  [CircleBatchMatMul]
+ *        |
+ *  [CircleOutput]
+ */
+TEST_F(BatchMatmulV2GraphTest, simple_test)
+{
+  g.init();
+
+  auto ret = pass.run(g.g());
+  EXPECT_EQ(true, ret);
+
+  auto batchmatmul = dynamic_cast<luci::CircleBatchMatMul *>(g.output()->from());
+  EXPECT_NE(nullptr, batchmatmul);
+
+  auto input_0 = dynamic_cast<luci::CircleInput *>(batchmatmul->x());
+  auto input_1 = dynamic_cast<luci::CircleInput *>(batchmatmul->y());
+  EXPECT_NE(nullptr, input_0);
+  EXPECT_NE(nullptr, input_1);
+}
+
+TEST_F(BatchMatmulV2GraphTest, wrong_condition_NEG)
+{
+  g.init();
+
+  // wrong custom code
+  g.batchmatmulv2()->custom_code("BatchMatMulv2"); // v is lower case
+  auto ret = pass.run(g.g());
+
+  EXPECT_EQ(false, ret);
+}
diff --git a/compiler/luci/pass/src/ResolveCustomOpMatMulPass.cpp b/compiler/luci/pass/src/ResolveCustomOpMatMulPass.cpp
index 547fd22fc..216778066 100644
--- a/compiler/luci/pass/src/ResolveCustomOpMatMulPass.cpp
+++ b/compiler/luci/pass/src/ResolveCustomOpMatMulPass.cpp
@@ -20,11 +20,10 @@
 #include <loco/IR/DataTypeTraits.h>
 
 #include <luci/IR/CircleNodes.h>
+#include <luci/Profile/CircleNodeOrigin.h>
 
 #include <loco.h>
 #include <oops/InternalExn.h>
-#include <loco/Service/ShapeInference.h>
-#include <loco/Service/TypeInference.h>
 
 namespace
 {
@@ -44,6 +43,7 @@ luci::CircleConst *create_const_node(loco::Graph *g, const loco::DataType dtype,
     node->dim(i) = shape.at(i);
     size *= shape.at(i);
   }
+  node->shape_status(luci::ShapeStatus::VALID);
 
 #define INIT_VALUES(DT)                          \
   {                                              \
@@ -90,6 +90,9 @@ bool resolve_matmul(luci::CircleCustom *cop)
   const auto S32 = loco::DataType::S32;
   const auto FLOAT32 = loco::DataType::FLOAT32;
 
+  auto name = cop->name();
+  assert(name.length() > 0);
+
   bool transpose_a = map["transpose_a"].AsBool();
   bool transpose_b = map["transpose_b"].AsBool();
 
@@ -97,34 +100,38 @@ bool resolve_matmul(luci::CircleCustom *cop)
   loco::Node *rhs = cop->inputs(1);
 
   // Check that the type of the first input is known
-  CHECK_OR_FALSE(loco::dtype_known(lhs));
-  auto lhs_dtype = loco::dtype_get(cop->inputs(0));
+  auto lhs_dtype = loco::must_cast<luci::CircleNode *>(cop->inputs(0))->dtype();
+  CHECK_OR_FALSE(lhs_dtype != loco::DataType::Unknown);
 
   // If transpose of first input is requested, its shape must be known
-  CHECK_OR_FALSE(!transpose_a || loco::shape_known(lhs));
+  auto circle_lhs = loco::must_cast<luci::CircleNode *>(lhs);
+  CHECK_OR_FALSE(!transpose_a || circle_lhs->shape_status() == luci::ShapeStatus::VALID);
   // and its rank should be at least 2
-  CHECK_OR_FALSE(!transpose_a || loco::shape_get(lhs).as<loco::TensorShape>().rank() >= 2);
+  CHECK_OR_FALSE(!transpose_a || circle_lhs->rank() >= 2);
   // Check that the shape of the 2nd input is known
-  CHECK_OR_FALSE(loco::shape_known(rhs));
+  auto circle_rhs = loco::must_cast<luci::CircleNode *>(rhs);
+  CHECK_OR_FALSE(circle_rhs->shape_status() == luci::ShapeStatus::VALID);
   // TODO as of 06/23/20 TFLite only supports rank 2 for 2nd input. Fix this once that changes!
-  CHECK_OR_FALSE(loco::shape_get(rhs).as<loco::TensorShape>().rank() == 2);
+  CHECK_OR_FALSE(circle_rhs->rank() == 2);
   // Check that input data type is supported
   CHECK_OR_THROW(lhs_dtype == U8 || lhs_dtype == S16 || lhs_dtype == FLOAT32,
                  "Only UInt8, Int16 and Float32 data types are supported by MatMul");
 
   if (transpose_a)
   {
-    auto a_shape = loco::shape_get(lhs).as<loco::TensorShape>();
     // Create a permutation constant node
     std::vector<uint32_t> perm;
-    for (uint32_t i = 0; i < a_shape.rank(); ++i)
+    for (uint32_t i = 0; i < circle_lhs->rank(); ++i)
       perm.push_back(i);
-    std::swap(perm[a_shape.rank() - 1], perm[a_shape.rank() - 2]);
-    auto perm_node = create_const_node(graph, S32, {a_shape.rank()}, perm);
+    std::swap(perm[circle_lhs->rank() - 1], perm[circle_lhs->rank() - 2]);
+    auto perm_node = create_const_node(graph, S32, {circle_lhs->rank()}, perm);
+    perm_node->name(name + "/lhs/Transpose/perm");
     // Now make a transpose node
     auto transpose_node = graph->nodes()->create<luci::CircleTranspose>();
     transpose_node->a(lhs);
     transpose_node->perm(perm_node);
+    transpose_node->name(name + "/lhs/Transpose");
+    luci::add_origin(transpose_node, luci::get_origin(cop));
     lhs = transpose_node;
   }
 
@@ -135,24 +142,29 @@ bool resolve_matmul(luci::CircleCustom *cop)
   {
     const std::vector<uint32_t> perm{1, 0};
     auto perm_node = create_const_node(graph, S32, {2}, perm);
+    perm_node->name(name + "/rhs/Transpose/perm");
     auto transpose_node = graph->nodes()->create<luci::CircleTranspose>();
     transpose_node->a(rhs);
     transpose_node->perm(perm_node);
+    transpose_node->name(name + "/rhs/Transpose");
+    luci::add_origin(transpose_node, luci::get_origin(cop));
     rhs = transpose_node;
   }
 
-  // Make a constant zero-filled bias node
-  auto b_shape = loco::shape_get(cop->inputs(1)).as<loco::TensorShape>();
-  uint32_t bias_size = b_shape.dim(transpose_b ? 1 : 0).value();
-  const std::vector<float> val(bias_size, .0f);
-  auto bias_node = create_const_node(graph, lhs_dtype, {bias_size}, val);
+  auto empty_bias = graph->nodes()->create<luci::CircleOutputExclude>();
+  empty_bias->dtype(loco::DataType::FLOAT32); // Needed for type inference
+
   auto fc_node = graph->nodes()->create<luci::CircleFullyConnected>();
   fc_node->input(lhs);
   fc_node->weights(rhs);
-  fc_node->bias(bias_node);
+  fc_node->bias(empty_bias);
   fc_node->fusedActivationFunction(luci::FusedActFunc::NONE);
+  fc_node->name(name + "/FullyConnected");
+  luci::add_origin(fc_node, luci::get_origin(cop));
 
-  replace(cop).with(fc_node);
+  auto customOut = loco::succs(cop);
+  assert(customOut.size() == 1);
+  replace(*customOut.begin()).with(fc_node);
   return true;
 }
 
diff --git a/compiler/luci/pass/src/ResolveCustomOpMatMulPass.test.cpp b/compiler/luci/pass/src/ResolveCustomOpMatMulPass.test.cpp
new file mode 100644
index 000000000..c4ea3ea06
--- /dev/null
+++ b/compiler/luci/pass/src/ResolveCustomOpMatMulPass.test.cpp
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/ResolveCustomOpMatMulPass.h"
+
+#include <gtest/gtest.h>
+
+TEST(ResolveCustomOpMatMulPassTest, name)
+{
+  luci::ResolveCustomOpMatMulPass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
diff --git a/compiler/luci/pass/src/ShapeInferencePass.cpp b/compiler/luci/pass/src/ShapeInferencePass.cpp
deleted file mode 100644
index 4bd0aaed4..000000000
--- a/compiler/luci/pass/src/ShapeInferencePass.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "luci/Pass/ShapeInferencePass.h"
-
-#include <luci/IR/CircleDialect.h>
-#include <luci/Service/CircleShapeInferenceRule.h>
-
-#include <loco.h>
-#include <loco/IR/CanonicalDialect.h>
-#include <loco/Service/CanonicalShapeInferenceRule.h>
-#include <loco/Service/ShapeInference.h>
-#include <loco/Service/MultiDialectShapeInferenceRule.h>
-
-namespace luci
-{
-
-bool ShapeInferencePass::run(luci::Module *m)
-{
-  bool changed = false;
-
-  for (size_t g = 0; g < m->size(); ++g)
-  {
-    if (run(m->graph(g)))
-      changed = true;
-  }
-
-  return changed;
-}
-
-bool ShapeInferencePass::run(loco::Graph *g)
-{
-  loco::CanonicalShapeInferenceRule canonical_rule;
-  luci::CircleShapeInferenceRule circle_rule;
-
-  loco::MultiDialectShapeInferenceRule rules;
-
-  rules.bind(loco::CanonicalDialect::get(), &canonical_rule)
-      .bind(luci::CircleDialect::get(), &circle_rule);
-
-  return loco::apply(&rules).to(g);
-}
-
-} // namespace luci
diff --git a/compiler/luci/pass/src/ShuffleWeightTo16x1Float32Pass.cpp b/compiler/luci/pass/src/ShuffleWeightTo16x1Float32Pass.cpp
index 6a58f18c5..92060f625 100644
--- a/compiler/luci/pass/src/ShuffleWeightTo16x1Float32Pass.cpp
+++ b/compiler/luci/pass/src/ShuffleWeightTo16x1Float32Pass.cpp
@@ -72,6 +72,9 @@ luci::CircleConst *shuffle_weight(luci::CircleFullyConnected *fc)
 {
   auto the_weights = loco::must_cast<luci::CircleConst *>(fc->weights());
 
+  auto name = fc->name();
+  assert(name.length() > 0);
+
   // create CircleConst where shuffled data will be stored
   luci::CircleConst *new_weights = fc->graph()->nodes()->create<luci::CircleConst>();
   new_weights->dtype(loco::DataType::FLOAT32);
@@ -82,6 +85,7 @@ luci::CircleConst *shuffle_weight(luci::CircleFullyConnected *fc)
   {
     new_weights->dim(r).set(the_weights->dim(r).value());
   }
+  new_weights->name(name + "/shuffle_weight");
 
   // suffle weight
   const uint32_t MULTIPLE = 16;
@@ -96,7 +100,7 @@ luci::CircleConst *shuffle_weight(luci::CircleFullyConnected *fc)
       for (uint32_t i = 0; i < MULTIPLE; i++)
       {
         new_weights->at<loco::DataType::FLOAT32>(index++) =
-            the_weights->at<loco::DataType::FLOAT32>((r * MULTIPLE + i) * cols + c);
+          the_weights->at<loco::DataType::FLOAT32>((r * MULTIPLE + i) * cols + c);
       }
     }
   }
@@ -131,6 +135,8 @@ bool ShuffleWeightTo16x1Float32Pass::run(loco::Graph *g)
       fc->weights(new_weights);
       fc->weights_format(luci::CircleFullyConnected::WeightsFormat::SHUFFLED16x1FLOAT32);
     }
+
+    changed = true;
   }
 
   return changed;
diff --git a/compiler/luci/pass/src/ShuffleWeightTo16x1Float32Pass.test.cpp b/compiler/luci/pass/src/ShuffleWeightTo16x1Float32Pass.test.cpp
index 9745e5754..077985977 100644
--- a/compiler/luci/pass/src/ShuffleWeightTo16x1Float32Pass.test.cpp
+++ b/compiler/luci/pass/src/ShuffleWeightTo16x1Float32Pass.test.cpp
@@ -18,61 +18,86 @@
 
 #include <luci/IR/CircleNodes.h>
 
+#include <luci/test/TestIOGraph.h>
+#include "test/TestFirstNode.h"
+
 #include <gtest/gtest.h>
 
-void create_fc_net(loco::Graph *g)
+namespace
 {
-  assert(g);
-
-  const uint32_t ROW = 16;
-  const uint32_t COL = 2;
-  const uint32_t elements_num = ROW * COL;
-
-  // input
-  auto input = g->nodes()->create<luci::CircleInput>();
-  auto graph_input = g->inputs()->create();
-  input->index(graph_input->index());
-
-  // fc weights
-  auto weights = g->nodes()->create<luci::CircleConst>();
-  weights->dtype(loco::DataType::FLOAT32);
-  weights->size<loco::DataType::FLOAT32>(elements_num);
-  weights->rank(2);
-  weights->dim(0).set(ROW);
-  weights->dim(1).set(COL);
-  for (uint32_t idx = 0; idx < elements_num; idx++)
+
+using namespace luci::test;
+
+class FCGraphlet
+{
+public:
+  FCGraphlet() = default;
+
+public:
+  void init(loco::Graph *g, const ShapeU32 wshape)
   {
-    weights->at<loco::DataType::FLOAT32>(idx) = idx;
+    const uint32_t elements_num = num_elements(wshape);
+
+    // fc weights
+    _weights = g->nodes()->create<luci::CircleConst>();
+    _weights->dtype(loco::DataType::FLOAT32);
+    _weights->shape(wshape);
+    _weights->size<loco::DataType::FLOAT32>(elements_num);
+    for (uint32_t idx = 0; idx < elements_num; idx++)
+    {
+      _weights->at<loco::DataType::FLOAT32>(idx) = idx;
+    }
+    _weights->name("weights");
+
+    // fc
+    _fc = g->nodes()->create<luci::CircleFullyConnected>();
+    _fc->dtype(loco::DataType::FLOAT32);
+    _fc->name("fc");
   }
 
-  // fc
-  auto fc = g->nodes()->create<luci::CircleFullyConnected>();
-  fc->dtype(loco::DataType::FLOAT32);
-  fc->input(input);
-  fc->weights(weights);
-
-  // output
-  auto output = g->nodes()->create<luci::CircleOutput>();
-  output->from(fc);
-  auto graph_output = g->outputs()->create();
-  output->index(graph_output->index());
-}
+protected:
+  luci::CircleFullyConnected *_fc = nullptr;
+  luci::CircleConst *_weights = nullptr;
+};
 
-TEST(ShuffleWeightTo16x1Float32PassTest, SimpleTest1)
+class FCGraph : public TestIGraphlet, public TestOGraphlet, public FCGraphlet
 {
-  auto graph = loco::make_graph();
-  create_fc_net(graph.get());
+public:
+  FCGraph() = default;
 
-  luci::CircleFullyConnected *fc_node = nullptr;
-  for (auto node : loco::active_nodes(loco::output_nodes(graph.get())))
+  void init(const ShapeU32 shape, const ShapeU32 wshape)
   {
-    auto fc = dynamic_cast<luci::CircleFullyConnected *>(node);
-    if (not fc)
-      continue;
+    TestIGraphlet::init(g(), shape);
+    TestOGraphlet::init(g(), shape);
+    FCGraphlet::init(g(), wshape);
+
+    // connect graph
+    _fc->input(input());
+    _fc->weights(_weights);
 
-    fc_node = fc;
-    break;
+    output()->from(_fc);
   }
+};
+
+} // namespace
+
+TEST(ShuffleWeightTo16x1Float32PassTest, name)
+{
+  luci::ShuffleWeightTo16x1Float32Pass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
+
+const uint32_t ROW = 16;
+const uint32_t COL = 2;
+
+TEST(ShuffleWeightTo16x1Float32PassTest, SimpleTest1)
+{
+  FCGraph g;
+
+  g.init({ROW, COL}, {ROW, COL});
+
+  auto fc_node = luci::test::first_node<luci::CircleFullyConnected>(g.g());
   ASSERT_NE(fc_node, nullptr);
   auto weights = loco::must_cast<luci::CircleConst *>(fc_node->weights());
   // before
@@ -94,7 +119,7 @@ TEST(ShuffleWeightTo16x1Float32PassTest, SimpleTest1)
   ASSERT_EQ(15, weights->at<loco::DataType::FLOAT32>(15));
 
   luci::ShuffleWeightTo16x1Float32Pass pass;
-  while (pass.run(graph.get()))
+  while (pass.run(g.g()))
     ;
 
   weights = loco::must_cast<luci::CircleConst *>(fc_node->weights());
@@ -116,3 +141,33 @@ TEST(ShuffleWeightTo16x1Float32PassTest, SimpleTest1)
   ASSERT_EQ(28, weights->at<loco::DataType::FLOAT32>(14));
   ASSERT_EQ(30, weights->at<loco::DataType::FLOAT32>(15));
 }
+
+TEST(ShuffleWeightTo16x1Float32PassTest, invalid_weight_shape_NEG)
+{
+  FCGraph g;
+
+  g.init({ROW, COL}, {1, ROW, COL, 1});
+
+  auto fc_node = luci::test::first_node<luci::CircleFullyConnected>(g.g());
+  ASSERT_NE(fc_node, nullptr);
+
+  luci::ShuffleWeightTo16x1Float32Pass pass;
+  auto ret = pass.run(g.g());
+
+  ASSERT_FALSE(ret);
+}
+
+TEST(ShuffleWeightTo16x1Float32PassTest, invalid_weight_row16_NEG)
+{
+  FCGraph g;
+
+  g.init({COL, ROW}, {COL, ROW});
+
+  auto fc_node = luci::test::first_node<luci::CircleFullyConnected>(g.g());
+  ASSERT_NE(fc_node, nullptr);
+
+  luci::ShuffleWeightTo16x1Float32Pass pass;
+  auto ret = pass.run(g.g());
+
+  ASSERT_FALSE(ret);
+}
diff --git a/compiler/luci/pass/src/Sparsifier.cpp b/compiler/luci/pass/src/Sparsifier.cpp
index 210c1a34c..18ab45f98 100644
--- a/compiler/luci/pass/src/Sparsifier.cpp
+++ b/compiler/luci/pass/src/Sparsifier.cpp
@@ -26,8 +26,8 @@ Sparsifier<T>::Sparsifier(const std::vector<int32_t> &shape,
                           const std::vector<DimensionType> &format,
                           const std::vector<int32_t> &block_size,
                           const std::vector<int32_t> &block_map)
-    : _dense_shape(shape), _traversal_order(traversal_order), _block_size(block_size),
-      _block_map(block_map)
+  : _dense_shape(shape), _traversal_order(traversal_order), _block_size(block_size),
+    _block_map(block_map)
 {
   _dense_size = 1;
   int32_t block_dim = 0;
diff --git a/compiler/luci/pass/src/Sparsifier.test.cpp b/compiler/luci/pass/src/Sparsifier.test.cpp
index 272e0e934..14e24aad7 100644
--- a/compiler/luci/pass/src/Sparsifier.test.cpp
+++ b/compiler/luci/pass/src/Sparsifier.test.cpp
@@ -190,6 +190,6 @@ TEST(SparsifierTest, WrongFormatRank_NEG)
   const std::vector<int32_t> block_size = {4, 1};
   const std::vector<int32_t> block_map = {0, 1};
   EXPECT_THROW(
-      luci::Sparsifier<int32_t>(dense_shape, traversal_order, format, block_size, block_map),
-      std::out_of_range);
+    luci::Sparsifier<int32_t>(dense_shape, traversal_order, format, block_size, block_map),
+    std::out_of_range);
 }
diff --git a/compiler/luci/pass/src/SparsifyTensorPass.cpp b/compiler/luci/pass/src/SparsifyTensorPass.cpp
index 2f1a36e77..1a75bfb0c 100644
--- a/compiler/luci/pass/src/SparsifyTensorPass.cpp
+++ b/compiler/luci/pass/src/SparsifyTensorPass.cpp
@@ -69,11 +69,11 @@ template <loco::DataType DT> void SparsifyTensorPass::sparsify_tensor(luci::Circ
     else if (_format.at(idx) == DimensionType::SPARSE_CSR)
     {
       sparsityparam->dim_metadata.emplace_back(
-          DimensionType::SPARSE_CSR, /* dense size */ 0,
-          /* array_segments */ SparseIndexVector{SparseIndexVectorType::U16,
-                                                 dim_metadata.at(idx * 2)},
-          /* array_indices */ SparseIndexVector{SparseIndexVectorType::U16,
-                                                dim_metadata.at(idx * 2 + 1)});
+        DimensionType::SPARSE_CSR, /* dense size */ 0,
+        /* array_segments */
+        SparseIndexVector{SparseIndexVectorType::U16, dim_metadata.at(idx * 2)},
+        /* array_indices */
+        SparseIndexVector{SparseIndexVectorType::U16, dim_metadata.at(idx * 2 + 1)});
     }
   }
   for (uint32_t i = 0; i < _block_size.size(); i++)
diff --git a/compiler/luci/pass/src/SparsifyTensorPass.test.cpp b/compiler/luci/pass/src/SparsifyTensorPass.test.cpp
new file mode 100644
index 000000000..372e8e5ca
--- /dev/null
+++ b/compiler/luci/pass/src/SparsifyTensorPass.test.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/SparsifyTensorPass.h"
+
+#include <gtest/gtest.h>
+
+TEST(SparsifyTensorPassTest, name)
+{
+  std::vector<int32_t> to;
+  std::vector<luci::DimensionType> vdt;
+  std::vector<int32_t> bs;
+  std::vector<int32_t> bm;
+  luci::SparsifyTensorPass pass("", to, vdt, bs, bm);
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
diff --git a/compiler/luci/pass/src/SubstitutePackToReshapePass.cpp b/compiler/luci/pass/src/SubstitutePackToReshapePass.cpp
index 44e974b91..d8676cd62 100644
--- a/compiler/luci/pass/src/SubstitutePackToReshapePass.cpp
+++ b/compiler/luci/pass/src/SubstitutePackToReshapePass.cpp
@@ -17,10 +17,22 @@
 #include "luci/Pass/SubstitutePackToReshapePass.h"
 
 #include <luci/IR/CircleNodes.h>
+#include <luci/Profile/CircleNodeOrigin.h>
 
 namespace
 {
 
+int32_t unknown_dim_count(luci::CircleNode *node)
+{
+  int32_t count = 0;
+
+  for (uint32_t i = 0; i < node->rank(); ++i)
+    if (!node->dim(i).known())
+      ++count;
+
+  return count;
+}
+
 bool substitute_pack_to_reshape(luci::CircleNode *node)
 {
   auto target_node = dynamic_cast<luci::CirclePack *>(node);
@@ -35,9 +47,14 @@ bool substitute_pack_to_reshape(luci::CircleNode *node)
   if (axis < 0)
     axis = axis + static_cast<int32_t>(value_node->rank()) + 1;
 
+  auto name = node->name();
+  assert(name.length() > 0);
+
   auto graph = target_node->graph();
   auto reshape_node = graph->nodes()->create<luci::CircleReshape>();
   reshape_node->tensor(value_node);
+  reshape_node->name(name + "/Reshape");
+  luci::add_origin(reshape_node, luci::get_origin(node));
 
   auto const_node = graph->nodes()->create<luci::CircleConst>();
   const_node->dtype(loco::DataType::S32);
@@ -53,13 +70,16 @@ bool substitute_pack_to_reshape(luci::CircleNode *node)
     }
     else if (i < axis)
     {
-      const_node->at<loco::DataType::S32>(i) = value_node->dim(i).value();
+      const_node->at<loco::DataType::S32>(i) =
+        value_node->dim(i).known() ? value_node->dim(i).value() : -1;
     }
     else
     {
-      const_node->at<loco::DataType::S32>(i) = value_node->dim(i - 1).value();
+      const_node->at<loco::DataType::S32>(i) =
+        value_node->dim(i - 1).known() ? value_node->dim(i - 1).value() : -1;
     }
   }
+  const_node->name(name + "/Reshape/shape");
   reshape_node->shape(const_node);
   replace(target_node).with(reshape_node);
   return true;
@@ -71,24 +91,23 @@ namespace luci
 {
 
 /**
- *   BEFORE
- *      |
- * [CircleNode]
- *      |
- * [CirclePack]
- *      |
- * [CircleNode]
- *      |
+ * BEFORE
+ *           |
+ *      [CircleNode]
+ *           |
+ *      [CirclePack]
+ *           |
+ *      [CircleNode]
+ *           |
  *
- *    AFTER
- *      |
- * [CircleNode]  [CircleConst]
- *       \             /
- *       [CircleReshape]
+ * AFTER
  *             |
- *        [CircleNode]
- *             |
- *
+ *        [CircleNode]  [CircleConst]
+ *           |   \             /
+ *  [CirclePack] [CircleReshape]
+ *                      |
+ *                 [CircleNode]
+ *                      |
  */
 bool SubstitutePackToReshapePass::run(loco::Graph *g)
 {
@@ -96,7 +115,7 @@ bool SubstitutePackToReshapePass::run(loco::Graph *g)
   for (auto node : loco::active_nodes(loco::output_nodes(g)))
   {
     auto circle_node = loco::must_cast<luci::CircleNode *>(node);
-    if (substitute_pack_to_reshape(circle_node))
+    if (unknown_dim_count(circle_node) <= 1 && substitute_pack_to_reshape(circle_node))
     {
       changed = true;
     }
diff --git a/compiler/luci/pass/src/SubstitutePackToReshapePass.test.cpp b/compiler/luci/pass/src/SubstitutePackToReshapePass.test.cpp
index 143b88896..3b5d4ea2c 100644
--- a/compiler/luci/pass/src/SubstitutePackToReshapePass.test.cpp
+++ b/compiler/luci/pass/src/SubstitutePackToReshapePass.test.cpp
@@ -22,26 +22,6 @@
 namespace
 {
 
-/**
- *           BEFORE
- *             |
- *        [CircleNode]
- *             |
- *        [CirclePack]
- *             |
- *        [CircleNode]
- *             |
- *
- *           AFTER
- *      |
- * [CircleNode]  [CircleConst]
- *       \             /
- *       [CircleReshape]
- *             |
- *        [CircleNode]
- *             |
- *
- */
 void create_substitute_pack_to_reshape(loco::Graph *g, const std::initializer_list<uint32_t> shape,
                                        int32_t axis)
 {
@@ -54,23 +34,33 @@ void create_substitute_pack_to_reshape(loco::Graph *g, const std::initializer_li
   input->shape_status(luci::ShapeStatus::VALID);
   input->rank(shape.size());
   input->shape(shape);
+  input->name("input");
 
   // Pack Node create.
   auto pack = g->nodes()->create<luci::CirclePack>(1);
   pack->values(0, input);
   pack->axis(axis);
+  pack->name("pack");
 
   // Output Connect.
   auto output = g->nodes()->create<luci::CircleOutput>();
   output->from(pack);
   auto graph_output = g->outputs()->create();
   output->index(graph_output->index());
+  output->name("output");
 
   return;
 }
 
 } // namespace
 
+TEST(SubstitutePackToReshapePassTest, name)
+{
+  luci::SubstitutePackToReshapePass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
+
 TEST(SubstitutePackToReshapePass, simple_case)
 {
   auto graph = loco::make_graph();
diff --git a/compiler/luci/pass/src/SubstituteSqueezeToReshapePass.cpp b/compiler/luci/pass/src/SubstituteSqueezeToReshapePass.cpp
new file mode 100644
index 000000000..74be86a4c
--- /dev/null
+++ b/compiler/luci/pass/src/SubstituteSqueezeToReshapePass.cpp
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/SubstituteSqueezeToReshapePass.h"
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/Profile/CircleNodeOrigin.h>
+
+namespace
+{
+
+/**
+ * @brief return TRUE if all dim is known
+ * @note This pass can be applied even some of dimensions are unknown.
+         For now, do not consider about it and update logic later.
+ */
+bool can_squeeze_shape(const luci::CircleNode *node)
+{
+  for (uint32_t r = 0; r < node->rank(); ++r)
+  {
+    if (not node->dim(r).known())
+      return false;
+  }
+  return true;
+}
+
+/**
+ * @brief return valid unsigned dim value from 0 ~ (rank-1)
+ * @note  dim can be -rank to (rank-1)
+ */
+uint32_t valid_unsigned_dim(uint32_t rank, int32_t dim)
+{
+  int32_t irank = static_cast<int32_t>(rank);
+  return dim >= 0 ? static_cast<uint32_t>(dim) : static_cast<uint32_t>(irank + dim);
+}
+
+/**
+ * @brief return TRUE if input dim is 1 for squeeze_dims values
+ */
+bool is_valid_input(const luci::CircleNode *node, const std::vector<int32_t> &squeeze_dims)
+{
+  auto rank = node->rank();
+  for (auto dim : squeeze_dims)
+  {
+    auto udim = valid_unsigned_dim(rank, dim);
+    if (node->dim(udim).value() != 1)
+      return false;
+  }
+  return true;
+}
+
+/**
+ * @brief return shape vector from input
+ */
+std::vector<uint32_t> node_shape(const luci::CircleNode *input)
+{
+  std::vector<uint32_t> shape;
+  uint32_t rank = input->rank();
+  for (uint32_t r = 0; r < rank; ++r)
+    shape.push_back(input->dim(r).value());
+
+  return shape;
+}
+
+/**
+ * @brief return CircleConst ptr with values of new_shape
+ */
+luci::CircleConst *create_shape_const(loco::Graph *graph, const std::vector<uint32_t> &new_shape)
+{
+  // NOTE dim_size can be 0
+  uint32_t dim_size = static_cast<uint32_t>(new_shape.size());
+
+  auto shape_const = graph->nodes()->create<luci::CircleConst>();
+
+  // const shape/dtype
+  shape_const->dtype(loco::DataType::S32);
+  if (dim_size > 0)
+  {
+    shape_const->rank(1);
+    shape_const->dim(0).set(dim_size);
+  }
+  else
+    shape_const->rank(0);
+  shape_const->shape_status(luci::ShapeStatus::VALID);
+
+  // constant values
+  shape_const->size<loco::DataType::S32>(dim_size);
+  for (uint32_t i = 0; i < dim_size; ++i)
+    shape_const->at<loco::DataType::S32>(i) = new_shape.at(i);
+
+  return shape_const;
+}
+
+bool substitute_squeeze_to_reshape(luci::CircleSqueeze *squeeze)
+{
+  assert(squeeze != nullptr);
+
+  auto input = loco::must_cast<luci::CircleNode *>(squeeze->input());
+  // we need input node shape and all dim should be known
+  if (input->shape_status() != luci::ShapeStatus::VALID)
+    return false;
+  if (not can_squeeze_shape(input))
+    return false;
+
+  // we will use squeeze shape for new shape
+  if (squeeze->shape_status() != luci::ShapeStatus::VALID)
+    return false;
+
+  auto squeeze_dims = squeeze->squeeze_dims();
+  if (not is_valid_input(input, squeeze_dims))
+    throw std::runtime_error("Invalid values in squeeze_dims: " + squeeze->name());
+
+  auto name = squeeze->name();
+  assert(name.length() > 0);
+
+  auto reshape_shape = node_shape(squeeze);
+  auto graph = squeeze->graph();
+  auto reshape = graph->nodes()->create<luci::CircleReshape>();
+  auto shape_const = create_shape_const(graph, reshape_shape);
+  reshape->name(name + "/Reshape");
+  luci::add_origin(reshape, luci::get_origin(squeeze));
+  shape_const->name(name + "/Reshape/shape");
+
+  // graph connection
+  reshape->tensor(input);
+  reshape->shape(shape_const);
+  replace(squeeze).with(reshape);
+
+  return true;
+}
+
+} // namespace
+
+namespace luci
+{
+
+/**
+ * BEFORE
+ *           |
+ *      [CircleNode]
+ *           |
+ *    [CircleSqueeze]
+ *           |
+ *      [CircleNode]
+ *           |
+ *
+ * AFTER
+ *               |
+ *          [CircleNode]  [CircleConst]
+ *             |    \             /
+ *  [CircleSqueeze] [CircleReshape]
+ *                        |
+ *                   [CircleNode]
+ *                        |
+ */
+bool SubstituteSqueezeToReshapePass::run(loco::Graph *g)
+{
+  bool changed = false;
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    if (auto squeeze = dynamic_cast<luci::CircleSqueeze *>(node))
+    {
+      if (substitute_squeeze_to_reshape(squeeze))
+        changed = true;
+    }
+  }
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/SubstituteSqueezeToReshapePass.test.cpp b/compiler/luci/pass/src/SubstituteSqueezeToReshapePass.test.cpp
new file mode 100644
index 000000000..d917af678
--- /dev/null
+++ b/compiler/luci/pass/src/SubstituteSqueezeToReshapePass.test.cpp
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "luci/Pass/SubstituteSqueezeToReshapePass.h"
+#include "luci/Pass/CircleShapeInferencePass.h"
+
+#include <luci/IR/CircleNodes.h>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+using uilist = std::initializer_list<uint32_t>;
+using ilist = std::initializer_list<int32_t>;
+
+class PassTestGraph
+{
+public:
+  PassTestGraph() = default;
+
+public:
+  void init(const uilist shape_in, const uilist shape_out)
+  {
+    _graph_input = _g.inputs()->create();
+    _graph_output = _g.outputs()->create();
+
+    _input = _g.nodes()->create<luci::CircleInput>();
+    _input->shape(shape_in);
+    _input->shape_status(luci::ShapeStatus::VALID);
+    _input->name("input");
+
+    _output = _g.nodes()->create<luci::CircleOutput>();
+    _output->shape(shape_out);
+    _output->shape_status(luci::ShapeStatus::VALID);
+    _output->name("output");
+
+    _input->index(_graph_input->index());
+    _output->index(_graph_output->index());
+
+    auto input_shape = std::make_unique<loco::TensorShape>();
+    set(input_shape.get(), shape_in);
+    _graph_input->shape(std::move(input_shape));
+
+    auto output_shape = std::make_unique<loco::TensorShape>();
+    set(output_shape.get(), shape_out);
+    _graph_output->shape(std::move(output_shape));
+  }
+
+protected:
+  void set(loco::TensorShape *shape, const uilist &values)
+  {
+    uint32_t r = 0;
+    shape->rank(values.size());
+    for (auto v : values)
+      shape->dim(r++).set(v);
+  }
+
+public:
+  loco::Graph *g(void) { return &_g; }
+  luci::CircleOutput *output(void) { return _output; }
+
+protected:
+  loco::Graph _g;
+  loco::GraphInput *_graph_input = nullptr;
+  loco::GraphOutput *_graph_output = nullptr;
+  luci::CircleInput *_input = nullptr;
+  luci::CircleOutput *_output = nullptr;
+};
+
+class SubstituteSqueezeToReshapeGraph : public PassTestGraph
+{
+public:
+  SubstituteSqueezeToReshapeGraph() = default;
+
+public:
+  void init(const uilist shape_in, const uilist shape_out, const ilist squeeze_dims)
+  {
+    PassTestGraph::init(shape_in, shape_out);
+
+    _squeeze = _g.nodes()->create<luci::CircleSqueeze>();
+    _squeeze->input(_input);
+    _squeeze->squeeze_dims(squeeze_dims);
+    _squeeze->name("squeeze");
+
+    _output->from(_squeeze);
+  }
+
+protected:
+  luci::CircleSqueeze *_squeeze = nullptr;
+};
+
+class SubstituteSqueezeToReshapeTest : public ::testing::Test
+{
+public:
+  SubstituteSqueezeToReshapeTest() = default;
+
+  void run_pass(void)
+  {
+    while (_shapeinf.run(_graph.g()) || _pass.run(_graph.g()))
+      ;
+  }
+
+protected:
+  SubstituteSqueezeToReshapeGraph _graph;
+  luci::SubstituteSqueezeToReshapePass _pass;
+  luci::CircleShapeInferencePass _shapeinf;
+};
+
+} // namespace
+
+TEST(SubstituteSqueezeToReshapePassTest, name)
+{
+  luci::SubstituteSqueezeToReshapePass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
+
+TEST_F(SubstituteSqueezeToReshapeTest, simple_with_squeeze_dims)
+{
+  _graph.init({1, 16, 1, 1}, {1, 16}, {2, 3});
+
+  run_pass();
+
+  auto reshape = dynamic_cast<luci::CircleReshape *>(_graph.output()->from());
+  auto squeeze = dynamic_cast<luci::CircleSqueeze *>(_graph.output()->from());
+  ASSERT_NE(nullptr, reshape);
+  ASSERT_EQ(nullptr, squeeze);
+  auto reshape_shape = loco::must_cast<luci::CircleConst *>(reshape->shape());
+  ASSERT_EQ(2, reshape_shape->size<loco::DataType::S32>());
+  ASSERT_EQ(1, reshape_shape->at<loco::DataType::S32>(0));
+  ASSERT_EQ(16, reshape_shape->at<loco::DataType::S32>(1));
+}
+
+TEST_F(SubstituteSqueezeToReshapeTest, simple_without_squeeze_dims)
+{
+  _graph.init({1, 16, 1, 1}, {16}, {});
+
+  run_pass();
+
+  auto reshape = dynamic_cast<luci::CircleReshape *>(_graph.output()->from());
+  auto squeeze = dynamic_cast<luci::CircleSqueeze *>(_graph.output()->from());
+  ASSERT_NE(nullptr, reshape);
+  ASSERT_EQ(nullptr, squeeze);
+  auto reshape_shape = loco::must_cast<luci::CircleConst *>(reshape->shape());
+  ASSERT_EQ(1, reshape_shape->size<loco::DataType::S32>());
+  ASSERT_EQ(16, reshape_shape->at<loco::DataType::S32>(0));
+}
+
+TEST_F(SubstituteSqueezeToReshapeTest, input_with_0_dims)
+{
+  _graph.init({1, 16, 0, 1}, {16, 0}, {});
+
+  run_pass();
+
+  auto reshape = dynamic_cast<luci::CircleReshape *>(_graph.output()->from());
+  auto squeeze = dynamic_cast<luci::CircleSqueeze *>(_graph.output()->from());
+  ASSERT_NE(nullptr, reshape);
+  ASSERT_EQ(nullptr, squeeze);
+  auto reshape_shape = loco::must_cast<luci::CircleConst *>(reshape->shape());
+  ASSERT_EQ(2, reshape_shape->size<loco::DataType::S32>());
+  ASSERT_EQ(16, reshape_shape->at<loco::DataType::S32>(0));
+  ASSERT_EQ(0, reshape_shape->at<loco::DataType::S32>(1));
+}
+
+TEST_F(SubstituteSqueezeToReshapeTest, nothing_to_squeeze)
+{
+  _graph.init({2, 16, 16, 3}, {2, 16, 16, 3}, {});
+
+  run_pass();
+
+  auto reshape = dynamic_cast<luci::CircleReshape *>(_graph.output()->from());
+  auto squeeze = dynamic_cast<luci::CircleSqueeze *>(_graph.output()->from());
+  ASSERT_NE(nullptr, reshape);
+  ASSERT_EQ(nullptr, squeeze);
+}
+
+TEST_F(SubstituteSqueezeToReshapeTest, all_to_squeeze)
+{
+  _graph.init({1, 1}, {}, {});
+
+  run_pass();
+
+  auto reshape = dynamic_cast<luci::CircleReshape *>(_graph.output()->from());
+  auto squeeze = dynamic_cast<luci::CircleSqueeze *>(_graph.output()->from());
+  ASSERT_NE(nullptr, reshape);
+  ASSERT_EQ(nullptr, squeeze);
+}
+
+TEST_F(SubstituteSqueezeToReshapeTest, wrong_squeeze_dims_NEG)
+{
+  _graph.init({1, 16, 1, 1}, {1, 16, 1, 1}, {1});
+
+  // shape inference will throw for invalid squeeze_dims
+  EXPECT_THROW(run_pass(), std::exception);
+}
diff --git a/compiler/luci/pass/src/SubstituteTransposeToReshapePass.cpp b/compiler/luci/pass/src/SubstituteTransposeToReshapePass.cpp
new file mode 100644
index 000000000..dfd5e6cf2
--- /dev/null
+++ b/compiler/luci/pass/src/SubstituteTransposeToReshapePass.cpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/SubstituteTransposeToReshapePass.h"
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/Profile/CircleNodeOrigin.h>
+
+namespace
+{
+
+/**
+ * @brief Convert transpose op in a certain condition to reshape op
+ * @details Convert transpose op if it have condition below
+ *          1. have a CircleConst perm value.
+ *          2. input have an unknown dimension less then 2
+ *          3. the order of shape that except dim value 1 remains same on input and output
+ *             eg) input shape  = (126, 201, 1, 1) => (126, 201)
+ *                 output shape = (1, 126, 1, 201) => (126, 201)
+ */
+bool substitute_transpose_to_reshape(luci::CircleTranspose *node)
+{
+  auto perm_const = dynamic_cast<luci::CircleConst *>(node->perm());
+  if (perm_const == nullptr)
+    return false;
+
+  assert(perm_const->dtype() == loco::DataType::S32);
+
+  auto input_node = loco::must_cast<luci::CircleNode *>(node->a());
+  if (perm_const->dim(0).value() != input_node->rank())
+    return false;
+
+  // If input have more than 2 unknown dimension, transpose will not be changed.
+  int count = 0;
+  for (uint32_t i = 0; i < input_node->rank(); i++)
+    if (!input_node->dim(i).known())
+      count++;
+  if (count > 1)
+    return false;
+
+  uint32_t idx = 0;
+  auto size_items = perm_const->size<loco::DataType::S32>();
+  for (uint32_t i = 0; i < size_items; i++)
+  {
+    assert(perm_const->at<loco::DataType::S32>(i) >= 0 &&
+           perm_const->at<loco::DataType::S32>(i) < static_cast<int32_t>(input_node->rank()));
+    const auto perm_value = static_cast<uint32_t>(perm_const->at<loco::DataType::S32>(i));
+    if (input_node->dim(perm_value).known() && input_node->dim(perm_value).value() == 1)
+      continue;
+    // To check idx values are increasing
+    if (idx > perm_value)
+      return false;
+    idx = perm_value;
+  }
+
+  auto name = node->name();
+  assert(name.length() > 0);
+
+  auto new_const_node = node->graph()->nodes()->create<luci::CircleConst>();
+  new_const_node->dtype(loco::DataType::S32);
+  new_const_node->size<loco::DataType::S32>(size_items);
+  new_const_node->shape_status(luci::ShapeStatus::VALID);
+  new_const_node->rank(1);
+  new_const_node->dim(0).set(size_items);
+  for (uint32_t i = 0; i < size_items; i++)
+  {
+    if (input_node->dim(static_cast<uint32_t>(perm_const->at<loco::DataType::S32>(i))).known())
+      new_const_node->at<loco::DataType::S32>(i) = static_cast<int32_t>(
+        input_node->dim(static_cast<uint32_t>(perm_const->at<loco::DataType::S32>(i))).value());
+    else
+      new_const_node->at<loco::DataType::S32>(i) = -1;
+  }
+
+  auto new_reshape_node = node->graph()->nodes()->create<luci::CircleReshape>();
+  new_reshape_node->tensor(input_node);
+  new_reshape_node->shape(new_const_node);
+  new_reshape_node->name(name + "/Reshape");
+  luci::add_origin(new_reshape_node, luci::get_origin(node));
+  new_const_node->name(name + "/Reshape/shape");
+
+  replace(node).with(new_reshape_node);
+  return true;
+}
+
+} // namespace
+
+namespace luci
+{
+
+/**
+ * BEFORE
+ *
+ *     [CircleNode]  [CircleConst]
+ *          \             /
+ *          [CircleTranspose]
+ *                 |
+ *            [CircleNode]
+ *
+ * AFTER
+ *
+ *     [CircleNode]  [CircleConst]
+ *           \             /
+ *          [CircleReshape]
+ *                 |
+ *            [CircleNode]
+ *
+ */
+bool SubstituteTransposeToReshapePass::run(loco::Graph *g)
+{
+  bool changed = false;
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    if (auto circle_node = dynamic_cast<luci::CircleTranspose *>(node))
+    {
+      if (substitute_transpose_to_reshape(circle_node))
+      {
+        changed = true;
+      }
+    }
+  }
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/SubstituteTransposeToReshapePass.test.cpp b/compiler/luci/pass/src/SubstituteTransposeToReshapePass.test.cpp
new file mode 100644
index 000000000..f81f7e615
--- /dev/null
+++ b/compiler/luci/pass/src/SubstituteTransposeToReshapePass.test.cpp
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "luci/Pass/SubstituteTransposeToReshapePass.h"
+
+#include <luci/IR/CircleNodes.h>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+class SubstituteTransposeToReshapeTest : public ::testing::Test
+{
+public:
+  SubstituteTransposeToReshapeTest() {}
+
+  void buildGraph(const std::initializer_list<uint32_t> shape, const std::vector<int32_t> perm)
+  {
+    // Input Create.
+    input = g.nodes()->create<luci::CircleInput>();
+    auto graph_input = g.inputs()->create();
+    input->index(graph_input->index());
+    input->shape_status(luci::ShapeStatus::VALID);
+    input->rank(shape.size());
+    input->shape(shape);
+    input->name("input");
+
+    // Permutation Create.
+    auto perm_const = g.nodes()->create<luci::CircleConst>();
+    perm_const->dtype(loco::DataType::S32);
+    perm_const->size<loco::DataType::S32>(perm.size());
+    perm_const->shape_status(luci::ShapeStatus::VALID);
+    perm_const->rank(1);
+    perm_const->dim(0).set(perm.size());
+    for (uint32_t i = 0; i < static_cast<uint32_t>(perm.size()); i++)
+    {
+      perm_const->at<loco::DataType::S32>(i) = perm.at(i);
+    }
+    perm_const->name("perm_const");
+
+    // Transpose Create.
+    auto transpose_node = g.nodes()->create<luci::CircleTranspose>();
+    transpose_node->a(input);
+    transpose_node->perm(perm_const);
+    transpose_node->name("transpose_node");
+
+    // Output Connect.
+    output = g.nodes()->create<luci::CircleOutput>();
+    output->from(transpose_node);
+    auto graph_output = g.outputs()->create();
+    output->index(graph_output->index());
+    output->name("output");
+  }
+
+public:
+  loco::Graph g;
+  luci::CircleInput *input = nullptr;
+  luci::CircleOutput *output = nullptr;
+};
+
+} // namespace
+
+TEST(SubstituteTransposeToReshapePassTest, name)
+{
+  luci::SubstituteTransposeToReshapePass pass;
+  auto const name = pass.name();
+  ASSERT_NE(nullptr, name);
+}
+
+TEST_F(SubstituteTransposeToReshapeTest, simple_case)
+{
+  // Create graph that tranpose input {126, 201, 1, 1} with permutation {2, 0, 3, 1}
+  buildGraph({126, 201, 1, 1}, std::vector<int32_t>({2, 0, 3, 1}));
+  // With this input shape and permutation values, output shape will be [1, 126, 1, 201].
+  // The order of non-one values is unchanged (126, 201).
+  // So this Transpose op can be converted to Reshape op.
+  luci::SubstituteTransposeToReshapePass pass;
+  while (pass.run(&g))
+    ;
+
+  auto reshape_node = dynamic_cast<luci::CircleReshape *>(output->from());
+  auto transpose_node = dynamic_cast<luci::CircleTranspose *>(output->from());
+  ASSERT_NE(nullptr, reshape_node);
+  ASSERT_EQ(nullptr, transpose_node);
+  auto new_shape = loco::must_cast<luci::CircleConst *>(reshape_node->shape());
+  ASSERT_EQ(1, new_shape->at<loco::DataType::S32>(0));
+  ASSERT_EQ(126, new_shape->at<loco::DataType::S32>(1));
+  ASSERT_EQ(1, new_shape->at<loco::DataType::S32>(2));
+  ASSERT_EQ(201, new_shape->at<loco::DataType::S32>(3));
+}
+
+TEST_F(SubstituteTransposeToReshapeTest, failed_to_substitute_NEG)
+{
+  // Create graph that tranpose input {126, 201, 1, 1} with permutation {2, 1, 3, 0}
+  buildGraph({126, 201, 1, 1}, std::vector<int32_t>({2, 1, 3, 0}));
+  // With this input shape and permutation values, output shape will be [1, 201, 1, 126].
+  // The order of non-one values is changed (126, 201) -> (201, 126).
+  // So this Transpose op cannot be converted to Reshape op.
+  luci::SubstituteTransposeToReshapePass pass;
+  while (pass.run(&g))
+    ;
+
+  auto reshape_node = dynamic_cast<luci::CircleReshape *>(output->from());
+  auto transpose_node = dynamic_cast<luci::CircleTranspose *>(output->from());
+  ASSERT_EQ(nullptr, reshape_node);
+  ASSERT_NE(nullptr, transpose_node);
+}
diff --git a/compiler/luci/pass/src/TransformMinMaxToRelu6Pass.cpp b/compiler/luci/pass/src/TransformMinMaxToRelu6Pass.cpp
new file mode 100644
index 000000000..c15a3b676
--- /dev/null
+++ b/compiler/luci/pass/src/TransformMinMaxToRelu6Pass.cpp
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/TransformMinMaxToRelu6Pass.h"
+
+#include "helpers/NodeFiller.h"
+#include "helpers/TypeMapper.h"
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/Profile/CircleNodeOrigin.h>
+
+namespace
+{
+
+template <loco::DataType DT>
+bool is_scalar_with_value(luci::CircleConst *node, typename loco::DataTypeImpl<DT>::Type val)
+{
+  if (node->dtype() != DT)
+    return false;
+  if (node->rank() != 0)
+    return false;
+  if (node->size<DT>() != 1)
+    return false;
+  if (node->at<DT>(0) != static_cast<typename loco::DataTypeImpl<DT>::Type>(val))
+    return false;
+
+  return true;
+}
+
+/**
+ *  BEFORE
+ *        [CircleNode]
+ *              |
+ *       [CircleMinimum]
+ *              |
+ *       [CircleMaximum]
+ *              |
+ *        [CircleNode]
+ *
+ *  AFTER
+ *
+ *        [CircleNode]
+ *              |
+ *        [CircleRelu6]
+ *              |
+ *        [CircleNode]
+ *
+ *  NOTE Only max(min(input, 6), 0) pattern will be transformed.
+ */
+template <loco::DataType DT> bool transform_min_max_pattern(luci::CircleMaximum *maxi)
+{
+  if (not maxi)
+    return false;
+
+  if (maxi->dtype() != DT)
+    return false;
+
+  luci::CircleConst *maxi_const = nullptr;
+  luci::CircleMinimum *mini = nullptr;
+
+  // There are two ways Maximum takes inputs.
+  // 1. Maximum(x = CircleConst, y = CircleMinimum)
+  // 2. Maximum(x = CircleMinimum, y = CircleConst)
+  if (not luci::fill(&maxi_const, &mini).with_commutative_args_of(maxi))
+    return false;
+
+  // Maximum constant should be scalar whose value is 0.
+  if (not is_scalar_with_value<DT>(maxi_const,
+                                   static_cast<typename loco::DataTypeImpl<DT>::Type>(0)))
+    return false;
+
+  luci::CircleConst *mini_const = nullptr;
+  loco::Node *mini_input = nullptr;
+
+  // There are two ways Miminum takes inputs.
+  // 1. Miminum(x = CircleNode, y = CircleMinimum)
+  // 2. Miminum(x = CircleMinimum, y = CircleNode)
+  if (not luci::fill(&mini_const, &mini_input).with_commutative_args_of(mini))
+    return false;
+
+  // Miminum constant should be scalar whose value is 6.
+  if (not is_scalar_with_value<DT>(mini_const,
+                                   static_cast<typename loco::DataTypeImpl<DT>::Type>(6)))
+    return false;
+
+  auto name = maxi->name();
+  assert(name.length() > 0);
+
+  // Create Relu6 op
+  auto relu6 = mini->graph()->nodes()->create<luci::CircleRelu6>();
+  relu6->features(mini_input);
+  relu6->name(name + "/Relu6");
+  luci::add_origin(relu6, luci::composite_origin({luci::get_origin(maxi), luci::get_origin(mini)}));
+
+  replace(maxi).with(relu6);
+
+  return true;
+}
+
+} // namespace
+
+namespace luci
+{
+
+bool TransformMinMaxToRelu6Pass::run(loco::Graph *g)
+{
+  bool changed = false;
+
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    if (auto maxi = dynamic_cast<luci::CircleMaximum *>(node))
+    {
+      if (transform_min_max_pattern<loco::DataType::FLOAT32>(maxi))
+        changed = true;
+    }
+  }
+
+  return changed;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/TransformMinMaxToRelu6Pass.test.cpp b/compiler/luci/pass/src/TransformMinMaxToRelu6Pass.test.cpp
new file mode 100644
index 000000000..9755a70cf
--- /dev/null
+++ b/compiler/luci/pass/src/TransformMinMaxToRelu6Pass.test.cpp
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Pass/TransformMinMaxToRelu6Pass.h"
+
+#include <luci/IR/CircleNodes.h>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+/**
+ *  Minimum-Maximum pattern graph
+ *
+ *  [CircleInput]  [CircleConst]
+ *         \         /
+ *    [CircleMinimum]   [CircleConst]
+ *             |       /
+ *       [CircleMaximum]
+ *             |
+ *       [CircleOutput]
+ */
+struct MinMaxGraph
+{
+  loco::Graph _g;
+  luci::CircleInput *_input = nullptr;
+  luci::CircleMinimum *_mini = nullptr;
+  luci::CircleConst *_mini_const = nullptr;
+  luci::CircleMaximum *_maxi = nullptr;
+  luci::CircleConst *_maxi_const = nullptr;
+  luci::CircleOutput *_output = nullptr;
+};
+
+class TransformMinMaxToRelu6PassTest : public ::testing::Test
+{
+protected:
+  virtual void SetUp()
+  {
+    const int N = 1;
+    const int H = 4;
+    const int W = 4;
+    const int C = 3;
+
+    // graph input and output
+    auto graph_input = _min_max_g._g.inputs()->create();
+    auto graph_output = _min_max_g._g.outputs()->create();
+
+    // CircleInput
+    _min_max_g._input = _min_max_g._g.nodes()->create<luci::CircleInput>();
+    _min_max_g._input->index(graph_input->index());
+    _min_max_g._input->shape({N, H, W, C});
+    _min_max_g._input->dtype(loco::DataType::FLOAT32);
+    _min_max_g._input->name("input");
+
+    // CircleConst
+    _min_max_g._mini_const = _min_max_g._g.nodes()->create<luci::CircleConst>();
+    _min_max_g._mini_const->shape({}); // scalar
+    _min_max_g._mini_const->dtype(loco::DataType::FLOAT32);
+    _min_max_g._mini_const->size<loco::DataType::FLOAT32>(1);
+    _min_max_g._mini_const->at<loco::DataType::FLOAT32>(0) = 6.;
+    _min_max_g._mini_const->name("mini_const");
+
+    // CircleMinimum
+    _min_max_g._mini = _min_max_g._g.nodes()->create<luci::CircleMinimum>();
+    _min_max_g._mini->x(_min_max_g._input);
+    _min_max_g._mini->y(_min_max_g._mini_const);
+    _min_max_g._mini->shape({N, H, W, C});
+    _min_max_g._mini->dtype(loco::DataType::FLOAT32);
+    _min_max_g._mini->name("mini");
+
+    // CircleConst
+    _min_max_g._maxi_const = _min_max_g._g.nodes()->create<luci::CircleConst>();
+    _min_max_g._mini_const->shape({}); // scalar
+    _min_max_g._maxi_const->dtype(loco::DataType::FLOAT32);
+    _min_max_g._maxi_const->size<loco::DataType::FLOAT32>(1);
+    _min_max_g._maxi_const->at<loco::DataType::FLOAT32>(0) = 0.;
+    _min_max_g._maxi_const->name("maxi_const");
+
+    // CircleMaximum
+    _min_max_g._maxi = _min_max_g._g.nodes()->create<luci::CircleMaximum>();
+    _min_max_g._maxi->x(_min_max_g._mini);
+    _min_max_g._maxi->y(_min_max_g._maxi_const);
+    _min_max_g._maxi->shape({N, H, W, C});
+    _min_max_g._maxi->dtype(loco::DataType::FLOAT32);
+    _min_max_g._maxi->name("maxi");
+
+    // CircleOutput
+    _min_max_g._output = _min_max_g._g.nodes()->create<luci::CircleOutput>();
+    _min_max_g._output->index(graph_output->index());
+    _min_max_g._output->from(_min_max_g._maxi);
+    _min_max_g._output->shape({N, H, W, C});
+    _min_max_g._output->dtype(loco::DataType::FLOAT32);
+    _min_max_g._output->name("output");
+  }
+
+protected:
+  luci::TransformMinMaxToRelu6Pass _pass;
+  MinMaxGraph _min_max_g;
+};
+
+} // namespace
+
+TEST_F(TransformMinMaxToRelu6PassTest, name)
+{
+  auto const name = _pass.name();
+  ASSERT_NE(nullptr, name);
+}
+
+/**
+ *  Optimized graph looks like below.
+ *
+ *  [CircleInput]
+ *        |
+ *  [CircleRelu6]
+ *        |
+ *  [CircleOutput]
+ */
+TEST_F(TransformMinMaxToRelu6PassTest, simple_test)
+{
+  auto ret = _pass.run(&_min_max_g._g);
+  EXPECT_TRUE(ret);
+
+  auto relu6 = dynamic_cast<luci::CircleRelu6 *>(_min_max_g._output->from());
+  EXPECT_NE(nullptr, relu6);
+
+  auto input = dynamic_cast<luci::CircleInput *>(relu6->features());
+  EXPECT_NE(nullptr, input);
+}
+
+TEST_F(TransformMinMaxToRelu6PassTest, wrong_condition_NEG)
+{
+  _min_max_g._maxi_const->at<loco::DataType::FLOAT32>(0) = 2.;
+
+  auto ret = _pass.run(&_min_max_g._g);
+
+  EXPECT_FALSE(ret);
+}
diff --git a/compiler/luci/pass/src/TypeInferencePass.cpp b/compiler/luci/pass/src/TypeInferencePass.cpp
deleted file mode 100644
index 63744045c..000000000
--- a/compiler/luci/pass/src/TypeInferencePass.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "luci/Pass/TypeInferencePass.h"
-
-#include <luci/IR/CircleDialect.h>
-#include <luci/Service/CircleTypeInferenceRule.h>
-
-#include <loco.h>
-#include <loco/IR/CanonicalDialect.h>
-#include <loco/Service/TypeInference.h>
-
-namespace luci
-{
-
-bool TypeInferencePass::run(luci::Module *m)
-{
-  bool changed = false;
-
-  for (size_t g = 0; g < m->size(); ++g)
-  {
-    if (run(m->graph(g)))
-      changed = true;
-  }
-
-  return changed;
-}
-
-bool TypeInferencePass::run(loco::Graph *g)
-{
-  loco::CanonicalTypeInferenceRule canonical_rule;
-  luci::CircleTypeInferenceRule circle_rule;
-
-  loco::MultiDialectTypeInferenceRule rules;
-
-  rules.bind(loco::CanonicalDialect::get(), &canonical_rule)
-      .bind(luci::CircleDialect::get(), &circle_rule);
-
-  return loco::apply(&rules).to(g);
-}
-
-} // namespace luci
diff --git a/compiler/luci/pass/src/VerifyQuantizedNodeChannelWiseGranularity.h b/compiler/luci/pass/src/VerifyQuantizedNodeChannelWiseGranularity.h
new file mode 100644
index 000000000..32f0d1a34
--- /dev/null
+++ b/compiler/luci/pass/src/VerifyQuantizedNodeChannelWiseGranularity.h
@@ -0,0 +1,401 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_VERIFY_QUANTIZED_NODE_CHANNELWISE_GRANULARITY_H__
+#define __LUCI_VERIFY_QUANTIZED_NODE_CHANNELWISE_GRANULARITY_H__
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/IR/CircleNodeVisitor.h>
+#include <luci/Pass/QuantizationParameters.h>
+
+using Granularity = luci::QuantizationGranularity;
+
+// This macro is undef at the end of the file
+#define RETURN_FALSE_UNLESS(ARG) \
+  if (not(ARG))                  \
+  {                              \
+    return false;                \
+  }
+
+namespace luci
+{
+
+/**
+ * @brief Verify the granualrity of channel-wise quantized node
+ * @details
+ *
+ * Targets to verify
+ * - node's output (i.e., node itself)
+ * - node's inputs
+ */
+struct VerifyQuantizedNodeChannelWiseGranularity final : public luci::CircleNodeVisitor<bool>
+{
+private:
+  bool is_lwq(const loco::Node *node)
+  {
+    auto circle_node = loco::must_cast<const luci::CircleNode *>(node);
+
+    if (circle_node->quantparam() == nullptr)
+      return false;
+
+    if (circle_node->quantparam()->scale.size() != 1)
+      return false;
+
+    if (circle_node->quantparam()->zerop.size() != 1)
+      return false;
+
+    return true;
+  }
+
+  uint32_t rank(const loco::Node *node)
+  {
+    auto circle_node = loco::must_cast<const luci::CircleNode *>(node);
+    return circle_node->rank();
+  }
+
+  bool is_cwq_const(const loco::Node *node, uint32_t channel_dim)
+  {
+    auto circle_node = loco::must_cast<const luci::CircleConst *>(node);
+
+    assert(channel_dim < circle_node->rank()); // FIX_CALLER_UNLESS
+    auto channel_size = circle_node->dim(channel_dim).value();
+
+    if (circle_node->quantparam() == nullptr)
+      return false;
+
+    if (circle_node->quantparam()->quantized_dimension != static_cast<int32_t>(channel_dim))
+      return false;
+
+    if (circle_node->quantparam()->scale.size() != channel_size)
+      return false;
+
+    if (circle_node->quantparam()->zerop.size() != channel_size)
+      return false;
+
+    return true;
+  }
+
+private:
+  bool visit(const luci::CircleConv2D *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node))
+    RETURN_FALSE_UNLESS(is_lwq(node->input()))
+    RETURN_FALSE_UNLESS(is_cwq_const(node->filter(), 0))
+    RETURN_FALSE_UNLESS(is_cwq_const(node->bias(), rank(node->bias()) - 1))
+    return true;
+  }
+
+  bool visit(const luci::CircleConcatenation *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node))
+    for (uint32_t i = 0; i < node->numValues(); i++)
+    {
+      RETURN_FALSE_UNLESS(is_lwq(node->values(i)));
+    }
+    return true;
+  }
+
+  bool visit(const luci::CircleDepthToSpace *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node))
+    RETURN_FALSE_UNLESS(is_lwq(node->input()))
+    return true;
+  }
+
+  bool visit(const luci::CircleDepthwiseConv2D *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node))
+    RETURN_FALSE_UNLESS(is_lwq(node->input()))
+    RETURN_FALSE_UNLESS(is_cwq_const(node->filter(), 3))
+    RETURN_FALSE_UNLESS(is_cwq_const(node->bias(), rank(node->bias()) - 1))
+    return true;
+  }
+
+  bool visit(const luci::CircleInstanceNorm *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node))
+    RETURN_FALSE_UNLESS(is_lwq(node->input()))
+    RETURN_FALSE_UNLESS(is_cwq_const(node->gamma(), rank(node->gamma()) - 1))
+    RETURN_FALSE_UNLESS(is_cwq_const(node->beta(), rank(node->beta()) - 1))
+    return true;
+  }
+
+  bool visit(const luci::CirclePad *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node))
+    RETURN_FALSE_UNLESS(is_lwq(node->input()))
+    return true;
+  }
+
+  bool visit(const luci::CirclePRelu *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node))
+    RETURN_FALSE_UNLESS(is_lwq(node->input()))
+    RETURN_FALSE_UNLESS(is_cwq_const(node->alpha(), rank(node->alpha()) - 1))
+    return true;
+  }
+
+  bool visit(const luci::CircleTransposeConv *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node))
+    RETURN_FALSE_UNLESS(is_lwq(node->outBackprop()))
+    RETURN_FALSE_UNLESS(is_cwq_const(node->filter(), 0))
+    luci::CircleConst *bias = dynamic_cast<luci::CircleConst *>(node->bias());
+    if (bias != nullptr)
+      RETURN_FALSE_UNLESS(is_cwq_const(node->bias(), rank(node->bias()) - 1))
+
+    return true;
+  }
+
+  bool visit(const luci::CircleFullyConnected *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node))
+    RETURN_FALSE_UNLESS(is_lwq(node->input()))
+    RETURN_FALSE_UNLESS(is_cwq_const(node->weights(), 0))
+    RETURN_FALSE_UNLESS(is_cwq_const(node->bias(), rank(node->bias()) - 1))
+    return true;
+  }
+
+  bool visit(const luci::CircleAdd *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->x()));
+    RETURN_FALSE_UNLESS(is_lwq(node->y()));
+    return true;
+  }
+
+  bool visit(const luci::CircleAveragePool2D *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->value()));
+    return true;
+  }
+
+  bool visit(const luci::CircleLogicalOr *)
+  {
+    // Logical OR has bool-type inputs and output
+    // Nothing to be checked
+    return true;
+  }
+
+  bool visit(const luci::CircleMaxPool2D *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->value()));
+    return true;
+  }
+
+  bool visit(const luci::CircleMean *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->input()));
+    return true;
+  }
+
+  bool visit(const luci::CircleMul *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->x()));
+    RETURN_FALSE_UNLESS(is_lwq(node->y()));
+    return true;
+  }
+
+  bool visit(const luci::CircleNotEqual *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node->x()));
+    RETURN_FALSE_UNLESS(is_lwq(node->y()));
+    return true;
+  }
+
+  bool visit(const luci::CircleRelu *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->features()));
+    return true;
+  }
+
+  bool visit(const luci::CircleReshape *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node))
+    RETURN_FALSE_UNLESS(is_lwq(node->tensor()));
+    return true;
+  }
+
+  bool visit(const luci::CircleLogistic *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->x()));
+    return true;
+  }
+
+  bool visit(const luci::CircleSoftmax *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->logits()));
+    return true;
+  }
+
+  bool visit(const luci::CircleSpaceToBatchND *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->input()));
+    return true;
+  }
+
+  bool visit(const luci::CircleSpaceToDepth *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->input()));
+    return true;
+  }
+
+  bool visit(const luci::CircleSlice *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->input()));
+    return true;
+  }
+
+  bool visit(const luci::CircleSplit *node)
+  {
+    // node's output is the input of CircleSplitOut, thus not quantized
+    RETURN_FALSE_UNLESS(is_lwq(node->input()));
+    return true;
+  }
+
+  bool visit(const luci::CircleSplitOut *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    return true;
+  }
+
+  bool visit(const luci::CircleStridedSlice *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->input()));
+    return true;
+  }
+
+  bool visit(const luci::CircleArgMax *node)
+  {
+    // node's output is index, thus not quantized
+    RETURN_FALSE_UNLESS(is_lwq(node->input()));
+    return true;
+  }
+
+  bool visit(const luci::CircleBatchToSpaceND *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->input()));
+    return true;
+  }
+
+  bool visit(const luci::CircleTanh *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->x()));
+    return true;
+  }
+
+  bool visit(const luci::CircleTranspose *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->a()));
+    return true;
+  }
+
+  bool visit(const luci::CircleFloor *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->x()));
+    return true;
+  }
+
+  bool visit(const luci::CircleGreater *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node->x()));
+    RETURN_FALSE_UNLESS(is_lwq(node->y()));
+    return true;
+  }
+
+  bool visit(const luci::CircleGreaterEqual *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node->x()));
+    RETURN_FALSE_UNLESS(is_lwq(node->y()));
+    return true;
+  }
+
+  bool visit(const luci::CircleDiv *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->x()));
+    RETURN_FALSE_UNLESS(is_lwq(node->y()));
+    return true;
+  }
+
+  bool visit(const luci::CircleFloorDiv *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->x()));
+    RETURN_FALSE_UNLESS(is_lwq(node->y()));
+    return true;
+  }
+
+  bool visit(const luci::CircleRsqrt *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->x()));
+    return true;
+  }
+
+  bool visit(const luci::CircleSqrt *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->x()));
+    return true;
+  }
+
+  bool visit(const luci::CircleElu *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->features()));
+    return true;
+  }
+
+  bool visit(const luci::CirclePow *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->x()));
+    RETURN_FALSE_UNLESS(is_lwq(node->y()));
+    return true;
+  }
+
+  bool visit(const luci::CircleResizeBilinear *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->input()));
+    return true;
+  }
+
+  // TODO: Implement more Ops
+
+  bool visit(const luci::CircleNode *) { return true; }
+};
+
+} // namespace luci
+
+#undef RETURN_FALSE_UNLESS
+
+#endif // __LUCI_VERIFY_QUANTIZED_NODE_CHANNELWISE_GRANULARITY_H__
diff --git a/compiler/luci/pass/src/VerifyQuantizedNodeLayerWiseGranularity.h b/compiler/luci/pass/src/VerifyQuantizedNodeLayerWiseGranularity.h
new file mode 100644
index 000000000..1e6fd53c0
--- /dev/null
+++ b/compiler/luci/pass/src/VerifyQuantizedNodeLayerWiseGranularity.h
@@ -0,0 +1,388 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_VERIFY_QUANTIZED_NODE_LAYERWISE_GRANULARITY_H__
+#define __LUCI_VERIFY_QUANTIZED_NODE_LAYERWISE_GRANULARITY_H__
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/IR/CircleNodeVisitor.h>
+#include <luci/Pass/QuantizationParameters.h>
+
+using Granularity = luci::QuantizationGranularity;
+
+// This macro is undef at the end of the file
+#define RETURN_FALSE_UNLESS(ARG) \
+  if (not(ARG))                  \
+  {                              \
+    return false;                \
+  }
+
+namespace luci
+{
+
+/**
+ * @brief Verify the granualrity of layer-wise quantized node
+ * @details
+ *
+ * Targets to verify
+ * - node's output (i.e., node itself)
+ * - node's inputs
+ */
+struct VerifyQuantizedNodeLayerWiseGranularity final : public luci::CircleNodeVisitor<bool>
+{
+private:
+  bool is_lwq(const loco::Node *node)
+  {
+    auto circle_node = loco::must_cast<const luci::CircleNode *>(node);
+
+    if (circle_node->quantparam() == nullptr)
+      return false;
+
+    if (circle_node->quantparam()->scale.size() != 1)
+      return false;
+
+    if (circle_node->quantparam()->zerop.size() != 1)
+      return false;
+
+    return true;
+  }
+
+  bool is_lwq_const(const loco::Node *node)
+  {
+    auto circle_node = loco::must_cast<const luci::CircleConst *>(node);
+
+    if (circle_node->quantparam() == nullptr)
+      return false;
+
+    if (circle_node->quantparam()->scale.size() != 1)
+      return false;
+
+    if (circle_node->quantparam()->zerop.size() != 1)
+      return false;
+
+    return true;
+  }
+
+private:
+  bool visit(const luci::CircleConv2D *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node))
+    RETURN_FALSE_UNLESS(is_lwq(node->input()))
+    RETURN_FALSE_UNLESS(is_lwq_const(node->filter()))
+    RETURN_FALSE_UNLESS(is_lwq_const(node->bias()))
+    return true;
+  }
+
+  bool visit(const luci::CircleConcatenation *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node))
+    for (uint32_t i = 0; i < node->numValues(); i++)
+    {
+      RETURN_FALSE_UNLESS(is_lwq(node->values(i)));
+    }
+    return true;
+  }
+
+  bool visit(const luci::CircleDepthToSpace *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node))
+    RETURN_FALSE_UNLESS(is_lwq(node->input()))
+    return true;
+  }
+
+  bool visit(const luci::CircleDepthwiseConv2D *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node))
+    RETURN_FALSE_UNLESS(is_lwq(node->input()))
+    RETURN_FALSE_UNLESS(is_lwq_const(node->filter()))
+    RETURN_FALSE_UNLESS(is_lwq_const(node->bias()))
+    return true;
+  }
+
+  bool visit(const luci::CircleInstanceNorm *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node))
+    RETURN_FALSE_UNLESS(is_lwq(node->input()))
+    RETURN_FALSE_UNLESS(is_lwq_const(node->gamma()))
+    RETURN_FALSE_UNLESS(is_lwq_const(node->beta()))
+    return true;
+  }
+
+  bool visit(const luci::CirclePad *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node))
+    RETURN_FALSE_UNLESS(is_lwq(node->input()))
+    return true;
+  }
+
+  bool visit(const luci::CirclePRelu *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node))
+    RETURN_FALSE_UNLESS(is_lwq(node->input()))
+    RETURN_FALSE_UNLESS(is_lwq_const(node->alpha()))
+    return true;
+  }
+
+  bool visit(const luci::CircleTransposeConv *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node))
+    RETURN_FALSE_UNLESS(is_lwq(node->outBackprop()))
+    RETURN_FALSE_UNLESS(is_lwq_const(node->filter()))
+    luci::CircleConst *bias = dynamic_cast<luci::CircleConst *>(node->bias());
+    if (bias != nullptr)
+      RETURN_FALSE_UNLESS(is_lwq_const(node->bias()))
+    return true;
+  }
+
+  bool visit(const luci::CircleFullyConnected *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node))
+    RETURN_FALSE_UNLESS(is_lwq(node->input()))
+    RETURN_FALSE_UNLESS(is_lwq_const(node->weights()))
+    RETURN_FALSE_UNLESS(is_lwq_const(node->bias()))
+    return true;
+  }
+
+  bool visit(const luci::CircleAdd *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node))
+    RETURN_FALSE_UNLESS(is_lwq(node->x()));
+    RETURN_FALSE_UNLESS(is_lwq(node->y()));
+    return true;
+  }
+
+  bool visit(const luci::CircleAveragePool2D *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node))
+    RETURN_FALSE_UNLESS(is_lwq(node->value()));
+    return true;
+  }
+
+  bool visit(const luci::CircleLogicalOr *)
+  {
+    // Logical OR has bool-type inputs and output
+    // Nothing to be checked
+    return true;
+  }
+
+  bool visit(const luci::CircleMaxPool2D *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node))
+    RETURN_FALSE_UNLESS(is_lwq(node->value()));
+    return true;
+  }
+
+  bool visit(const luci::CircleMean *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node))
+    RETURN_FALSE_UNLESS(is_lwq(node->input()));
+    return true;
+  }
+
+  bool visit(const luci::CircleMul *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node))
+    RETURN_FALSE_UNLESS(is_lwq(node->x()));
+    RETURN_FALSE_UNLESS(is_lwq(node->y()));
+    return true;
+  }
+
+  bool visit(const luci::CircleNotEqual *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node->x()));
+    RETURN_FALSE_UNLESS(is_lwq(node->y()));
+    return true;
+  }
+
+  bool visit(const luci::CircleRelu *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node))
+    RETURN_FALSE_UNLESS(is_lwq(node->features()));
+    return true;
+  }
+
+  bool visit(const luci::CircleReshape *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node))
+    RETURN_FALSE_UNLESS(is_lwq(node->tensor()));
+    return true;
+  }
+
+  bool visit(const luci::CircleLogistic *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->x()));
+    return true;
+  }
+
+  bool visit(const luci::CircleSoftmax *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->logits()));
+    return true;
+  }
+
+  bool visit(const luci::CircleSpaceToBatchND *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->input()));
+    return true;
+  }
+
+  bool visit(const luci::CircleSpaceToDepth *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->input()));
+    return true;
+  }
+
+  bool visit(const luci::CircleSlice *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->input()));
+    return true;
+  }
+
+  bool visit(const luci::CircleSplit *node)
+  {
+    // node's output is the input of CircleSplitOut, thus not quantized
+    RETURN_FALSE_UNLESS(is_lwq(node->input()));
+    return true;
+  }
+
+  bool visit(const luci::CircleSplitOut *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    return true;
+  }
+
+  bool visit(const luci::CircleStridedSlice *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->input()));
+    return true;
+  }
+
+  bool visit(const luci::CircleArgMax *node)
+  {
+    // node's output is index, thus not quantized
+    RETURN_FALSE_UNLESS(is_lwq(node->input()));
+    return true;
+  }
+
+  bool visit(const luci::CircleBatchToSpaceND *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->input()));
+    return true;
+  }
+
+  bool visit(const luci::CircleTanh *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->x()));
+    return true;
+  }
+
+  bool visit(const luci::CircleTranspose *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->a()));
+    return true;
+  }
+
+  bool visit(const luci::CircleFloor *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->x()));
+    return true;
+  }
+
+  bool visit(const luci::CircleGreater *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node->x()));
+    RETURN_FALSE_UNLESS(is_lwq(node->y()));
+    return true;
+  }
+
+  bool visit(const luci::CircleGreaterEqual *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node->x()));
+    RETURN_FALSE_UNLESS(is_lwq(node->y()));
+    return true;
+  }
+
+  bool visit(const luci::CircleDiv *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->x()));
+    RETURN_FALSE_UNLESS(is_lwq(node->y()));
+    return true;
+  }
+
+  bool visit(const luci::CircleFloorDiv *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->x()));
+    RETURN_FALSE_UNLESS(is_lwq(node->y()));
+    return true;
+  }
+
+  bool visit(const luci::CircleRsqrt *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->x()));
+    return true;
+  }
+
+  bool visit(const luci::CircleSqrt *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->x()));
+    return true;
+  }
+
+  bool visit(const luci::CircleElu *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->features()));
+    return true;
+  }
+
+  bool visit(const luci::CirclePow *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->x()));
+    RETURN_FALSE_UNLESS(is_lwq(node->y()));
+    return true;
+  }
+
+  bool visit(const luci::CircleResizeBilinear *node)
+  {
+    RETURN_FALSE_UNLESS(is_lwq(node));
+    RETURN_FALSE_UNLESS(is_lwq(node->input()));
+    return true;
+  }
+
+  // TODO: Implement more Ops
+
+  bool visit(const luci::CircleNode *) { return true; }
+};
+
+} // namespace luci
+
+#undef RETURN_FALSE_UNLESS
+
+#endif // __LUCI_VERIFY_QUANTIZED_NODE_LAYERWISE_GRANULARITY_H__
diff --git a/compiler/luci/pass/src/VerifyQuantizedNodeS16Type.h b/compiler/luci/pass/src/VerifyQuantizedNodeS16Type.h
new file mode 100644
index 000000000..e05d8325f
--- /dev/null
+++ b/compiler/luci/pass/src/VerifyQuantizedNodeS16Type.h
@@ -0,0 +1,375 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_VERIFY_QUANTIZED_NODE_S16_TYPE_H__
+#define __LUCI_VERIFY_QUANTIZED_NODE_S16_TYPE_H__
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/IR/CircleNodeVisitor.h>
+
+using Type = loco::DataType;
+
+// This macro is undef at the end of the file
+#define RETURN_FALSE_UNLESS(ARG) \
+  if (not(ARG))                  \
+  {                              \
+    return false;                \
+  }
+
+namespace luci
+{
+
+/**
+ * @brief Verify the data type of INT16 quantized node
+ * @details
+ *
+ * Targets to verify
+ * - node's output (i.e., node itself)
+ * - node's inputs
+ */
+struct VerifyQuantizedNodeS16Type final : public luci::CircleNodeVisitor<bool>
+{
+private:
+  bool has_type(const loco::Node *node, Type dtype)
+  {
+    auto circle_node = loco::must_cast<const luci::CircleNode *>(node);
+    return circle_node->dtype() == dtype;
+  }
+
+private:
+  bool visit(const luci::CircleConv2D *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->input(), Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->filter(), Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->bias(), Type::S64))
+    return true;
+  }
+
+  bool visit(const luci::CircleConcatenation *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::S16))
+    for (uint32_t i = 0; i < node->numValues(); i++)
+    {
+      RETURN_FALSE_UNLESS(has_type(node->values(i), Type::S16))
+    }
+    return true;
+  }
+
+  bool visit(const luci::CircleDepthToSpace *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->input(), Type::S16))
+    return true;
+  }
+
+  bool visit(const luci::CircleDepthwiseConv2D *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->input(), Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->filter(), Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->bias(), Type::S64))
+    return true;
+  }
+
+  bool visit(const luci::CircleInstanceNorm *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->input(), Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->gamma(), Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->beta(), Type::S16))
+    return true;
+  }
+
+  bool visit(const luci::CirclePad *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->input(), Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->paddings(), Type::S32))
+    return true;
+  }
+
+  bool visit(const luci::CirclePRelu *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->input(), Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->alpha(), Type::S16))
+    return true;
+  }
+
+  bool visit(const luci::CircleTransposeConv *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->outBackprop(), Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->filter(), Type::S16))
+    luci::CircleConst *bias = dynamic_cast<luci::CircleConst *>(node->bias());
+    if (bias != nullptr)
+      RETURN_FALSE_UNLESS(has_type(bias, Type::S64))
+    return true;
+  }
+
+  bool visit(const luci::CircleFullyConnected *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->input(), Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->weights(), Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->bias(), Type::S64))
+    return true;
+  }
+
+  bool visit(const luci::CircleAdd *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->x(), Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->y(), Type::S16))
+    return true;
+  }
+
+  bool visit(const luci::CircleAveragePool2D *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->value(), Type::S16))
+    return true;
+  }
+
+  bool visit(const luci::CircleLogicalOr *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::BOOL))
+    RETURN_FALSE_UNLESS(has_type(node->x(), Type::BOOL))
+    RETURN_FALSE_UNLESS(has_type(node->y(), Type::BOOL))
+    return true;
+  }
+
+  bool visit(const luci::CircleMaxPool2D *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->value(), Type::S16))
+    return true;
+  }
+
+  bool visit(const luci::CircleMean *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->input(), Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->reduction_indices(), Type::S32))
+    return true;
+  }
+
+  bool visit(const luci::CircleMul *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->x(), Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->y(), Type::S16))
+    return true;
+  }
+
+  bool visit(const luci::CircleNotEqual *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::BOOL))
+    RETURN_FALSE_UNLESS(has_type(node->x(), Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->y(), Type::S16))
+    return true;
+  }
+
+  bool visit(const luci::CircleRelu *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->features(), Type::S16))
+    return true;
+  }
+
+  bool visit(const luci::CircleReshape *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->tensor(), Type::S16))
+    luci::CircleConst *shape = dynamic_cast<luci::CircleConst *>(node->shape());
+    if (shape != nullptr)
+      RETURN_FALSE_UNLESS(has_type(shape, Type::S32))
+    return true;
+  }
+
+  bool visit(const luci::CircleLogistic *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->x(), Type::S16))
+    return true;
+  }
+
+  bool visit(const luci::CircleSoftmax *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->logits(), Type::S16))
+    return true;
+  }
+
+  bool visit(const luci::CircleSpaceToBatchND *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->input(), Type::S16))
+    return true;
+  }
+
+  bool visit(const luci::CircleSpaceToDepth *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->input(), Type::S16))
+    return true;
+  }
+
+  bool visit(const luci::CircleSlice *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->input(), Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->begin(), Type::S32) || has_type(node->begin(), Type::S64))
+    RETURN_FALSE_UNLESS(has_type(node->size(), Type::S32) || has_type(node->size(), Type::S64))
+    return true;
+  }
+
+  bool visit(const luci::CircleSplit *node)
+  {
+    // node's output is the input of CircleSplitOut, thus not quantized
+    RETURN_FALSE_UNLESS(has_type(node->input(), Type::S16))
+    return true;
+  }
+
+  bool visit(const luci::CircleSplitOut *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::S16))
+    return true;
+  }
+
+  bool visit(const luci::CircleStridedSlice *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->input(), Type::S16))
+    return true;
+  }
+
+  bool visit(const luci::CircleArgMax *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, node->output_type()))
+    RETURN_FALSE_UNLESS(has_type(node->input(), Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->dimension(), Type::S32) ||
+                        has_type(node->dimension(), Type::S64))
+    return true;
+  }
+
+  bool visit(const luci::CircleBatchToSpaceND *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->input(), Type::S16))
+    return true;
+  }
+
+  bool visit(const luci::CircleTanh *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->x(), Type::S16))
+    return true;
+  }
+
+  bool visit(const luci::CircleTranspose *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->a(), Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->perm(), Type::S32))
+    return true;
+  }
+
+  bool visit(const luci::CircleFloor *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->x(), Type::S16))
+    return true;
+  }
+
+  bool visit(const luci::CircleGreater *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::BOOL))
+    RETURN_FALSE_UNLESS(has_type(node->x(), Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->y(), Type::S16))
+    return true;
+  }
+
+  bool visit(const luci::CircleGreaterEqual *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::BOOL))
+    RETURN_FALSE_UNLESS(has_type(node->x(), Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->y(), Type::S16))
+    return true;
+  }
+
+  bool visit(const luci::CircleDiv *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->x(), Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->y(), Type::S16))
+    return true;
+  }
+
+  bool visit(const luci::CircleFloorDiv *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->x(), Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->y(), Type::S16))
+    return true;
+  }
+
+  bool visit(const luci::CircleRsqrt *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->x(), Type::S16))
+    return true;
+  }
+
+  bool visit(const luci::CircleSqrt *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->x(), Type::S16))
+    return true;
+  }
+
+  bool visit(const luci::CircleElu *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->features(), Type::S16))
+    return true;
+  }
+
+  bool visit(const luci::CirclePow *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->x(), Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->y(), Type::S16))
+    return true;
+  }
+
+  bool visit(const luci::CircleResizeBilinear *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::S16))
+    RETURN_FALSE_UNLESS(has_type(node->input(), Type::S16))
+    return true;
+  }
+
+  // TODO: Implement more Ops
+
+  bool visit(const luci::CircleNode *) { return true; }
+};
+
+} // namespace luci
+
+#undef RETURN_FALSE_UNLESS
+
+#endif // __LUCI_VERIFY_QUNTIZED_NODE_S16_TYPE_H__
diff --git a/compiler/luci/pass/src/VerifyQuantizedNodeU8Type.h b/compiler/luci/pass/src/VerifyQuantizedNodeU8Type.h
new file mode 100644
index 000000000..72ce5b8f8
--- /dev/null
+++ b/compiler/luci/pass/src/VerifyQuantizedNodeU8Type.h
@@ -0,0 +1,375 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_VERIFY_QUANTIZED_NODE_U8_TYPE_H__
+#define __LUCI_VERIFY_QUANTIZED_NODE_U8_TYPE_H__
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/IR/CircleNodeVisitor.h>
+
+using Type = loco::DataType;
+
+// This macro is undef at the end of the file
+#define RETURN_FALSE_UNLESS(ARG) \
+  if (not(ARG))                  \
+  {                              \
+    return false;                \
+  }
+
+namespace luci
+{
+
+/**
+ * @brief Verify the data type of UINT8 quantized node
+ * @details
+ *
+ * Targets to verify
+ * - node's output (i.e., node itself)
+ * - node's inputs
+ */
+struct VerifyQuantizedNodeU8Type final : public luci::CircleNodeVisitor<bool>
+{
+private:
+  bool has_type(const loco::Node *node, Type dtype)
+  {
+    auto circle_node = loco::must_cast<const luci::CircleNode *>(node);
+    return circle_node->dtype() == dtype;
+  }
+
+private:
+  bool visit(const luci::CircleConv2D *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->input(), Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->filter(), Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->bias(), Type::S32))
+    return true;
+  }
+
+  bool visit(const luci::CircleConcatenation *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::U8))
+    for (uint32_t i = 0; i < node->numValues(); i++)
+    {
+      RETURN_FALSE_UNLESS(has_type(node->values(i), Type::U8))
+    }
+    return true;
+  }
+
+  bool visit(const luci::CircleDepthToSpace *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->input(), Type::U8))
+    return true;
+  }
+
+  bool visit(const luci::CircleDepthwiseConv2D *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->input(), Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->filter(), Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->bias(), Type::S32))
+    return true;
+  }
+
+  bool visit(const luci::CircleInstanceNorm *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->input(), Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->gamma(), Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->beta(), Type::U8))
+    return true;
+  }
+
+  bool visit(const luci::CirclePad *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->input(), Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->paddings(), Type::S32))
+    return true;
+  }
+
+  bool visit(const luci::CirclePRelu *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->input(), Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->alpha(), Type::U8))
+    return true;
+  }
+
+  bool visit(const luci::CircleTransposeConv *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->outBackprop(), Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->filter(), Type::U8))
+    luci::CircleConst *bias = dynamic_cast<luci::CircleConst *>(node->bias());
+    if (bias != nullptr)
+      RETURN_FALSE_UNLESS(has_type(bias, Type::S32))
+    return true;
+  }
+
+  bool visit(const luci::CircleFullyConnected *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->input(), Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->weights(), Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->bias(), Type::S32))
+    return true;
+  }
+
+  bool visit(const luci::CircleAdd *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->x(), Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->y(), Type::U8))
+    return true;
+  }
+
+  bool visit(const luci::CircleAveragePool2D *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->value(), Type::U8))
+    return true;
+  }
+
+  bool visit(const luci::CircleBatchToSpaceND *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->input(), Type::U8))
+    return true;
+  }
+
+  bool visit(const luci::CircleLogicalOr *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::BOOL))
+    RETURN_FALSE_UNLESS(has_type(node->x(), Type::BOOL))
+    RETURN_FALSE_UNLESS(has_type(node->y(), Type::BOOL))
+    return true;
+  }
+
+  bool visit(const luci::CircleMaxPool2D *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->value(), Type::U8))
+    return true;
+  }
+
+  bool visit(const luci::CircleMean *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->input(), Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->reduction_indices(), Type::S32))
+    return true;
+  }
+
+  bool visit(const luci::CircleMul *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->x(), Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->y(), Type::U8))
+    return true;
+  }
+
+  bool visit(const luci::CircleNotEqual *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::BOOL))
+    RETURN_FALSE_UNLESS(has_type(node->x(), Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->y(), Type::U8))
+    return true;
+  }
+
+  bool visit(const luci::CircleRelu *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->features(), Type::U8))
+    return true;
+  }
+
+  bool visit(const luci::CircleReshape *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->tensor(), Type::U8))
+    luci::CircleConst *shape = dynamic_cast<luci::CircleConst *>(node->shape());
+    if (shape != nullptr)
+      RETURN_FALSE_UNLESS(has_type(shape, Type::S32))
+    return true;
+  }
+
+  bool visit(const luci::CircleLogistic *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->x(), Type::U8))
+    return true;
+  }
+
+  bool visit(const luci::CircleSoftmax *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->logits(), Type::U8))
+    return true;
+  }
+
+  bool visit(const luci::CircleSpaceToBatchND *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->input(), Type::U8))
+    return true;
+  }
+
+  bool visit(const luci::CircleSpaceToDepth *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->input(), Type::U8))
+    return true;
+  }
+
+  bool visit(const luci::CircleSlice *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->input(), Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->begin(), Type::S32) || has_type(node->begin(), Type::S64))
+    RETURN_FALSE_UNLESS(has_type(node->size(), Type::S32) || has_type(node->size(), Type::S64))
+    return true;
+  }
+
+  bool visit(const luci::CircleSplit *node)
+  {
+    // node's output is the input of CircleSplitOut, thus not quantized
+    RETURN_FALSE_UNLESS(has_type(node->input(), Type::U8))
+    return true;
+  }
+
+  bool visit(const luci::CircleSplitOut *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::U8))
+    return true;
+  }
+
+  bool visit(const luci::CircleStridedSlice *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->input(), Type::U8))
+    return true;
+  }
+
+  bool visit(const luci::CircleArgMax *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, node->output_type()))
+    RETURN_FALSE_UNLESS(has_type(node->input(), Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->dimension(), Type::S32) ||
+                        has_type(node->dimension(), Type::S64))
+    return true;
+  }
+
+  bool visit(const luci::CircleTanh *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->x(), Type::U8))
+    return true;
+  }
+
+  bool visit(const luci::CircleTranspose *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->a(), Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->perm(), Type::S32))
+    return true;
+  }
+
+  bool visit(const luci::CircleFloor *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->x(), Type::U8))
+    return true;
+  }
+
+  bool visit(const luci::CircleGreater *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::BOOL))
+    RETURN_FALSE_UNLESS(has_type(node->x(), Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->y(), Type::U8))
+    return true;
+  }
+
+  bool visit(const luci::CircleGreaterEqual *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::BOOL))
+    RETURN_FALSE_UNLESS(has_type(node->x(), Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->y(), Type::U8))
+    return true;
+  }
+
+  bool visit(const luci::CircleDiv *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->x(), Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->y(), Type::U8))
+    return true;
+  }
+
+  bool visit(const luci::CircleFloorDiv *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->x(), Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->y(), Type::U8))
+    return true;
+  }
+
+  bool visit(const luci::CircleRsqrt *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->x(), Type::U8))
+    return true;
+  }
+
+  bool visit(const luci::CircleSqrt *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->x(), Type::U8))
+    return true;
+  }
+
+  bool visit(const luci::CircleElu *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->features(), Type::U8))
+    return true;
+  }
+
+  bool visit(const luci::CirclePow *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->x(), Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->y(), Type::U8))
+    return true;
+  }
+
+  bool visit(const luci::CircleResizeBilinear *node)
+  {
+    RETURN_FALSE_UNLESS(has_type(node, Type::U8))
+    RETURN_FALSE_UNLESS(has_type(node->input(), Type::U8))
+    return true;
+  }
+
+  // TODO: Implement more Ops
+
+  bool visit(const luci::CircleNode *) { return true; }
+};
+
+} // namespace luci
+
+#undef RETURN_FALSE_UNLESS
+
+#endif // __LUCI_VERIFY_QUNTIZED_NODE_U8_TYPE_H__
diff --git a/compiler/luci/pass/src/helpers/InferenceCandidates.cpp b/compiler/luci/pass/src/helpers/InferenceCandidates.cpp
new file mode 100644
index 000000000..2c8565932
--- /dev/null
+++ b/compiler/luci/pass/src/helpers/InferenceCandidates.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "InferenceCandidates.h"
+
+#include <luci/IR/DeadNodeQueryService.h>
+
+namespace luci
+{
+
+std::vector<loco::Node *> inference_candidates(loco::Graph *g)
+{
+  auto candidates = loco::postorder_traversal(loco::output_nodes(g));
+
+  for (auto node : loco::all_nodes(g))
+  {
+    // already included as candidate
+    if (std::find(candidates.begin(), candidates.end(), node) != candidates.end())
+      continue;
+
+    // As the node is not used for both graph output and multiple output operation,
+    // it cannot be candidate.
+    if (node->dialect()->service<DeadNodeQueryServiceImpl>()->isDeadNode(node))
+      continue;
+
+    candidates.emplace_back(node);
+  }
+
+  return candidates;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/helpers/InferenceCandidates.h b/compiler/luci/pass/src/helpers/InferenceCandidates.h
new file mode 100644
index 000000000..f27e4fe60
--- /dev/null
+++ b/compiler/luci/pass/src/helpers/InferenceCandidates.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_INFERENCE_CANDIDATES_H__
+#define __LUCI_INFERENCE_CANDIDATES_H__
+
+#include <loco.h>
+
+#include <vector>
+
+namespace luci
+{
+
+/**
+ * @brief Enumerate all the nodes whose shape/dtype should be inferenced to export graph.
+ */
+std::vector<loco::Node *> inference_candidates(loco::Graph *g);
+
+} // namespace luci
+
+#endif // __LUCI_INFERENCE_CANDIDATES_H__
diff --git a/compiler/luci/pass/src/helpers/InferenceCandidates.test.cpp b/compiler/luci/pass/src/helpers/InferenceCandidates.test.cpp
new file mode 100644
index 000000000..e34421f5e
--- /dev/null
+++ b/compiler/luci/pass/src/helpers/InferenceCandidates.test.cpp
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "InferenceCandidates.h"
+#include "luci/IR/CircleNode.h"
+
+#include <algorithm>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+bool contains(const std::vector<loco::Node *> &vec, loco::Node *val)
+{
+  return std::any_of(vec.begin(), vec.end(), [val](loco::Node *node) { return node == val; });
+}
+
+} // namespace
+
+TEST(LuciPassHelpersInferenceCandidates, inference_candidates)
+{
+  auto g = loco::make_graph();
+
+  // Create nodes
+  auto input = g->nodes()->create<luci::CircleInput>();
+  auto split = g->nodes()->create<luci::CircleSplit>();
+  auto split_out1 = g->nodes()->create<luci::CircleSplitOut>();
+  auto split_out2 = g->nodes()->create<luci::CircleSplitOut>();
+  auto split_dim = g->nodes()->create<luci::CircleConst>();
+  auto output = g->nodes()->create<luci::CircleOutput>();
+
+  // Build up initial graph
+  auto graph_input1 = g->inputs()->create();
+  input->index(graph_input1->index());
+
+  split->split_dim(split_dim);
+  split->input(input);
+  split->num_split(2);
+
+  split_out1->input(split);
+  split_out1->index(0);
+
+  split_out2->input(split);
+  split_out2->index(1);
+
+  auto graph_output = g->outputs()->create();
+  output->from(split_out1);
+  output->index(graph_output->index());
+
+  auto s = luci::inference_candidates(g.get());
+
+  ASSERT_EQ(6, s.size());
+  ASSERT_TRUE(contains(s, input));
+  ASSERT_TRUE(contains(s, split));
+  ASSERT_TRUE(contains(s, split_out1));
+  ASSERT_TRUE(contains(s, split_out2));
+  ASSERT_TRUE(contains(s, split_dim));
+  ASSERT_TRUE(contains(s, output));
+}
+
+TEST(LuciPassHelpersInferenceCandidates, inference_candidates_NEG)
+{
+  auto g = loco::make_graph();
+
+  // Create nodes
+  auto input = g->nodes()->create<luci::CircleInput>();
+  auto split = g->nodes()->create<luci::CircleSplit>();
+  auto split_out1 = g->nodes()->create<luci::CircleSplitOut>();
+  auto split_out2 = g->nodes()->create<luci::CircleSplitOut>();
+  auto split_dim = g->nodes()->create<luci::CircleConst>();
+  auto relu1 = g->nodes()->create<luci::CircleRelu>();
+  auto relu2 = g->nodes()->create<luci::CircleRelu>();
+  auto output = g->nodes()->create<luci::CircleOutput>();
+
+  // Build up initial graph
+  auto graph_input1 = g->inputs()->create();
+  input->index(graph_input1->index());
+
+  split->split_dim(split_dim);
+  split->input(input);
+  split->num_split(2);
+
+  split_out1->input(split);
+  split_out1->index(0);
+
+  split_out2->input(split);
+  split_out2->index(1);
+
+  relu1->features(split_out2);
+
+  relu2->features(input);
+
+  auto graph_output = g->outputs()->create();
+  output->from(split_out1);
+  output->index(graph_output->index());
+
+  auto s = luci::inference_candidates(g.get());
+
+  ASSERT_EQ(6, s.size());
+  ASSERT_TRUE(contains(s, input));
+  ASSERT_TRUE(contains(s, split));
+  ASSERT_TRUE(contains(s, split_out1));
+  ASSERT_TRUE(contains(s, split_out2));
+  ASSERT_TRUE(contains(s, split_dim));
+  ASSERT_TRUE(contains(s, output));
+  ASSERT_FALSE(contains(s, relu1));
+  ASSERT_FALSE(contains(s, relu2));
+}
diff --git a/compiler/luci/pass/src/helpers/NodeFiller.cpp b/compiler/luci/pass/src/helpers/NodeFiller.cpp
new file mode 100644
index 000000000..b1416655d
--- /dev/null
+++ b/compiler/luci/pass/src/helpers/NodeFiller.cpp
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "NodeFiller.h"
+
+// NOTE Do NOT delete this file; this file enforces compiler to check whether 'NodeFiller.h' is
+//      complete.
diff --git a/compiler/luci/pass/src/helpers/NodeFiller.h b/compiler/luci/pass/src/helpers/NodeFiller.h
new file mode 100644
index 000000000..b80f085b0
--- /dev/null
+++ b/compiler/luci/pass/src/helpers/NodeFiller.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace luci
+{
+
+/**
+ * INTRODUCTION
+ *         Binary operation f(x,y) is 'commutative' when
+ *         f(x,y) == f(y,x) holds for all x, y.
+ *         For examples, ADD, MUL and SQUARED_DIFFERENCE are commutative.
+ *         These helpers make it easy to find commutative arguments of commutative node.
+ *
+ * HOW TO USE
+ *         COMM_NODE *node;
+ *         ARG_TYPE_1 *arg1;
+ *         ARG_TYPE_2 *arg2;
+ *
+ *         bool ok = fill(&arg1, &arg2).with_commutative_args_of(node);
+ *
+ * Result
+ *         If 'node's commutative argument types are actually {ARG_TYPE_1, ARG_TYPE_2}
+ *         (as a set), 'arg1' and 'arg2' set as actual 'node's arguments with matching
+ *         type, and return value 'ok' is true.
+ *         Otherwise, 'arg1' and 'arg2' not changed, 'ok' is false.
+ */
+
+template <class ARG_TYPE_1, class ARG_TYPE_2> class NodeFiller final
+{
+public:
+  NodeFiller(ARG_TYPE_1 **arg_1, ARG_TYPE_2 **arg_2) : _arg_1(arg_1), _arg_2(arg_2)
+  {
+    // DO NOTHING
+  }
+
+  /**
+   * @return true   When 'node's argument types are 'ARG_TYPE_1' and 'ARG_TYPE_2'
+   *                In such case, it assign '_arg_1' and '_arg_2' to actual arguments
+   *
+   * @return false  When 'node's argument types are NOT matched with 'ARG_TYPE_*'
+   *                In such case, it does not amend '_arg_1' and '_arg_2'
+   *
+   * @require       COMM_NODE has member x() and y()
+   */
+  template <class COMM_NODE> bool with_commutative_args_of(const COMM_NODE *node);
+
+private:
+  ARG_TYPE_1 **_arg_1;
+  ARG_TYPE_2 **_arg_2;
+};
+
+template <class ARG_TYPE_1, class ARG_TYPE_2>
+inline NodeFiller<ARG_TYPE_1, ARG_TYPE_2> fill(ARG_TYPE_1 **arg_1, ARG_TYPE_2 **arg_2)
+{
+  return NodeFiller<ARG_TYPE_1, ARG_TYPE_2>{arg_1, arg_2};
+}
+
+template <class ARG_TYPE_1, class ARG_TYPE_2>
+template <class COMM_NODE>
+bool NodeFiller<ARG_TYPE_1, ARG_TYPE_2>::with_commutative_args_of(const COMM_NODE *node)
+{
+  // Case 1) X == ARG_TYPE_1 / Y == ARG_TYPE_2
+  {
+    auto x = dynamic_cast<ARG_TYPE_1 *>(node->x());
+    auto y = dynamic_cast<ARG_TYPE_2 *>(node->y());
+
+    if (x && y)
+    {
+      *_arg_1 = x;
+      *_arg_2 = y;
+      return true;
+    }
+  }
+
+  // Case 2) X == ARG_TYPE_2 / Y == ARG_TYPE_1
+  {
+    auto x = dynamic_cast<ARG_TYPE_2 *>(node->x());
+    auto y = dynamic_cast<ARG_TYPE_1 *>(node->y());
+
+    if (x && y)
+    {
+      *_arg_1 = y;
+      *_arg_2 = x;
+      return true;
+    }
+  }
+
+  return false;
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/helpers/NodeFiller.test.cpp b/compiler/luci/pass/src/helpers/NodeFiller.test.cpp
new file mode 100644
index 000000000..9bbc7f264
--- /dev/null
+++ b/compiler/luci/pass/src/helpers/NodeFiller.test.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <luci/IR/CircleNodes.h>
+
+#include <gtest/gtest.h>
+
+#include "NodeFiller.h"
+
+TEST(NodeFillerTest, simple_test)
+{
+  luci::CircleConst maxi_const;
+  luci::CircleMinimum mini;
+  luci::CircleMaximum maxi;
+  maxi.x(&maxi_const);
+  maxi.y(&mini);
+
+  luci::CircleConst *x = nullptr;
+  luci::CircleMinimum *y = nullptr;
+
+  EXPECT_TRUE(luci::fill(&x, &y).with_commutative_args_of(&maxi));
+  EXPECT_TRUE(x == &maxi_const);
+  EXPECT_TRUE(y == &mini);
+
+  x = nullptr;
+  y = nullptr;
+
+  EXPECT_TRUE(luci::fill(&y, &x).with_commutative_args_of(&maxi));
+  EXPECT_TRUE(x == &maxi_const);
+  EXPECT_TRUE(y == &mini);
+}
+
+TEST(NodeFillerTest, wrong_condition_NEG)
+{
+  luci::CircleConst add_const;
+  luci::CircleMinimum mini;
+  luci::CircleAdd add;
+  add.x(&add_const);
+  add.y(&mini);
+
+  luci::CircleMul *x = nullptr;
+  luci::CircleMinimum *y = nullptr;
+
+  EXPECT_FALSE(luci::fill(&x, &y).with_commutative_args_of(&add));
+  EXPECT_FALSE(luci::fill(&y, &x).with_commutative_args_of(&add));
+}
diff --git a/compiler/luci/pass/src/helpers/Strings.cpp b/compiler/luci/pass/src/helpers/Strings.cpp
new file mode 100644
index 000000000..d020f6ddc
--- /dev/null
+++ b/compiler/luci/pass/src/helpers/Strings.cpp
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Strings.h"
+
+#include <algorithm>
+
+namespace luci
+{
+
+bool in_array(const std::string &str, const std::vector<std::string> &array)
+{
+  return std::find(array.begin(), array.end(), str) != array.end();
+}
+
+std::string to_string(const std::vector<std::string> &strings)
+{
+  assert(!strings.empty());
+
+  std::string res;
+  for (unsigned int i = 0; i < strings.size() - 1; i++)
+    res += strings[i] + ", ";
+
+  res += strings[strings.size() - 1];
+  return res;
+}
+
+std::string to_lower_case(std::string s)
+{
+  std::transform(s.begin(), s.end(), s.begin(), [](unsigned char c) { return std::tolower(c); });
+  return s;
+}
+
+loco::DataType str_to_dtype(const std::string &str)
+{
+  if (to_lower_case(str).compare("uint8") == 0)
+    return loco::DataType::U8;
+  if (to_lower_case(str).compare("uint16") == 0)
+    return loco::DataType::U16;
+  if (to_lower_case(str).compare("uint32") == 0)
+    return loco::DataType::U32;
+  if (to_lower_case(str).compare("uint64") == 0)
+    return loco::DataType::U64;
+
+  if (to_lower_case(str).compare("int8") == 0)
+    return loco::DataType::S8;
+  if (to_lower_case(str).compare("int16") == 0)
+    return loco::DataType::S16;
+  if (to_lower_case(str).compare("int32") == 0)
+    return loco::DataType::S32;
+  if (to_lower_case(str).compare("int64") == 0)
+    return loco::DataType::S64;
+
+  if (to_lower_case(str).compare("float16") == 0)
+    return loco::DataType::FLOAT16;
+  if (to_lower_case(str).compare("float32") == 0)
+    return loco::DataType::FLOAT32;
+  if (to_lower_case(str).compare("float64") == 0)
+    return loco::DataType::FLOAT64;
+
+  if (to_lower_case(str).compare("bool") == 0)
+    return loco::DataType::BOOL;
+
+  return loco::DataType::Unknown;
+}
+
+QuantizationGranularity str_to_granularity(const std::string &str)
+{
+  if (to_lower_case(str).compare("layer") == 0)
+    return QuantizationGranularity::LayerWise;
+
+  if (to_lower_case(str).compare("channel") == 0)
+    return QuantizationGranularity::ChannelWise;
+
+  throw std::runtime_error("Quantization granularity must be either 'layer' or 'channel'");
+}
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/helpers/Strings.h b/compiler/luci/pass/src/helpers/Strings.h
new file mode 100644
index 000000000..793d137fb
--- /dev/null
+++ b/compiler/luci/pass/src/helpers/Strings.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_PASS_HELPERS_STRINGS_H__
+#define __LUCI_PASS_HELPERS_STRINGS_H__
+
+#include "luci/Pass/QuantizationParameters.h"
+
+#include <loco.h>
+
+#include <vector>
+#include <sstream>
+#include <string>
+
+namespace luci
+{
+
+bool in_array(const std::string &, const std::vector<std::string> &);
+
+std::string to_string(const std::vector<std::string> &);
+
+std::string to_lower_case(std::string);
+
+loco::DataType str_to_dtype(const std::string &);
+
+QuantizationGranularity str_to_granularity(const std::string &);
+
+template <typename T> std::vector<T> csv_to_vector(const std::string &str)
+{
+  std::vector<T> ret;
+  std::istringstream is(str);
+  for (T i; is >> i;)
+  {
+    assert(i != ',');
+    ret.push_back(i);
+    if (is.peek() == ',')
+      is.ignore();
+  }
+  return ret;
+}
+
+} // namespace luci
+
+#endif // __LUCI_PASS_HELPERS_STRINGS_H__
diff --git a/compiler/luci/pass/src/helpers/Strings.test.cpp b/compiler/luci/pass/src/helpers/Strings.test.cpp
new file mode 100644
index 000000000..f6bb48951
--- /dev/null
+++ b/compiler/luci/pass/src/helpers/Strings.test.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Strings.h"
+
+#include "luci/Pass/QuantizationParameters.h"
+
+#include <gtest/gtest.h>
+
+TEST(StringsTest, str_to_dtype)
+{
+  ASSERT_EQ(loco::DataType::U8, luci::str_to_dtype("uint8"));
+  ASSERT_EQ(loco::DataType::U16, luci::str_to_dtype("uint16"));
+  ASSERT_EQ(loco::DataType::U32, luci::str_to_dtype("uint32"));
+  ASSERT_EQ(loco::DataType::U64, luci::str_to_dtype("uint64"));
+
+  ASSERT_EQ(loco::DataType::S8, luci::str_to_dtype("int8"));
+  ASSERT_EQ(loco::DataType::S16, luci::str_to_dtype("int16"));
+  ASSERT_EQ(loco::DataType::S32, luci::str_to_dtype("int32"));
+  ASSERT_EQ(loco::DataType::S64, luci::str_to_dtype("int64"));
+
+  ASSERT_EQ(loco::DataType::FLOAT16, luci::str_to_dtype("float16"));
+  ASSERT_EQ(loco::DataType::FLOAT32, luci::str_to_dtype("float32"));
+  ASSERT_EQ(loco::DataType::FLOAT64, luci::str_to_dtype("float64"));
+
+  ASSERT_EQ(loco::DataType::BOOL, luci::str_to_dtype("bool"));
+
+  ASSERT_EQ(loco::DataType::Unknown, luci::str_to_dtype("foo"));
+}
+
+TEST(StringsTest, str_to_granularity)
+{
+  ASSERT_EQ(luci::QuantizationGranularity::LayerWise, luci::str_to_granularity("layer"));
+  ASSERT_EQ(luci::QuantizationGranularity::ChannelWise, luci::str_to_granularity("channel"));
+
+  EXPECT_THROW(luci::str_to_granularity("foo"), std::runtime_error);
+}
+
+TEST(StringsTest, csv_to_vector_int32)
+{
+  auto ret = luci::csv_to_vector<int32_t>("1,2,3");
+  ASSERT_EQ(3, ret.size());
+  ASSERT_EQ(1, ret.at(0));
+  ASSERT_EQ(3, ret.at(2));
+}
diff --git a/compiler/luci/pass/src/helpers/TypeMapper.cpp b/compiler/luci/pass/src/helpers/TypeMapper.cpp
new file mode 100644
index 000000000..ffa0159dd
--- /dev/null
+++ b/compiler/luci/pass/src/helpers/TypeMapper.cpp
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TypeMapper.h"
+
+// NOTE Do NOT delete this file; this file enforces compiler to check whether 'TypeMapper.h' is
+//      complete.
diff --git a/compiler/luci/pass/src/helpers/TypeMapper.h b/compiler/luci/pass/src/helpers/TypeMapper.h
new file mode 100644
index 000000000..90760e95b
--- /dev/null
+++ b/compiler/luci/pass/src/helpers/TypeMapper.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <loco/IR/DataType.h>
+
+#include <cstdint>
+
+namespace luci
+{
+
+/**
+ * @brief TypeMapper maps between c++ primitive data type and loco::DataType.
+ */
+template <typename T> struct TypeMapper
+{
+  static constexpr loco::DataType get() { return loco::DataType::Unknown; }
+};
+
+template <> struct TypeMapper<float>
+{
+  static constexpr loco::DataType get() { return loco::DataType::FLOAT32; }
+};
+
+template <> struct TypeMapper<uint8_t>
+{
+  static constexpr loco::DataType get() { return loco::DataType::U8; }
+};
+
+template <> struct TypeMapper<uint16_t>
+{
+  static constexpr loco::DataType get() { return loco::DataType::U16; }
+};
+
+template <> struct TypeMapper<uint32_t>
+{
+  static constexpr loco::DataType get() { return loco::DataType::U32; }
+};
+
+template <> struct TypeMapper<uint64_t>
+{
+  static constexpr loco::DataType get() { return loco::DataType::U64; }
+};
+
+template <> struct TypeMapper<int8_t>
+{
+  static constexpr loco::DataType get() { return loco::DataType::S8; }
+};
+
+template <> struct TypeMapper<int16_t>
+{
+  static constexpr loco::DataType get() { return loco::DataType::S16; }
+};
+
+template <> struct TypeMapper<int32_t>
+{
+  static constexpr loco::DataType get() { return loco::DataType::S32; }
+};
+
+template <> struct TypeMapper<int64_t>
+{
+  static constexpr loco::DataType get() { return loco::DataType::S64; }
+};
+
+} // namespace luci
diff --git a/compiler/luci/pass/src/helpers/TypeMapper.test.cpp b/compiler/luci/pass/src/helpers/TypeMapper.test.cpp
new file mode 100644
index 000000000..a7ac08a63
--- /dev/null
+++ b/compiler/luci/pass/src/helpers/TypeMapper.test.cpp
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <luci/IR/CircleNodes.h>
+
+#include <gtest/gtest.h>
+
+#include "TypeMapper.h"
+
+#include <vector>
+
+namespace
+{
+
+template <typename T> bool fill_const_node(luci::CircleConst *node, std::vector<T> &data)
+{
+  if (node->dtype() != luci::TypeMapper<T>::get())
+    return false;
+
+  node->size<luci::TypeMapper<T>::get()>(data.size());
+  for (uint32_t i = 0; i < data.size(); i++)
+  {
+    node->at<luci::TypeMapper<T>::get()>(i) = data.at(i);
+  }
+
+  return true;
+}
+
+class STRANGER
+{
+};
+
+} // namespace
+
+TEST(TypeMapperTest, simple_test)
+{
+  EXPECT_EQ(loco::DataType::FLOAT32, luci::TypeMapper<float>::get());
+  EXPECT_EQ(loco::DataType::U8, luci::TypeMapper<uint8_t>::get());
+  EXPECT_EQ(loco::DataType::U16, luci::TypeMapper<uint16_t>::get());
+  EXPECT_EQ(loco::DataType::U32, luci::TypeMapper<uint32_t>::get());
+  EXPECT_EQ(loco::DataType::U64, luci::TypeMapper<uint64_t>::get());
+  EXPECT_EQ(loco::DataType::S8, luci::TypeMapper<int8_t>::get());
+  EXPECT_EQ(loco::DataType::S16, luci::TypeMapper<int16_t>::get());
+  EXPECT_EQ(loco::DataType::S32, luci::TypeMapper<int32_t>::get());
+  EXPECT_EQ(loco::DataType::S64, luci::TypeMapper<int64_t>::get());
+}
+
+TEST(TypeMapperTest, with_template_test)
+{
+  std::vector<int32_t> int32_vec{0, 1, 2, 3, 4, 5, 6, 7};
+  luci::CircleConst const_node;
+  const_node.dtype(loco::DataType::S32);
+  EXPECT_TRUE(fill_const_node(&const_node, int32_vec));
+  EXPECT_EQ(8, const_node.size<loco::DataType::S32>());
+  EXPECT_EQ(0, const_node.at<loco::DataType::S32>(0));
+  EXPECT_EQ(1, const_node.at<loco::DataType::S32>(1));
+  EXPECT_EQ(2, const_node.at<loco::DataType::S32>(2));
+  EXPECT_EQ(3, const_node.at<loco::DataType::S32>(3));
+  EXPECT_EQ(4, const_node.at<loco::DataType::S32>(4));
+  EXPECT_EQ(5, const_node.at<loco::DataType::S32>(5));
+  EXPECT_EQ(6, const_node.at<loco::DataType::S32>(6));
+  EXPECT_EQ(7, const_node.at<loco::DataType::S32>(7));
+
+  std::vector<float> f32_vec{0.0, 1.1, 2.2, 3.3, 4.4, 5.5};
+  const_node.dtype(loco::DataType::FLOAT32);
+  EXPECT_FALSE(fill_const_node(&const_node, int32_vec));
+  EXPECT_TRUE(fill_const_node(&const_node, f32_vec));
+  EXPECT_EQ(6, const_node.size<loco::DataType::FLOAT32>());
+  EXPECT_FLOAT_EQ(0.0, const_node.at<loco::DataType::FLOAT32>(0));
+  EXPECT_FLOAT_EQ(1.1, const_node.at<loco::DataType::FLOAT32>(1));
+  EXPECT_FLOAT_EQ(2.2, const_node.at<loco::DataType::FLOAT32>(2));
+  EXPECT_FLOAT_EQ(3.3, const_node.at<loco::DataType::FLOAT32>(3));
+  EXPECT_FLOAT_EQ(4.4, const_node.at<loco::DataType::FLOAT32>(4));
+  EXPECT_FLOAT_EQ(5.5, const_node.at<loco::DataType::FLOAT32>(5));
+}
+
+TEST(TypeMapperTest, wrong_condition_NEG)
+{
+  EXPECT_EQ(loco::DataType::Unknown, luci::TypeMapper<STRANGER>::get());
+}
diff --git a/compiler/luci/pass/src/test/TestFirstNode.h b/compiler/luci/pass/src/test/TestFirstNode.h
new file mode 100644
index 000000000..21f859fcd
--- /dev/null
+++ b/compiler/luci/pass/src/test/TestFirstNode.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_PASS_TEST_FIRST_NODE_H__
+#define __LUCI_PASS_TEST_FIRST_NODE_H__
+
+#include <luci/IR/CircleNodes.h>
+
+#include <loco.h>
+
+namespace luci
+{
+namespace test
+{
+
+template <class T> T *first_node(loco::Graph *g)
+{
+  for (auto node : loco::active_nodes(loco::output_nodes(g)))
+  {
+    auto target_node = dynamic_cast<T *>(node);
+    if (target_node != nullptr)
+      return target_node;
+  }
+  return nullptr;
+}
+
+} // namespace test
+} // namespace luci
+
+#endif // __LUCI_PASS_TEST_FIRST_NODE_H__
diff --git a/compiler/luci/pass/src/test/TestFirstNode.test.cpp b/compiler/luci/pass/src/test/TestFirstNode.test.cpp
new file mode 100644
index 000000000..b07ac6199
--- /dev/null
+++ b/compiler/luci/pass/src/test/TestFirstNode.test.cpp
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TestFirstNode.h"
+
+// This file validates "TestFirstNode.h". Pleaes DO NOT remove this file.
diff --git a/compiler/luci/pass/src/test/TestIOGraph.h b/compiler/luci/pass/src/test/TestIOGraph.h
new file mode 100644
index 000000000..b1fc41f90
--- /dev/null
+++ b/compiler/luci/pass/src/test/TestIOGraph.h
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_PASS_TEST_IO_GRAPH_H__
+#define __LUCI_PASS_TEST_IO_GRAPH_H__
+
+#include "TestShape.h"
+
+#include <luci/IR/CircleNodes.h>
+
+namespace luci
+{
+namespace test
+{
+
+/**
+ * @brief Graphlet with Inputs and loco::Graph for multiple inputs
+ * @note  Every Graph will have Input(s) and Output(s)
+ *        We put loco::Graph only in IsGraphlet not to declare separate
+ *        class for loco::Graph
+ */
+template <unsigned N> class TestIsGraphlet
+{
+public:
+  TestIsGraphlet()
+  {
+    for (uint32_t n = 0; n < N; ++n)
+    {
+      _graph_inputs[n] = nullptr;
+      _inputs[n] = nullptr;
+    }
+  }
+
+public:
+  virtual void init(loco::Graph *g, const ShapeU32 shape_in)
+  {
+    for (uint32_t n = 0; n < N; ++n)
+    {
+      _graph_inputs[n] = g->inputs()->create();
+
+      _inputs[n] = g->nodes()->create<luci::CircleInput>();
+      _inputs[n]->shape(shape_in);
+      _inputs[n]->shape_status(luci::ShapeStatus::VALID);
+      _inputs[n]->dtype(loco::DataType::FLOAT32);
+      _inputs[n]->name("input_" + std::to_string(n));
+
+      _inputs[n]->index(_graph_inputs[n]->index());
+
+      auto input_shape = std::make_unique<loco::TensorShape>();
+      set_shape_vector(input_shape.get(), shape_in);
+      _graph_inputs[n]->shape(std::move(input_shape));
+      _graph_inputs[n]->dtype(loco::DataType::FLOAT32);
+    }
+  }
+
+public:
+  loco::Graph *g(void) { return &_g; }
+  luci::CircleInput *input(int idx) { return _inputs[idx]; }
+
+protected:
+  loco::Graph _g;
+  std::array<loco::GraphInput *, N> _graph_inputs;
+  std::array<luci::CircleInput *, N> _inputs;
+};
+
+/**
+ * @brief Graphlet with one Input
+ */
+class TestIGraphlet : public TestIsGraphlet<1>
+{
+public:
+  luci::CircleInput *input() { return _inputs[0]; }
+};
+
+/**
+ * @brief Graphlet with Outputs for multiple outputs
+ */
+template <unsigned N> class TestOsGraphlet
+{
+public:
+  TestOsGraphlet()
+  {
+    for (uint32_t n = 0; n < N; ++n)
+    {
+      _graph_outputs[n] = nullptr;
+      _outputs[n] = nullptr;
+    }
+  }
+
+public:
+  virtual void init(loco::Graph *g, const ShapeU32 shape_out)
+  {
+    for (uint32_t n = 0; n < N; ++n)
+    {
+      _graph_outputs[n] = g->outputs()->create();
+
+      _outputs[n] = g->nodes()->create<luci::CircleOutput>();
+      _outputs[n]->shape(shape_out);
+      _outputs[n]->shape_status(luci::ShapeStatus::VALID);
+      _outputs[n]->dtype(loco::DataType::FLOAT32);
+      _outputs[n]->name("output_" + std::to_string(n));
+
+      _outputs[n]->index(_graph_outputs[n]->index());
+
+      auto output_shape = std::make_unique<loco::TensorShape>();
+      set_shape_vector(output_shape.get(), shape_out);
+      _graph_outputs[n]->shape(std::move(output_shape));
+      _graph_outputs[n]->dtype(loco::DataType::FLOAT32);
+    }
+  }
+
+public:
+  luci::CircleOutput *output(int idx) { return _outputs[idx]; }
+
+protected:
+  std::array<loco::GraphOutput *, N> _graph_outputs;
+  std::array<luci::CircleOutput *, N> _outputs;
+};
+
+/**
+ * @brief Graphlet with one Output
+ */
+class TestOGraphlet : public TestOsGraphlet<1>
+{
+public:
+  luci::CircleOutput *output() { return _outputs[0]; }
+};
+
+/**
+ * @brief Graph with Input and Output
+ */
+class TestIOGraph : public TestIGraphlet, public TestOGraphlet
+{
+public:
+  TestIOGraph() = default;
+
+public:
+  virtual void init(const ShapeU32 shape_in, const ShapeU32 shape_out)
+  {
+    TestIsGraphlet<1>::init(g(), shape_in);
+    TestOsGraphlet<1>::init(g(), shape_out);
+  }
+};
+
+} // namespace test
+} // namespace luci
+
+#endif // __LUCI_PASS_TEST_IO_GRAPH_H__
diff --git a/compiler/luci/pass/src/test/TestIOGraph.test.cpp b/compiler/luci/pass/src/test/TestIOGraph.test.cpp
new file mode 100644
index 000000000..e58a13f2b
--- /dev/null
+++ b/compiler/luci/pass/src/test/TestIOGraph.test.cpp
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TestIOGraph.h"
+
+// This file validates "TestIOGraph.h". Pleaes DO NOT remove this file.
diff --git a/compiler/luci/export/src/TypeBridge.h b/compiler/luci/pass/src/test/TestShape.h
index a63fbce54..ccc55c9da 100644
--- a/compiler/luci/export/src/TypeBridge.h
+++ b/compiler/luci/pass/src/test/TestShape.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,31 +14,27 @@
  * limitations under the License.
  */
 
-#ifndef __TYPE_BRIDGE_H__
-#define __TYPE_BRIDGE_H__
+#ifndef __LUCI_PASS_TEST_SHAPE_H__
+#define __LUCI_PASS_TEST_SHAPE_H__
 
 #include <luci/IR/CircleNode.h>
 
-#include <loco.h>
+#include <initializer_list>
 
 namespace luci
 {
+namespace test
+{
 
-/**
- * @brief  node_shape() will return loco::TensorShape of CircleNode
- */
-loco::TensorShape node_shape(CircleNode *node);
+using ShapeU32 = std::initializer_list<uint32_t>;
+using ShapeI32 = std::initializer_list<int32_t>;
 
-/**
- * @brief  node_dtype() will return loco::DataType of CircleNode
- */
-loco::DataType node_dtype(CircleNode *node);
+void set_shape_vector(loco::TensorShape *shape, const ShapeU32 &values);
+void set_shape_vector(luci::CircleConst *const_node, const ShapeI32 &values);
 
-/**
- * @brief copy_shape_dtype() will copy shape and dtype inference data to CircleNode
- */
-void copy_shape_dtype(loco::Graph *graph);
+uint32_t num_elements(const ShapeU32 shape);
 
+} // namespace test
 } // namespace luci
 
-#endif // __TYPE_BRIDGE_H__
+#endif // __LUCI_PASS_TEST_SHAPE_H__
diff --git a/compiler/luci/pass/src/test/TestShape.test.cpp b/compiler/luci/pass/src/test/TestShape.test.cpp
new file mode 100644
index 000000000..39790c614
--- /dev/null
+++ b/compiler/luci/pass/src/test/TestShape.test.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "TestShape.h"
+
+/**
+ * @note This file does not hold any test cases but provides methods for tests
+ */
+
+namespace luci
+{
+namespace test
+{
+
+void set_shape_vector(loco::TensorShape *shape, const ShapeU32 &values)
+{
+  uint32_t r = 0;
+  shape->rank(values.size());
+  for (auto v : values)
+    shape->dim(r++).set(v);
+}
+
+void set_shape_vector(luci::CircleConst *const_node, const ShapeI32 &values)
+{
+  const_node->rank(1);
+  const_node->dim(0).set(values.size());
+  const_node->shape_status(luci::ShapeStatus::VALID);
+  const_node->dtype(loco::DataType::S32);
+  const_node->size<loco::DataType::S32>(values.size());
+  uint32_t idx = 0;
+  for (auto val : values)
+    const_node->at<loco::DataType::S32>(idx++) = val;
+}
+
+uint32_t num_elements(const ShapeU32 shape)
+{
+  uint32_t result = 1;
+  for (auto val : shape)
+    result = result * val;
+  return result;
+}
+
+} // namespace test
+} // namespace luci
diff --git a/compiler/luci/profile/CMakeLists.txt b/compiler/luci/profile/CMakeLists.txt
new file mode 100644
index 000000000..f2c6665da
--- /dev/null
+++ b/compiler/luci/profile/CMakeLists.txt
@@ -0,0 +1,22 @@
+file(GLOB_RECURSE SOURCES "src/*.cpp")
+file(GLOB_RECURSE TESTS "src/*.test.cpp")
+list(REMOVE_ITEM SOURCES ${TESTS})
+
+add_library(luci_profile SHARED ${SOURCES})
+target_include_directories(luci_profile PRIVATE src)
+target_include_directories(luci_profile PUBLIC include)
+target_link_libraries(luci_profile PUBLIC loco)
+target_link_libraries(luci_profile PUBLIC luci_lang)
+
+install(TARGETS luci_profile DESTINATION lib)
+
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
+nnas_find_package(GTest REQUIRED)
+
+GTest_AddTest(luci_profile_test ${TESTS})
+target_include_directories(luci_profile_test PRIVATE src)
+target_link_libraries(luci_profile_test luci_lang)
+target_link_libraries(luci_profile_test luci_profile)
diff --git a/compiler/luci/profile/README.md b/compiler/luci/profile/README.md
new file mode 100644
index 000000000..577e60a7c
--- /dev/null
+++ b/compiler/luci/profile/README.md
@@ -0,0 +1,119 @@
+# luci-profile
+
+`luci-profile` provides profiling related items.
+
+## CircleNodeOrigin
+
+`CircleNodeOrigin` allow us know where some node is originated from.
+
+Let's assume following graph transformations are done.
+
+```
+    |                          |                         |
+ [node1] --------+             |                         |
+(id = 1)         |             |                         |
+    |            +--------> [node5] ----------------> [node6]
+    |            |     (origin = [1,2])          (origin = [1,2])
+ [node2] --------+             |                         |
+(id = 2)                       |                         |
+    |                          |                         |
+ [node3] -----------------> [node3] --------+-------> [node3]
+(id = 3)                (origin = [3])      |    (origin = [3,4])
+    |                          |            |            |
+ [node4] -----------------> [node4] --------+            |
+(id = 4)                (origin = [4])                   |
+    |                          |                         |
+
+<Circle1> -- optimizer --> <circle2> -- quantizer --> <circle3>
+```
+
+The most important purpose of using `CircleNodeOrigin` is preserving origin information.
+Following changes show how origin information is preserved even after graph is transformed.
+
+- `node3`
+  - `node4` is absorbed to **existing** `node3`.
+  - origin of `node4` is absorbed to origin of `node3`.
+- `node5`
+  - `node1` and `node2` are fused to **newly created** `node5`.
+  - origin of `node1` and `node2` are inherited to origin of `node4`.
+- `node6`
+   - `node5` is **replaced with newly created** `node6`.
+   - origin of `node5` is copied to origin of `node6`.
+
+**Therefore, when using `CircleNodeOrigin`, please aware of the most important principle. "Preserve origin information"**
+
+Next items are about implementation details to store the origin information.
+
+### Source Table
+
+Source table includes a set of id and name of origin node.
+
+#### Binary format
+
+```
+[ entry_number : uint32_t ]
+[ id : uint32_t ][ length : uint32_t ][ data : char * length ] * entry_number
+```
+- entry_number : The number of entries
+  - Each entry consists of id, length, and data.
+- id : ID of origin node
+- length : Length of data
+- data : Name of origin node **(null-terminated string)**
+
+#### In-memory format
+```cpp
+// size = entry_number
+std::map<uint32_t /* id */, std::string /* name */>
+```
+
+#### Example
+
+Following example means "Name of origin 1 is node1".
+
+```
+[Binary Format]
+ 0x01 00 00 00 0x01 00 00 00 0x06 00 00 00 0x6e 0x6f 0x64 0x65 0x31 00
+ ------------- ------------- ------------- ---- ---- ---- ---- ---- ----
+entry_number=1      id=1        length=6   'n'  'o'  'd'  'e'  '1'  '\0'
+```
+```cpp
+[In-memory Format]
+std::map<uint32_t, std::string>({1, "node1"});
+```
+
+### Op Table
+
+Op table includes a set of id of operation and id(s) of operation's origin nodes.
+
+#### Binary format
+
+Op table is stored in circle file as binary with following format.
+```
+[ entry_number : uint32_t ]
+[ id : uint32_t ][ node_num : uint32_t ][ node_ids : uint32_t * node_num ] * entry_number
+```
+- entry_number : The number of entries
+  - Each entry consists of id, node_num, and node_ids.
+- id : ID of operation in circle model file
+- node_num : The number of operation's origin nodes
+- node_ids : Set of IDs of origin nodes
+
+#### In-memory format
+```cpp
+std::map<uint32_t /* id */, std::set<uint32_t> /* node_ids */>
+```
+
+#### Example
+
+Following example means "Operation 5 is originated from origin 1 and origin 2".
+
+```
+[Binary Format]
+ 0x01 00 00 00 0x05 00 00 00 0x02 00 00 00 0x01 00 00 00 0x02 00 00 00
+ ------------- ------------- ------------- ---------------------------
+entry_number=1      id=5       node_num=2        node_ids : 1, 2
+```
+```cpp
+[In-memory Format]
+std::map<uint32_t, std::set<uint32_t>>({5, std::set{1, 2}});
+```
diff --git a/compiler/luci/pass/src/FuseActivationFunctionPassInternal.h b/compiler/luci/profile/include/luci/Profile/CircleNodeID.h
index 0cfb9d507..165866bcf 100644
--- a/compiler/luci/pass/src/FuseActivationFunctionPassInternal.h
+++ b/compiler/luci/profile/include/luci/Profile/CircleNodeID.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,18 +14,22 @@
  * limitations under the License.
  */
 
-#ifndef __LUCI_CIRCLE_FUSE_ACTIVATION_FUNCTION_PASS_INTERNAL_H__
-#define __LUCI_CIRCLE_FUSE_ACTIVATION_FUNCTION_PASS_INTERNAL_H__
+#ifndef __LUCI_PROFILE_CIRCLE_NODE_ID_H__
+#define __LUCI_PROFILE_CIRCLE_NODE_ID_H__
 
-#include <luci/IR/CircleNodes.h>
+#include <luci/IR/CircleNode.h>
 
 namespace luci
 {
 
-//  Fuse activation function with preceding Op
-/// @return true if success
-bool fuse_activation_function(luci::CircleNode *node);
+using CircleNodeID = uint32_t;
+
+bool has_node_id(const luci::CircleNode *circle_node);
+
+void set_node_id(luci::CircleNode *circle_node, CircleNodeID id);
+
+CircleNodeID get_node_id(const luci::CircleNode *circle_node);
 
 } // namespace luci
 
-#endif // __LUCI_CIRCLE_FUSE_ACTIVATION_FUNCTION_PASS_INTERNAL_H__
+#endif // __LUCI_PROFILE_CIRCLE_NODE_ID_H__
diff --git a/compiler/luci/profile/include/luci/Profile/CircleNodeOrigin.h b/compiler/luci/profile/include/luci/Profile/CircleNodeOrigin.h
new file mode 100644
index 000000000..2d6558c92
--- /dev/null
+++ b/compiler/luci/profile/include/luci/Profile/CircleNodeOrigin.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_PROFILE_CIRCLE_NODE_ORIGIN_H__
+#define __LUCI_PROFILE_CIRCLE_NODE_ORIGIN_H__
+
+#include "CircleNodeID.h"
+
+#include <luci/IR/CircleNode.h>
+
+#include <set>
+
+namespace luci
+{
+
+class CircleNodeOrigin
+{
+protected:
+  struct Source
+  {
+  public:
+    std::string name(void) const { return _name; }
+    void name(const std::string &name) { _name = name; }
+
+    uint32_t id(void) const { return _id; }
+    void id(const uint32_t id) { _id = id; }
+
+  private:
+    std::string _name;
+    uint32_t _id = 0;
+  };
+
+public:
+  virtual std::set<const Source *> sources(void) const = 0;
+};
+
+std::shared_ptr<CircleNodeOrigin> single_origin(uint32_t id, const std::string &name);
+
+std::shared_ptr<CircleNodeOrigin>
+composite_origin(const std::initializer_list<std::shared_ptr<CircleNodeOrigin>> origins);
+
+std::shared_ptr<CircleNodeOrigin>
+composite_origin(const std::vector<std::shared_ptr<CircleNodeOrigin>> &origins);
+
+} // namespace luci
+
+namespace luci
+{
+
+bool has_origin(const luci::CircleNode *circle_node);
+
+void add_origin(luci::CircleNode *circle_node, const std::shared_ptr<CircleNodeOrigin> origin);
+
+// NOTE When circle_node does not have origin, nullptr is returned
+const std::shared_ptr<luci::CircleNodeOrigin> get_origin(const luci::CircleNode *circle_node);
+
+} // namespace luci
+
+#endif // __LUCI_PROFILE_CIRCLE_NODE_ORIGIN_H__
diff --git a/compiler/luci/profile/src/CircleNodeID.cpp b/compiler/luci/profile/src/CircleNodeID.cpp
new file mode 100644
index 000000000..750b36cae
--- /dev/null
+++ b/compiler/luci/profile/src/CircleNodeID.cpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Profile/CircleNodeID.h"
+
+#include <loco.h>
+
+#include <stdexcept>
+
+namespace
+{
+
+/**
+ * @brief Set annotation for circle node id
+ * @note  Once CircleNodeID is annotated, it should not be changed.
+ *        If CircleNodeID is needed to be changed, create new CircleNodeID.
+ */
+class CircleNodeIDAnnotation final : public loco::NodeAnnotation
+{
+public:
+  CircleNodeIDAnnotation() = delete;
+
+  CircleNodeIDAnnotation(luci::CircleNodeID node_id) : _node_id{node_id}
+  {
+    // Do nothing
+  }
+
+public:
+  luci::CircleNodeID node_id(void) const { return _node_id; }
+  // No setter
+
+private:
+  luci::CircleNodeID _node_id;
+};
+
+} // namespace
+
+namespace luci
+{
+
+bool has_node_id(const luci::CircleNode *circle_node)
+{
+  return circle_node->annot<CircleNodeIDAnnotation>() != nullptr;
+}
+
+void set_node_id(luci::CircleNode *circle_node, luci::CircleNodeID id)
+{
+  circle_node->annot<CircleNodeIDAnnotation>(nullptr);
+  circle_node->annot(std::make_unique<CircleNodeIDAnnotation>(id));
+}
+
+luci::CircleNodeID get_node_id(const luci::CircleNode *circle_node)
+{
+  if (!has_node_id(circle_node))
+    throw std::runtime_error("Cannot find CircleNodeID");
+
+  return circle_node->annot<CircleNodeIDAnnotation>()->node_id();
+}
+
+} // namespace luci
diff --git a/compiler/luci/profile/src/CircleNodeID.test.cpp b/compiler/luci/profile/src/CircleNodeID.test.cpp
new file mode 100644
index 000000000..d80c09b2c
--- /dev/null
+++ b/compiler/luci/profile/src/CircleNodeID.test.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Profile/CircleNodeID.h"
+
+#include <luci/IR/CircleNodes.h>
+
+#include <gtest/gtest.h>
+
+TEST(LuciCircleNodeID, simple_circle_node_id)
+{
+  auto g = loco::make_graph();
+  auto add = g->nodes()->create<luci::CircleAdd>();
+
+  ASSERT_FALSE(has_node_id(add));
+
+  set_node_id(add, 3);
+
+  ASSERT_TRUE(has_node_id(add));
+  ASSERT_EQ(3, get_node_id(add));
+}
+
+TEST(LuciCircleNodeID, simple_circle_node_id_NEG)
+{
+  auto g = loco::make_graph();
+  auto add = g->nodes()->create<luci::CircleAdd>();
+
+  ASSERT_FALSE(has_node_id(add));
+
+  ASSERT_ANY_THROW(get_node_id(add));
+}
diff --git a/compiler/luci/profile/src/CircleNodeOrigin.cpp b/compiler/luci/profile/src/CircleNodeOrigin.cpp
new file mode 100644
index 000000000..0a731a9ad
--- /dev/null
+++ b/compiler/luci/profile/src/CircleNodeOrigin.cpp
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Profile/CircleNodeOrigin.h"
+
+#include <loco.h>
+
+#include <cassert>
+#include <vector>
+
+namespace
+{
+
+/**
+ * @brief Set annotation for recording origin information
+ * @note  Once CircleNodeOrigin is annotated, it should not be changed.
+ *        If CircleNodeOrigin is needed to be changed, create new CircleNodeOrigin.
+ */
+class CircleNodeOriginAnnotation final : public loco::NodeAnnotation
+{
+public:
+  CircleNodeOriginAnnotation() = delete;
+
+  CircleNodeOriginAnnotation(const std::shared_ptr<luci::CircleNodeOrigin> origin) : _origin(origin)
+  {
+    // Do nothing
+  }
+
+public:
+  const std::shared_ptr<luci::CircleNodeOrigin> origin(void) const { return _origin; }
+  // No setter
+
+private:
+  const std::shared_ptr<luci::CircleNodeOrigin> _origin;
+};
+
+} // namespace
+
+namespace
+{
+
+class SingleOrigin final : public luci::CircleNodeOrigin
+{
+public:
+  SingleOrigin() = delete;
+
+  SingleOrigin(uint32_t id, const std::string &name)
+  {
+    _source.id(id);
+    _source.name(name);
+  }
+
+public:
+  std::set<const Source *> sources(void) const final
+  {
+    std::set<const Source *> res;
+    res.emplace(&_source);
+    return res;
+  }
+
+private:
+  Source _source;
+};
+
+class CompositeOrigin final : public luci::CircleNodeOrigin
+{
+public:
+  CompositeOrigin() = delete;
+
+  template <typename T> CompositeOrigin(T origins)
+  {
+    if (origins.size() == 0)
+      throw std::invalid_argument("No origins provided");
+
+    for (auto &origin : origins)
+    {
+      if (origin != nullptr)
+        _origins.emplace_back(origin);
+    }
+  }
+
+public:
+  std::set<const Source *> sources(void) const final
+  {
+    std::set<const Source *> res;
+
+    for (auto &origin : _origins)
+    {
+      for (auto source : origin->sources())
+      {
+        res.emplace(source);
+      }
+    }
+
+    return res;
+  }
+
+private:
+  std::vector<std::shared_ptr<CircleNodeOrigin>> _origins;
+};
+
+} // namespace
+
+namespace luci
+{
+
+std::shared_ptr<CircleNodeOrigin> single_origin(uint32_t id, const std::string &name)
+{
+  return std::make_shared<SingleOrigin>(id, name);
+}
+
+std::shared_ptr<CircleNodeOrigin>
+composite_origin(const std::initializer_list<std::shared_ptr<CircleNodeOrigin>> origins)
+{
+  return std::make_shared<CompositeOrigin>(origins);
+}
+
+std::shared_ptr<CircleNodeOrigin>
+composite_origin(const std::vector<std::shared_ptr<CircleNodeOrigin>> &origins)
+{
+  return std::make_shared<CompositeOrigin>(origins);
+}
+
+} // namespace luci
+
+namespace luci
+{
+
+bool has_origin(const luci::CircleNode *circle_node)
+{
+  return circle_node->annot<CircleNodeOriginAnnotation>() != nullptr;
+}
+
+/**
+ * @brief 'origin' is added to the existing origin of circle_node.
+ * @note  If 'origin' is nullptr, nothing is changed.
+ *        For more detail, please refer to CompositeOrigin constructor.
+ */
+void add_origin(luci::CircleNode *circle_node, const std::shared_ptr<CircleNodeOrigin> origin)
+{
+  auto new_origin = composite_origin({get_origin(circle_node), origin});
+  circle_node->annot<CircleNodeOriginAnnotation>(nullptr);
+  circle_node->annot(std::make_unique<CircleNodeOriginAnnotation>(new_origin));
+}
+
+const std::shared_ptr<luci::CircleNodeOrigin> get_origin(const luci::CircleNode *circle_node)
+{
+  if (!has_origin(circle_node))
+    return nullptr;
+
+  assert(circle_node->annot<CircleNodeOriginAnnotation>()->origin() != nullptr);
+  return circle_node->annot<CircleNodeOriginAnnotation>()->origin();
+}
+
+} // namespace luci
diff --git a/compiler/luci/profile/src/CircleNodeOrigin.test.cpp b/compiler/luci/profile/src/CircleNodeOrigin.test.cpp
new file mode 100644
index 000000000..34618e1ab
--- /dev/null
+++ b/compiler/luci/profile/src/CircleNodeOrigin.test.cpp
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Profile/CircleNodeID.h"
+#include "luci/Profile/CircleNodeOrigin.h"
+
+#include <luci/IR/CircleNodes.h>
+
+#include <gtest/gtest.h>
+
+TEST(LuciCircleNodeOrigin, simple_single_origin)
+{
+  auto g = loco::make_graph();
+  auto add = g->nodes()->create<luci::CircleAdd>();
+
+  ASSERT_FALSE(has_origin(add));
+
+  auto origin = luci::single_origin(3, "add");
+  add_origin(add, origin);
+
+  ASSERT_TRUE(has_origin(add));
+
+  auto sources = get_origin(add)->sources();
+  ASSERT_EQ(1, sources.size());
+  for (auto source : sources)
+  {
+    ASSERT_EQ(3, source->id());
+    ASSERT_EQ(0, source->name().compare("add"));
+  }
+}
+
+TEST(LuciCircleNodeOrigin, simple_composite_origin_with_initializer)
+{
+  auto g = loco::make_graph();
+  auto mul = g->nodes()->create<luci::CircleMul>();
+
+  ASSERT_FALSE(has_origin(mul));
+
+  auto origin =
+    luci::composite_origin({luci::single_origin(3, "add"), luci::single_origin(7, "sub")});
+  add_origin(mul, origin);
+
+  ASSERT_TRUE(has_origin(mul));
+
+  bool add_origin_passed = false;
+  bool sub_origin_passed = false;
+  auto sources = get_origin(mul)->sources();
+  ASSERT_EQ(2, sources.size());
+  for (auto source : sources)
+  {
+    if (source->id() == 3 && source->name().compare("add") == 0)
+      add_origin_passed = true;
+    if (source->id() == 7 && source->name().compare("sub") == 0)
+      sub_origin_passed = true;
+  }
+
+  ASSERT_EQ(true, add_origin_passed);
+  ASSERT_EQ(true, sub_origin_passed);
+}
+
+TEST(LuciCircleNodeOrigin, simple_composite_origin_with_vector)
+{
+  auto g = loco::make_graph();
+  auto mul = g->nodes()->create<luci::CircleMul>();
+
+  ASSERT_FALSE(has_origin(mul));
+
+  std::vector<std::shared_ptr<luci::CircleNodeOrigin>> vec;
+  vec.push_back(luci::single_origin(3, "add"));
+  vec.push_back(luci::single_origin(7, "sub"));
+  auto origin = luci::composite_origin(vec);
+  add_origin(mul, origin);
+
+  ASSERT_TRUE(has_origin(mul));
+
+  bool add_origin_passed = false;
+  bool sub_origin_passed = false;
+  auto sources = get_origin(mul)->sources();
+  ASSERT_EQ(2, sources.size());
+  for (auto source : sources)
+  {
+    if (source->id() == 3 && source->name().compare("add") == 0)
+      add_origin_passed = true;
+    if (source->id() == 7 && source->name().compare("sub") == 0)
+      sub_origin_passed = true;
+  }
+
+  ASSERT_EQ(true, add_origin_passed);
+  ASSERT_EQ(true, sub_origin_passed);
+}
+
+TEST(LuciCircleNodeOrigin, composite_origin_empty_ctor_NEG)
+{
+  ASSERT_ANY_THROW(luci::composite_origin({}));
+}
diff --git a/compiler/luci/service/CMakeLists.txt b/compiler/luci/service/CMakeLists.txt
index 9f50c9c4f..1c78031ab 100644
--- a/compiler/luci/service/CMakeLists.txt
+++ b/compiler/luci/service/CMakeLists.txt
@@ -22,4 +22,5 @@ nnas_find_package(GTest REQUIRED)
 GTest_AddTest(luci_service_test ${TESTS})
 target_include_directories(luci_service_test PRIVATE src)
 target_link_libraries(luci_service_test luci_service)
+target_link_libraries(luci_service_test luci_testhelper)
 target_link_libraries(luci_service_test oops)
diff --git a/compiler/luci/service/include/luci/Service/CircleNodeClone.h b/compiler/luci/service/include/luci/Service/CircleNodeClone.h
new file mode 100644
index 000000000..2429997cc
--- /dev/null
+++ b/compiler/luci/service/include/luci/Service/CircleNodeClone.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_CIRCLE_NODE_CLONE__
+#define __LUCI_CIRCLE_NODE_CLONE__
+
+#include <luci/IR/CircleNodes.h>
+
+#include <loco/IR/Graph.h>
+
+namespace luci
+{
+
+/**
+ * @brief Copy common attributes of CircleNode from src to dst.
+ */
+void copy_common_attributes(const luci::CircleNode *src, luci::CircleNode *dst);
+
+/**
+ * @brief Return a new cloned CircleNode object with same attributes value of node to graph.
+ * @note  Will return nullptr if clone has failed
+ */
+CircleNode *clone_node(const CircleNode *node, loco::Graph *graph);
+
+} // namespace luci
+
+#endif // __LUCI_CIRCLE_NODE_CLONE__
diff --git a/compiler/luci/service/include/luci/Service/CircleShapeInference.h b/compiler/luci/service/include/luci/Service/CircleShapeInference.h
index c301db5f4..60bc16e48 100644
--- a/compiler/luci/service/include/luci/Service/CircleShapeInference.h
+++ b/compiler/luci/service/include/luci/Service/CircleShapeInference.h
@@ -17,29 +17,15 @@
 #ifndef __LUCI_CIRCLE_SHAPE_INFERENCE_H__
 #define __LUCI_CIRCLE_SHAPE_INFERENCE_H__
 
-#include "ShapeDescription.h"
-
 #include <loco/IR/Nodes.h>
 
 #include <luci/IR/CircleNodes.h>
 #include <luci/IR/CircleNodeVisitor.h>
-#include <luci/Service/CircleShapeInferenceHelper.h>
+#include <luci/Service/CircleShapeInferenceRule.h>
 
 namespace luci
 {
 
-/**
- * @brief Get the shape of each node as a node annotation
- *
- * HOW TO USE
- *
- *   ShapeInference::get(g->nodes()->at(..));
- */
-struct ShapeInference
-{
-  static ShapeDescription get(loco::Node *node);
-};
-
 namespace sinf // namespace for Shape Inference
 {
 
@@ -52,7 +38,12 @@ class Algorithm final : public luci::CircleNodeVisitor<loco::TensorShape>
 {
 public:
   // TODO Remove this when all of visit function is implemented
-  loco::TensorShape visit(const luci::CircleNode *node) final { return sinf::circle_shape(node); }
+  loco::TensorShape visit(const luci::CircleNode *node) final
+  {
+    loco::NodeShape shape;
+    luci::CircleShapeInferenceRule().infer(node, shape);
+    return shape.as<loco::TensorShape>();
+  }
 
   // loco::TensorShape visit(const luci::CircleAbs *node) final;
   // loco::TensorShape visit(const luci::CircleAdd *node) final;
@@ -77,6 +68,7 @@ public:
   // loco::TensorShape visit(const luci::CircleEqual *node) final;
   // loco::TensorShape visit(const luci::CircleExp *node) final;
   // loco::TensorShape visit(const luci::CircleExpandDims *node) final;
+  // loco::TensorShape visit(const luci::CircleFakeQuant *node) final;
   // loco::TensorShape visit(const luci::CircleFill *node) final;
   // loco::TensorShape visit(const luci::CircleFloor *node) final;
   // loco::TensorShape visit(const luci::CircleFloorDiv *node) final;
@@ -106,10 +98,12 @@ public:
   // loco::TensorShape visit(const luci::CircleMean *node) final;
   // loco::TensorShape visit(const luci::CircleMinimum *node) final;
   // loco::TensorShape visit(const luci::CircleMirrorPad *node) final;
+  // loco::TensorShape visit(const luci::CircleMul *node) final;
   // loco::TensorShape visit(const luci::CircleNeg *node) final;
   // loco::TensorShape visit(const luci::CircleNonMaxSuppressionV4 *node) final;
   // loco::TensorShape visit(const luci::CircleNonMaxSuppressionV5 *node) final;
   // loco::TensorShape visit(const luci::CircleNotEqual *node) final;
+  // loco::TensorShape visit(const luci::CircleOneHot *node) final;
   // loco::TensorShape visit(const luci::CirclePack *node) final;
   // loco::TensorShape visit(const luci::CirclePad *node) final;
   // loco::TensorShape visit(const luci::CirclePadV2 *node) final;
@@ -117,8 +111,6 @@ public:
   // loco::TensorShape visit(const luci::CirclePRelu *node) final;
   // loco::TensorShape visit(const luci::CircleRange *node) final;
   // loco::TensorShape visit(const luci::CircleRank *node) final;
-  // loco::TensorShape visit(const luci::CircleMul *node) final;
-  // loco::TensorShape visit(const luci::CircleOneHot *node) final;
   // loco::TensorShape visit(const luci::CircleReduceAny *node) final;
   // loco::TensorShape visit(const luci::CircleReduceMax *node) final;
   // loco::TensorShape visit(const luci::CircleReduceMin *node) final;
@@ -171,14 +163,14 @@ public:
   // loco::TensorShape visit(const luci::CircleInstanceNorm *node) final;
 
   // Virtual
+  // loco::TensorShape visit(const luci::CircleCustomOut *node) final;
+  loco::TensorShape visit(const luci::CircleIfOut *node) final;
   // loco::TensorShape visit(const luci::CircleInput *node) final;
+  // loco::TensorShape visit(const luci::CircleNonMaxSuppressionV4Out *node) final;
+  // loco::TensorShape visit(const luci::CircleNonMaxSuppressionV5Out *node) final;
   // loco::TensorShape visit(const luci::CircleOutput *node) final;
   // loco::TensorShape visit(const luci::CircleOutputDummy *node) final;
   // loco::TensorShape visit(const luci::CircleOutputExclude *node) final;
-  // loco::TensorShape visit(const luci::CircleCustomOut *node) final;
-  // loco::TensorShape visit(const luci::CircleIfOut *node) final;
-  // loco::TensorShape visit(const luci::CircleNonMaxSuppressionV4Out *node) final;
-  // loco::TensorShape visit(const luci::CircleNonMaxSuppressionV5Out *node) final;
   // loco::TensorShape visit(const luci::CircleSplitOut *node) final;
   // loco::TensorShape visit(const luci::CircleSplitVOut *node) final;
   // loco::TensorShape visit(const luci::CircleTopKV2Out *node) final;
diff --git a/compiler/luci/service/include/luci/Service/CircleShapeSignatureInference.h b/compiler/luci/service/include/luci/Service/CircleShapeSignatureInference.h
deleted file mode 100644
index f7ea89bb8..000000000
--- a/compiler/luci/service/include/luci/Service/CircleShapeSignatureInference.h
+++ /dev/null
@@ -1,179 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __LUCI_CIRCLE_SHAPE_SIGNATURE_INFERENCE_H__
-#define __LUCI_CIRCLE_SHAPE_SIGNATURE_INFERENCE_H__
-
-#include <luci/IR/CircleNodes.h>
-#include <luci/IR/CircleNodeVisitor.h>
-#include <luci/IR/CircleShapeSignature.h>
-#include <luci/Service/CircleShapeSignatureInferenceHelper.h>
-
-namespace luci
-{
-
-namespace ssinf // namespace for Shape Signature Inference
-{
-
-struct Rule
-{
-  bool infer(const luci::CircleNode *, ShapeSignature &) const;
-};
-
-class Algorithm final : public luci::CircleNodeVisitor<ShapeSignature>
-{
-public:
-  // TODO Remove this when visit function is implemented for all the operations.
-  ShapeSignature visit(const luci::CircleNode *node) final { return node->shape_signature(); }
-
-  // ShapeSignature visit(const luci::CircleAbs *node) final;
-  // ShapeSignature visit(const luci::CircleAdd *node) final;
-  // ShapeSignature visit(const luci::CircleAddN *node) final;
-  // ShapeSignature visit(const luci::CircleArgMax *node) final;
-  // ShapeSignature visit(const luci::CircleArgMin *node) final;
-  // ShapeSignature visit(const luci::CircleAveragePool2D *node) final;
-  // ShapeSignature visit(const luci::CircleBatchMatMul *node) final;
-  // ShapeSignature visit(const luci::CircleBatchToSpaceND *node) final;
-  // ShapeSignature visit(const luci::CircleCast *node) final;
-  // ShapeSignature visit(const luci::CircleCeil *node) final;
-  // ShapeSignature visit(const luci::CircleConcatenation *node) final;
-  // ShapeSignature visit(const luci::CircleConst *node) final;
-  // ShapeSignature visit(const luci::CircleConv2D *node) final;
-  // ShapeSignature visit(const luci::CircleCos *node) final;
-  // ShapeSignature visit(const luci::CircleCustom *node) final;
-  // ShapeSignature visit(const luci::CircleDepthToSpace *node) final;
-  // ShapeSignature visit(const luci::CircleDepthwiseConv2D *node) final;
-  // ShapeSignature visit(const luci::CircleDequantize *node) final;
-  // ShapeSignature visit(const luci::CircleDiv *node) final;
-  // ShapeSignature visit(const luci::CircleElu *node) final;
-  // ShapeSignature visit(const luci::CircleEqual *node) final;
-  // ShapeSignature visit(const luci::CircleExp *node) final;
-  // ShapeSignature visit(const luci::CircleExpandDims *node) final;
-  // ShapeSignature visit(const luci::CircleFill *node) final;
-  // ShapeSignature visit(const luci::CircleFloor *node) final;
-  // ShapeSignature visit(const luci::CircleFloorDiv *node) final;
-  // ShapeSignature visit(const luci::CircleFloorMod *node) final;
-  // ShapeSignature visit(const luci::CircleFullyConnected *node) final;
-  // ShapeSignature visit(const luci::CircleGather *node) final;
-  // ShapeSignature visit(const luci::CircleGatherNd *node) final;
-  // ShapeSignature visit(const luci::CircleGreater *node) final;
-  // ShapeSignature visit(const luci::CircleGreaterEqual *node) final;
-  // ShapeSignature visit(const luci::CircleIf *node) final;
-  // ShapeSignature visit(const luci::CircleL2Normalize *node) final;
-  // ShapeSignature visit(const luci::CircleL2Pool2D *node) final;
-  // ShapeSignature visit(const luci::CircleLeakyRelu *node) final;
-  // ShapeSignature visit(const luci::CircleLess *node) final;
-  // ShapeSignature visit(const luci::CircleLessEqual *node) final;
-  // ShapeSignature visit(const luci::CircleLocalResponseNormalization *node) final;
-  // ShapeSignature visit(const luci::CircleLog *node) final;
-  // ShapeSignature visit(const luci::CircleLogicalAnd *node) final;
-  // ShapeSignature visit(const luci::CircleLogicalNot *node) final;
-  // ShapeSignature visit(const luci::CircleLogicalOr *node) final;
-  // ShapeSignature visit(const luci::CircleLogistic *node) final;
-  // ShapeSignature visit(const luci::CircleLogSoftmax *node) final;
-  // ShapeSignature visit(const luci::CircleMatrixDiag *node) final;
-  // ShapeSignature visit(const luci::CircleMatrixSetDiag *node) final;
-  // ShapeSignature visit(const luci::CircleMaximum *node) final;
-  // ShapeSignature visit(const luci::CircleMaxPool2D *node) final;
-  ShapeSignature visit(const luci::CircleMean *node) final;
-  // ShapeSignature visit(const luci::CircleMinimum *node) final;
-  // ShapeSignature visit(const luci::CircleMirrorPad *node) final;
-  // ShapeSignature visit(const luci::CircleNeg *node) final;
-  // ShapeSignature visit(const luci::CircleNonMaxSuppressionV4 *node) final;
-  // ShapeSignature visit(const luci::CircleNonMaxSuppressionV5 *node) final;
-  // ShapeSignature visit(const luci::CircleNotEqual *node) final;
-  // ShapeSignature visit(const luci::CirclePack *node) final;
-  // ShapeSignature visit(const luci::CirclePad *node) final;
-  // ShapeSignature visit(const luci::CirclePadV2 *node) final;
-  // ShapeSignature visit(const luci::CirclePow *node) final;
-  // ShapeSignature visit(const luci::CirclePRelu *node) final;
-  // ShapeSignature visit(const luci::CircleRange *node) final;
-  // ShapeSignature visit(const luci::CircleRank *node) final;
-  // ShapeSignature visit(const luci::CircleMul *node) final;
-  // ShapeSignature visit(const luci::CircleOneHot *node) final;
-  ShapeSignature visit(const luci::CircleReduceAny *node) final;
-  ShapeSignature visit(const luci::CircleReduceMax *node) final;
-  ShapeSignature visit(const luci::CircleReduceMin *node) final;
-  ShapeSignature visit(const luci::CircleReduceProd *node) final;
-  ShapeSignature visit(const luci::CircleRelu *node) final;
-  ShapeSignature visit(const luci::CircleRelu6 *node) final;
-  ShapeSignature visit(const luci::CircleReluN1To1 *node) final;
-  // ShapeSignature visit(const luci::CircleReshape *node) final;
-  // ShapeSignature visit(const luci::CircleResizeBilinear *node) final;
-  // ShapeSignature visit(const luci::CircleResizeNearestNeighbor *node) final;
-  // ShapeSignature visit(const luci::CircleReverseSequence *node) final;
-  // ShapeSignature visit(const luci::CircleReverseV2 *node) final;
-  // ShapeSignature visit(const luci::CircleRound *node) final;
-  // ShapeSignature visit(const luci::CircleRsqrt *node) final;
-  // ShapeSignature visit(const luci::CircleScatterNd *node) final;
-  // ShapeSignature visit(const luci::CircleSegmentSum *node) final;
-  // ShapeSignature visit(const luci::CircleSelect *node) final;
-  // ShapeSignature visit(const luci::CircleSelectV2 *node) final;
-  // ShapeSignature visit(const luci::CircleShape *node) final;
-  // ShapeSignature visit(const luci::CircleSin *node) final;
-  // ShapeSignature visit(const luci::CircleSlice *node) final;
-  // ShapeSignature visit(const luci::CircleSoftmax *node) final;
-  // ShapeSignature visit(const luci::CircleSpaceToBatchND *node) final;
-  // ShapeSignature visit(const luci::CircleSpaceToDepth *node) final;
-  // ShapeSignature visit(const luci::CircleSparseToDense *node) final;
-  // ShapeSignature visit(const luci::CircleSplit *node) final;
-  // ShapeSignature visit(const luci::CircleSplitV *node) final;
-  // ShapeSignature visit(const luci::CircleSqrt *node) final;
-  // ShapeSignature visit(const luci::CircleSquare *node) final;
-  // ShapeSignature visit(const luci::CircleSquaredDifference *node) final;
-  // ShapeSignature visit(const luci::CircleSqueeze *node) final;
-  // ShapeSignature visit(const luci::CircleStridedSlice *node) final;
-  // ShapeSignature visit(const luci::CircleSub *node) final;
-  ShapeSignature visit(const luci::CircleSum *node) final;
-  // ShapeSignature visit(const luci::CircleTanh *node) final;
-  // ShapeSignature visit(const luci::CircleTile *node) final;
-  // ShapeSignature visit(const luci::CircleTopKV2 *node) final;
-  // ShapeSignature visit(const luci::CircleTranspose *node) final;
-  // ShapeSignature visit(const luci::CircleTransposeConv *node) final;
-  // ShapeSignature visit(const luci::CircleUnidirectionalSequenceLSTM *node) final;
-  // ShapeSignature visit(const luci::CircleUnique *node) final;
-  // ShapeSignature visit(const luci::CircleUnpack *node) final;
-  // ShapeSignature visit(const luci::CircleWhere *node) final ;
-  // ShapeSignature visit(const luci::CircleWhile *node) final;
-  // ShapeSignature visit(const luci::CircleZerosLike *node) final;
-
-  // Circle Only
-  // ShapeSignature visit(const luci::CircleBCQFullyConnected *node) final;
-  // ShapeSignature visit(const luci::CircleBCQGather *node) final;
-  // ShapeSignature visit(const luci::CircleInstanceNorm *node) final;
-
-  // Virtual
-  ShapeSignature visit(const luci::CircleInput *node) final;
-  ShapeSignature visit(const luci::CircleOutput *node) final;
-  ShapeSignature visit(const luci::CircleOutputDummy *node) final;
-  ShapeSignature visit(const luci::CircleOutputExclude *node) final;
-  // ShapeSignature visit(const luci::CircleCustomOut *node) final;
-  // ShapeSignature visit(const luci::CircleIfOut *node) final;
-  // ShapeSignature visit(const luci::CircleNonMaxSuppressionV4Out *node) final;
-  // ShapeSignature visit(const luci::CircleNonMaxSuppressionV5Out *node) final;
-  // ShapeSignature visit(const luci::CircleSplitOut *node) final;
-  // ShapeSignature visit(const luci::CircleSplitVOut *node) final;
-  // ShapeSignature visit(const luci::CircleTopKV2Out *node) final;
-  // ShapeSignature visit(const luci::CircleUniqueOut *node) final;
-  // ShapeSignature visit(const luci::CircleUnpackOut *node) final;
-  // ShapeSignature visit(const luci::CircleWhileOut *node) final;
-};
-
-} // namespace ssinf
-
-} // namespace luci
-
-#endif // __LUCI_CIRCLE_SHAPE_SIGNATURE_INFERENCE_H__
diff --git a/compiler/luci/service/include/luci/Service/CircleShapeSignatureInferenceHelper.h b/compiler/luci/service/include/luci/Service/CircleShapeSignatureInferenceHelper.h
deleted file mode 100644
index fb5b3b302..000000000
--- a/compiler/luci/service/include/luci/Service/CircleShapeSignatureInferenceHelper.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __LUCI_CIRCLE_SHAPE_SIGNATURE_INFERENCE_HELPER_H__
-#define __LUCI_CIRCLE_SHAPE_SIGNATURE_INFERENCE_HELPER_H__
-
-#include <luci/IR/CircleNodes.h>
-#include <luci/IR/CircleShapeSignature.h>
-
-namespace luci
-{
-
-namespace ssinf // Namespace for Shape Signature Inference
-{
-
-// Return empty signature if all of dimensions are known.
-// If at least one of dimensions is unknown, return signature without change.
-ShapeSignature legalized_signature(const luci::ShapeSignature &signature);
-
-// Return reduced input_signature with indices and keep_dims.
-//  - indices : reduction index
-//  - keep_dims : If true, rank is not changed. If false, rank is reduced along indices.
-ShapeSignature reduced_signature(const loco::Node *node, const loco::Node *indices, bool keep_dims);
-
-// Return signature of index-th argument of node.
-ShapeSignature input_arg_signature(const luci::CircleNode *node, uint32_t index);
-
-} // namespace ssinf
-
-} // namespace luci
-
-#endif // __LUCI_CIRCLE_SHAPE_SIGNATURE_INFERENCE_HELPER_H__
diff --git a/compiler/luci/service/include/luci/Service/CircleTypeInference.h b/compiler/luci/service/include/luci/Service/CircleTypeInference.h
index 342214887..8eef469ac 100644
--- a/compiler/luci/service/include/luci/Service/CircleTypeInference.h
+++ b/compiler/luci/service/include/luci/Service/CircleTypeInference.h
@@ -23,24 +23,11 @@
 
 #include <luci/IR/CircleNodes.h>
 #include <luci/IR/CircleNodeVisitor.h>
-#include <luci/Service/CircleTypeInferenceHelper.h>
+#include <luci/Service/CircleTypeInferenceRule.h>
 
 namespace luci
 {
 
-/**
- * @brief Get the type of each node as NodeAnnotation
- *
- * HOW TO USE
- *
- *   TypeInference::get(g->nodes()->at(0));
- *   TypeInference::get(g->nodes()->at(...));
- */
-struct TypeInference
-{
-  static circle::TensorType get(loco::Node *node);
-};
-
 namespace tinf // namespace for Type Inference
 {
 
@@ -53,7 +40,12 @@ class Algorithm final : public luci::CircleNodeVisitor<loco::DataType>
 {
 public:
   // TODO Remove this when all of visit function is implemented
-  loco::DataType visit(const luci::CircleNode *node) final { return node->dtype(); }
+  loco::DataType visit(const luci::CircleNode *node) final
+  {
+    loco::DataType dtype;
+    luci::CircleTypeInferenceRule().infer(node, dtype);
+    return dtype;
+  }
 
   // loco::DataType visit(const luci::CircleAbs *node) final;
   // loco::DataType visit(const luci::CircleAdd *node) final;
@@ -78,6 +70,7 @@ public:
   // loco::DataType visit(const luci::CircleEqual *node) final;
   // loco::DataType visit(const luci::CircleExp *node) final;
   // loco::DataType visit(const luci::CircleExpandDims *node) final;
+  // loco::DataType visit(const luci::CircleFakeQuant *node) final;
   // loco::DataType visit(const luci::CircleFill *node) final;
   // loco::DataType visit(const luci::CircleFloor *node) final;
   // loco::DataType visit(const luci::CircleFloorDiv *node) final;
@@ -177,7 +170,7 @@ public:
   // loco::DataType visit(const luci::CircleOutputDummy *node) final;
   // loco::DataType visit(const luci::CircleOutputExclude *node) final;
   // loco::DataType visit(const luci::CircleCustomOut *node) final;
-  // loco::DataType visit(const luci::CircleIfOut *node) final;
+  loco::DataType visit(const luci::CircleIfOut *node) final;
   // loco::DataType visit(const luci::CircleNonMaxSuppressionV4Out *node) final;
   // loco::DataType visit(const luci::CircleNonMaxSuppressionV5Out *node) final;
   // loco::DataType visit(const luci::CircleSplitOut *node) final;
diff --git a/compiler/luci/service/include/luci/Service/Nodes/CircleConst.h b/compiler/luci/service/include/luci/Service/Nodes/CircleConst.h
new file mode 100644
index 000000000..6049b4297
--- /dev/null
+++ b/compiler/luci/service/include/luci/Service/Nodes/CircleConst.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_SERVICE_CIRCLE_CONST_H__
+#define __LUCI_SERVICE_CIRCLE_CONST_H__
+
+#include <luci/IR/Nodes/CircleConst.h>
+
+namespace luci
+{
+
+/**
+ * @brief Return cloned object of CircleConst node
+ */
+luci::CircleConst *clone(luci::CircleConst *node);
+
+} // namespace luci
+
+#endif // __LUCI_SERVICE_CIRCLE_CONST_H__
diff --git a/compiler/luci/service/include/luci/Service/ShapeDescription.h b/compiler/luci/service/include/luci/Service/ShapeDescription.h
index 4d92be13f..4671096fd 100644
--- a/compiler/luci/service/include/luci/Service/ShapeDescription.h
+++ b/compiler/luci/service/include/luci/Service/ShapeDescription.h
@@ -37,10 +37,6 @@ struct ShapeDescription
 // TODO remove these when CircleDialect is fully functioal
 ShapeDescription to_shape_description(const luci::CircleNode *node);
 ShapeDescription to_shape_description(const loco::TensorShape &shape);
-ShapeDescription to_shape_description(const loco::FeatureShape &shape);
-ShapeDescription to_shape_description(const loco::FilterShape &shape);
-ShapeDescription to_shape_description(const loco::BiasShape &shape);
-ShapeDescription to_shape_description(const loco::MatrixShape &shape);
 ShapeDescription to_shape_description(const loco::NodeShape &shape);
 
 template <typename Permutation> inline bool isNHWC(Permutation *perm);
diff --git a/compiler/luci/service/include/luci/Service/Validate.h b/compiler/luci/service/include/luci/Service/Validate.h
index 4b80d1d16..456d6e504 100644
--- a/compiler/luci/service/include/luci/Service/Validate.h
+++ b/compiler/luci/service/include/luci/Service/Validate.h
@@ -17,6 +17,8 @@
 #ifndef __LUCI_SERVICE_VALIDATE_H__
 #define __LUCI_SERVICE_VALIDATE_H__
 
+#include <luci/IR/Module.h>
+
 #include <loco.h>
 
 namespace luci
@@ -24,6 +26,17 @@ namespace luci
 
 bool validate(loco::Graph *);
 
+/**
+ * @brief Return true if all nodes in graph have non empty name
+ */
+bool validate_name(loco::Graph *);
+
+/**
+ * @brief Return true if all names in the Module are unique
+ * @note  CircleOutput may have duplicate name
+ */
+bool validate_unique_name(luci::Module *);
+
 } // namespace luci
 
 #endif // __LUCI_SERVICE_VALIDATE_H__
diff --git a/compiler/luci/service/src/CircleCloneNode.h b/compiler/luci/service/src/CircleCloneNode.h
new file mode 100644
index 000000000..02c7cd256
--- /dev/null
+++ b/compiler/luci/service/src/CircleCloneNode.h
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CIRCLE_CLONE_NODE_H__
+#define __CIRCLE_CLONE_NODE_H__
+
+#include <luci/IR/CircleNodes.h>
+
+#include <luci/IR/CircleNodeVisitor.h>
+
+namespace luci
+{
+
+class CloneNode final : public luci::CircleNodeVisitor<luci::CircleNode *>
+{
+public:
+  CloneNode(loco::Graph *graph) : _graph(graph){};
+
+public:
+  luci::CircleNode *visit(const luci::CircleAbs *) final;
+  luci::CircleNode *visit(const luci::CircleAdd *) final;
+  luci::CircleNode *visit(const luci::CircleAddN *) final;
+  luci::CircleNode *visit(const luci::CircleArgMax *) final;
+  luci::CircleNode *visit(const luci::CircleArgMin *) final;
+  luci::CircleNode *visit(const luci::CircleAveragePool2D *) final;
+  luci::CircleNode *visit(const luci::CircleBatchMatMul *) final;
+  luci::CircleNode *visit(const luci::CircleBatchToSpaceND *) final;
+  luci::CircleNode *visit(const luci::CircleCast *) final;
+  luci::CircleNode *visit(const luci::CircleCeil *) final;
+  luci::CircleNode *visit(const luci::CircleConcatenation *) final;
+  luci::CircleNode *visit(const luci::CircleConst *) final;
+  luci::CircleNode *visit(const luci::CircleConv2D *) final;
+  luci::CircleNode *visit(const luci::CircleCos *) final;
+  luci::CircleNode *visit(const luci::CircleCustom *) final;
+  luci::CircleNode *visit(const luci::CircleDepthToSpace *) final;
+  luci::CircleNode *visit(const luci::CircleDepthwiseConv2D *) final;
+  luci::CircleNode *visit(const luci::CircleDequantize *) final;
+  luci::CircleNode *visit(const luci::CircleDiv *) final;
+  luci::CircleNode *visit(const luci::CircleElu *) final;
+  luci::CircleNode *visit(const luci::CircleEqual *) final;
+  luci::CircleNode *visit(const luci::CircleExp *) final;
+  luci::CircleNode *visit(const luci::CircleExpandDims *) final;
+  luci::CircleNode *visit(const luci::CircleFakeQuant *) final;
+  luci::CircleNode *visit(const luci::CircleFill *) final;
+  luci::CircleNode *visit(const luci::CircleFloor *) final;
+  luci::CircleNode *visit(const luci::CircleFloorDiv *) final;
+  luci::CircleNode *visit(const luci::CircleFloorMod *) final;
+  luci::CircleNode *visit(const luci::CircleFullyConnected *) final;
+  luci::CircleNode *visit(const luci::CircleGather *) final;
+  luci::CircleNode *visit(const luci::CircleGatherNd *) final;
+  luci::CircleNode *visit(const luci::CircleGreater *) final;
+  luci::CircleNode *visit(const luci::CircleGreaterEqual *) final;
+  // luci::CircleNode *visit(const luci::CircleIf *) final;
+  luci::CircleNode *visit(const luci::CircleL2Normalize *) final;
+  luci::CircleNode *visit(const luci::CircleL2Pool2D *) final;
+  luci::CircleNode *visit(const luci::CircleLeakyRelu *) final;
+  luci::CircleNode *visit(const luci::CircleLess *) final;
+  luci::CircleNode *visit(const luci::CircleLessEqual *) final;
+  luci::CircleNode *visit(const luci::CircleLocalResponseNormalization *) final;
+  luci::CircleNode *visit(const luci::CircleLog *) final;
+  luci::CircleNode *visit(const luci::CircleLogicalAnd *) final;
+  luci::CircleNode *visit(const luci::CircleLogicalNot *) final;
+  luci::CircleNode *visit(const luci::CircleLogicalOr *) final;
+  luci::CircleNode *visit(const luci::CircleLogistic *) final;
+  luci::CircleNode *visit(const luci::CircleLogSoftmax *) final;
+  luci::CircleNode *visit(const luci::CircleMatrixDiag *) final;
+  luci::CircleNode *visit(const luci::CircleMatrixSetDiag *) final;
+  luci::CircleNode *visit(const luci::CircleMaximum *) final;
+  luci::CircleNode *visit(const luci::CircleMaxPool2D *) final;
+  luci::CircleNode *visit(const luci::CircleMean *) final;
+  luci::CircleNode *visit(const luci::CircleMinimum *) final;
+  luci::CircleNode *visit(const luci::CircleMirrorPad *) final;
+  luci::CircleNode *visit(const luci::CircleMul *) final;
+  luci::CircleNode *visit(const luci::CircleNeg *) final;
+  luci::CircleNode *visit(const luci::CircleNonMaxSuppressionV4 *) final;
+  luci::CircleNode *visit(const luci::CircleNonMaxSuppressionV5 *) final;
+  luci::CircleNode *visit(const luci::CircleNotEqual *) final;
+  luci::CircleNode *visit(const luci::CircleOneHot *) final;
+  luci::CircleNode *visit(const luci::CirclePack *) final;
+  luci::CircleNode *visit(const luci::CirclePad *) final;
+  luci::CircleNode *visit(const luci::CirclePadV2 *) final;
+  luci::CircleNode *visit(const luci::CirclePow *) final;
+  luci::CircleNode *visit(const luci::CirclePRelu *) final;
+  luci::CircleNode *visit(const luci::CircleRange *) final;
+  luci::CircleNode *visit(const luci::CircleRank *) final;
+  luci::CircleNode *visit(const luci::CircleReduceAny *) final;
+  luci::CircleNode *visit(const luci::CircleReduceMax *) final;
+  luci::CircleNode *visit(const luci::CircleReduceMin *) final;
+  luci::CircleNode *visit(const luci::CircleReduceProd *) final;
+  luci::CircleNode *visit(const luci::CircleRelu *) final;
+  luci::CircleNode *visit(const luci::CircleRelu6 *) final;
+  luci::CircleNode *visit(const luci::CircleReluN1To1 *) final;
+  luci::CircleNode *visit(const luci::CircleReshape *) final;
+  luci::CircleNode *visit(const luci::CircleResizeBilinear *) final;
+  luci::CircleNode *visit(const luci::CircleResizeNearestNeighbor *) final;
+  luci::CircleNode *visit(const luci::CircleReverseSequence *) final;
+  luci::CircleNode *visit(const luci::CircleReverseV2 *) final;
+  luci::CircleNode *visit(const luci::CircleRound *) final;
+  luci::CircleNode *visit(const luci::CircleRsqrt *) final;
+  luci::CircleNode *visit(const luci::CircleScatterNd *) final;
+  luci::CircleNode *visit(const luci::CircleSegmentSum *) final;
+  luci::CircleNode *visit(const luci::CircleSelect *) final;
+  luci::CircleNode *visit(const luci::CircleSelectV2 *) final;
+  luci::CircleNode *visit(const luci::CircleShape *) final;
+  luci::CircleNode *visit(const luci::CircleSin *) final;
+  luci::CircleNode *visit(const luci::CircleSlice *) final;
+  luci::CircleNode *visit(const luci::CircleSoftmax *) final;
+  luci::CircleNode *visit(const luci::CircleSpaceToBatchND *) final;
+  luci::CircleNode *visit(const luci::CircleSpaceToDepth *) final;
+  luci::CircleNode *visit(const luci::CircleSparseToDense *) final;
+  luci::CircleNode *visit(const luci::CircleSplit *) final;
+  luci::CircleNode *visit(const luci::CircleSplitV *) final;
+  luci::CircleNode *visit(const luci::CircleSqrt *) final;
+  luci::CircleNode *visit(const luci::CircleSquare *) final;
+  luci::CircleNode *visit(const luci::CircleSquaredDifference *) final;
+  luci::CircleNode *visit(const luci::CircleSqueeze *) final;
+  luci::CircleNode *visit(const luci::CircleStridedSlice *) final;
+  luci::CircleNode *visit(const luci::CircleSub *) final;
+  luci::CircleNode *visit(const luci::CircleSum *) final;
+  luci::CircleNode *visit(const luci::CircleTanh *) final;
+  luci::CircleNode *visit(const luci::CircleTile *) final;
+  luci::CircleNode *visit(const luci::CircleTopKV2 *) final;
+  luci::CircleNode *visit(const luci::CircleTranspose *) final;
+  luci::CircleNode *visit(const luci::CircleTransposeConv *) final;
+  luci::CircleNode *visit(const luci::CircleUnidirectionalSequenceLSTM *) final;
+  luci::CircleNode *visit(const luci::CircleUnique *) final;
+  luci::CircleNode *visit(const luci::CircleUnpack *) final;
+  luci::CircleNode *visit(const luci::CircleWhere *) final;
+  // luci::CircleNode *visit(const luci::CircleWhile *) final;
+  luci::CircleNode *visit(const luci::CircleZerosLike *) final;
+
+  // Circle Only
+  luci::CircleNode *visit(const luci::CircleBCQFullyConnected *) final;
+  luci::CircleNode *visit(const luci::CircleBCQGather *) final;
+  luci::CircleNode *visit(const luci::CircleInstanceNorm *) final;
+
+  // Virtual
+  luci::CircleNode *visit(const luci::CircleCustomOut *) final;
+  // luci::CircleNode *visit(const luci::CircleIfOut *) final;
+  // luci::CircleNode *visit(const luci::CircleInput *) final;
+  luci::CircleNode *visit(const luci::CircleNonMaxSuppressionV4Out *) final;
+  luci::CircleNode *visit(const luci::CircleNonMaxSuppressionV5Out *) final;
+  // luci::CircleNode *visit(const luci::CircleOutput *) final;
+  luci::CircleNode *visit(const luci::CircleOutputDummy *) final;
+  luci::CircleNode *visit(const luci::CircleOutputExclude *) final;
+  luci::CircleNode *visit(const luci::CircleSplitOut *) final;
+  luci::CircleNode *visit(const luci::CircleSplitVOut *) final;
+  luci::CircleNode *visit(const luci::CircleTopKV2Out *) final;
+  luci::CircleNode *visit(const luci::CircleUniqueOut *) final;
+  luci::CircleNode *visit(const luci::CircleUnpackOut *) final;
+  // luci::CircleNode *visit(const luci::CircleWhileOut *) final;
+
+  // NOTE CircleNodeVisitor will throw if not supported here
+
+protected:
+  loco::Graph *_graph = nullptr;
+};
+
+} // namespace luci
+
+#endif // __CIRCLE_CLONE_NODE_H__
diff --git a/compiler/luci/service/src/CircleNodeClone.cpp b/compiler/luci/service/src/CircleNodeClone.cpp
new file mode 100644
index 000000000..d2033dd0c
--- /dev/null
+++ b/compiler/luci/service/src/CircleNodeClone.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include "CircleCloneNode.h"
+
+#include <oops/UserExn.h>
+
+#include <cassert>
+
+namespace luci
+{
+
+/**
+ * @note  Attributes of specific node type like keep_dims() of CircleSum are
+ *        not copied.
+ */
+void copy_common_attributes(const luci::CircleNode *src, luci::CircleNode *dst)
+{
+  assert(src != nullptr);
+  assert(dst != nullptr);
+
+  dst->name(src->name());
+  dst->dtype(src->dtype());
+
+  dst->rank(src->rank());
+  for (uint32_t i = 0; i < src->rank(); i++)
+  {
+    dst->dim(i) = src->dim(i);
+  }
+  dst->shape_status(src->shape_status());
+
+  // quantparam
+  const auto *quantparam = src->quantparam();
+  if (quantparam != nullptr)
+  {
+    auto qparam = std::make_unique<luci::CircleQuantParam>();
+    qparam->scale = quantparam->scale;
+    qparam->zerop = quantparam->zerop;
+    qparam->min = quantparam->min;
+    qparam->max = quantparam->max;
+    qparam->quantized_dimension = quantparam->quantized_dimension;
+
+    dst->quantparam(std::move(qparam));
+  }
+
+  // sparsity
+  const auto *sparsity = src->sparsityparam();
+  if (sparsity != nullptr)
+  {
+    auto sparam = std::make_unique<luci::SparsityParam>();
+    sparam->traversal_order = sparsity->traversal_order;
+    sparam->block_map = sparsity->block_map;
+    sparam->dim_metadata = sparsity->dim_metadata;
+
+    dst->sparsityparam(std::move(sparam));
+  }
+
+  // op version
+  dst->op_version(src->op_version());
+}
+
+/**
+ * @note  Each visit implementation must copy node specific attributes.
+ */
+luci::CircleNode *clone_node(const luci::CircleNode *node, loco::Graph *graph)
+{
+  if (node == nullptr || graph == nullptr)
+    return nullptr;
+
+  CloneNode cn(graph);
+  auto cloned = node->accept(&cn);
+  if (cloned != nullptr)
+    copy_common_attributes(node, cloned);
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/CircleNodeClone.test.cpp b/compiler/luci/service/src/CircleNodeClone.test.cpp
new file mode 100644
index 000000000..5908eeb82
--- /dev/null
+++ b/compiler/luci/service/src/CircleNodeClone.test.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+// NOTE any node will do for testing
+#include <luci/IR/Nodes/CircleAdd.h>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+luci::CircleAdd *build_simple_add_graph(loco::Graph *g)
+{
+  auto node = g->nodes()->create<luci::CircleAdd>();
+
+  node->name("name");
+  node->dtype(loco::DataType::FLOAT32);
+  node->rank(1);
+  node->dim(0).set(3);
+  node->shape_status(luci::ShapeStatus::VALID);
+  node->fusedActivationFunction(luci::FusedActFunc::NONE);
+
+  auto qparam = std::make_unique<luci::CircleQuantParam>();
+  qparam->scale = {1.0};
+  qparam->zerop = {0};
+  qparam->min = {0.0};
+  qparam->max = {1.0};
+  qparam->quantized_dimension = 0;
+  node->quantparam(std::move(qparam));
+
+  auto sparam = std::make_unique<luci::SparsityParam>();
+  sparam->traversal_order = {0};
+  sparam->block_map = {0};
+  sparam->dim_metadata = {luci::DimMetaData(luci::DimensionType::DENSE, 1)};
+  node->sparsityparam(std::move(sparam));
+
+  node->op_version(2);
+
+  return node;
+}
+
+} // namespace
+
+TEST(CircleNodeCloneTest, copy_attribites)
+{
+  auto g = loco::make_graph();
+  auto node = build_simple_add_graph(g.get());
+
+  auto copy = g->nodes()->create<luci::CircleAdd>();
+  luci::copy_common_attributes(node, copy);
+
+  ASSERT_EQ(node->name(), copy->name());
+  ASSERT_EQ(node->dtype(), copy->dtype());
+  ASSERT_EQ(node->rank(), copy->rank());
+  ASSERT_EQ(node->shape_status(), copy->shape_status());
+
+  const auto *qparam_node = node->quantparam();
+  const auto *qparam_copy = copy->quantparam();
+  ASSERT_EQ(qparam_node->scale, qparam_copy->scale);
+
+  const auto *sparsity_node = node->sparsityparam();
+  const auto *sparsity_copy = copy->sparsityparam();
+  ASSERT_EQ(sparsity_node->traversal_order, sparsity_copy->traversal_order);
+
+  ASSERT_EQ(node->op_version(), copy->op_version());
+}
+
+TEST(CircleNodeCloneTest, clone_add_node)
+{
+  auto g = loco::make_graph();
+  auto node = build_simple_add_graph(g.get());
+
+  auto cg = loco::make_graph();
+  auto clone = clone_node(node, cg.get());
+
+  ASSERT_NE(nullptr, clone);
+  ASSERT_EQ(cg.get(), clone->graph());
+  ASSERT_EQ(node->name(), clone->name());
+  ASSERT_EQ(node->dtype(), clone->dtype());
+  ASSERT_EQ(node->rank(), clone->rank());
+  ASSERT_EQ(node->shape_status(), clone->shape_status());
+}
+
+TEST(CircleNodeCloneTest, clone_node_NEG)
+{
+  auto g = loco::make_graph();
+  auto node = build_simple_add_graph(g.get());
+
+  auto cg = loco::make_graph();
+  auto clone = luci::clone_node(nullptr, cg.get());
+  ASSERT_EQ(nullptr, clone);
+  auto clone2 = luci::clone_node(node, nullptr);
+  ASSERT_EQ(nullptr, clone2);
+}
diff --git a/compiler/luci/service/src/CircleShapeInference.cpp b/compiler/luci/service/src/CircleShapeInference.cpp
index db8ffd8ad..73472069b 100644
--- a/compiler/luci/service/src/CircleShapeInference.cpp
+++ b/compiler/luci/service/src/CircleShapeInference.cpp
@@ -15,27 +15,16 @@
  */
 
 #include "luci/Service/CircleShapeInference.h"
-#include "luci/Service/ShapeDescription.h"
+
+#include "CircleShapeInferenceHelper.h"
 
 #include <loco.h>
-#include <loco/Service/ShapeInference.h>
 
 #include <luci/Log.h>
 
 #include <cassert>
 #include <iostream>
 
-namespace luci
-{
-
-ShapeDescription ShapeInference::get(loco::Node *node)
-{
-  assert(loco::shape_known(node));
-  return to_shape_description(loco::shape_get(node));
-}
-
-} // namespace luci
-
 namespace
 {
 
@@ -46,7 +35,11 @@ std::ostream &operator<<(std::ostream &os, const loco::TensorShape &tensor_shape
   {
     if (r)
       os << ",";
-    os << tensor_shape.dim(r).value();
+
+    if (tensor_shape.dim(r).known())
+      os << tensor_shape.dim(r).value();
+    else
+      os << "?";
   }
   os << "]";
   return os;
@@ -90,5 +83,5 @@ bool Rule::infer(const luci::CircleNode *circle_node, loco::TensorShape &shape)
   return true;
 }
 
-} // namespace ssinf
+} // namespace sinf
 } // namespace luci
diff --git a/compiler/luci/service/src/CircleShapeInferenceHelper.cpp b/compiler/luci/service/src/CircleShapeInferenceHelper.cpp
index f7eb6c3ec..2009aa59f 100644
--- a/compiler/luci/service/src/CircleShapeInferenceHelper.cpp
+++ b/compiler/luci/service/src/CircleShapeInferenceHelper.cpp
@@ -14,7 +14,24 @@
  * limitations under the License.
  */
 
-#include "luci/Service/CircleShapeInferenceHelper.h"
+#include "CircleShapeInferenceHelper.h"
+
+namespace luci
+{
+
+loco::NodeShape shape_get(const loco::Node *node)
+{
+  assert(luci::shape_known(node));
+  return loco::NodeShape{sinf::circle_shape(loco::must_cast<const luci::CircleNode *>(node))};
+}
+
+bool shape_known(const loco::Node *node)
+{
+  return loco::must_cast<const luci::CircleNode *>(node)->shape_status() !=
+         luci::ShapeStatus::UNDEFINED;
+}
+
+} // namespace luci
 
 namespace luci
 {
@@ -26,7 +43,7 @@ loco::TensorShape circle_shape(const luci::CircleNode *node)
   loco::TensorShape shape;
   shape.rank(node->rank());
   for (uint32_t r = 0; r < node->rank(); ++r)
-    shape.dim(r) = loco::Dimension(node->dim(r).value());
+    shape.dim(r) = node->dim(r);
   return shape;
 }
 
diff --git a/compiler/luci/service/include/luci/Service/CircleShapeInferenceHelper.h b/compiler/luci/service/src/CircleShapeInferenceHelper.h
index dd6a5a454..7c7ea496c 100644
--- a/compiler/luci/service/include/luci/Service/CircleShapeInferenceHelper.h
+++ b/compiler/luci/service/src/CircleShapeInferenceHelper.h
@@ -17,10 +17,24 @@
 #ifndef __LUCI_CIRCLE_SHAPE_INFERENCE_HELPER_H__
 #define __LUCI_CIRCLE_SHAPE_INFERENCE_HELPER_H__
 
+#include <loco/IR/NodeShape.h>
 #include <loco/IR/TensorShape.h>
 
 #include <luci/IR/CircleNodes.h>
-#include <luci/IR/CircleShapeSignature.h>
+
+namespace luci
+{
+
+// NOTE Functions in this namespace will be removed after new inference
+//      algorithms are fully implemented.
+
+// This function is temporary function for deprecating loco::shape_get
+loco::NodeShape shape_get(const loco::Node *node);
+
+// This function is temporary function for deprecating loco::shape_known
+bool shape_known(const loco::Node *node);
+
+} // namespace luci
 
 namespace luci
 {
diff --git a/compiler/luci/service/src/CircleShapeInferenceRule.cpp b/compiler/luci/service/src/CircleShapeInferenceRule.cpp
index 38ff619ab..c6d8232c3 100644
--- a/compiler/luci/service/src/CircleShapeInferenceRule.cpp
+++ b/compiler/luci/service/src/CircleShapeInferenceRule.cpp
@@ -17,6 +17,7 @@
 #include "luci/Service/CircleShapeInferenceRule.h"
 #include "Check.h"
 
+#include "CircleShapeInferenceHelper.h"
 #include "ShapeInfer_StridedSlice.h"
 
 #include <luci/IR/CircleNodes.h>
@@ -41,7 +42,11 @@ std::ostream &operator<<(std::ostream &os, const loco::TensorShape &tensor_shape
   {
     if (r)
       os << ",";
-    os << tensor_shape.dim(r).value();
+
+    if (tensor_shape.dim(r).known())
+      os << tensor_shape.dim(r).value();
+    else
+      os << "?";
   }
   os << "]";
   return os;
@@ -52,7 +57,15 @@ loco::TensorShape own_shape(const luci::CircleNode *node)
   loco::TensorShape shape;
   shape.rank(node->rank());
   for (uint32_t r = 0; r < node->rank(); ++r)
-    shape.dim(r) = loco::Dimension(node->dim(r).value());
+  {
+    // Shape inference rules in this file did not consider unknown dimension.
+    // If some node has unknown dimension, 0 is inserted and wrong shape
+    // inference was done as a result.
+    // To fix this, new shape inference algorithm is being implemented.
+    // Until new inference algorithm is fully implemented, unknown dimension
+    // would be represented as 1 along with TFLite expression.
+    shape.dim(r) = node->dim(r).known() ? node->dim(r).value() : 1;
+  }
   return shape;
 }
 
@@ -135,10 +148,8 @@ loco::TensorShape expand_dimension(const loco::TensorShape &x, const loco::Tenso
   output_shape.rank(rank);
   for (uint32_t axis = 0; axis < rank; ++axis)
   {
-    assert(x.dim(axis).known() && y.dim(axis).known());
-
-    auto x_dim = x.dim(axis).value();
-    auto y_dim = y.dim(axis).value();
+    auto x_dim = x.dim(axis).known() ? x.dim(axis).value() : 1;
+    auto y_dim = y.dim(axis).known() ? y.dim(axis).value() : 1;
 
     // each dimension of x and y should be same or one must be 1 if different
     if (!((x_dim == y_dim) || (x_dim == 1 || y_dim == 1)))
@@ -177,23 +188,29 @@ template <loco::DataType T> std::vector<int64_t> vector_from_constant(luci::Circ
 
 template <class CIRCLENODE> loco::NodeShape broadcast_xy(const CIRCLENODE *node)
 {
-  auto x_shape = loco::shape_get(node->x()).template as<loco::TensorShape>();
-  auto y_shape = loco::shape_get(node->y()).template as<loco::TensorShape>();
+  auto x_shape = luci::shape_get(node->x()).template as<loco::TensorShape>();
+  auto y_shape = luci::shape_get(node->y()).template as<loco::TensorShape>();
 
   auto output_shape = broadcast_shape(x_shape, y_shape);
 
   return loco::NodeShape{output_shape};
 }
 
+template <class CIRCLENODE> loco::NodeShape use_inputs(const CIRCLENODE *node)
+{
+  auto inputs_shape = luci::shape_get(node->inputs()).template as<loco::TensorShape>();
+  return loco::NodeShape{inputs_shape};
+}
+
 template <class CIRCLENODE> loco::NodeShape use_x(const CIRCLENODE *node)
 {
-  auto x_shape = loco::shape_get(node->x()).template as<loco::TensorShape>();
+  auto x_shape = luci::shape_get(node->x()).template as<loco::TensorShape>();
   return loco::NodeShape{x_shape};
 }
 
 template <class CIRCLENODE> loco::NodeShape use_logits(const CIRCLENODE *node)
 {
-  auto shape = loco::shape_get(node->logits()).template as<loco::TensorShape>();
+  auto shape = luci::shape_get(node->logits()).template as<loco::TensorShape>();
   return loco::NodeShape{shape};
 }
 
@@ -202,7 +219,7 @@ loco::NodeShape use_paddings(const CIRCLENODE *node, const luci::CircleConst *pa
 {
   const loco::DataType S32 = loco::DataType::S32;
 
-  auto input_shape = loco::shape_get(node->input()).template as<loco::TensorShape>();
+  auto input_shape = luci::shape_get(node->input()).template as<loco::TensorShape>();
 
   // TODO support other data type
   LUCI_ASSERT(paddings->dtype() == S32, "Only support int 32 for now");
@@ -232,11 +249,11 @@ loco::NodeShape use_paddings(const CIRCLENODE *node, const luci::CircleConst *pa
 
 loco::NodeShape infer_add_n(const luci::CircleAddN *node)
 {
-  auto shape = loco::shape_get(node->inputs(0)).as<loco::TensorShape>();
+  auto shape = luci::shape_get(node->inputs(0)).as<loco::TensorShape>();
 
   for (uint32_t idx = 1; idx < node->arity(); ++idx)
   {
-    auto shape_idx = loco::shape_get(node->inputs(idx)).as<loco::TensorShape>();
+    auto shape_idx = luci::shape_get(node->inputs(idx)).as<loco::TensorShape>();
     if (!(shape == shape_idx))
     {
       INTERNAL_EXN_V("ADD_N shape not same as the first input: ", idx);
@@ -247,8 +264,8 @@ loco::NodeShape infer_add_n(const luci::CircleAddN *node)
 
 loco::NodeShape infer_arg_max(const luci::CircleArgMax *node)
 {
-  auto input_shape = loco::shape_get(node->input()).as<loco::TensorShape>();
-  auto dimension_shape = loco::shape_get(node->dimension()).as<loco::TensorShape>();
+  auto input_shape = luci::shape_get(node->input()).as<loco::TensorShape>();
+  auto dimension_shape = luci::shape_get(node->dimension()).as<loco::TensorShape>();
 
   int64_t select_axis = 0;
   {
@@ -286,8 +303,8 @@ loco::NodeShape infer_arg_max(const luci::CircleArgMax *node)
 
 loco::NodeShape infer_arg_min(const luci::CircleArgMin *node)
 {
-  auto input_shape = loco::shape_get(node->input()).as<loco::TensorShape>();
-  auto dimension_shape = loco::shape_get(node->dimension()).as<loco::TensorShape>();
+  auto input_shape = luci::shape_get(node->input()).as<loco::TensorShape>();
+  auto dimension_shape = luci::shape_get(node->dimension()).as<loco::TensorShape>();
 
   int64_t select_axis = 0;
   {
@@ -326,9 +343,7 @@ loco::NodeShape infer_arg_min(const luci::CircleArgMin *node)
 // Call this for CircleAvgPool2D and CircleMaxPool2D only
 template <class Pool2DType> loco::NodeShape infer_pool_2d_shape(const Pool2DType *node)
 {
-  LUCI_ASSERT(loco::shape_known(node->value()), "Shape must be known");
-
-  auto ifm_shape = loco::shape_get(node->value()).template as<loco::TensorShape>();
+  auto ifm_shape = luci::shape_get(node->value()).template as<loco::TensorShape>();
   assert(ifm_shape.rank() == 4);
 
   uint32_t input_height = ifm_shape.dim(1).value();
@@ -372,7 +387,7 @@ loco::NodeShape infer_batch_to_space_nd(const luci::CircleBatchToSpaceND *node)
 {
   const loco::DataType S32 = loco::DataType::S32;
 
-  auto input_shape = loco::shape_get(node->input()).as<loco::TensorShape>();
+  auto input_shape = luci::shape_get(node->input()).as<loco::TensorShape>();
   // Support only input rank is 3 and 4
   assert(input_shape.rank() == 3 || input_shape.rank() == 4);
 
@@ -384,8 +399,8 @@ loco::NodeShape infer_batch_to_space_nd(const luci::CircleBatchToSpaceND *node)
   auto const_crops = loco::must_cast<luci::CircleConst *>(node->crops());
   LUCI_ASSERT(const_crops->dtype() == loco::DataType::S32, "Only support int32 crops");
 
-  auto const_block_shape_shape = loco::shape_get(const_block_shape).as<loco::TensorShape>();
-  auto const_crops_shape = loco::shape_get(const_crops).as<loco::TensorShape>();
+  auto const_block_shape_shape = luci::shape_get(const_block_shape).as<loco::TensorShape>();
+  auto const_crops_shape = luci::shape_get(const_crops).as<loco::TensorShape>();
   assert(const_block_shape_shape.rank() == 1);
   assert(const_crops_shape.rank() == 2);
 
@@ -423,8 +438,8 @@ struct OutputSize
 
 template <class Conv2DType> OutputSize infer_conv2d_type(const Conv2DType *node)
 {
-  auto ifm_shape = loco::shape_get(node->input()).template as<loco::TensorShape>();
-  auto ker_shape = loco::shape_get(node->filter()).template as<loco::TensorShape>();
+  auto ifm_shape = luci::shape_get(node->input()).template as<loco::TensorShape>();
+  auto ker_shape = luci::shape_get(node->filter()).template as<loco::TensorShape>();
   assert(ifm_shape.rank() == 4);
   assert(ker_shape.rank() == 4);
 
@@ -496,7 +511,7 @@ loco::NodeShape infer_batchmatmul_shape(const loco::TensorShape &x_shape,
   loco::Dimension y_lhs = adj_y ? y_shape.dim(y_rank - 1) : y_shape.dim(y_rank - 2);
   loco::Dimension y_rhs = adj_y ? y_shape.dim(y_rank - 2) : y_shape.dim(y_rank - 1);
 
-  if (not(x_rhs == y_lhs))
+  if (x_rhs.known() && y_lhs.known() && not(x_rhs == y_lhs))
     INTERNAL_EXN("x_rhs and y_lhs should be same");
 
   uint32_t out_rank = output_shape.rank();
@@ -511,7 +526,7 @@ loco::NodeShape infer_concatenation(const luci::CircleConcatenation *node)
   // TODO Support when CircleConcatenation has 0 input
   assert(node->numValues() > 0);
 
-  auto first_shape = loco::shape_get(node->values(0)).as<loco::TensorShape>();
+  auto first_shape = luci::shape_get(node->values(0)).as<loco::TensorShape>();
   auto axis = node->axis();
   if (axis < 0)
     axis += first_shape.rank();
@@ -527,14 +542,20 @@ loco::NodeShape infer_concatenation(const luci::CircleConcatenation *node)
 
   for (uint32_t i = 1; i < node->numValues(); ++i)
   {
-    auto input_shape = loco::shape_get(node->values(i)).as<loco::TensorShape>();
+    auto input_shape = luci::shape_get(node->values(i)).as<loco::TensorShape>();
 
     for (uint32_t j = 0; j < output_shape.rank(); ++j)
     {
       if (j == static_cast<uint32_t>(axis))
+      {
+        // If dimension is unknown, value() will return 0.
+        // This is wrong but until new inference algorithm is implemented,
+        // this code will not be modified to keep compatibility.
         output_shape.dim(j) = output_shape.dim(j).value() + input_shape.dim(j).value();
+      }
       else
-        assert(output_shape.dim(j) == input_shape.dim(j));
+        assert(!output_shape.dim(j).known() || !input_shape.dim(j).known() ||
+               output_shape.dim(j) == input_shape.dim(j));
     }
   }
 
@@ -545,8 +566,8 @@ loco::NodeShape infer_conv2d(const luci::CircleConv2D *node)
 {
   LOGGER(l);
 
-  auto ifm_shape = loco::shape_get(node->input()).as<loco::TensorShape>();  // in NHWC
-  auto ker_shape = loco::shape_get(node->filter()).as<loco::TensorShape>(); // in OHWI
+  auto ifm_shape = luci::shape_get(node->input()).as<loco::TensorShape>();  // in NHWC
+  auto ker_shape = luci::shape_get(node->filter()).as<loco::TensorShape>(); // in OHWI
 
   INFO(l) << "[luci] CircleConv2D ShapeInf ifm(" << ifm_shape.rank() << ") ker(" << ker_shape.rank()
           << ")" << std::endl;
@@ -569,7 +590,7 @@ loco::NodeShape infer_conv2d(const luci::CircleConv2D *node)
 
 loco::NodeShape infer_depth_to_space(const luci::CircleDepthToSpace *node)
 {
-  auto input_shape = loco::shape_get(node->input()).as<loco::TensorShape>();
+  auto input_shape = luci::shape_get(node->input()).as<loco::TensorShape>();
   LUCI_ASSERT(input_shape.rank() == 4, "Only input rank 4 is supported");
 
   // Only data format NHWC is supported
@@ -601,12 +622,13 @@ loco::NodeShape infer_depth_to_space(const luci::CircleDepthToSpace *node)
 
 loco::NodeShape infer_depthwise_conv2d(const luci::CircleDepthwiseConv2D *node)
 {
-  auto ifm_shape = loco::shape_get(node->input()).as<loco::TensorShape>();  // in NHWC
-  auto ker_shape = loco::shape_get(node->filter()).as<loco::TensorShape>(); // in 1 H W CM
+  auto ifm_shape = luci::shape_get(node->input()).as<loco::TensorShape>();  // in NHWC
+  auto ker_shape = luci::shape_get(node->filter()).as<loco::TensorShape>(); // in 1 H W CM
 
   assert(ifm_shape.rank() == 4);
   assert(ker_shape.rank() == 4);
   assert(ker_shape.dim(0).value() == 1);
+  assert(ifm_shape.dim(3).value() * node->depthMultiplier() == ker_shape.dim(3).value());
 
   auto os = infer_conv2d_type(node);
 
@@ -623,7 +645,7 @@ loco::NodeShape infer_depthwise_conv2d(const luci::CircleDepthwiseConv2D *node)
 loco::NodeShape infer_expand_dims(const luci::CircleExpandDims *node)
 {
   const loco::DataType S32 = loco::DataType::S32;
-  auto x_shape = loco::shape_get(node->input()).as<loco::TensorShape>();
+  auto x_shape = luci::shape_get(node->input()).as<loco::TensorShape>();
   if (x_shape.rank() == 0)
   {
     // This maybe for unknown shape. We use shape from the node itself.
@@ -637,7 +659,7 @@ loco::NodeShape infer_expand_dims(const luci::CircleExpandDims *node)
   }
   int32_t axis = const_axis->at<S32>(0);
   LUCI_ASSERT((axis <= static_cast<int32_t>(x_shape.rank())) &&
-                  (axis >= -1 - static_cast<int32_t>(x_shape.rank())),
+                (axis >= -1 - static_cast<int32_t>(x_shape.rank())),
               "Axis has to be between [-(D+1), D], where D is rank of input.");
   size_t positive_axis = axis < 0 ? x_shape.rank() + axis + 1 : axis;
   loco::TensorShape output_shape;
@@ -684,8 +706,8 @@ loco::NodeShape infer_fill(const luci::CircleFill *node)
 
 loco::NodeShape infer_fully_connected(const luci::CircleFullyConnected *node)
 {
-  auto input_shape = loco::shape_get(node->input()).as<loco::TensorShape>();
-  auto weights_shape = loco::shape_get(node->weights()).as<loco::TensorShape>();
+  auto input_shape = luci::shape_get(node->input()).as<loco::TensorShape>();
+  auto weights_shape = luci::shape_get(node->weights()).as<loco::TensorShape>();
 
   // Checking shape capability for fully connected layer
   // Input: a tensor of at least rank 2 [D1, D2, ... Dn]
@@ -715,8 +737,8 @@ loco::NodeShape infer_gather(const luci::CircleGather *node)
 {
   loco::TensorShape output_shape;
 
-  const auto input_shape = loco::shape_get(node->params()).as<loco::TensorShape>();
-  const auto positions_shape = loco::shape_get(node->indices()).as<loco::TensorShape>();
+  const auto input_shape = luci::shape_get(node->params()).as<loco::TensorShape>();
+  const auto positions_shape = luci::shape_get(node->indices()).as<loco::TensorShape>();
   int32_t axis = node->axis();
 
   // If CircleGather input has a dynamic shape, it can't inference this shape. So, it returns the
@@ -743,8 +765,8 @@ loco::NodeShape infer_gather_nd(const luci::CircleGatherNd *node)
 {
   loco::TensorShape output_shape;
 
-  const auto params_shape = loco::shape_get(node->params()).as<loco::TensorShape>();
-  const auto indices_shape = loco::shape_get(node->indices()).as<loco::TensorShape>();
+  const auto params_shape = luci::shape_get(node->params()).as<loco::TensorShape>();
+  const auto indices_shape = luci::shape_get(node->indices()).as<loco::TensorShape>();
 
   const auto params_rank = params_shape.rank();
   const auto indices_rank = indices_shape.rank();
@@ -791,7 +813,7 @@ loco::NodeShape infer_matrix_diag(const luci::CircleMatrixDiag *node)
 {
   loco::TensorShape output_shape;
 
-  auto diagonal_shape = loco::shape_get(node->diagonal()).as<loco::TensorShape>();
+  auto diagonal_shape = luci::shape_get(node->diagonal()).as<loco::TensorShape>();
   auto rank = diagonal_shape.rank();
 
   output_shape.rank(rank + 1);
@@ -808,8 +830,8 @@ loco::NodeShape infer_matrix_diag(const luci::CircleMatrixDiag *node)
 
 loco::NodeShape infer_matrix_set_diag(const luci::CircleMatrixSetDiag *node)
 {
-  auto input_shape = loco::shape_get(node->input()).as<loco::TensorShape>();
-  auto diagonal_shape = loco::shape_get(node->diagonal()).as<loco::TensorShape>();
+  auto input_shape = luci::shape_get(node->input()).as<loco::TensorShape>();
+  auto diagonal_shape = luci::shape_get(node->diagonal()).as<loco::TensorShape>();
 
   auto rank = diagonal_shape.rank();
 
@@ -831,7 +853,7 @@ loco::TensorShape infer_reducer(const loco::Node *input, const loco::Node *indic
 {
   const loco::DataType S32 = loco::DataType::S32;
 
-  auto input_shape = loco::shape_get(input).as<loco::TensorShape>();
+  auto input_shape = luci::shape_get(input).as<loco::TensorShape>();
   auto reduction_indices = loco::must_cast<const luci::CircleConst *>(indices);
 
   { // Exceptions
@@ -892,7 +914,7 @@ loco::NodeShape infer_mirror_pad(const luci::CircleMirrorPad *node)
 loco::NodeShape infer_one_hot(const luci::CircleOneHot *node)
 {
   const loco::DataType S32 = loco::DataType::S32;
-  auto indices_shape = loco::shape_get(node->indices()).as<loco::TensorShape>();
+  auto indices_shape = luci::shape_get(node->indices()).as<loco::TensorShape>();
   // Only support OneHot node's depth() is CircleConst with type S32
   // TODO support depth with other types
   auto depth = loco::must_cast<luci::CircleConst *>(node->depth());
@@ -925,11 +947,11 @@ loco::NodeShape infer_pack(const luci::CirclePack *node)
 {
   LUCI_ASSERT(node->values_count() > 0, "Only support one or more inputs");
 
-  auto first_shape = loco::shape_get(node->values(0)).as<loco::TensorShape>();
+  auto first_shape = luci::shape_get(node->values(0)).as<loco::TensorShape>();
   // Make sure all inputs have the same shape.
   for (uint32_t i = 1; i < node->values_count(); ++i)
   {
-    auto in_shape = loco::shape_get(node->values(i)).as<loco::TensorShape>();
+    auto in_shape = luci::shape_get(node->values(i)).as<loco::TensorShape>();
     LUCI_ASSERT(loco::NodeShape{first_shape} == loco::NodeShape{in_shape},
                 "All inputs must have the same shape");
   }
@@ -985,8 +1007,8 @@ loco::NodeShape infer_pad_v2(const luci::CirclePadV2 *node)
 
 loco::NodeShape infer_p_relu(const luci::CirclePRelu *node)
 {
-  auto input_shape = loco::shape_get(node->input()).as<loco::TensorShape>();
-  auto alpha_shape = loco::shape_get(node->alpha()).as<loco::TensorShape>();
+  auto input_shape = luci::shape_get(node->input()).as<loco::TensorShape>();
+  auto alpha_shape = luci::shape_get(node->alpha()).as<loco::TensorShape>();
 
   auto output_shape = broadcast_shape(input_shape, alpha_shape);
 
@@ -1087,10 +1109,12 @@ loco::NodeShape infer_reshape(const luci::CircleReshape *node)
   loco::TensorShape output_shape = shape_by_input;
 
   // One of the dimensions can have special value -1, meaning its actual value should be inferred.
-  const auto input_shape = loco::shape_get(node->tensor()).as<loco::TensorShape>();
-  const uint32_t input_element_count = loco::element_count(&input_shape);
+  const auto input_shape = luci::shape_get(node->tensor()).as<loco::TensorShape>();
+  uint32_t input_element_count = 1;
   uint32_t output_element_count = 1;
   uint32_t unknown_dim_index = UINT32_MAX;
+  for (uint32_t i = 0; i < input_shape.rank(); ++i)
+    input_element_count *= (input_shape.dim(i).known() ? input_shape.dim(i).value() : 1);
   for (uint32_t dim_index = 0; dim_index < output_shape.rank(); ++dim_index)
   {
     const uint32_t dim_value = output_shape.dim(dim_index).value();
@@ -1114,7 +1138,7 @@ loco::NodeShape infer_reshape(const luci::CircleReshape *node)
 
 loco::NodeShape infer_resize_bilinear(const luci::CircleResizeBilinear *node)
 {
-  auto input_shape = loco::shape_get(node->input()).as<loco::TensorShape>();
+  auto input_shape = luci::shape_get(node->input()).as<loco::TensorShape>();
 
   if (input_shape.rank() != 4)
     INTERNAL_EXN("Expected ResizeBilinear input to have rank 4");
@@ -1142,7 +1166,7 @@ loco::NodeShape infer_resize_bilinear(const luci::CircleResizeBilinear *node)
 
 loco::NodeShape infer_resize_nearest_neighbor(const luci::CircleResizeNearestNeighbor *node)
 {
-  auto input_shape = loco::shape_get(node->input()).as<loco::TensorShape>();
+  auto input_shape = luci::shape_get(node->input()).as<loco::TensorShape>();
 
   if (input_shape.rank() != 4)
     INTERNAL_EXN("Expected ResizeNearesNeighbor input to have rank 4");
@@ -1195,8 +1219,8 @@ loco::NodeShape infer_scatter_nd(const luci::CircleScatterNd *node)
 
 loco::NodeShape infer_segment_sum(const luci::CircleSegmentSum *node)
 {
-  auto input_shape = loco::shape_get(node->input()).as<loco::TensorShape>();
-  auto segment_shape = loco::shape_get(node->segment_ids()).as<loco::TensorShape>();
+  auto input_shape = luci::shape_get(node->input()).as<loco::TensorShape>();
+  auto segment_shape = luci::shape_get(node->segment_ids()).as<loco::TensorShape>();
 
   LUCI_ASSERT(segment_shape.rank() == 1, "segment_ids must be 1-D tensor");
   LUCI_ASSERT(segment_shape.dim(0).value() == input_shape.dim(0).value(),
@@ -1226,11 +1250,11 @@ loco::NodeShape infer_segment_sum(const luci::CircleSegmentSum *node)
 
 loco::NodeShape infer_select(const luci::CircleSelect *node)
 {
-  auto t_shape = loco::shape_get(node->t()).as<loco::TensorShape>();
-  assert(t_shape == loco::shape_get(node->e()).as<loco::TensorShape>());
+  auto t_shape = luci::shape_get(node->t()).as<loco::TensorShape>();
+  assert(t_shape == luci::shape_get(node->e()).as<loco::TensorShape>());
 
   // condition shape validation
-  auto c_shape = loco::shape_get(node->condition()).as<loco::TensorShape>();
+  auto c_shape = luci::shape_get(node->condition()).as<loco::TensorShape>();
   if (c_shape.rank() != t_shape.rank())
   {
     if (c_shape.rank() != 0 && c_shape.rank() != 1)
@@ -1248,9 +1272,9 @@ loco::NodeShape infer_select(const luci::CircleSelect *node)
 
 loco::NodeShape infer_select_v2(const luci::CircleSelectV2 *node)
 {
-  auto c_shape = loco::shape_get(node->condition()).as<loco::TensorShape>();
-  auto t_shape = loco::shape_get(node->t()).as<loco::TensorShape>();
-  auto e_shape = loco::shape_get(node->e()).as<loco::TensorShape>();
+  auto c_shape = luci::shape_get(node->condition()).as<loco::TensorShape>();
+  auto t_shape = luci::shape_get(node->t()).as<loco::TensorShape>();
+  auto e_shape = luci::shape_get(node->e()).as<loco::TensorShape>();
 
   // validate ability to broadcast shapes to each other
   auto b_shape = broadcast_shape(broadcast_shape(c_shape, t_shape), e_shape);
@@ -1259,7 +1283,7 @@ loco::NodeShape infer_select_v2(const luci::CircleSelectV2 *node)
 
 loco::NodeShape infer_shape(const luci::CircleShape *node)
 {
-  auto input_shape = loco::shape_get(node->input()).as<loco::TensorShape>();
+  auto input_shape = luci::shape_get(node->input()).as<loco::TensorShape>();
 
   loco::TensorShape output_shape;
 
@@ -1274,7 +1298,7 @@ loco::NodeShape infer_slice(const luci::CircleSlice *node)
   const loco::DataType S32 = loco::DataType::S32;
   const loco::DataType S64 = loco::DataType::S64;
 
-  auto input_shape = loco::shape_get(node->input()).as<loco::TensorShape>();
+  auto input_shape = luci::shape_get(node->input()).as<loco::TensorShape>();
 
   auto const_begin = loco::must_cast<luci::CircleConst *>(node->begin());
   auto const_size = loco::must_cast<luci::CircleConst *>(node->size());
@@ -1318,7 +1342,7 @@ loco::NodeShape infer_space_to_batch_nd(const luci::CircleSpaceToBatchND *node)
 {
   const loco::DataType S32 = loco::DataType::S32;
 
-  auto input_shape = loco::shape_get(node->input()).as<loco::TensorShape>();
+  auto input_shape = luci::shape_get(node->input()).as<loco::TensorShape>();
   // Support only input rank is 3 and 4
   assert(input_shape.rank() == 3 || input_shape.rank() == 4);
 
@@ -1330,8 +1354,8 @@ loco::NodeShape infer_space_to_batch_nd(const luci::CircleSpaceToBatchND *node)
   auto const_paddings = loco::must_cast<luci::CircleConst *>(node->paddings());
   LUCI_ASSERT(const_paddings->dtype() == S32, "Only support int32 paddings");
 
-  auto const_block_shape_shape = loco::shape_get(const_block_shape).as<loco::TensorShape>();
-  auto const_paddings_shape = loco::shape_get(const_paddings).as<loco::TensorShape>();
+  auto const_block_shape_shape = luci::shape_get(const_block_shape).as<loco::TensorShape>();
+  auto const_paddings_shape = luci::shape_get(const_paddings).as<loco::TensorShape>();
   assert(const_block_shape_shape.rank() == 1);
   assert(const_paddings_shape.rank() == 2);
 
@@ -1374,7 +1398,7 @@ loco::NodeShape infer_space_to_batch_nd(const luci::CircleSpaceToBatchND *node)
 
 loco::NodeShape infer_space_to_depth(const luci::CircleSpaceToDepth *node)
 {
-  auto input_shape = loco::shape_get(node->input()).as<loco::TensorShape>();
+  auto input_shape = luci::shape_get(node->input()).as<loco::TensorShape>();
   LUCI_ASSERT(input_shape.rank() == 4, "Only input rank 4 is supported");
 
   // Only data format NHWC is supported
@@ -1412,19 +1436,33 @@ loco::NodeShape infer_sparse_to_dense(const luci::CircleSparseToDense *node)
     auto output_shape_node = dynamic_cast<luci::CircleConst *>(node->output_shape());
     if (output_shape_node != nullptr)
     {
-      // Only support node with S32
-      LUCI_ASSERT(output_shape_node->dtype() == loco::DataType::S32,
-                  "Only support int32 CircleConst");
+      const auto output_shape_type = output_shape_node->dtype();
 
       if (output_shape_node->rank() != 1)
         INTERNAL_EXN_V("Only support rank 1 CircleConst",
                        oops::to_uint32(output_shape_node->rank()));
 
-      shape.rank(output_shape_node->size<loco::DataType::S32>());
+      if (output_shape_type == loco::DataType::S32)
+      {
+        shape.rank(output_shape_node->size<loco::DataType::S32>());
 
-      for (uint32_t axis = 0; axis < shape.rank(); ++axis)
+        for (uint32_t axis = 0; axis < shape.rank(); ++axis)
+        {
+          shape.dim(axis) = output_shape_node->at<loco::DataType::S32>(axis);
+        }
+      }
+      else if (output_shape_type == loco::DataType::S64)
       {
-        shape.dim(axis) = output_shape_node->at<loco::DataType::S32>(axis);
+        shape.rank(output_shape_node->size<loco::DataType::S64>());
+
+        for (uint32_t axis = 0; axis < shape.rank(); ++axis)
+        {
+          shape.dim(axis) = output_shape_node->at<loco::DataType::S64>(axis);
+        }
+      }
+      else
+      {
+        INTERNAL_EXN("Output shape of SparseToDense must be either int32 or int64");
       }
     }
     else
@@ -1453,7 +1491,7 @@ loco::NodeShape infer_strided_slice(const luci::CircleStridedSlice *node)
 
 loco::NodeShape infer_squeeze(const luci::CircleSqueeze *node)
 {
-  auto input_shape = loco::shape_get(node->input()).as<loco::TensorShape>();
+  auto input_shape = luci::shape_get(node->input()).as<loco::TensorShape>();
 
   // TODO input shape may be unknown before runtime
   std::vector<bool> do_squeeze(input_shape.rank(), false);
@@ -1508,7 +1546,7 @@ loco::NodeShape infer_tile(const luci::CircleTile *node)
 {
   const loco::DataType S32 = loco::DataType::S32;
 
-  auto input_shape = loco::shape_get(node->input()).as<loco::TensorShape>();
+  auto input_shape = luci::shape_get(node->input()).as<loco::TensorShape>();
   auto multiples = loco::must_cast<luci::CircleConst *>(node->multiples());
 
   // TODO support non-const case
@@ -1534,7 +1572,7 @@ loco::NodeShape infer_tile(const luci::CircleTile *node)
 
 loco::NodeShape infer_transpose(const luci::CircleTranspose *node)
 {
-  auto input_shape = loco::shape_get(node->a()).as<loco::TensorShape>();
+  auto input_shape = luci::shape_get(node->a()).as<loco::TensorShape>();
 
   auto perm_node = loco::must_cast<luci::CircleConst *>(node->perm());
 
@@ -1576,7 +1614,7 @@ loco::NodeShape infer_unpack(const luci::CircleUnpack *node)
   // CircleUnpack provides list(array) of Tensors which has one less dimension of the input
   // We'll set shape of CircleUnpack to shape of actual outputs
   // TODO fix this if any problem rises
-  auto value_shape = loco::shape_get(node->value()).as<loco::TensorShape>();
+  auto value_shape = luci::shape_get(node->value()).as<loco::TensorShape>();
 
   auto axis = node->axis();
   auto num = node->num();
@@ -1610,9 +1648,9 @@ loco::NodeShape infer_unpack(const luci::CircleUnpack *node)
 
 loco::NodeShape infer_unidirectionalsequencelstm(const luci::CircleUnidirectionalSequenceLSTM *node)
 {
-  auto input_shape = loco::shape_get(node->input()).as<loco::TensorShape>();
+  auto input_shape = luci::shape_get(node->input()).as<loco::TensorShape>();
   auto recurrent_to_output_weights =
-      loco::shape_get(node->recurrent_to_output_weights()).as<loco::TensorShape>();
+    luci::shape_get(node->recurrent_to_output_weights()).as<loco::TensorShape>();
   auto rank = input_shape.rank();
   loco::TensorShape output_shape;
   output_shape.rank(rank);
@@ -1626,7 +1664,7 @@ loco::NodeShape infer_unidirectionalsequencelstm(const luci::CircleUnidirectiona
 
 loco::NodeShape infer_unique(const luci::CircleUnique *node)
 {
-  auto input_shape = loco::shape_get(node->input()).as<loco::TensorShape>();
+  auto input_shape = luci::shape_get(node->input()).as<loco::TensorShape>();
 
   assert(input_shape.rank() == 1);
 
@@ -1641,7 +1679,7 @@ loco::NodeShape infer_bcq_fully_connected(const luci::CircleBCQFullyConnected *n
 {
   loco::TensorShape out_shape;
 
-  auto input_shape = loco::shape_get(node->input()).as<loco::TensorShape>();
+  auto input_shape = luci::shape_get(node->input()).as<loco::TensorShape>();
   auto weights_clusters = loco::must_cast<luci::CircleConst *>(node->weights_clusters());
 
   LUCI_ASSERT(input_shape.rank() == 2, "Input rank of BCQFullyConnected should be 2");
@@ -1664,8 +1702,8 @@ loco::NodeShape infer_bcq_gather(const luci::CircleBCQGather *node)
   loco::TensorShape input_shape;
   loco::TensorShape output_shape;
 
-  const auto input_binary_shape = loco::shape_get(node->input_binary()).as<loco::TensorShape>();
-  const auto indices_shape = loco::shape_get(node->indices()).as<loco::TensorShape>();
+  const auto input_binary_shape = luci::shape_get(node->input_binary()).as<loco::TensorShape>();
+  const auto indices_shape = luci::shape_get(node->indices()).as<loco::TensorShape>();
   auto axis = node->axis();
 
   auto input_clusters = loco::must_cast<luci::CircleConst *>(node->input_clusters());
@@ -1712,46 +1750,6 @@ loco::NodeShape infer_output(const luci::CircleOutput *node)
   return loco::NodeShape{*output_shape};
 }
 
-loco::NodeShape infer_if_out(const luci::CircleIfOut *node)
-{
-  /**
-   * @note  IF operator type and shape are that of the "then" and "else"
-   *        Graph Outputs.
-   */
-  auto circle_if = dynamic_cast<const luci::CircleIf *>(node->input());
-  if (circle_if == nullptr)
-  {
-    INTERNAL_EXN("CircleIf IR is not configured correctly");
-  }
-
-  auto index = node->index();
-  auto then_graph = circle_if->then_graph();
-  auto else_graph = circle_if->else_graph();
-  assert(then_graph != nullptr);
-  assert(else_graph != nullptr);
-
-  // shape and type are assumed to be same
-  // these are checked at post_import_graph() in Import
-  auto then_outputs = loco::output_nodes(then_graph);
-  auto else_outputs = loco::output_nodes(else_graph);
-  assert(then_outputs.size() == else_outputs.size());
-  assert(index < static_cast<int32_t>(then_outputs.size()));
-
-  auto then_out = loco::must_cast<luci::CircleOutput *>(then_outputs.at(index));
-  auto else_out = loco::must_cast<luci::CircleOutput *>(else_outputs.at(index));
-
-  auto then_graph_outputs = then_graph->outputs(); // loco::GraphOutput items
-  auto else_graph_outputs = else_graph->outputs();
-  assert(then_graph_outputs->size() == else_graph_outputs->size());
-
-  auto then_graph_output = then_graph_outputs->at(then_out->index());
-  auto else_graph_output = else_graph_outputs->at(else_out->index());
-  (void)else_graph_output; // make compiler happy for unused variable warnings
-  assert(*then_graph_output->shape() == *else_graph_output->shape());
-
-  return loco::NodeShape{*then_graph_output->shape()};
-}
-
 loco::NodeShape infer_non_max_suppression_v4_out(const luci::CircleNonMaxSuppressionV4Out *node)
 {
   const loco::DataType S32 = loco::DataType::S32;
@@ -1818,7 +1816,7 @@ loco::NodeShape infer_split_out(const luci::CircleSplitOut *node)
 
   loco::NodeShape unknown;
 
-  auto split_shape = loco::shape_get(split).as<loco::TensorShape>();
+  auto split_shape = luci::shape_get(split).as<loco::TensorShape>();
 
   auto split_dim = dynamic_cast<const luci::CircleConst *>(split->split_dim());
   if (split_dim == nullptr)
@@ -1852,7 +1850,7 @@ loco::NodeShape infer_split_v_out(const luci::CircleSplitVOut *node)
 
   loco::NodeShape unknown;
 
-  auto split_shape = loco::shape_get(split).as<loco::TensorShape>();
+  auto split_shape = luci::shape_get(split).as<loco::TensorShape>();
 
   auto size_splits = dynamic_cast<const luci::CircleConst *>(split->size_splits());
   if (size_splits == nullptr)
@@ -1913,7 +1911,7 @@ loco::NodeShape infer_top_k_v2_out(const luci::CircleTopKV2Out *node)
     INTERNAL_EXN("CircleSplit IR is not configured correctly");
 
   // shape of topkv2 is same as topkv2->input()
-  auto input_shape = loco::shape_get(topkv2).as<loco::TensorShape>();
+  auto input_shape = luci::shape_get(topkv2).as<loco::TensorShape>();
 
   auto node_k = loco::must_cast<const luci::CircleConst *>(topkv2->k());
   LUCI_ASSERT(node_k->dtype() == S32, "Only support Int32");
@@ -1940,7 +1938,7 @@ loco::NodeShape infer_unique_out(const luci::CircleUniqueOut *node)
   }
   assert(node->index() == 1);
   auto unique = loco::must_cast<luci::CircleUnique *>(node->input());
-  auto unique_shape = loco::shape_get(unique->input()).as<loco::TensorShape>();
+  auto unique_shape = luci::shape_get(unique->input()).as<loco::TensorShape>();
 
   assert(unique_shape.rank() == 1);
 
@@ -1958,7 +1956,7 @@ loco::NodeShape infer_unpack_out(const luci::CircleUnpackOut *node)
     INTERNAL_EXN("CircleUnpack IR is not configured correctly");
   }
 
-  auto unpack_shape = loco::shape_get(unpack).as<loco::TensorShape>();
+  auto unpack_shape = luci::shape_get(unpack).as<loco::TensorShape>();
 
   return loco::NodeShape{unpack_shape};
 }
@@ -2025,8 +2023,8 @@ public:
 
   loco::NodeShape visit(const luci::CircleBatchMatMul *node) final
   {
-    auto x_shape = loco::shape_get(node->x()).as<loco::TensorShape>();
-    auto y_shape = loco::shape_get(node->y()).as<loco::TensorShape>();
+    auto x_shape = luci::shape_get(node->x()).as<loco::TensorShape>();
+    auto y_shape = luci::shape_get(node->y()).as<loco::TensorShape>();
 
     return infer_batchmatmul_shape(x_shape, y_shape, node->adj_x(), node->adj_y());
   }
@@ -2065,7 +2063,7 @@ public:
 
   loco::NodeShape visit(const luci::CircleDequantize *node) final
   {
-    const auto input_shape = loco::shape_get(node->input()).as<loco::TensorShape>();
+    const auto input_shape = luci::shape_get(node->input()).as<loco::TensorShape>();
     return loco::NodeShape{input_shape};
   }
 
@@ -2073,7 +2071,7 @@ public:
 
   loco::NodeShape visit(const luci::CircleElu *node) final
   {
-    auto input_shape = loco::shape_get(node->features()).as<loco::TensorShape>();
+    auto input_shape = luci::shape_get(node->features()).as<loco::TensorShape>();
 
     return loco::NodeShape{input_shape};
   }
@@ -2087,6 +2085,8 @@ public:
     return infer_expand_dims(node);
   }
 
+  loco::NodeShape visit(const luci::CircleFakeQuant *node) final { return use_inputs(node); }
+
   loco::NodeShape visit(const luci::CircleFill *node) final { return infer_fill(node); }
 
   loco::NodeShape visit(const luci::CircleFloor *node) final { return use_x(node); }
@@ -2112,7 +2112,7 @@ public:
   {
     // Shape of CircleIf is not used. Just use input 0
     assert(node->input_count() > 0);
-    const auto input_shape = loco::shape_get(node->input(0)).as<loco::TensorShape>();
+    const auto input_shape = luci::shape_get(node->input(0)).as<loco::TensorShape>();
     return loco::NodeShape{input_shape};
   }
 
@@ -2125,7 +2125,7 @@ public:
 
   loco::NodeShape visit(const luci::CircleLeakyRelu *node) final
   {
-    const auto input_shape = loco::shape_get(node->features()).as<loco::TensorShape>();
+    const auto input_shape = luci::shape_get(node->features()).as<loco::TensorShape>();
     return loco::NodeShape{input_shape};
   }
 
@@ -2135,7 +2135,7 @@ public:
 
   loco::NodeShape visit(const luci::CircleLocalResponseNormalization *node) final
   {
-    const auto input_shape = loco::shape_get(node->input()).as<loco::TensorShape>();
+    const auto input_shape = luci::shape_get(node->input()).as<loco::TensorShape>();
     return loco::NodeShape{input_shape};
   }
 
@@ -2184,13 +2184,13 @@ public:
 
   loco::NodeShape visit(const luci::CircleNonMaxSuppressionV4 *node) final
   {
-    const auto boxes_shape = loco::shape_get(node->boxes()).as<loco::TensorShape>();
+    const auto boxes_shape = luci::shape_get(node->boxes()).as<loco::TensorShape>();
     return loco::NodeShape{boxes_shape};
   }
 
   loco::NodeShape visit(const luci::CircleNonMaxSuppressionV5 *node) final
   {
-    const auto boxes_shape = loco::shape_get(node->boxes()).as<loco::TensorShape>();
+    const auto boxes_shape = luci::shape_get(node->boxes()).as<loco::TensorShape>();
     return loco::NodeShape{boxes_shape};
   }
 
@@ -2244,21 +2244,21 @@ public:
 
   loco::NodeShape visit(const luci::CircleRelu *node) final
   {
-    auto input_shape = loco::shape_get(node->features()).as<loco::TensorShape>();
+    auto input_shape = luci::shape_get(node->features()).as<loco::TensorShape>();
 
     return loco::NodeShape{input_shape};
   }
 
   loco::NodeShape visit(const luci::CircleRelu6 *node) final
   {
-    auto input_shape = loco::shape_get(node->features()).as<loco::TensorShape>();
+    auto input_shape = luci::shape_get(node->features()).as<loco::TensorShape>();
 
     return loco::NodeShape{input_shape};
   }
 
   loco::NodeShape visit(const luci::CircleReluN1To1 *node) final
   {
-    auto input_shape = loco::shape_get(node->features()).as<loco::TensorShape>();
+    auto input_shape = luci::shape_get(node->features()).as<loco::TensorShape>();
 
     return loco::NodeShape{input_shape};
   }
@@ -2284,7 +2284,7 @@ public:
 
   loco::NodeShape visit(const luci::CircleReverseSequence *node) final
   {
-    auto input_shape = loco::shape_get(node->input()).as<loco::TensorShape>();
+    auto input_shape = luci::shape_get(node->input()).as<loco::TensorShape>();
 
     return loco::NodeShape{input_shape};
   }
@@ -2293,9 +2293,9 @@ public:
 
   loco::NodeShape visit(const luci::CircleReverseV2 *node) final
   {
-    auto input_shape = loco::shape_get(node->tensor()).as<loco::TensorShape>();
+    auto input_shape = luci::shape_get(node->tensor()).as<loco::TensorShape>();
 
-    LUCI_ASSERT(loco::shape_get(node->axis()).as<loco::TensorShape>().rank() == 1,
+    LUCI_ASSERT(luci::shape_get(node->axis()).as<loco::TensorShape>().rank() == 1,
                 "Tensor must be 1-D");
 
     return loco::NodeShape{input_shape};
@@ -2340,14 +2340,14 @@ public:
   loco::NodeShape visit(const luci::CircleSplit *node) final
   {
     // We'll set Split output as same as input so that SplitOut can handle it's own shape
-    auto input_shape = loco::shape_get(node->input()).as<loco::TensorShape>();
+    auto input_shape = luci::shape_get(node->input()).as<loco::TensorShape>();
     return loco::NodeShape{input_shape};
   }
 
   loco::NodeShape visit(const luci::CircleSplitV *node) final
   {
     // We'll set SplitV output as same as input so that SplitOut can handle it's own shape
-    auto input_shape = loco::shape_get(node->input()).as<loco::TensorShape>();
+    auto input_shape = luci::shape_get(node->input()).as<loco::TensorShape>();
     return loco::NodeShape{input_shape};
   }
 
@@ -2382,7 +2382,7 @@ public:
   loco::NodeShape visit(const luci::CircleTopKV2 *node) final
   {
     // set shape of this node as same as input
-    const auto input_shape = loco::shape_get(node->input()).as<loco::TensorShape>();
+    const auto input_shape = luci::shape_get(node->input()).as<loco::TensorShape>();
     return loco::NodeShape{input_shape};
   }
 
@@ -2408,13 +2408,13 @@ public:
   {
     // Shape of CircleWhile is not used. Just use input 0
     assert(node->arity() > 0);
-    const auto input_shape = loco::shape_get(node->input(0)).as<loco::TensorShape>();
+    const auto input_shape = luci::shape_get(node->input(0)).as<loco::TensorShape>();
     return loco::NodeShape{input_shape};
   }
 
   loco::NodeShape visit(const luci::CircleZerosLike *node) final
   {
-    auto input_shape = loco::shape_get(node->input()).as<loco::TensorShape>();
+    auto input_shape = luci::shape_get(node->input()).as<loco::TensorShape>();
 
     return loco::NodeShape{input_shape};
   }
@@ -2429,7 +2429,7 @@ public:
 
   loco::NodeShape visit(const luci::CircleInstanceNorm *node) final
   {
-    auto input_shape = loco::shape_get(node->input()).as<loco::TensorShape>();
+    auto input_shape = luci::shape_get(node->input()).as<loco::TensorShape>();
 
     return loco::NodeShape{input_shape};
   }
@@ -2445,8 +2445,6 @@ public:
 
   loco::NodeShape visit(const luci::CircleCustomOut *node) final { return use_own(node); }
 
-  loco::NodeShape visit(const luci::CircleIfOut *node) final { return infer_if_out(node); }
-
   loco::NodeShape visit(const luci::CircleNonMaxSuppressionV4Out *node) final
   {
     return infer_non_max_suppression_v4_out(node);
diff --git a/compiler/luci/service/src/CircleShapeInferenceRule.test.cpp b/compiler/luci/service/src/CircleShapeInferenceRule.test.cpp
deleted file mode 100644
index ac27db3bd..000000000
--- a/compiler/luci/service/src/CircleShapeInferenceRule.test.cpp
+++ /dev/null
@@ -1,626 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "TestGraph.h"
-#include "luci/Service/CircleShapeInferenceRule.h"
-
-#include <luci/IR/CircleNodes.h>
-#include <luci/IR/CircleDialect.h>
-
-#include <loco.h>
-#include <loco/IR/CanonicalDialect.h>
-#include <loco/Service/ShapeInference.h>
-#include <loco/Service/CanonicalShapeInferenceRule.h>
-#include <loco/Service/MultiDialectShapeInferenceRule.h>
-
-#include <oops/InternalExn.h>
-
-#include <gtest/gtest.h>
-
-#include <memory>
-
-namespace
-{
-
-bool shape_pass(loco::Graph *g)
-{
-  loco::CanonicalShapeInferenceRule canonical_rule;
-  luci::CircleShapeInferenceRule circle_rule;
-  loco::MultiDialectShapeInferenceRule rules;
-
-  rules.bind(loco::CanonicalDialect::get(), &canonical_rule)
-      .bind(luci::CircleDialect::get(), &circle_rule);
-
-  return loco::apply(&rules).to(g);
-}
-
-} // namespace
-
-TEST(CircleShapeInferenceRuleTest, minimal_with_CircleRelu)
-{
-  // Create a simple network
-  luci::test::TestGraph graph;
-  auto relu_node = graph.append<luci::CircleRelu>(graph.input_node);
-  graph.complete(relu_node);
-
-  // set shape
-  {
-    graph.input_node->rank(2);
-    graph.input_node->dim(0) = 3;
-    graph.input_node->dim(1) = 4;
-
-    graph.output_node->rank(2);
-    graph.output_node->dim(0) = 3;
-    graph.output_node->dim(1) = 4;
-
-    luci::test::graph_input_shape(graph.input_node);
-    luci::test::graph_output_shape(graph.output_node);
-  }
-
-  // pre-check
-  ASSERT_FALSE(loco::shape_known(relu_node));
-
-  // shape inference
-  while (shape_pass(graph.graph()) == true)
-    ;
-
-  // Verify
-  {
-    ASSERT_TRUE(loco::shape_known(relu_node));
-    ASSERT_EQ(loco::Domain::Tensor, loco::shape_get(relu_node).domain());
-
-    auto shape = loco::shape_get(relu_node).as<loco::TensorShape>();
-    ASSERT_EQ(2, shape.rank());
-    ASSERT_EQ(3, shape.dim(0));
-    ASSERT_EQ(4, shape.dim(1));
-  }
-}
-
-// based on the case shown in
-// https://www.corvil.com/kb/what-is-the-difference-between-same-and-valid-padding-in-tf-nn-max-pool-of-tensorflow
-TEST(CircleShapeInferenceRuleTest, avgpool2d_valid)
-{
-  luci::test::TestGraph graph;
-  auto avg_node = graph.append<luci::CircleAveragePool2D>(graph.input_node);
-  graph.complete();
-
-  auto input_node = graph.input_node;
-  {
-    input_node->shape({1, 4, 3, 1});
-    luci::test::graph_input_shape(input_node);
-  }
-  auto output_node = graph.output_node;
-  {
-    output_node->shape({1, 2, 1, 1});
-    luci::test::graph_output_shape(output_node);
-  }
-  // setting CircleAveragePool2D
-  {
-    avg_node->filter()->h(2);
-    avg_node->filter()->w(2);
-    avg_node->stride()->h(2);
-    avg_node->stride()->w(2);
-    avg_node->fusedActivationFunction(luci::FusedActFunc::NONE);
-    avg_node->padding(luci::Padding::VALID);
-  }
-  ASSERT_FALSE(loco::shape_known(avg_node));
-
-  // shape inference
-  while (shape_pass(graph.graph()) == true)
-    ;
-
-  // Verify
-  {
-    ASSERT_TRUE(loco::shape_known(avg_node));
-    ASSERT_EQ(loco::Domain::Tensor, loco::shape_get(avg_node).domain());
-
-    auto shape = loco::shape_get(avg_node).as<loco::TensorShape>();
-    ASSERT_EQ(4, shape.rank());
-    ASSERT_EQ(1, shape.dim(0).value());
-    ASSERT_EQ(2, shape.dim(1).value());
-    ASSERT_EQ(1, shape.dim(2).value());
-    ASSERT_EQ(1, shape.dim(3).value());
-  }
-}
-
-TEST(CircleShapeInferenceRuleTest, avgpool2d_same)
-{
-  luci::test::TestGraph graph;
-  auto avg_node = graph.append<luci::CircleAveragePool2D>(graph.input_node);
-  graph.complete();
-
-  auto input_node = graph.input_node;
-  {
-    input_node->shape({1, 4, 3, 1});
-    luci::test::graph_input_shape(input_node);
-  }
-  auto output_node = graph.output_node;
-  {
-    output_node->shape({1, 2, 2, 1});
-    luci::test::graph_output_shape(output_node);
-  }
-
-  // setting CircleAveragePool2D
-  {
-    avg_node->filter()->h(2);
-    avg_node->filter()->w(2);
-    avg_node->stride()->h(2);
-    avg_node->stride()->w(2);
-    avg_node->fusedActivationFunction(luci::FusedActFunc::NONE);
-    avg_node->padding(luci::Padding::SAME);
-  }
-
-  ASSERT_FALSE(loco::shape_known(avg_node));
-
-  // shape inference
-  while (shape_pass(graph.graph()) == true)
-    ;
-
-  // Verify
-  {
-    ASSERT_TRUE(loco::shape_known(avg_node));
-    ASSERT_EQ(loco::Domain::Tensor, loco::shape_get(avg_node).domain());
-
-    auto shape = loco::shape_get(avg_node).as<loco::TensorShape>();
-    ASSERT_EQ(4, shape.rank());
-    ASSERT_EQ(1, shape.dim(0).value());
-    ASSERT_EQ(2, shape.dim(1).value());
-    ASSERT_EQ(2, shape.dim(2).value());
-    ASSERT_EQ(1, shape.dim(3).value());
-  }
-}
-
-/**
- * @note Function to test: Shape inference of two different input shapes
- *
- *       Rank expansion to higher input side
- *          x(2,1,5) + y(3,5) --> x(2,1,5) + y(1,3,5)
- *       Do output shape inference like numpy
- *          x(2,1,5) + y(1,3,5) --> output(2,3,5)
- *       For each axis, dim value should be same OR one of them should be 1
- */
-TEST(CircleShapeInferenceRuleTest, TFAdd_shapeinf_different)
-{
-  auto g = loco::make_graph();
-
-  auto x_node = g->nodes()->create<luci::CircleInput>();
-  {
-    x_node->rank(3);
-    x_node->dim(0) = 2;
-    x_node->dim(1) = 1;
-    x_node->dim(2) = 5;
-  }
-  auto y_node = g->nodes()->create<luci::CircleInput>();
-  {
-    y_node->rank(2);
-    y_node->dim(0) = 3;
-    y_node->dim(1) = 5;
-  }
-  auto add_node = g->nodes()->create<luci::CircleAdd>();
-  {
-    add_node->x(x_node);
-    add_node->y(y_node);
-  }
-  auto output_node = g->nodes()->create<luci::CircleOutput>();
-  {
-    output_node->from(add_node);
-  }
-
-  auto x_input = g->inputs()->create();
-  {
-    x_input->name("x");
-    luci::link(x_input, x_node);
-  }
-  auto y_input = g->inputs()->create();
-  {
-    y_input->name("y");
-    luci::link(y_input, y_node);
-  }
-  auto output = g->outputs()->create();
-  {
-    output->name("output");
-    luci::link(output, output_node);
-  }
-
-  luci::test::graph_input_shape(x_node);
-  luci::test::graph_input_shape(y_node);
-  luci::test::graph_output_shape(output_node);
-
-  // pre-check
-  ASSERT_FALSE(loco::shape_known(add_node));
-
-  // shape inference
-  while (shape_pass(g.get()) == true)
-    ;
-
-  // Verify
-  {
-    ASSERT_TRUE(loco::shape_known(add_node));
-    ASSERT_EQ(loco::Domain::Tensor, loco::shape_get(add_node).domain());
-
-    auto shape = loco::shape_get(add_node).as<loco::TensorShape>();
-    ASSERT_EQ(3, shape.rank());
-    ASSERT_EQ(2, shape.dim(0));
-    ASSERT_EQ(3, shape.dim(1));
-    ASSERT_EQ(5, shape.dim(2));
-  }
-}
-
-TEST(CircleShapeInferenceRuleTest, CircleTranspose_simple)
-{
-  luci::test::ExampleGraph<luci::test::ExampleGraphType::CircleTranspose> g;
-
-  g.input_node->rank(3);
-  g.input_node->dim(0) = 3;
-  g.input_node->dim(1) = 8;
-  g.input_node->dim(2) = 1;
-
-  g.const_perm->dtype(loco::DataType::S32);
-  g.const_perm->rank(1);
-  g.const_perm->dim(0) = 3;
-  g.const_perm->size<loco::DataType::S32>(3);
-  g.const_perm->at<loco::DataType::S32>(0) = 1;
-  g.const_perm->at<loco::DataType::S32>(1) = 2;
-  g.const_perm->at<loco::DataType::S32>(2) = 0;
-
-  luci::test::graph_input_shape(g.input_node);
-  luci::test::graph_output_shape(g.output_node);
-
-  // pre-check
-  ASSERT_FALSE(loco::shape_known(g.transpose_node));
-
-  // shape inference
-  while (shape_pass(g.graph()) == true)
-    ;
-
-  // Verify
-  {
-    ASSERT_TRUE(loco::shape_known(g.transpose_node));
-
-    auto shape = loco::shape_get(g.transpose_node).as<loco::TensorShape>();
-    ASSERT_EQ(3, shape.rank());
-    ASSERT_EQ(8, shape.dim(0));
-    ASSERT_EQ(1, shape.dim(1));
-    ASSERT_EQ(3, shape.dim(2));
-  }
-}
-
-TEST(CircleShapeInferenceRuleTest, CircleSqueeze)
-{
-  luci::test::TestGraph graph;
-  auto squeeze_node = graph.append<luci::CircleSqueeze>(graph.input_node);
-  graph.complete();
-
-  auto input_node = graph.input_node;
-  {
-    input_node->shape({1, 4, 3, 1});
-  }
-  auto output_node = graph.output_node;
-  {
-    output_node->shape({4, 3, 1});
-  }
-
-  luci::test::graph_input_shape(input_node);
-  luci::test::graph_output_shape(output_node);
-
-  squeeze_node->squeeze_dims({0});
-
-  // pre-check
-  ASSERT_FALSE(loco::shape_known(squeeze_node));
-
-  // shape inference
-  while (shape_pass(graph.graph()) == true)
-    ;
-
-  // Verify
-  {
-    ASSERT_TRUE(loco::shape_known(squeeze_node));
-
-    auto shape = loco::shape_get(squeeze_node).as<loco::TensorShape>();
-    ASSERT_EQ(3, shape.rank());
-    ASSERT_EQ(4, shape.dim(0));
-    ASSERT_EQ(3, shape.dim(1));
-    ASSERT_EQ(1, shape.dim(2));
-  }
-}
-
-TEST(CircleShapeInferenceRuleTest, CircleExpandDims)
-{
-  luci::test::TestGraph graph;
-  auto axis = graph.append<luci::CircleConst>();
-  axis->dtype(loco::DataType::S32);
-  axis->rank(0);
-  axis->size<loco::DataType::S32>(1);
-  axis->at<loco::DataType::S32>(0) = 1;
-
-  auto expand_dims = graph.append<luci::CircleExpandDims>(graph.input_node, axis);
-  graph.complete();
-
-  auto input_node = graph.input_node;
-  {
-    input_node->shape({4, 3});
-  }
-
-  auto output_node = graph.output_node;
-  {
-    output_node->from(expand_dims);
-  }
-
-  luci::test::graph_input_shape(input_node);
-  luci::test::graph_output_shape(output_node);
-
-  // shape inference
-  while (shape_pass(graph.graph()))
-    ;
-
-  // validation
-  {
-    ASSERT_TRUE(loco::shape_known(expand_dims));
-
-    auto shape = loco::shape_get(expand_dims).as<loco::TensorShape>();
-
-    ASSERT_EQ(3, shape.rank());
-    ASSERT_EQ(4, shape.dim(0));
-    ASSERT_EQ(1, shape.dim(1));
-    ASSERT_EQ(3, shape.dim(2));
-  }
-}
-
-TEST(CircleShapeInferenceRuleTest, CircleSqueezeAll)
-{
-  luci::test::TestGraph graph;
-  auto squeeze_node = graph.append<luci::CircleSqueeze>(graph.input_node);
-  graph.complete();
-
-  auto input_node = graph.input_node;
-  {
-    input_node->shape({1, 4, 3, 1});
-  }
-  auto output_node = graph.output_node;
-  {
-    input_node->shape({4, 3});
-  }
-
-  luci::test::graph_input_shape(input_node);
-  luci::test::graph_output_shape(output_node);
-
-  squeeze_node->squeeze_dims({});
-
-  // pre-check
-  ASSERT_FALSE(loco::shape_known(squeeze_node));
-
-  // shape inference
-  while (shape_pass(graph.graph()) == true)
-    ;
-
-  // Verify
-  {
-    ASSERT_TRUE(loco::shape_known(squeeze_node));
-
-    auto shape = loco::shape_get(squeeze_node).as<loco::TensorShape>();
-    ASSERT_EQ(2, shape.rank());
-    ASSERT_EQ(4, shape.dim(0));
-    ASSERT_EQ(3, shape.dim(1));
-  }
-}
-
-TEST(CircleShapeInferenceRuleTest, CircleGatherNd_simple)
-{
-  luci::test::TestGraph graph;
-  auto indices_const = graph.append<luci::CircleConst>();
-  auto gather_nd_node = graph.append<luci::CircleGatherNd>(graph.input_node, indices_const);
-  graph.complete();
-
-  {
-    auto input_node = graph.input_node;
-    input_node->shape({1, 4, 4, 3});
-    luci::test::graph_input_shape(input_node);
-  }
-  {
-    auto output_node = graph.output_node;
-    output_node->shape({1, 2, 2, 3});
-    luci::test::graph_output_shape(output_node);
-  }
-
-  {
-    indices_const->shape({1, 2, 3});
-  }
-
-  // pre-check
-  ASSERT_FALSE(loco::shape_known(gather_nd_node));
-
-  // shape inference
-  while (shape_pass(graph.graph()) == true)
-    ;
-
-  // Verify
-  {
-    ASSERT_TRUE(loco::shape_known(gather_nd_node));
-
-    auto shape = loco::shape_get(gather_nd_node).as<loco::TensorShape>();
-    ASSERT_EQ(3, shape.rank());
-    ASSERT_EQ(1, shape.dim(0));
-    ASSERT_EQ(2, shape.dim(1));
-    ASSERT_EQ(3, shape.dim(2));
-  }
-}
-
-TEST(CircleShapeInferenceRuleTest, CircleGatherNd_slices)
-{
-  luci::test::TestGraph graph;
-  auto indices_const = graph.append<luci::CircleConst>();
-  auto gather_nd_node = graph.append<luci::CircleGatherNd>(graph.input_node, indices_const);
-  graph.complete();
-
-  {
-    auto input_node = graph.input_node;
-    input_node->shape({1, 4, 4, 3});
-    luci::test::graph_input_shape(input_node);
-  }
-  {
-    auto output_node = graph.output_node;
-    output_node->shape({1, 2, 4, 4, 3});
-    luci::test::graph_output_shape(output_node);
-  }
-
-  {
-    indices_const->shape({1, 2, 1});
-  }
-
-  // pre-check
-  ASSERT_FALSE(loco::shape_known(gather_nd_node));
-
-  // shape inference
-  while (shape_pass(graph.graph()) == true)
-    ;
-
-  // Verify
-  {
-    ASSERT_TRUE(loco::shape_known(gather_nd_node));
-
-    auto shape = loco::shape_get(gather_nd_node).as<loco::TensorShape>();
-    ASSERT_EQ(5, shape.rank());
-    ASSERT_EQ(1, shape.dim(0));
-    ASSERT_EQ(2, shape.dim(1));
-    ASSERT_EQ(4, shape.dim(2));
-    ASSERT_EQ(4, shape.dim(3));
-    ASSERT_EQ(3, shape.dim(4));
-  }
-}
-
-TEST(CircleShapeInferenceRuleTest, CircleGatherNd_NEG)
-{
-  luci::test::TestGraph graph;
-  auto indices_const = graph.append<luci::CircleConst>();
-  auto gather_nd_node = graph.append<luci::CircleGatherNd>(graph.input_node, indices_const);
-  graph.complete();
-
-  {
-    auto input_node = graph.input_node;
-    input_node->shape({1, 4, 4, 3});
-    luci::test::graph_input_shape(input_node);
-  }
-  {
-    // Does not matter, because test should fail anyway
-    auto output_node = graph.output_node;
-    output_node->shape({0, 0, 0});
-    luci::test::graph_output_shape(output_node);
-  }
-
-  {
-    indices_const->shape({1, 2, 5});
-  }
-
-  // pre-check
-  ASSERT_FALSE(loco::shape_known(gather_nd_node));
-
-  // had to pack into lambda to check throw
-  auto lambda = [&]() {
-    // shape inference
-    while (shape_pass(graph.graph()) == true)
-      ;
-  };
-
-  ASSERT_THROW(lambda(), oops::InternalExn);
-}
-
-TEST(CircleShapeInferenceRuleTest, CircleResizeNearestNeighbor)
-{
-  luci::test::TestGraph graph;
-  auto size_const = graph.append<luci::CircleConst>();
-  size_const->dtype(loco::DataType::S32);
-  size_const->rank(1);
-  size_const->dim(0) = 2;
-  size_const->size<loco::DataType::S32>(2);
-  size_const->at<loco::DataType::S32>(0) = 16;
-  size_const->at<loco::DataType::S32>(1) = 16;
-  auto resize_node = graph.append<luci::CircleResizeNearestNeighbor>(graph.input_node, size_const);
-  graph.complete();
-
-  {
-    auto input_node = graph.input_node;
-    input_node->shape({1, 4, 4, 3});
-    luci::test::graph_input_shape(input_node);
-  }
-  {
-    auto output_node = graph.output_node;
-    output_node->from(resize_node);
-    luci::test::graph_output_shape(output_node);
-  }
-
-  // pre-check
-  ASSERT_FALSE(loco::shape_known(resize_node));
-
-  // shape inference
-  while (shape_pass(graph.graph()) == true)
-    ;
-
-  // Verify
-  {
-    ASSERT_TRUE(loco::shape_known(resize_node));
-
-    auto shape = loco::shape_get(resize_node).as<loco::TensorShape>();
-    ASSERT_EQ(4, shape.rank());
-    ASSERT_EQ(1, shape.dim(0));
-    ASSERT_EQ(16, shape.dim(1));
-    ASSERT_EQ(16, shape.dim(2));
-    ASSERT_EQ(3, shape.dim(3));
-  }
-}
-
-TEST(CircleShapeInferenceRuleTest, CircleResizeBilinear)
-{
-  luci::test::TestGraph graph;
-  auto size_const = graph.append<luci::CircleConst>();
-  size_const->dtype(loco::DataType::S32);
-  size_const->rank(1);
-  size_const->dim(0) = 2;
-  size_const->size<loco::DataType::S32>(2);
-  size_const->at<loco::DataType::S32>(0) = 16;
-  size_const->at<loco::DataType::S32>(1) = 16;
-  auto resize_node = graph.append<luci::CircleResizeBilinear>(graph.input_node, size_const);
-  graph.complete();
-
-  {
-    auto input_node = graph.input_node;
-    input_node->shape({1, 4, 4, 3});
-    luci::test::graph_input_shape(input_node);
-  }
-  {
-    auto output_node = graph.output_node;
-    output_node->from(resize_node);
-    luci::test::graph_output_shape(output_node);
-  }
-
-  // pre-check
-  ASSERT_FALSE(loco::shape_known(resize_node));
-
-  // shape inference
-  while (shape_pass(graph.graph()) == true)
-    ;
-
-  // Verify
-  {
-    ASSERT_TRUE(loco::shape_known(resize_node));
-
-    auto shape = loco::shape_get(resize_node).as<loco::TensorShape>();
-    ASSERT_EQ(4, shape.rank());
-    ASSERT_EQ(1, shape.dim(0));
-    ASSERT_EQ(16, shape.dim(1));
-    ASSERT_EQ(16, shape.dim(2));
-    ASSERT_EQ(3, shape.dim(3));
-  }
-}
diff --git a/compiler/luci/service/src/CircleShapeSignatureInference.cpp b/compiler/luci/service/src/CircleShapeSignatureInference.cpp
deleted file mode 100644
index 1ccaa19d5..000000000
--- a/compiler/luci/service/src/CircleShapeSignatureInference.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "luci/Service/CircleShapeSignatureInference.h"
-
-#include <luci/Log.h>
-
-namespace
-{
-
-std::ostream &operator<<(std::ostream &os, const luci::ShapeSignature &shape_signature)
-{
-  os << "[";
-  for (uint32_t r = 0; r < shape_signature.rank(); ++r)
-  {
-    if (r)
-      os << ",";
-    os << shape_signature.dim(r);
-  }
-  os << "]";
-  return os;
-}
-
-} // namespace
-
-namespace luci
-{
-
-namespace ssinf
-{
-
-bool Rule::infer(const luci::CircleNode *circle_node, ShapeSignature &shape_signature) const
-{
-  LOGGER(l);
-
-  // There is nothing to check before ShapeSignatureInference.
-
-  Algorithm alg;
-
-  shape_signature = circle_node->accept(&alg);
-
-  VERBOSE(l, 1) << "[luci] Shape Signature( " << circle_node->name() << " )";
-  VERBOSE(l, 1) << "    before: " << circle_node->shape_signature();
-  VERBOSE(l, 1) << "     after: " << shape_signature;
-
-  return true;
-}
-
-} // namespace ssinf
-
-} // namespace luci
diff --git a/compiler/luci/service/src/CircleShapeSignatureInferenceHelper.cpp b/compiler/luci/service/src/CircleShapeSignatureInferenceHelper.cpp
deleted file mode 100644
index d7d1a24e8..000000000
--- a/compiler/luci/service/src/CircleShapeSignatureInferenceHelper.cpp
+++ /dev/null
@@ -1,160 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "luci/Service/CircleShapeSignatureInferenceHelper.h"
-
-#include <loco.h>
-
-#include <luci/Log.h>
-
-#include <oops/InternalExn.h>
-
-namespace luci
-{
-
-namespace ssinf
-{
-
-luci::ShapeSignature legalized_signature(const luci::ShapeSignature &signature)
-{
-  // If shape signature has at least one -1, it is not static.
-  for (uint32_t i = 0; i < signature.rank(); ++i)
-    if (signature.dim(i) == -1)
-      return signature;
-
-  // If all dimensions are static, return empty shape signature.
-  return luci::ShapeSignature();
-}
-
-ShapeSignature reduced_signature(const loco::Node *node, const loco::Node *indices, bool keep_dims)
-{
-  LOGGER(l);
-
-  ShapeSignature input_signature;
-  ShapeSignature output_signature;
-
-  auto circle_node = loco::must_cast<const luci::CircleNode *>(node);
-  if (circle_node->shape_signature().rank() > 0)
-    input_signature = circle_node->shape_signature();
-  else
-  {
-    input_signature.rank(circle_node->rank());
-    for (uint32_t i = 0; i < circle_node->rank(); ++i)
-      input_signature.dim(i) = circle_node->dim(i).value();
-  }
-
-  // If input rank is 0, it means that one of following case is occurred.
-  // - Input is scalar : result is always scalar
-  // - Input shape signature is not inferenced : cannot infer output shape signauture
-  // Therefore, when input signature rank is 0, always return empty signature.
-  if (input_signature.rank() == 0)
-    return output_signature;
-
-  // When reduction_indices is not constant
-  auto reduction_indices = dynamic_cast<const luci::CircleConst *>(indices);
-  if (reduction_indices == nullptr)
-  {
-    if (keep_dims)
-    {
-      // If keep_dims is true, rank is not changed.
-      output_signature.rank(input_signature.rank());
-      for (uint32_t i = 0; i < output_signature.rank(); ++i)
-        output_signature.dim(i) = -1;
-    }
-    else
-    {
-      // There is no way to inference for this case.
-      // Do nothing to return empty signature.
-      INFO(l) << "[CircleShapeSignatureInferenceHelper] " << circle_node->name() << std::endl;
-      INFO(l) << " reduced_signature : cannot infer because of non-constant node" << std::endl;
-    }
-
-    return output_signature;
-  }
-
-  std::vector<int32_t> reduction_values;
-  if (reduction_indices->dtype() == loco::DataType::S32)
-  {
-    auto reduction_size = reduction_indices->size<loco::DataType::S32>();
-    for (uint32_t i = 0; i < reduction_size; ++i)
-    {
-      int32_t axis = reduction_indices->at<loco::DataType::S32>(i);
-      if (axis < 0)
-        axis += input_signature.rank();
-
-      if (!(0 <= axis && axis < static_cast<int32_t>(input_signature.rank())))
-        INTERNAL_EXN_V("Invalid reduction axis for REDUCER", oops::to_uint32(axis));
-
-      reduction_values.push_back(axis);
-    }
-  }
-  else if (reduction_indices->dtype() == loco::DataType::S64)
-  {
-    auto reduction_size = reduction_indices->size<loco::DataType::S64>();
-    for (uint32_t i = 0; i < reduction_size; ++i)
-    {
-      int32_t axis = static_cast<int32_t>(reduction_indices->at<loco::DataType::S64>(i));
-      if (axis < 0)
-        axis += input_signature.rank();
-
-      if (!(0 <= axis && axis < static_cast<int32_t>(input_signature.rank())))
-        INTERNAL_EXN_V("Invalid reduction axis for REDUCER", oops::to_uint32(axis));
-
-      reduction_values.push_back(axis);
-    }
-  }
-  else
-  {
-    INTERNAL_EXN("Wrong reduction axis type, Only INT32, INT64 supported.");
-  }
-
-  if (keep_dims)
-  {
-    output_signature.rank(input_signature.rank());
-    for (uint32_t i = 0; i < input_signature.rank(); ++i)
-      output_signature.dim(i) = input_signature.dim(i);
-    for (uint32_t i = 0; i < reduction_values.size(); ++i)
-      output_signature.dim(reduction_values.at(i)) = 1;
-  }
-  else
-  {
-    std::vector<bool> check_reduce(input_signature.rank(), false);
-    for (uint32_t i = 0; i < reduction_values.size(); ++i)
-      check_reduce.at(reduction_values.at(i)) = true;
-
-    uint32_t reduce_cnt = 0;
-    for (uint32_t i = 0; i < check_reduce.size(); ++i)
-      if (check_reduce.at(i))
-        ++reduce_cnt;
-
-    output_signature.rank(input_signature.rank() - reduce_cnt);
-    for (uint32_t i = 0, j = 0; i < check_reduce.size(); ++i)
-      if (check_reduce.at(i) == false)
-        output_signature.dim(j++) = input_signature.dim(i);
-  }
-
-  return output_signature;
-}
-
-ShapeSignature input_arg_signature(const luci::CircleNode *node, uint32_t index)
-{
-  auto circle_input = loco::must_cast<luci::CircleNode *>(node->arg(index));
-  return circle_input->shape_signature();
-}
-
-} // namespace ssinf
-
-} // namespace luci
diff --git a/compiler/luci/service/src/CircleTypeInference.cpp b/compiler/luci/service/src/CircleTypeInference.cpp
index b4755b51a..db9a37cb0 100644
--- a/compiler/luci/service/src/CircleTypeInference.cpp
+++ b/compiler/luci/service/src/CircleTypeInference.cpp
@@ -15,72 +15,23 @@
  */
 
 #include "luci/Service/CircleTypeInference.h"
+#include "CircleTypeInferenceHelper.h"
 
 #include <luci/Log.h>
 
 #include <loco.h>
-#include <loco/Service/TypeInference.h>
-
-#include <mio/circle/schema_generated.h>
-#include <oops/InternalExn.h>
 
 #include <type_traits>
 
 namespace
 {
 
-circle::TensorType translateLocoTypeToCircle(loco::DataType dtype)
-{
-  switch (dtype)
-  {
-    case loco::DataType::U8:
-      return circle::TensorType_UINT8;
-    //  case loco::DataType::U16: unsupported
-    //  case loco::DataType::U32: unsupported
-    //  case loco::DataType::U64: unsupported
-    case loco::DataType::S8:
-      return circle::TensorType_INT8;
-    case loco::DataType::S16:
-      return circle::TensorType_INT16;
-    case loco::DataType::S32:
-      return circle::TensorType_INT32;
-    case loco::DataType::S64:
-      return circle::TensorType_INT64;
-    case loco::DataType::FLOAT16:
-      return circle::TensorType_FLOAT16;
-    case loco::DataType::FLOAT32:
-      return circle::TensorType_FLOAT32;
-    //  case loco::DataType::FLOAT64: unsupported
-    case loco::DataType::BOOL:
-      return circle::TensorType_BOOL;
-    default:
-      break;
-  }
-
-  INTERNAL_EXN_V("Invalid loco dtype", oops::to_uint32(dtype));
-}
-
-} // namespace
-
-namespace luci
-{
-
-circle::TensorType TypeInference::get(loco::Node *node)
-{
-  assert(loco::dtype_known(node));
-  return translateLocoTypeToCircle(loco::dtype_get(node));
-}
-
-} // namespace luci
-
-namespace
-{
-
 bool inputs_dtype_ready(const luci::CircleNode *node)
 {
   for (uint32_t arity = 0; arity < node->arity(); ++arity)
   {
-    if (node->dtype() == loco::DataType::Unknown)
+    auto input_node = loco::must_cast<luci::CircleNode *>(node->arg(arity));
+    if (input_node->dtype() == loco::DataType::Unknown)
       return false;
   }
 
diff --git a/compiler/luci/service/src/CircleTypeInferenceHelper.cpp b/compiler/luci/service/src/CircleTypeInferenceHelper.cpp
index 75cd9f7b2..06edd70f2 100644
--- a/compiler/luci/service/src/CircleTypeInferenceHelper.cpp
+++ b/compiler/luci/service/src/CircleTypeInferenceHelper.cpp
@@ -14,7 +14,23 @@
  * limitations under the License.
  */
 
-#include "luci/Service/CircleTypeInferenceHelper.h"
+#include "CircleTypeInferenceHelper.h"
+
+namespace luci
+{
+
+loco::DataType dtype_get(const loco::Node *node)
+{
+  assert(luci::dtype_known(node));
+  return loco::must_cast<const luci::CircleNode *>(node)->dtype();
+}
+
+bool dtype_known(const loco::Node *node)
+{
+  return loco::must_cast<const luci::CircleNode *>(node)->dtype() != loco::DataType::Unknown;
+}
+
+} // namespace luci
 
 namespace luci
 {
diff --git a/compiler/luci/service/include/luci/Service/CircleTypeInferenceHelper.h b/compiler/luci/service/src/CircleTypeInferenceHelper.h
index 296f99355..751340cc7 100644
--- a/compiler/luci/service/include/luci/Service/CircleTypeInferenceHelper.h
+++ b/compiler/luci/service/src/CircleTypeInferenceHelper.h
@@ -23,6 +23,20 @@
 
 namespace luci
 {
+
+// NOTE Functions in this namespace will be removed after new inference
+//      algorithms are fully implemented.
+
+// This function is temporary function for deprecating loco::dtype_get
+loco::DataType dtype_get(const loco::Node *node);
+
+// This function is temporary function for deprecating loco::dtype_known
+bool dtype_known(const loco::Node *node);
+
+} // namespace luci
+
+namespace luci
+{
 namespace tinf // Namespace for Type Inference
 {
 
diff --git a/compiler/luci/service/src/CircleTypeInferenceRule.cpp b/compiler/luci/service/src/CircleTypeInferenceRule.cpp
index f738ab5a8..0b8d2af9e 100644
--- a/compiler/luci/service/src/CircleTypeInferenceRule.cpp
+++ b/compiler/luci/service/src/CircleTypeInferenceRule.cpp
@@ -15,6 +15,7 @@
  */
 
 #include "luci/Service/CircleTypeInferenceRule.h"
+#include "CircleTypeInferenceHelper.h"
 
 #include <luci/IR/CircleDialect.h>
 #include <luci/IR/CircleNodeVisitor.h>
@@ -29,24 +30,24 @@ struct TypeInferenceAlgorithm final : public luci::CircleNodeVisitor<loco::DataT
 {
   // TODO Given a tensor x of complex numbers, Abs operation returns a tensor of type float32 or
   // float64.
-  loco::DataType visit(const luci::CircleAbs *node) final { return loco::dtype_get(node->x()); }
+  loco::DataType visit(const luci::CircleAbs *node) final { return luci::dtype_get(node->x()); }
 
-  loco::DataType visit(const luci::CircleAdd *node) final { return loco::dtype_get(node->x()); }
+  loco::DataType visit(const luci::CircleAdd *node) final { return luci::dtype_get(node->x()); }
 
   loco::DataType visit(const luci::CircleAddN *node) final
   {
-    auto dtype = loco::dtype_get(node->inputs(0));
+    auto dtype = luci::dtype_get(node->inputs(0));
 
     for (uint32_t idx = 1; idx < node->arity(); ++idx)
     {
-      auto dtype_idx = loco::dtype_get(node->inputs(idx));
+      auto dtype_idx = luci::dtype_get(node->inputs(idx));
       if (dtype != dtype_idx)
       {
         INTERNAL_EXN_V("ADD_N dtype not same as the first input: ", idx);
       }
     }
 
-    return loco::dtype_get(node->inputs(0));
+    return luci::dtype_get(node->inputs(0));
   }
 
   loco::DataType visit(const luci::CircleArgMax *node) final { return node->output_type(); }
@@ -55,22 +56,22 @@ struct TypeInferenceAlgorithm final : public luci::CircleNodeVisitor<loco::DataT
 
   loco::DataType visit(const luci::CircleAveragePool2D *node) final
   {
-    return loco::dtype_get(node->value());
+    return luci::dtype_get(node->value());
   }
 
   loco::DataType visit(const luci::CircleBatchMatMul *node) final
   {
-    return loco::dtype_get(node->x());
+    return luci::dtype_get(node->x());
   }
 
   loco::DataType visit(const luci::CircleBatchToSpaceND *node) final
   {
-    return loco::dtype_get(node->input());
+    return luci::dtype_get(node->input());
   }
 
   loco::DataType visit(const luci::CircleCast *node) final { return node->dtype(); }
 
-  loco::DataType visit(const luci::CircleCeil *node) final { return loco::dtype_get(node->x()); }
+  loco::DataType visit(const luci::CircleCeil *node) final { return luci::dtype_get(node->x()); }
 
   loco::DataType visit(const luci::CircleConcatenation *node) final
   {
@@ -78,87 +79,92 @@ struct TypeInferenceAlgorithm final : public luci::CircleNodeVisitor<loco::DataT
     assert(node->numValues() > 0);
 
     for (uint32_t i = 1; i < node->numValues(); ++i)
-      assert(loco::dtype_get(node->values(i - 1)) == loco::dtype_get(node->values(i)));
+      assert(luci::dtype_get(node->values(i - 1)) == luci::dtype_get(node->values(i)));
 
-    return loco::dtype_get(node->values(0));
+    return luci::dtype_get(node->values(0));
   }
 
   loco::DataType visit(const luci::CircleConst *node) final { return node->dtype(); }
 
   loco::DataType visit(const luci::CircleConv2D *node) final
   {
-    return loco::dtype_get(node->input());
+    return luci::dtype_get(node->input());
   }
 
-  loco::DataType visit(const luci::CircleCos *node) final { return loco::dtype_get(node->x()); }
+  loco::DataType visit(const luci::CircleCos *node) final { return luci::dtype_get(node->x()); }
 
   loco::DataType visit(const luci::CircleCustom *node) final
   {
     if (node->custom_code() == "BatchMatMulV2")
     {
-      return loco::dtype_get(node->inputs(0));
+      return luci::dtype_get(node->inputs(0));
     }
     return node->dtype();
   }
 
   loco::DataType visit(const luci::CircleDepthToSpace *node) final
   {
-    return loco::dtype_get(node->input());
+    return luci::dtype_get(node->input());
   }
 
   loco::DataType visit(const luci::CircleDepthwiseConv2D *node) final
   {
-    return loco::dtype_get(node->input());
+    return luci::dtype_get(node->input());
   }
 
   loco::DataType visit(const luci::CircleDequantize *) final { return loco::DataType::FLOAT32; }
 
-  loco::DataType visit(const luci::CircleDiv *node) final { return loco::dtype_get(node->x()); }
+  loco::DataType visit(const luci::CircleDiv *node) final { return luci::dtype_get(node->x()); }
 
   loco::DataType visit(const luci::CircleElu *node) final
   {
-    return loco::dtype_get(node->features());
+    return luci::dtype_get(node->features());
   }
 
   loco::DataType visit(const luci::CircleEqual *) final { return loco::DataType::BOOL; }
 
-  loco::DataType visit(const luci::CircleExp *node) final { return loco::dtype_get(node->x()); }
+  loco::DataType visit(const luci::CircleExp *node) final { return luci::dtype_get(node->x()); }
 
   loco::DataType visit(const luci::CircleExpandDims *node) final
   {
-    return loco::dtype_get(node->input());
+    return luci::dtype_get(node->input());
+  }
+
+  loco::DataType visit(const luci::CircleFakeQuant *node) final
+  {
+    return luci::dtype_get(node->inputs());
   }
 
   loco::DataType visit(const luci::CircleFill *node) final
   {
-    return loco::dtype_get(node->value());
+    return luci::dtype_get(node->value());
   }
 
-  loco::DataType visit(const luci::CircleFloor *node) final { return loco::dtype_get(node->x()); }
+  loco::DataType visit(const luci::CircleFloor *node) final { return luci::dtype_get(node->x()); }
 
   loco::DataType visit(const luci::CircleFloorDiv *node) final
   {
-    return loco::dtype_get(node->x());
+    return luci::dtype_get(node->x());
   }
 
   loco::DataType visit(const luci::CircleFloorMod *node) final
   {
-    return loco::dtype_get(node->x());
+    return luci::dtype_get(node->x());
   }
 
   loco::DataType visit(const luci::CircleFullyConnected *node) final
   {
-    return loco::dtype_get(node->input());
+    return luci::dtype_get(node->input());
   }
 
   loco::DataType visit(const luci::CircleGather *node) final
   {
-    return loco::dtype_get(node->params());
+    return luci::dtype_get(node->params());
   }
 
   loco::DataType visit(const luci::CircleGatherNd *node) final
   {
-    return loco::dtype_get(node->params());
+    return luci::dtype_get(node->params());
   }
 
   loco::DataType visit(const luci::CircleGreater *) final { return loco::DataType::BOOL; }
@@ -169,22 +175,22 @@ struct TypeInferenceAlgorithm final : public luci::CircleNodeVisitor<loco::DataT
   {
     // Type of If is not used. Just use input 0
     assert(node->input_count() > 0);
-    return loco::dtype_get(node->input(0));
+    return luci::dtype_get(node->input(0));
   }
 
   loco::DataType visit(const luci::CircleL2Normalize *node) final
   {
-    return loco::dtype_get(node->x());
+    return luci::dtype_get(node->x());
   }
 
   loco::DataType visit(const luci::CircleL2Pool2D *node) final
   {
-    return loco::dtype_get(node->value());
+    return luci::dtype_get(node->value());
   }
 
   loco::DataType visit(const luci::CircleLeakyRelu *node) final
   {
-    return loco::dtype_get(node->features());
+    return luci::dtype_get(node->features());
   }
 
   loco::DataType visit(const luci::CircleLess *) final { return loco::DataType::BOOL; }
@@ -193,75 +199,75 @@ struct TypeInferenceAlgorithm final : public luci::CircleNodeVisitor<loco::DataT
 
   loco::DataType visit(const luci::CircleLocalResponseNormalization *node) final
   {
-    return loco::dtype_get(node->input());
+    return luci::dtype_get(node->input());
   }
 
-  loco::DataType visit(const luci::CircleLog *node) final { return loco::dtype_get(node->x()); }
+  loco::DataType visit(const luci::CircleLog *node) final { return luci::dtype_get(node->x()); }
 
   loco::DataType visit(const luci::CircleLogicalAnd *node) final
   {
-    return loco::dtype_get(node->x());
+    return luci::dtype_get(node->x());
   }
 
   loco::DataType visit(const luci::CircleLogicalNot *node) final
   {
-    return loco::dtype_get(node->x());
+    return luci::dtype_get(node->x());
   }
 
   loco::DataType visit(const luci::CircleLogicalOr *node) final
   {
-    return loco::dtype_get(node->x());
+    return luci::dtype_get(node->x());
   }
 
   loco::DataType visit(const luci::CircleLogistic *node) final
   {
-    return loco::dtype_get(node->x());
+    return luci::dtype_get(node->x());
   }
 
   loco::DataType visit(const luci::CircleLogSoftmax *node) final
   {
-    return loco::dtype_get(node->logits());
+    return luci::dtype_get(node->logits());
   }
 
   loco::DataType visit(const luci::CircleMatrixDiag *node) final
   {
-    return loco::dtype_get(node->diagonal());
+    return luci::dtype_get(node->diagonal());
   }
 
   loco::DataType visit(const luci::CircleMatrixSetDiag *node) final
   {
-    return loco::dtype_get(node->input());
+    return luci::dtype_get(node->input());
   }
 
-  loco::DataType visit(const luci::CircleMaximum *node) final { return loco::dtype_get(node->x()); }
+  loco::DataType visit(const luci::CircleMaximum *node) final { return luci::dtype_get(node->x()); }
 
   loco::DataType visit(const luci::CircleMaxPool2D *node) final
   {
-    return loco::dtype_get(node->value());
+    return luci::dtype_get(node->value());
   }
 
   loco::DataType visit(const luci::CircleMean *node) final
   {
-    return loco::dtype_get(node->input());
+    return luci::dtype_get(node->input());
   }
 
-  loco::DataType visit(const luci::CircleMinimum *node) final { return loco::dtype_get(node->x()); }
+  loco::DataType visit(const luci::CircleMinimum *node) final { return luci::dtype_get(node->x()); }
 
   loco::DataType visit(const luci::CircleMirrorPad *node) final
   {
-    return loco::dtype_get(node->input());
+    return luci::dtype_get(node->input());
   }
 
-  loco::DataType visit(const luci::CircleNeg *node) final { return loco::dtype_get(node->x()); }
+  loco::DataType visit(const luci::CircleNeg *node) final { return luci::dtype_get(node->x()); }
 
   loco::DataType visit(const luci::CircleNonMaxSuppressionV4 *node) final
   {
-    return loco::dtype_get(node->boxes());
+    return luci::dtype_get(node->boxes());
   }
 
   loco::DataType visit(const luci::CircleNonMaxSuppressionV5 *node) final
   {
-    return loco::dtype_get(node->boxes());
+    return luci::dtype_get(node->boxes());
   }
 
   loco::DataType visit(const luci::CircleNotEqual *) final { return loco::DataType::BOOL; }
@@ -271,25 +277,25 @@ struct TypeInferenceAlgorithm final : public luci::CircleNodeVisitor<loco::DataT
     // Only support CirclePack with one or more inputs
     assert(node->values_count() > 0);
 
-    auto first_value_type = loco::dtype_get(node->values(0));
+    auto first_value_type = luci::dtype_get(node->values(0));
     for (uint32_t i = 1; i < node->values_count(); ++i)
-      assert(first_value_type == loco::dtype_get(node->values(i)));
+      assert(first_value_type == luci::dtype_get(node->values(i)));
 
     return first_value_type;
   }
 
-  loco::DataType visit(const luci::CirclePad *node) final { return loco::dtype_get(node->input()); }
+  loco::DataType visit(const luci::CirclePad *node) final { return luci::dtype_get(node->input()); }
 
   loco::DataType visit(const luci::CirclePadV2 *node) final
   {
-    return loco::dtype_get(node->input());
+    return luci::dtype_get(node->input());
   }
 
   loco::DataType visit(const luci::CirclePow *node) final
   {
     // TODO make sure types cannot differ
-    auto x_type = loco::dtype_get(node->x());
-    auto y_type = loco::dtype_get(node->y());
+    auto x_type = luci::dtype_get(node->x());
+    auto y_type = luci::dtype_get(node->y());
 
     if (x_type != y_type)
       INTERNAL_EXN("Different datatype for x and y are not supported");
@@ -299,8 +305,8 @@ struct TypeInferenceAlgorithm final : public luci::CircleNodeVisitor<loco::DataT
 
   loco::DataType visit(const luci::CirclePRelu *node) final
   {
-    auto input_type = loco::dtype_get(node->input());
-    auto alpha_type = loco::dtype_get(node->alpha());
+    auto input_type = luci::dtype_get(node->input());
+    auto alpha_type = luci::dtype_get(node->alpha());
 
     if (input_type != alpha_type)
       INTERNAL_EXN("Different datatype for input and alpha are not supported");
@@ -310,201 +316,201 @@ struct TypeInferenceAlgorithm final : public luci::CircleNodeVisitor<loco::DataT
 
   loco::DataType visit(const luci::CircleRange *node) final
   {
-    return loco::dtype_get(node->start());
+    return luci::dtype_get(node->start());
   }
 
   loco::DataType visit(const luci::CircleRank *) final { return loco::DataType::S32; }
 
-  loco::DataType visit(const luci::CircleMul *node) final { return loco::dtype_get(node->x()); }
+  loco::DataType visit(const luci::CircleMul *node) final { return luci::dtype_get(node->x()); }
 
   loco::DataType visit(const luci::CircleOneHot *node) final
   {
-    return loco::dtype_get(node->on_value());
+    return luci::dtype_get(node->on_value());
   }
 
   loco::DataType visit(const luci::CircleReduceAny *node) final
   {
-    return loco::dtype_get(node->input());
+    return luci::dtype_get(node->input());
   }
 
   loco::DataType visit(const luci::CircleReduceMax *node) final
   {
-    return loco::dtype_get(node->input());
+    return luci::dtype_get(node->input());
   }
 
   loco::DataType visit(const luci::CircleReduceMin *node) final
   {
-    return loco::dtype_get(node->input());
+    return luci::dtype_get(node->input());
   }
 
   loco::DataType visit(const luci::CircleReduceProd *node) final
   {
-    return loco::dtype_get(node->input());
+    return luci::dtype_get(node->input());
   }
 
   loco::DataType visit(const luci::CircleRelu *node) final
   {
-    return loco::dtype_get(node->features());
+    return luci::dtype_get(node->features());
   }
 
   loco::DataType visit(const luci::CircleRelu6 *node) final
   {
-    return loco::dtype_get(node->features());
+    return luci::dtype_get(node->features());
   }
 
   loco::DataType visit(const luci::CircleReluN1To1 *node) final
   {
-    return loco::dtype_get(node->features());
+    return luci::dtype_get(node->features());
   }
 
   loco::DataType visit(const luci::CircleReshape *node) final
   {
-    return loco::dtype_get(node->tensor());
+    return luci::dtype_get(node->tensor());
   }
 
   loco::DataType visit(const luci::CircleResizeBilinear *node) final
   {
-    return loco::dtype_get(node->input());
+    return luci::dtype_get(node->input());
   }
 
   loco::DataType visit(const luci::CircleResizeNearestNeighbor *node) final
   {
-    return loco::dtype_get(node->input());
+    return luci::dtype_get(node->input());
   }
 
   loco::DataType visit(const luci::CircleReverseSequence *node) final
   {
-    return loco::dtype_get(node->input());
+    return luci::dtype_get(node->input());
   }
 
   loco::DataType visit(const luci::CircleReverseV2 *node) final
   {
-    return loco::dtype_get(node->tensor());
+    return luci::dtype_get(node->tensor());
   }
 
-  loco::DataType visit(const luci::CircleRound *node) final { return loco::dtype_get(node->x()); }
+  loco::DataType visit(const luci::CircleRound *node) final { return luci::dtype_get(node->x()); }
 
-  loco::DataType visit(const luci::CircleRsqrt *node) final { return loco::dtype_get(node->x()); }
+  loco::DataType visit(const luci::CircleRsqrt *node) final { return luci::dtype_get(node->x()); }
 
   loco::DataType visit(const luci::CircleScatterNd *node) final
   {
-    return loco::dtype_get(node->updates());
+    return luci::dtype_get(node->updates());
   }
 
   loco::DataType visit(const luci::CircleSegmentSum *node) final
   {
-    return loco::dtype_get(node->input());
+    return luci::dtype_get(node->input());
   }
 
   loco::DataType visit(const luci::CircleSelect *node) final
   {
-    assert(loco::dtype_get(node->t()) == loco::dtype_get(node->e()));
-    return loco::dtype_get(node->t());
+    assert(luci::dtype_get(node->t()) == luci::dtype_get(node->e()));
+    return luci::dtype_get(node->t());
   }
 
   loco::DataType visit(const luci::CircleSelectV2 *node) final
   {
-    assert(loco::dtype_get(node->t()) == loco::dtype_get(node->e()));
-    return loco::dtype_get(node->t());
+    assert(luci::dtype_get(node->t()) == luci::dtype_get(node->e()));
+    return luci::dtype_get(node->t());
   }
 
   loco::DataType visit(const luci::CircleShape *node) final { return node->out_type(); }
 
-  loco::DataType visit(const luci::CircleSin *node) final { return loco::dtype_get(node->x()); }
+  loco::DataType visit(const luci::CircleSin *node) final { return luci::dtype_get(node->x()); }
 
   loco::DataType visit(const luci::CircleSlice *node) final
   {
-    return loco::dtype_get(node->input());
+    return luci::dtype_get(node->input());
   }
 
   loco::DataType visit(const luci::CircleSoftmax *node) final
   {
-    return loco::dtype_get(node->logits());
+    return luci::dtype_get(node->logits());
   }
 
   loco::DataType visit(const luci::CircleSpaceToBatchND *node) final
   {
-    return loco::dtype_get(node->input());
+    return luci::dtype_get(node->input());
   }
 
   loco::DataType visit(const luci::CircleSpaceToDepth *node) final
   {
-    return loco::dtype_get(node->input());
+    return luci::dtype_get(node->input());
   }
 
   loco::DataType visit(const luci::CircleSparseToDense *node) final
   {
-    return loco::dtype_get(node->values());
+    return luci::dtype_get(node->values());
   }
 
   loco::DataType visit(const luci::CircleSplit *node) final
   {
-    return loco::dtype_get(node->input());
+    return luci::dtype_get(node->input());
   }
 
   loco::DataType visit(const luci::CircleSplitV *node) final
   {
-    return loco::dtype_get(node->input());
+    return luci::dtype_get(node->input());
   }
 
-  loco::DataType visit(const luci::CircleSqrt *node) final { return loco::dtype_get(node->x()); }
+  loco::DataType visit(const luci::CircleSqrt *node) final { return luci::dtype_get(node->x()); }
 
-  loco::DataType visit(const luci::CircleSquare *node) final { return loco::dtype_get(node->x()); }
+  loco::DataType visit(const luci::CircleSquare *node) final { return luci::dtype_get(node->x()); }
 
   loco::DataType visit(const luci::CircleSquaredDifference *node) final
   {
-    return loco::dtype_get(node->x());
+    return luci::dtype_get(node->x());
   }
 
   loco::DataType visit(const luci::CircleSqueeze *node) final
   {
-    return loco::dtype_get(node->input());
+    return luci::dtype_get(node->input());
   }
 
   loco::DataType visit(const luci::CircleStridedSlice *node) final
   {
-    return loco::dtype_get(node->input());
+    return luci::dtype_get(node->input());
   }
 
-  loco::DataType visit(const luci::CircleSub *node) final { return loco::dtype_get(node->x()); }
+  loco::DataType visit(const luci::CircleSub *node) final { return luci::dtype_get(node->x()); }
 
-  loco::DataType visit(const luci::CircleSum *node) final { return loco::dtype_get(node->input()); }
+  loco::DataType visit(const luci::CircleSum *node) final { return luci::dtype_get(node->input()); }
 
-  loco::DataType visit(const luci::CircleTanh *node) final { return loco::dtype_get(node->x()); }
+  loco::DataType visit(const luci::CircleTanh *node) final { return luci::dtype_get(node->x()); }
 
   loco::DataType visit(const luci::CircleTile *node) final
   {
-    return loco::dtype_get(node->input());
+    return luci::dtype_get(node->input());
   }
 
   loco::DataType visit(const luci::CircleTopKV2 *node) final
   {
-    return loco::dtype_get(node->input());
+    return luci::dtype_get(node->input());
   }
 
   loco::DataType visit(const luci::CircleTranspose *node) final
   {
-    return loco::dtype_get(node->a());
+    return luci::dtype_get(node->a());
   }
 
   loco::DataType visit(const luci::CircleTransposeConv *node) final
   {
-    return loco::dtype_get(node->outBackprop());
+    return luci::dtype_get(node->outBackprop());
   }
 
   loco::DataType visit(const luci::CircleUnidirectionalSequenceLSTM *node) final
   {
-    return loco::dtype_get(node->input());
+    return luci::dtype_get(node->input());
   }
 
   loco::DataType visit(const luci::CircleUnique *node) final
   {
-    return loco::dtype_get(node->input());
+    return luci::dtype_get(node->input());
   }
 
   loco::DataType visit(const luci::CircleUnpack *node) final
   {
-    return loco::dtype_get(node->value());
+    return luci::dtype_get(node->value());
   }
 
   loco::DataType visit(const luci::CircleWhere *) final { return loco::DataType::S64; }
@@ -513,12 +519,12 @@ struct TypeInferenceAlgorithm final : public luci::CircleNodeVisitor<loco::DataT
   {
     // Type of While is not used. Just use input 0
     assert(node->input_count() > 0);
-    return loco::dtype_get(node->input(0));
+    return luci::dtype_get(node->input(0));
   }
 
   loco::DataType visit(const luci::CircleZerosLike *node) final
   {
-    return loco::dtype_get(node->input());
+    return luci::dtype_get(node->input());
   }
 
   // Circle Only
@@ -531,7 +537,7 @@ struct TypeInferenceAlgorithm final : public luci::CircleNodeVisitor<loco::DataT
 
   loco::DataType visit(const luci::CircleInstanceNorm *node) final
   {
-    return loco::dtype_get(node->input());
+    return luci::dtype_get(node->input());
   }
 
   // Virtual
@@ -548,7 +554,7 @@ struct TypeInferenceAlgorithm final : public luci::CircleNodeVisitor<loco::DataT
     {
       // We don't care for the type if from() is CircleOutputDummy or CircleOutputExclude
       // from() type should match that of CircleOutput
-      assert(output_dtype == loco::dtype_get(node->from()));
+      assert(output_dtype == luci::dtype_get(node->from()));
     }
     return output_dtype;
   }
@@ -559,46 +565,6 @@ struct TypeInferenceAlgorithm final : public luci::CircleNodeVisitor<loco::DataT
 
   loco::DataType visit(const luci::CircleCustomOut *node) final { return node->dtype(); }
 
-  loco::DataType visit(const luci::CircleIfOut *node) final
-  {
-    /**
-     * @note  IF operator type and shape are that of the "then" and "else"
-     *        Graph Outputs.
-     */
-    auto circle_if = dynamic_cast<const luci::CircleIf *>(node->input());
-    if (circle_if == nullptr)
-    {
-      INTERNAL_EXN("CircleIf IR is not configured correctly");
-    }
-
-    auto index = node->index();
-    auto then_graph = circle_if->then_graph();
-    auto else_graph = circle_if->else_graph();
-    assert(then_graph != nullptr);
-    assert(else_graph != nullptr);
-
-    // shape and type are assumed to be same
-    // these are checked at post_import_graph() in Import
-    auto then_outputs = loco::output_nodes(then_graph);
-    auto else_outputs = loco::output_nodes(else_graph);
-    assert(then_outputs.size() == else_outputs.size());
-    assert(index < static_cast<int32_t>(then_outputs.size()));
-
-    auto then_out = loco::must_cast<luci::CircleOutput *>(then_outputs.at(index));
-    auto else_out = loco::must_cast<luci::CircleOutput *>(else_outputs.at(index));
-
-    auto then_graph_outputs = then_graph->outputs(); // loco::GraphOutput items
-    auto else_graph_outputs = else_graph->outputs();
-    assert(then_graph_outputs->size() == else_graph_outputs->size());
-
-    auto then_graph_output = then_graph_outputs->at(then_out->index());
-    auto else_graph_output = else_graph_outputs->at(else_out->index());
-    (void)else_graph_output; // make compiler happy for unused variable warnings
-    assert(then_graph_output->dtype() == else_graph_output->dtype());
-
-    return then_graph_output->dtype();
-  }
-
   loco::DataType visit(const luci::CircleNonMaxSuppressionV4Out *node) final
   {
     (void)node;
@@ -619,19 +585,19 @@ struct TypeInferenceAlgorithm final : public luci::CircleNodeVisitor<loco::DataT
 
   loco::DataType visit(const luci::CircleSplitOut *node) final
   {
-    return loco::dtype_get(node->input());
+    return luci::dtype_get(node->input());
   }
 
   loco::DataType visit(const luci::CircleSplitVOut *node) final
   {
-    return loco::dtype_get(node->input());
+    return luci::dtype_get(node->input());
   }
 
   loco::DataType visit(const luci::CircleTopKV2Out *node) final
   {
     // First output is same as input
     if (node->index() == 0)
-      return loco::dtype_get(node->input());
+      return luci::dtype_get(node->input());
     // Second outout is always S32
     assert(node->index() == 1);
     return loco::DataType::S32;
@@ -641,7 +607,7 @@ struct TypeInferenceAlgorithm final : public luci::CircleNodeVisitor<loco::DataT
   {
     if (node->index() == 0)
     {
-      return loco::dtype_get(node->input());
+      return luci::dtype_get(node->input());
     }
     assert(node->index() == 1);
     auto unique = loco::must_cast<luci::CircleUnique *>(node->input());
@@ -650,7 +616,7 @@ struct TypeInferenceAlgorithm final : public luci::CircleNodeVisitor<loco::DataT
 
   loco::DataType visit(const luci::CircleUnpackOut *node) final
   {
-    return loco::dtype_get(node->input());
+    return luci::dtype_get(node->input());
   }
 
   loco::DataType visit(const luci::CircleWhileOut *node) final
diff --git a/compiler/luci/service/src/CircleTypeInferenceRule.test.cpp b/compiler/luci/service/src/CircleTypeInferenceRule.test.cpp
deleted file mode 100644
index 711a489af..000000000
--- a/compiler/luci/service/src/CircleTypeInferenceRule.test.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "TestGraph.h"
-#include <luci/Service/CircleTypeInferenceRule.h>
-
-#include <luci/IR/CircleNodes.h>
-#include <luci/IR/CircleDialect.h>
-
-#include <loco.h>
-#include <loco/IR/CanonicalDialect.h>
-#include <loco/Service/TypeInference.h>
-
-#include <gtest/gtest.h>
-
-#include <memory>
-
-TEST(CircleTypeInferenceRuleTest, minimal_with_CircleRelu)
-{
-  // Create a simple network
-  luci::test::TestGraph graph;
-  auto relu_node = graph.append<luci::CircleRelu>(graph.input_node);
-  graph.complete(relu_node);
-
-  // set dtype for nodes; like setting them in import
-  graph.input_node->dtype(loco::DataType::S32);
-  relu_node->dtype(loco::DataType::S32);
-  graph.output_node->dtype(loco::DataType::S32);
-
-  luci::test::graph_input_dtype(graph.input_node);
-  luci::test::graph_output_dtype(graph.output_node);
-
-  // pre-check
-  ASSERT_FALSE(loco::dtype_known(relu_node));
-
-  // type inference
-  luci::CircleTypeInferenceRule circle_rule;
-  loco::CanonicalTypeInferenceRule canon_rule;
-  loco::MultiDialectTypeInferenceRule rules;
-
-  rules.bind(loco::CanonicalDialect::get(), &canon_rule);
-  rules.bind(luci::CircleDialect::get(), &circle_rule);
-
-  loco::apply(&rules).to(graph.g.get());
-
-  // Verify
-  ASSERT_TRUE(loco::dtype_known(relu_node));
-  auto type = loco::dtype_get(relu_node);
-  ASSERT_EQ(loco::DataType::S32, type);
-}
diff --git a/compiler/luci/service/src/Nodes/CircleAbs.cpp b/compiler/luci/service/src/Nodes/CircleAbs.cpp
new file mode 100644
index 000000000..132760957
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleAbs.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleAbs *)
+{
+  return _graph->nodes()->create<luci::CircleAbs>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleAbs.test.cpp b/compiler/luci/service/src/Nodes/CircleAbs.test.cpp
new file mode 100644
index 000000000..885b395b8
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleAbs.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Abs)
+{
+  auto g = loco::make_graph();
+  auto node_abs = g->nodes()->create<luci::CircleAbs>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_abs, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_abs = dynamic_cast<luci::CircleAbs *>(cloned);
+  ASSERT_NE(nullptr, cloned_abs);
+}
diff --git a/compiler/luci/pass/include/luci/Pass/TypeInferencePass.h b/compiler/luci/service/src/Nodes/CircleAdd.cpp
index 9d964bdd6..08634320e 100644
--- a/compiler/luci/pass/include/luci/Pass/TypeInferencePass.h
+++ b/compiler/luci/service/src/Nodes/CircleAdd.cpp
@@ -1,6 +1,5 @@
-
 /*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,29 +14,20 @@
  * limitations under the License.
  */
 
-#ifndef __LUCI_TYPE_INFERENCE_PASS_H__
-#define __LUCI_TYPE_INFERENCE_PASS_H__
-
-#include <loco.h>
-
-#include <luci/ModulePass.h>
+#include "CircleCloneNode.h"
 
 namespace luci
 {
 
-/**
- * @brief Pass to infer type of nodes
- */
-class TypeInferencePass : public luci::Pass
+luci::CircleNode *CloneNode::visit(const luci::CircleAdd *node)
 {
-public:
-  virtual const char *name(void) const { return "luci::TypeInferencePass"; }
+  if (node->fusedActivationFunction() == luci::FusedActFunc::UNDEFINED)
+    return nullptr;
 
-public:
-  bool run(luci::Module *m);
-  bool run(loco::Graph *graph);
-};
+  auto *cloned = _graph->nodes()->create<luci::CircleAdd>();
+  if (cloned != nullptr)
+    cloned->fusedActivationFunction(node->fusedActivationFunction());
+  return cloned;
+}
 
 } // namespace luci
-
-#endif //__LUCI_TYPE_INFERENCE_PASS_H__
diff --git a/compiler/luci/service/src/Nodes/CircleAdd.test.cpp b/compiler/luci/service/src/Nodes/CircleAdd.test.cpp
new file mode 100644
index 000000000..41a818b0a
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleAdd.test.cpp
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/Service/CircleShapeInference.h>
+
+#include <loco/IR/TensorShape.h>
+
+#include <gtest/gtest.h>
+
+/**
+ * @note Function to test: Shape inference of two different input shapes
+ *
+ *       Rank expansion to higher input side
+ *          x(2,1,5) + y(3,5) --> x(2,1,5) + y(1,3,5)
+ *       Do output shape inference like numpy
+ *          x(2,1,5) + y(1,3,5) --> output(2,3,5)
+ *       For each axis, dim value should be same OR one of them should be 1
+ */
+TEST(ShapeRuleTest, different_input_shapes_add)
+{
+  luci::CircleInput input1;
+  luci::CircleInput input2;
+  luci::CircleAdd add;
+
+  input1.shape({2, 1, 5});
+  input1.shape_status(luci::ShapeStatus::VALID);
+  input2.shape({3, 5});
+  input2.shape_status(luci::ShapeStatus::VALID);
+
+  add.x(&input1);
+  add.y(&input2);
+
+  loco::TensorShape shape;
+  luci::sinf::Rule shape_inf_rule;
+
+  ASSERT_TRUE(shape_inf_rule.infer(&add, shape));
+  ASSERT_EQ(3, shape.rank());
+  ASSERT_EQ(2, shape.dim(0).value());
+  ASSERT_EQ(3, shape.dim(1).value());
+  ASSERT_EQ(5, shape.dim(2).value());
+}
+
+TEST(CloneNodeTest, clone_Add)
+{
+  auto g = loco::make_graph();
+  auto node_add = g->nodes()->create<luci::CircleAdd>();
+  node_add->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_add, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_add = dynamic_cast<luci::CircleAdd *>(cloned);
+  ASSERT_NE(nullptr, cloned_add);
+  ASSERT_EQ(node_add->fusedActivationFunction(), cloned_add->fusedActivationFunction());
+}
+
+TEST(CloneNodeTest, clone_Add_NEG)
+{
+  auto g = loco::make_graph();
+  auto node_add = g->nodes()->create<luci::CircleAdd>();
+  node_add->fusedActivationFunction(luci::FusedActFunc::UNDEFINED);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_add, gc.get());
+  ASSERT_EQ(nullptr, cloned);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleAddN.cpp b/compiler/luci/service/src/Nodes/CircleAddN.cpp
new file mode 100644
index 000000000..e536e54bb
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleAddN.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleAddN *node)
+{
+  auto arity = node->arity();
+  return _graph->nodes()->create<luci::CircleAddN>(arity);
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleAddN.test.cpp b/compiler/luci/service/src/Nodes/CircleAddN.test.cpp
new file mode 100644
index 000000000..5d5b82247
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleAddN.test.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_AddN)
+{
+  auto g = loco::make_graph();
+  auto node_addn = g->nodes()->create<luci::CircleAddN>(3);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_addn, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_addn = dynamic_cast<luci::CircleAddN *>(cloned);
+  ASSERT_NE(nullptr, cloned_addn);
+  ASSERT_EQ(node_addn->arity(), cloned_addn->arity());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleArgMax.cpp b/compiler/luci/service/src/Nodes/CircleArgMax.cpp
new file mode 100644
index 000000000..1b3bafa86
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleArgMax.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleArgMax *node)
+{
+  auto *cloned = _graph->nodes()->create<luci::CircleArgMax>();
+  if (cloned != nullptr)
+    cloned->output_type(node->output_type());
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleArgMax.test.cpp b/compiler/luci/service/src/Nodes/CircleArgMax.test.cpp
new file mode 100644
index 000000000..bb7588403
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleArgMax.test.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_ArgMax)
+{
+  auto g = loco::make_graph();
+  auto node_argmax = g->nodes()->create<luci::CircleArgMax>();
+  node_argmax->output_type(loco::DataType::FLOAT32);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_argmax, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_argmax = dynamic_cast<luci::CircleArgMax *>(cloned);
+  ASSERT_NE(nullptr, cloned_argmax);
+  ASSERT_EQ(node_argmax->output_type(), cloned_argmax->output_type());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleArgMin.cpp b/compiler/luci/service/src/Nodes/CircleArgMin.cpp
new file mode 100644
index 000000000..fa54f7b76
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleArgMin.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleArgMin *node)
+{
+  auto *cloned = _graph->nodes()->create<luci::CircleArgMin>();
+  if (cloned != nullptr)
+    cloned->output_type(node->output_type());
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleArgMin.test.cpp b/compiler/luci/service/src/Nodes/CircleArgMin.test.cpp
new file mode 100644
index 000000000..ca57946f9
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleArgMin.test.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_ArgMin)
+{
+  auto g = loco::make_graph();
+  auto node_argmin = g->nodes()->create<luci::CircleArgMin>();
+  node_argmin->output_type(loco::DataType::FLOAT32);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_argmin, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_argmin = dynamic_cast<luci::CircleArgMin *>(cloned);
+  ASSERT_NE(nullptr, cloned_argmin);
+  ASSERT_EQ(node_argmin->output_type(), cloned_argmin->output_type());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleAveragePool2D.cpp b/compiler/luci/service/src/Nodes/CircleAveragePool2D.cpp
new file mode 100644
index 000000000..4d2791833
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleAveragePool2D.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleAveragePool2D *node)
+{
+  if (node->fusedActivationFunction() == luci::FusedActFunc::UNDEFINED)
+    return nullptr;
+  if (node->padding() == luci::Padding::UNDEFINED)
+    return nullptr;
+
+  auto *cloned = _graph->nodes()->create<luci::CircleAveragePool2D>();
+  if (cloned != nullptr)
+  {
+    cloned->fusedActivationFunction(node->fusedActivationFunction());
+    cloned->padding(node->padding());
+    cloned->filter()->h(node->filter()->h());
+    cloned->filter()->w(node->filter()->w());
+    cloned->stride()->h(node->stride()->h());
+    cloned->stride()->w(node->stride()->w());
+  }
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleAveragePool2D.test.cpp b/compiler/luci/service/src/Nodes/CircleAveragePool2D.test.cpp
new file mode 100644
index 000000000..d048d1426
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleAveragePool2D.test.cpp
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/Service/CircleShapeInference.h>
+
+#include <loco/IR/TensorShape.h>
+
+#include <gtest/gtest.h>
+
+TEST(ShapeRuleTest, simple_valid_pad_avgpool2d)
+{
+  luci::CircleInput input;
+  luci::CircleAveragePool2D avgpool_2d;
+
+  input.shape({1, 4, 3, 1});
+  input.shape_status(luci::ShapeStatus::VALID);
+
+  avgpool_2d.value(&input);
+  avgpool_2d.filter()->h(2);
+  avgpool_2d.filter()->w(2);
+  avgpool_2d.stride()->h(2);
+  avgpool_2d.stride()->w(2);
+  avgpool_2d.fusedActivationFunction(luci::FusedActFunc::NONE);
+  avgpool_2d.padding(luci::Padding::VALID);
+
+  loco::TensorShape shape;
+  luci::sinf::Rule shape_inf_rule;
+
+  ASSERT_TRUE(shape_inf_rule.infer(&avgpool_2d, shape));
+  ASSERT_EQ(4, shape.rank());
+  ASSERT_EQ(1, shape.dim(0).value());
+  ASSERT_EQ(2, shape.dim(1).value());
+  ASSERT_EQ(1, shape.dim(2).value());
+  ASSERT_EQ(1, shape.dim(3).value());
+}
+
+TEST(ShapeRuleTest, simple_same_pad_avgpool2d)
+{
+  luci::CircleInput input;
+  luci::CircleAveragePool2D avgpool_2d;
+
+  input.shape({1, 4, 3, 1});
+  input.shape_status(luci::ShapeStatus::VALID);
+
+  avgpool_2d.value(&input);
+  avgpool_2d.filter()->h(2);
+  avgpool_2d.filter()->w(2);
+  avgpool_2d.stride()->h(2);
+  avgpool_2d.stride()->w(2);
+  avgpool_2d.fusedActivationFunction(luci::FusedActFunc::NONE);
+  avgpool_2d.padding(luci::Padding::SAME);
+
+  loco::TensorShape shape;
+  luci::sinf::Rule shape_inf_rule;
+
+  ASSERT_TRUE(shape_inf_rule.infer(&avgpool_2d, shape));
+  ASSERT_EQ(4, shape.rank());
+  ASSERT_EQ(1, shape.dim(0).value());
+  ASSERT_EQ(2, shape.dim(1).value());
+  ASSERT_EQ(2, shape.dim(2).value());
+  ASSERT_EQ(1, shape.dim(3).value());
+}
+
+TEST(CloneNodeTest, clone_AveragePool2D)
+{
+  auto g = loco::make_graph();
+  auto node_avgpool2d = g->nodes()->create<luci::CircleAveragePool2D>();
+  node_avgpool2d->fusedActivationFunction(luci::FusedActFunc::RELU);
+  node_avgpool2d->padding(luci::Padding::SAME);
+  node_avgpool2d->filter()->h(1);
+  node_avgpool2d->filter()->w(2);
+  node_avgpool2d->stride()->h(3);
+  node_avgpool2d->stride()->w(4);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_avgpool2d, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_avgpool2d = dynamic_cast<luci::CircleAveragePool2D *>(cloned);
+  ASSERT_NE(nullptr, cloned_avgpool2d);
+  ASSERT_EQ(node_avgpool2d->fusedActivationFunction(), cloned_avgpool2d->fusedActivationFunction());
+  ASSERT_EQ(node_avgpool2d->padding(), cloned_avgpool2d->padding());
+  ASSERT_EQ(node_avgpool2d->filter()->h(), cloned_avgpool2d->filter()->h());
+  ASSERT_EQ(node_avgpool2d->filter()->w(), cloned_avgpool2d->filter()->w());
+  ASSERT_EQ(node_avgpool2d->stride()->h(), cloned_avgpool2d->stride()->h());
+  ASSERT_EQ(node_avgpool2d->stride()->w(), cloned_avgpool2d->stride()->w());
+}
+
+TEST(CloneNodeTest, clone_AveragePool2D_fusedact_NEG)
+{
+  auto g = loco::make_graph();
+  auto node_avgpool2d = g->nodes()->create<luci::CircleAveragePool2D>();
+  node_avgpool2d->fusedActivationFunction(luci::FusedActFunc::UNDEFINED);
+  node_avgpool2d->padding(luci::Padding::SAME);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_avgpool2d, gc.get());
+  ASSERT_EQ(nullptr, cloned);
+}
+
+TEST(CloneNodeTest, clone_AveragePool2D_padding_NEG)
+{
+  auto g = loco::make_graph();
+  auto node_avgpool2d = g->nodes()->create<luci::CircleAveragePool2D>();
+  node_avgpool2d->fusedActivationFunction(luci::FusedActFunc::RELU);
+  node_avgpool2d->padding(luci::Padding::UNDEFINED);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_avgpool2d, gc.get());
+  ASSERT_EQ(nullptr, cloned);
+}
diff --git a/compiler/luci/pass/include/luci/Pass/ShapeSignatureInferencePass.h b/compiler/luci/service/src/Nodes/CircleBCQFullyConnected.cpp
index 2c6ffcf4e..3edc06ab8 100644
--- a/compiler/luci/pass/include/luci/Pass/ShapeSignatureInferencePass.h
+++ b/compiler/luci/service/src/Nodes/CircleBCQFullyConnected.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,29 +14,23 @@
  * limitations under the License.
  */
 
-#ifndef __LUCI_SHAPE_SIGNATURE_INFERENCE_PASS_H__
-#define __LUCI_SHAPE_SIGNATURE_INFERENCE_PASS_H__
-
-#include <loco.h>
-
-#include <luci/ModulePass.h>
+#include "CircleCloneNode.h"
 
 namespace luci
 {
 
-/**
- * @brief Pass to infer shape_signature of nodes
- */
-class ShapeSignatureInferencePass : public luci::Pass
+luci::CircleNode *CloneNode::visit(const luci::CircleBCQFullyConnected *node)
 {
-public:
-  virtual const char *name(void) const { return "luci::ShapeSignatureInferencePass"; }
-
-public:
-  bool run(luci::Module *m);
-  bool run(loco::Graph *graph);
-};
+  if (node->fusedActivationFunction() == luci::FusedActFunc::UNDEFINED)
+    return nullptr;
+
+  auto *cloned = _graph->nodes()->create<luci::CircleBCQFullyConnected>();
+  if (cloned != nullptr)
+  {
+    cloned->fusedActivationFunction(node->fusedActivationFunction());
+    cloned->weights_hidden_size(node->weights_hidden_size());
+  }
+  return cloned;
+}
 
 } // namespace luci
-
-#endif //__LUCI_SHAPE_SIGNATURE_INFERENCE_PASS_H__
diff --git a/compiler/luci/service/src/Nodes/CircleBCQFullyConnected.test.cpp b/compiler/luci/service/src/Nodes/CircleBCQFullyConnected.test.cpp
new file mode 100644
index 000000000..90c192e07
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleBCQFullyConnected.test.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_BCQFullyConnected)
+{
+  auto g = loco::make_graph();
+  auto node_fc = g->nodes()->create<luci::CircleBCQFullyConnected>();
+  node_fc->fusedActivationFunction(luci::FusedActFunc::RELU);
+  node_fc->weights_hidden_size(3);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_fc, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_fc = dynamic_cast<luci::CircleBCQFullyConnected *>(cloned);
+  ASSERT_NE(nullptr, cloned_fc);
+  ASSERT_EQ(node_fc->fusedActivationFunction(), cloned_fc->fusedActivationFunction());
+  ASSERT_EQ(node_fc->weights_hidden_size(), cloned_fc->weights_hidden_size());
+}
+
+TEST(CloneNodeTest, clone_BCQFullyConnected_fusedact_NEG)
+{
+  auto g = loco::make_graph();
+  auto node_fc = g->nodes()->create<luci::CircleBCQFullyConnected>();
+  node_fc->fusedActivationFunction(luci::FusedActFunc::UNDEFINED);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_fc, gc.get());
+  ASSERT_EQ(nullptr, cloned);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleBCQGather.cpp b/compiler/luci/service/src/Nodes/CircleBCQGather.cpp
new file mode 100644
index 000000000..35b6be744
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleBCQGather.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleBCQGather *node)
+{
+  auto *cloned = _graph->nodes()->create<luci::CircleBCQGather>();
+  if (cloned != nullptr)
+  {
+    cloned->axis(node->axis());
+    cloned->input_hidden_size(node->input_hidden_size());
+  }
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleBCQGather.test.cpp b/compiler/luci/service/src/Nodes/CircleBCQGather.test.cpp
new file mode 100644
index 000000000..a3f9e8850
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleBCQGather.test.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_BCQGather)
+{
+  auto g = loco::make_graph();
+  auto node_gat = g->nodes()->create<luci::CircleBCQGather>();
+  node_gat->axis(3);
+  node_gat->input_hidden_size(5);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_gat, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_gat = dynamic_cast<luci::CircleBCQGather *>(cloned);
+  ASSERT_NE(nullptr, cloned_gat);
+  ASSERT_EQ(node_gat->axis(), cloned_gat->axis());
+  ASSERT_EQ(node_gat->input_hidden_size(), cloned_gat->input_hidden_size());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleBatchMatMul.cpp b/compiler/luci/service/src/Nodes/CircleBatchMatMul.cpp
new file mode 100644
index 000000000..c7a8bbd52
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleBatchMatMul.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleBatchMatMul *node)
+{
+  auto *cloned = _graph->nodes()->create<luci::CircleBatchMatMul>();
+  if (cloned != nullptr)
+  {
+    cloned->adj_x(node->adj_x());
+    cloned->adj_y(node->adj_y());
+  }
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleBatchMatMul.test.cpp b/compiler/luci/service/src/Nodes/CircleBatchMatMul.test.cpp
new file mode 100644
index 000000000..e013feae8
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleBatchMatMul.test.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_BatchMatMul)
+{
+  auto g = loco::make_graph();
+  auto node_bmm = g->nodes()->create<luci::CircleBatchMatMul>();
+  node_bmm->adj_x(true);
+  node_bmm->adj_y(true);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_bmm, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_bmm = dynamic_cast<luci::CircleBatchMatMul *>(cloned);
+  ASSERT_NE(nullptr, cloned_bmm);
+  ASSERT_EQ(node_bmm->adj_x(), cloned_bmm->adj_x());
+  ASSERT_EQ(node_bmm->adj_y(), cloned_bmm->adj_y());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleBatchToSpaceND.cpp b/compiler/luci/service/src/Nodes/CircleBatchToSpaceND.cpp
new file mode 100644
index 000000000..70aa05f72
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleBatchToSpaceND.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleBatchToSpaceND *)
+{
+  return _graph->nodes()->create<luci::CircleBatchToSpaceND>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleBatchToSpaceND.test.cpp b/compiler/luci/service/src/Nodes/CircleBatchToSpaceND.test.cpp
new file mode 100644
index 000000000..a45039fc7
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleBatchToSpaceND.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_BatchToSpaceND)
+{
+  auto g = loco::make_graph();
+  auto node_b2s = g->nodes()->create<luci::CircleBatchToSpaceND>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_b2s, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_b2s = dynamic_cast<luci::CircleBatchToSpaceND *>(cloned);
+  ASSERT_NE(nullptr, cloned_b2s);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleCast.cpp b/compiler/luci/service/src/Nodes/CircleCast.cpp
new file mode 100644
index 000000000..75f15f9de
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleCast.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleCast *node)
+{
+  auto *cloned = _graph->nodes()->create<luci::CircleCast>();
+  if (cloned != nullptr)
+  {
+    cloned->in_data_type(node->in_data_type());
+    cloned->out_data_type(node->out_data_type());
+  }
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleCast.test.cpp b/compiler/luci/service/src/Nodes/CircleCast.test.cpp
new file mode 100644
index 000000000..1c4bacb73
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleCast.test.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Cast)
+{
+  auto g = loco::make_graph();
+  auto node_cast = g->nodes()->create<luci::CircleCast>();
+  node_cast->in_data_type(loco::DataType::U16);
+  node_cast->out_data_type(loco::DataType::S32);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_cast, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_cast = dynamic_cast<luci::CircleCast *>(cloned);
+  ASSERT_NE(nullptr, cloned_cast);
+  ASSERT_EQ(node_cast->in_data_type(), cloned_cast->in_data_type());
+  ASSERT_EQ(node_cast->out_data_type(), cloned_cast->out_data_type());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleCeil.cpp b/compiler/luci/service/src/Nodes/CircleCeil.cpp
new file mode 100644
index 000000000..92d039a7d
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleCeil.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleCeil *)
+{
+  return _graph->nodes()->create<luci::CircleCeil>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleCeil.test.cpp b/compiler/luci/service/src/Nodes/CircleCeil.test.cpp
new file mode 100644
index 000000000..b182127d9
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleCeil.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Ceil)
+{
+  auto g = loco::make_graph();
+  auto node_ceil = g->nodes()->create<luci::CircleCeil>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_ceil, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_ceil = dynamic_cast<luci::CircleCeil *>(cloned);
+  ASSERT_NE(nullptr, cloned_ceil);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleConcatenation.cpp b/compiler/luci/service/src/Nodes/CircleConcatenation.cpp
new file mode 100644
index 000000000..75d6a53e6
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleConcatenation.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleConcatenation *node)
+{
+  if (node->fusedActivationFunction() == luci::FusedActFunc::UNDEFINED)
+    return nullptr;
+
+  auto *cloned = _graph->nodes()->create<luci::CircleConcatenation>(node->numValues());
+  if (cloned != nullptr)
+  {
+    cloned->fusedActivationFunction(node->fusedActivationFunction());
+    cloned->axis(node->axis());
+  }
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleConcatenation.test.cpp b/compiler/luci/service/src/Nodes/CircleConcatenation.test.cpp
new file mode 100644
index 000000000..270068cf0
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleConcatenation.test.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Concatenation)
+{
+  auto g = loco::make_graph();
+  auto node_concat = g->nodes()->create<luci::CircleConcatenation>(3);
+  node_concat->fusedActivationFunction(luci::FusedActFunc::RELU);
+  node_concat->axis(7);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_concat, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_concat = dynamic_cast<luci::CircleConcatenation *>(cloned);
+  ASSERT_NE(nullptr, cloned_concat);
+  ASSERT_EQ(node_concat->numValues(), cloned_concat->numValues());
+  ASSERT_EQ(node_concat->fusedActivationFunction(), cloned_concat->fusedActivationFunction());
+  ASSERT_EQ(node_concat->axis(), cloned_concat->axis());
+}
+
+TEST(CloneNodeTest, clone_Concatenation_NEG)
+{
+  auto g = loco::make_graph();
+  auto node_concat = g->nodes()->create<luci::CircleConcatenation>(3);
+  node_concat->fusedActivationFunction(luci::FusedActFunc::UNDEFINED);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_concat, gc.get());
+  ASSERT_EQ(nullptr, cloned);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleConst.cpp b/compiler/luci/service/src/Nodes/CircleConst.cpp
new file mode 100644
index 000000000..0306ef4eb
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleConst.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <luci/IR/Nodes/CircleConst.h>
+
+#include <loco.h>
+#include <loco/IR/Graph.h>
+
+#include <oops/UserExn.h>
+
+#include <cassert>
+
+namespace
+{
+
+template <loco::DataType T>
+void copy_values(const luci::CircleConst *node, luci::CircleConst *cloned)
+{
+  assert(T == node->dtype());
+  assert(T == cloned->dtype());
+
+  const auto size = node->size<T>();
+  cloned->size<T>(size);
+  for (uint32_t i = 0; i < size; i++)
+    cloned->at<T>(i) = node->at<T>(i);
+}
+
+luci::CircleConst *clone_circleconst(const luci::CircleConst *node, loco::Graph *graph)
+{
+  auto cloned = graph->nodes()->create<luci::CircleConst>();
+
+  if (cloned != nullptr)
+  {
+    // dtype/shape
+    cloned->dtype(node->dtype());
+    cloned->rank(node->rank());
+
+    // values
+    switch (node->dtype())
+    {
+      case loco::DataType::FLOAT32:
+        copy_values<loco::DataType::FLOAT32>(node, cloned);
+        break;
+
+      case loco::DataType::U8:
+        copy_values<loco::DataType::U8>(node, cloned);
+        break;
+
+      case loco::DataType::S8:
+        copy_values<loco::DataType::S8>(node, cloned);
+        break;
+
+      case loco::DataType::S16:
+        copy_values<loco::DataType::S16>(node, cloned);
+        break;
+
+      case loco::DataType::S32:
+        copy_values<loco::DataType::S32>(node, cloned);
+        break;
+
+      case loco::DataType::S64:
+        copy_values<loco::DataType::S64>(node, cloned);
+        break;
+
+      case loco::DataType::BOOL:
+        copy_values<loco::DataType::BOOL>(node, cloned);
+        break;
+
+      default:
+        throw oops::UserExn("Unsupported tensor dtype");
+    }
+  }
+
+  return cloned;
+}
+
+} // namespace
+
+namespace luci
+{
+
+luci::CircleConst *clone(luci::CircleConst *node)
+{
+  auto *cloned = clone_circleconst(node, node->graph());
+
+  copy_common_attributes(node, cloned);
+
+  return cloned;
+}
+
+} // namespace luci
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleConst *node)
+{
+  return clone_circleconst(node, _graph);
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleConst.test.cpp b/compiler/luci/service/src/Nodes/CircleConst.test.cpp
new file mode 100644
index 000000000..5d94798f4
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleConst.test.cpp
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/Nodes/CircleConst.h"
+#include "luci/Service/CircleNodeClone.h"
+
+#include <loco.h>
+#include <loco/IR/Graph.h>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+luci::CircleConst *new_const_s32(loco::Graph *g)
+{
+  // prepare source CircleConst
+  auto circle_const = g->nodes()->create<luci::CircleConst>();
+
+  const auto size = 2;
+
+  circle_const->dtype(loco::DataType::S32);
+  circle_const->rank(1);
+  circle_const->dim(0).set(size);
+  circle_const->shape_status(luci::ShapeStatus::VALID);
+
+  circle_const->size<loco::DataType::S32>(size);
+  for (uint32_t i = 0; i < size; i++)
+    circle_const->at<loco::DataType::S32>(i) = i;
+
+  // quantparam
+  auto quantparam = std::make_unique<luci::CircleQuantParam>();
+  quantparam->scale = {1.0};
+  quantparam->zerop = {0};
+  quantparam->min = {-127.0};
+  quantparam->max = {127.0};
+  quantparam->quantized_dimension = 1;
+  circle_const->quantparam(std::move(quantparam));
+
+  // sparsityparam
+  auto sparam = std::make_unique<luci::SparsityParam>();
+  sparam->traversal_order = {1};
+  sparam->block_map = {1};
+  sparam->dim_metadata = {};
+  circle_const->sparsityparam(std::move(sparam));
+
+  return circle_const;
+}
+
+template <loco::DataType DT> luci::CircleConst *new_empty_const(loco::Graph *g)
+{
+  auto circle_const = g->nodes()->create<luci::CircleConst>();
+
+  const auto size = 0;
+
+  circle_const->dtype(DT);
+  circle_const->rank(1);
+  circle_const->dim(0).set(size);
+  circle_const->shape_status(luci::ShapeStatus::VALID);
+  circle_const->size<DT>(size);
+
+  return circle_const;
+}
+
+} // namespace
+
+TEST(CircleConstTest, clone)
+{
+  auto g = loco::make_graph();
+
+  // prepare source CircleConst
+  auto circle_const = new_const_s32(g.get());
+
+  // make a clone
+  auto const_cloned = luci::clone(circle_const);
+
+  // check attributes
+  ASSERT_EQ(loco::DataType::S32, const_cloned->dtype());
+  ASSERT_EQ(1, const_cloned->rank());
+  ASSERT_EQ(2, const_cloned->dim(0).value());
+  ASSERT_EQ(2, const_cloned->size<loco::DataType::S32>());
+  ASSERT_EQ(0, const_cloned->at<loco::DataType::S32>(0));
+  ASSERT_EQ(1, const_cloned->at<loco::DataType::S32>(1));
+  ASSERT_NE(nullptr, const_cloned->quantparam());
+  ASSERT_NE(nullptr, const_cloned->sparsityparam());
+}
+
+TEST(CircleConstTest, clone_U8)
+{
+  auto g = loco::make_graph();
+
+  // prepare source CircleConst
+  auto circle_const = new_empty_const<loco::DataType::U8>(g.get());
+
+  // make a clone
+  auto const_cloned = luci::clone(circle_const);
+
+  // check attributes
+  ASSERT_EQ(loco::DataType::U8, const_cloned->dtype());
+}
+
+TEST(CircleConstTest, clone_S8)
+{
+  auto g = loco::make_graph();
+
+  // prepare source CircleConst
+  auto circle_const = new_empty_const<loco::DataType::S8>(g.get());
+
+  // make a clone
+  auto const_cloned = luci::clone(circle_const);
+
+  // check attributes
+  ASSERT_EQ(loco::DataType::S8, const_cloned->dtype());
+}
+
+TEST(CircleConstTest, clone_S64)
+{
+  auto g = loco::make_graph();
+
+  // prepare source CircleConst
+  auto circle_const = new_empty_const<loco::DataType::S64>(g.get());
+
+  // make a clone
+  auto const_cloned = luci::clone(circle_const);
+
+  // check attributes
+  ASSERT_EQ(loco::DataType::S64, const_cloned->dtype());
+}
+
+TEST(CircleConstTest, clone_BOOL)
+{
+  auto g = loco::make_graph();
+
+  // prepare source CircleConst
+  auto circle_const = new_empty_const<loco::DataType::BOOL>(g.get());
+
+  // make a clone
+  auto const_cloned = luci::clone(circle_const);
+
+  // check attributes
+  ASSERT_EQ(loco::DataType::BOOL, const_cloned->dtype());
+}
+
+TEST(CloneNodeTest, clone_Const)
+{
+  auto g = loco::make_graph();
+  auto node_const = new_const_s32(g.get());
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_const, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_const = dynamic_cast<luci::CircleConst *>(cloned);
+  ASSERT_NE(nullptr, cloned_const);
+  ASSERT_EQ(loco::DataType::S32, cloned_const->dtype());
+  ASSERT_EQ(1, cloned_const->rank());
+  ASSERT_EQ(2, cloned_const->dim(0).value());
+  ASSERT_EQ(2, cloned_const->size<loco::DataType::S32>());
+  ASSERT_EQ(0, cloned_const->at<loco::DataType::S32>(0));
+  ASSERT_EQ(1, cloned_const->at<loco::DataType::S32>(1));
+  ASSERT_NE(nullptr, cloned_const->quantparam());
+  ASSERT_NE(nullptr, cloned_const->sparsityparam());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleConv2D.cpp b/compiler/luci/service/src/Nodes/CircleConv2D.cpp
new file mode 100644
index 000000000..08cd87ef7
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleConv2D.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleConv2D *node)
+{
+  if (node->fusedActivationFunction() == luci::FusedActFunc::UNDEFINED)
+    return nullptr;
+  if (node->padding() == luci::Padding::UNDEFINED)
+    return nullptr;
+
+  auto *cloned = _graph->nodes()->create<luci::CircleConv2D>();
+  if (cloned != nullptr)
+  {
+    cloned->fusedActivationFunction(node->fusedActivationFunction());
+    cloned->padding(node->padding());
+    cloned->stride()->h(node->stride()->h());
+    cloned->stride()->w(node->stride()->w());
+    cloned->dilation()->h(node->dilation()->h());
+    cloned->dilation()->w(node->dilation()->w());
+  }
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleConv2D.test.cpp b/compiler/luci/service/src/Nodes/CircleConv2D.test.cpp
new file mode 100644
index 000000000..c265d6cd1
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleConv2D.test.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Conv2D)
+{
+  auto g = loco::make_graph();
+  auto node_conv2d = g->nodes()->create<luci::CircleConv2D>();
+  node_conv2d->fusedActivationFunction(luci::FusedActFunc::RELU);
+  node_conv2d->padding(luci::Padding::SAME);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_conv2d, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_conv2d = dynamic_cast<luci::CircleConv2D *>(cloned);
+  ASSERT_NE(nullptr, cloned_conv2d);
+  ASSERT_EQ(node_conv2d->fusedActivationFunction(), cloned_conv2d->fusedActivationFunction());
+  ASSERT_EQ(node_conv2d->padding(), cloned_conv2d->padding());
+}
+
+TEST(CloneNodeTest, clone_Conv2D_fusedact_NEG)
+{
+  auto g = loco::make_graph();
+  auto node_conv2d = g->nodes()->create<luci::CircleConv2D>();
+  node_conv2d->fusedActivationFunction(luci::FusedActFunc::UNDEFINED);
+  node_conv2d->padding(luci::Padding::SAME);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_conv2d, gc.get());
+  ASSERT_EQ(nullptr, cloned);
+}
+
+TEST(CloneNodeTest, clone_Conv2D_padding_NEG)
+{
+  auto g = loco::make_graph();
+  auto node_conv2d = g->nodes()->create<luci::CircleConv2D>();
+  node_conv2d->fusedActivationFunction(luci::FusedActFunc::RELU);
+  node_conv2d->padding(luci::Padding::UNDEFINED);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_conv2d, gc.get());
+  ASSERT_EQ(nullptr, cloned);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleCos.cpp b/compiler/luci/service/src/Nodes/CircleCos.cpp
new file mode 100644
index 000000000..c46e3741b
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleCos.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleCos *)
+{
+  return _graph->nodes()->create<luci::CircleCos>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleCos.test.cpp b/compiler/luci/service/src/Nodes/CircleCos.test.cpp
new file mode 100644
index 000000000..a25943b98
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleCos.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Cos)
+{
+  auto g = loco::make_graph();
+  auto node_cos = g->nodes()->create<luci::CircleCos>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_cos, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_cos = dynamic_cast<luci::CircleCos *>(cloned);
+  ASSERT_NE(nullptr, cloned_cos);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleCustom.cpp b/compiler/luci/service/src/Nodes/CircleCustom.cpp
new file mode 100644
index 000000000..a9764c373
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleCustom.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleCustom *node)
+{
+  uint32_t num_in = node->numInputs();
+  uint32_t num_out = node->numOutputs();
+  auto *cloned = _graph->nodes()->create<luci::CircleCustom>(num_in, num_out);
+  if (cloned != nullptr)
+  {
+    cloned->custom_options(node->custom_options());
+    cloned->custom_code(node->custom_code());
+  }
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleCustom.test.cpp b/compiler/luci/service/src/Nodes/CircleCustom.test.cpp
new file mode 100644
index 000000000..6fee68e71
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleCustom.test.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+#include <string>
+#include <vector>
+
+TEST(CloneNodeTest, clone_Custom)
+{
+  auto g = loco::make_graph();
+  auto node_custom = g->nodes()->create<luci::CircleCustom>(2, 3);
+  std::vector<uint8_t> options({0x55, 0x56, 0x57});
+  std::string code = "hello";
+  node_custom->custom_options(options);
+  node_custom->custom_code(code);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_custom, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_custom = dynamic_cast<luci::CircleCustom *>(cloned);
+  ASSERT_NE(nullptr, cloned_custom);
+  auto cloned_options = cloned_custom->custom_options();
+  ASSERT_EQ(options.size(), cloned_options.size());
+  auto size = options.size();
+  for (size_t s = 0; s < size; ++s)
+    ASSERT_EQ(options.at(s), cloned_options.at(s));
+  ASSERT_TRUE(node_custom->custom_code() == cloned_custom->custom_code());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleCustomOut.cpp b/compiler/luci/service/src/Nodes/CircleCustomOut.cpp
new file mode 100644
index 000000000..84577f529
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleCustomOut.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleCustomOut *node)
+{
+  auto *cloned = _graph->nodes()->create<luci::CircleCustomOut>();
+  if (cloned != nullptr)
+    cloned->index(node->index());
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleCustomOut.test.cpp b/compiler/luci/service/src/Nodes/CircleCustomOut.test.cpp
new file mode 100644
index 000000000..15121bab6
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleCustomOut.test.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_CustomOut)
+{
+  auto g = loco::make_graph();
+  auto node_cout = g->nodes()->create<luci::CircleCustomOut>();
+  node_cout->index(1);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_cout, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_cout = dynamic_cast<luci::CircleCustomOut *>(cloned);
+  ASSERT_NE(nullptr, cloned_cout);
+  ASSERT_EQ(node_cout->index(), cloned_cout->index());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleDepthToSpace.cpp b/compiler/luci/service/src/Nodes/CircleDepthToSpace.cpp
new file mode 100644
index 000000000..7e0bc7d74
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleDepthToSpace.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleDepthToSpace *node)
+{
+  auto *cloned = _graph->nodes()->create<luci::CircleDepthToSpace>();
+  if (cloned != nullptr)
+    cloned->block_size(node->block_size());
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleDepthToSpace.test.cpp b/compiler/luci/service/src/Nodes/CircleDepthToSpace.test.cpp
new file mode 100644
index 000000000..192b10b90
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleDepthToSpace.test.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_DepthToSpace)
+{
+  auto g = loco::make_graph();
+  auto node_d2s = g->nodes()->create<luci::CircleDepthToSpace>();
+  node_d2s->block_size(32);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_d2s, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_d2s = dynamic_cast<luci::CircleDepthToSpace *>(cloned);
+  ASSERT_NE(nullptr, cloned_d2s);
+  ASSERT_EQ(node_d2s->block_size(), cloned_d2s->block_size());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleDepthwiseConv2D.cpp b/compiler/luci/service/src/Nodes/CircleDepthwiseConv2D.cpp
new file mode 100644
index 000000000..8e0b23d94
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleDepthwiseConv2D.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleDepthwiseConv2D *node)
+{
+  if (node->fusedActivationFunction() == luci::FusedActFunc::UNDEFINED)
+    return nullptr;
+  if (node->padding() == luci::Padding::UNDEFINED)
+    return nullptr;
+
+  auto *cloned = _graph->nodes()->create<luci::CircleDepthwiseConv2D>();
+  if (cloned != nullptr)
+  {
+    cloned->fusedActivationFunction(node->fusedActivationFunction());
+    cloned->padding(node->padding());
+    cloned->stride()->h(node->stride()->h());
+    cloned->stride()->w(node->stride()->w());
+    cloned->depthMultiplier(node->depthMultiplier());
+    cloned->dilation()->h(node->dilation()->h());
+    cloned->dilation()->w(node->dilation()->w());
+  }
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleDepthwiseConv2D.test.cpp b/compiler/luci/service/src/Nodes/CircleDepthwiseConv2D.test.cpp
new file mode 100644
index 000000000..8657464bc
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleDepthwiseConv2D.test.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_DepthwiseConv2D)
+{
+  auto g = loco::make_graph();
+  auto node_dwconv2d = g->nodes()->create<luci::CircleDepthwiseConv2D>();
+  node_dwconv2d->fusedActivationFunction(luci::FusedActFunc::RELU);
+  node_dwconv2d->padding(luci::Padding::SAME);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_dwconv2d, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_dwconv2d = dynamic_cast<luci::CircleDepthwiseConv2D *>(cloned);
+  ASSERT_NE(nullptr, cloned_dwconv2d);
+  ASSERT_EQ(node_dwconv2d->fusedActivationFunction(), cloned_dwconv2d->fusedActivationFunction());
+  ASSERT_EQ(node_dwconv2d->padding(), cloned_dwconv2d->padding());
+}
+
+TEST(CloneNodeTest, clone_DepthwiseConv2D_fusedact_NEG)
+{
+  auto g = loco::make_graph();
+  auto node_dwconv2d = g->nodes()->create<luci::CircleDepthwiseConv2D>();
+  node_dwconv2d->fusedActivationFunction(luci::FusedActFunc::UNDEFINED);
+  node_dwconv2d->padding(luci::Padding::SAME);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_dwconv2d, gc.get());
+  ASSERT_EQ(nullptr, cloned);
+}
+
+TEST(CloneNodeTest, clone_DepthwiseConv2D_padding_NEG)
+{
+  auto g = loco::make_graph();
+  auto node_dwconv2d = g->nodes()->create<luci::CircleDepthwiseConv2D>();
+  node_dwconv2d->fusedActivationFunction(luci::FusedActFunc::RELU);
+  node_dwconv2d->padding(luci::Padding::UNDEFINED);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_dwconv2d, gc.get());
+  ASSERT_EQ(nullptr, cloned);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleDequantize.cpp b/compiler/luci/service/src/Nodes/CircleDequantize.cpp
new file mode 100644
index 000000000..79983e4d3
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleDequantize.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleDequantize *)
+{
+  return _graph->nodes()->create<luci::CircleDequantize>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleDequantize.test.cpp b/compiler/luci/service/src/Nodes/CircleDequantize.test.cpp
new file mode 100644
index 000000000..e1c563acf
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleDequantize.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Dequantize)
+{
+  auto g = loco::make_graph();
+  auto node_dq = g->nodes()->create<luci::CircleDequantize>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_dq, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_dq = dynamic_cast<luci::CircleDequantize *>(cloned);
+  ASSERT_NE(nullptr, cloned_dq);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleDiv.cpp b/compiler/luci/service/src/Nodes/CircleDiv.cpp
new file mode 100644
index 000000000..7c48d8b76
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleDiv.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleDiv *node)
+{
+  if (node->fusedActivationFunction() == luci::FusedActFunc::UNDEFINED)
+    return nullptr;
+
+  auto *cloned = _graph->nodes()->create<luci::CircleDiv>();
+  if (cloned != nullptr)
+    cloned->fusedActivationFunction(node->fusedActivationFunction());
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleDiv.test.cpp b/compiler/luci/service/src/Nodes/CircleDiv.test.cpp
new file mode 100644
index 000000000..5182ac908
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleDiv.test.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Div)
+{
+  auto g = loco::make_graph();
+  auto node_div = g->nodes()->create<luci::CircleDiv>();
+  node_div->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_div, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_div = dynamic_cast<luci::CircleDiv *>(cloned);
+  ASSERT_NE(nullptr, cloned_div);
+  ASSERT_EQ(node_div->fusedActivationFunction(), cloned_div->fusedActivationFunction());
+}
+
+TEST(CloneNodeTest, clone_Div_NEG)
+{
+  auto g = loco::make_graph();
+  auto node_div = g->nodes()->create<luci::CircleDiv>();
+  node_div->fusedActivationFunction(luci::FusedActFunc::UNDEFINED);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_div, gc.get());
+  ASSERT_EQ(nullptr, cloned);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleElu.cpp b/compiler/luci/service/src/Nodes/CircleElu.cpp
new file mode 100644
index 000000000..e2df30285
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleElu.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleElu *)
+{
+  return _graph->nodes()->create<luci::CircleElu>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleElu.test.cpp b/compiler/luci/service/src/Nodes/CircleElu.test.cpp
new file mode 100644
index 000000000..e75b3bcb1
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleElu.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Elu)
+{
+  auto g = loco::make_graph();
+  auto node_elu = g->nodes()->create<luci::CircleElu>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_elu, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_elu = dynamic_cast<luci::CircleElu *>(cloned);
+  ASSERT_NE(nullptr, cloned_elu);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleEqual.cpp b/compiler/luci/service/src/Nodes/CircleEqual.cpp
new file mode 100644
index 000000000..5dd382d0b
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleEqual.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleEqual *)
+{
+  return _graph->nodes()->create<luci::CircleEqual>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleEqual.test.cpp b/compiler/luci/service/src/Nodes/CircleEqual.test.cpp
new file mode 100644
index 000000000..99a5535fc
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleEqual.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Equal)
+{
+  auto g = loco::make_graph();
+  auto node_eq = g->nodes()->create<luci::CircleEqual>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_eq, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_eq = dynamic_cast<luci::CircleEqual *>(cloned);
+  ASSERT_NE(nullptr, cloned_eq);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleExp.cpp b/compiler/luci/service/src/Nodes/CircleExp.cpp
new file mode 100644
index 000000000..3d4918320
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleExp.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleExp *)
+{
+  return _graph->nodes()->create<luci::CircleExp>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleExp.test.cpp b/compiler/luci/service/src/Nodes/CircleExp.test.cpp
new file mode 100644
index 000000000..ff2bb65db
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleExp.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Exp)
+{
+  auto g = loco::make_graph();
+  auto node_exp = g->nodes()->create<luci::CircleExp>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_exp, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_exp = dynamic_cast<luci::CircleExp *>(cloned);
+  ASSERT_NE(nullptr, cloned_exp);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleExpandDims.cpp b/compiler/luci/service/src/Nodes/CircleExpandDims.cpp
new file mode 100644
index 000000000..4dd1cec86
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleExpandDims.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleExpandDims *)
+{
+  return _graph->nodes()->create<luci::CircleExpandDims>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleExpandDims.test.cpp b/compiler/luci/service/src/Nodes/CircleExpandDims.test.cpp
new file mode 100644
index 000000000..e3481bccd
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleExpandDims.test.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/Service/CircleShapeInference.h>
+
+#include <loco/IR/TensorShape.h>
+
+#include <gtest/gtest.h>
+
+TEST(ShapeRuleTest, simple_expand_dims)
+{
+  luci::CircleInput input;
+  luci::CircleConst axis;
+  luci::CircleExpandDims expand_dims;
+
+  input.shape({4, 3});
+  input.shape_status(luci::ShapeStatus::VALID);
+
+  axis.dtype(loco::DataType::S32);
+  axis.rank(0);
+  axis.size<loco::DataType::S32>(1);
+  axis.at<loco::DataType::S32>(0) = 1;
+  axis.shape_status(luci::ShapeStatus::VALID);
+
+  expand_dims.input(&input);
+  expand_dims.axis(&axis);
+
+  loco::TensorShape shape;
+  luci::sinf::Rule shape_inf_rule;
+
+  ASSERT_TRUE(shape_inf_rule.infer(&expand_dims, shape));
+  ASSERT_EQ(3, shape.rank());
+  ASSERT_EQ(4, shape.dim(0).value());
+  ASSERT_EQ(1, shape.dim(1).value());
+  ASSERT_EQ(3, shape.dim(2).value());
+}
+
+TEST(CloneNodeTest, clone_ExpandDims)
+{
+  auto g = loco::make_graph();
+  auto node_ed = g->nodes()->create<luci::CircleExpandDims>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_ed, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_ed = dynamic_cast<luci::CircleExpandDims *>(cloned);
+  ASSERT_NE(nullptr, cloned_ed);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleFakeQuant.cpp b/compiler/luci/service/src/Nodes/CircleFakeQuant.cpp
new file mode 100644
index 000000000..7abaca685
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleFakeQuant.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleFakeQuant *node)
+{
+  auto *cloned = _graph->nodes()->create<luci::CircleFakeQuant>();
+  if (cloned != nullptr)
+  {
+    cloned->min(node->min());
+    cloned->max(node->max());
+    cloned->num_bits(node->num_bits());
+    cloned->narrow_range(node->narrow_range());
+  }
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleFakeQuant.test.cpp b/compiler/luci/service/src/Nodes/CircleFakeQuant.test.cpp
new file mode 100644
index 000000000..2c4e3b836
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleFakeQuant.test.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_FakeQuant)
+{
+  auto g = loco::make_graph();
+  auto node_fq = g->nodes()->create<luci::CircleFakeQuant>();
+  node_fq->min(1.0f);
+  node_fq->max(2.0f);
+  node_fq->num_bits(8);
+  node_fq->narrow_range(true);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_fq, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_fq = dynamic_cast<luci::CircleFakeQuant *>(cloned);
+  ASSERT_NE(nullptr, cloned_fq);
+  ASSERT_EQ(node_fq->min(), cloned_fq->min());
+  ASSERT_EQ(node_fq->max(), cloned_fq->max());
+  ASSERT_EQ(node_fq->num_bits(), cloned_fq->num_bits());
+  ASSERT_EQ(node_fq->narrow_range(), cloned_fq->narrow_range());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleFill.cpp b/compiler/luci/service/src/Nodes/CircleFill.cpp
new file mode 100644
index 000000000..d9b74c63a
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleFill.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleFill *)
+{
+  return _graph->nodes()->create<luci::CircleFill>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleFill.test.cpp b/compiler/luci/service/src/Nodes/CircleFill.test.cpp
new file mode 100644
index 000000000..56c807585
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleFill.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Fill)
+{
+  auto g = loco::make_graph();
+  auto node_fill = g->nodes()->create<luci::CircleFill>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_fill, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_fill = dynamic_cast<luci::CircleFill *>(cloned);
+  ASSERT_NE(nullptr, cloned_fill);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleFloor.cpp b/compiler/luci/service/src/Nodes/CircleFloor.cpp
new file mode 100644
index 000000000..532808bc8
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleFloor.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleFloor *)
+{
+  return _graph->nodes()->create<luci::CircleFloor>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleFloor.test.cpp b/compiler/luci/service/src/Nodes/CircleFloor.test.cpp
new file mode 100644
index 000000000..3d53fd2c3
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleFloor.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Floor)
+{
+  auto g = loco::make_graph();
+  auto node_floor = g->nodes()->create<luci::CircleFloor>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_floor, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_floor = dynamic_cast<luci::CircleFloor *>(cloned);
+  ASSERT_NE(nullptr, cloned_floor);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleFloorDiv.cpp b/compiler/luci/service/src/Nodes/CircleFloorDiv.cpp
new file mode 100644
index 000000000..65be3e868
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleFloorDiv.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleFloorDiv *)
+{
+  return _graph->nodes()->create<luci::CircleFloorDiv>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleFloorDiv.test.cpp b/compiler/luci/service/src/Nodes/CircleFloorDiv.test.cpp
new file mode 100644
index 000000000..6365ccd3b
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleFloorDiv.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_FloorDiv)
+{
+  auto g = loco::make_graph();
+  auto node_floordiv = g->nodes()->create<luci::CircleFloorDiv>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_floordiv, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_floordiv = dynamic_cast<luci::CircleFloorDiv *>(cloned);
+  ASSERT_NE(nullptr, cloned_floordiv);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleFloorMod.cpp b/compiler/luci/service/src/Nodes/CircleFloorMod.cpp
new file mode 100644
index 000000000..00e6a0499
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleFloorMod.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleFloorMod *)
+{
+  return _graph->nodes()->create<luci::CircleFloorMod>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleFloorMod.test.cpp b/compiler/luci/service/src/Nodes/CircleFloorMod.test.cpp
new file mode 100644
index 000000000..ce91d5881
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleFloorMod.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_FloorMod)
+{
+  auto g = loco::make_graph();
+  auto node_floormod = g->nodes()->create<luci::CircleFloorMod>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_floormod, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_floormod = dynamic_cast<luci::CircleFloorMod *>(cloned);
+  ASSERT_NE(nullptr, cloned_floormod);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleFullyConnected.cpp b/compiler/luci/service/src/Nodes/CircleFullyConnected.cpp
new file mode 100644
index 000000000..8acb35cbf
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleFullyConnected.cpp
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleFullyConnected *node)
+{
+  if (node->fusedActivationFunction() == luci::FusedActFunc::UNDEFINED)
+    return nullptr;
+  if (node->weights_format() == luci::CircleFullyConnected::WeightsFormat::UNDEFINED)
+    return nullptr;
+
+  auto *cloned = _graph->nodes()->create<luci::CircleFullyConnected>();
+  if (cloned != nullptr)
+  {
+    cloned->fusedActivationFunction(node->fusedActivationFunction());
+    cloned->weights_format(node->weights_format());
+  }
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleFullyConnected.test.cpp b/compiler/luci/service/src/Nodes/CircleFullyConnected.test.cpp
new file mode 100644
index 000000000..965b59130
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleFullyConnected.test.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_FullyConnected)
+{
+  auto g = loco::make_graph();
+  auto node_fc = g->nodes()->create<luci::CircleFullyConnected>();
+  node_fc->fusedActivationFunction(luci::FusedActFunc::RELU);
+  node_fc->weights_format(luci::CircleFullyConnected::WeightsFormat::DEFAULT);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_fc, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_fc = dynamic_cast<luci::CircleFullyConnected *>(cloned);
+  ASSERT_NE(nullptr, cloned_fc);
+  ASSERT_EQ(node_fc->fusedActivationFunction(), cloned_fc->fusedActivationFunction());
+  ASSERT_EQ(node_fc->weights_format(), cloned_fc->weights_format());
+}
+
+TEST(CloneNodeTest, clone_FullyConnected_fusedact_NEG)
+{
+  auto g = loco::make_graph();
+  auto node_fc = g->nodes()->create<luci::CircleFullyConnected>();
+  node_fc->fusedActivationFunction(luci::FusedActFunc::UNDEFINED);
+  node_fc->weights_format(luci::CircleFullyConnected::WeightsFormat::DEFAULT);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_fc, gc.get());
+  ASSERT_EQ(nullptr, cloned);
+}
+
+TEST(CloneNodeTest, clone_FullyConnected_wf_NEG)
+{
+  auto g = loco::make_graph();
+  auto node_fc = g->nodes()->create<luci::CircleFullyConnected>();
+  node_fc->fusedActivationFunction(luci::FusedActFunc::RELU);
+  node_fc->weights_format(luci::CircleFullyConnected::WeightsFormat::UNDEFINED);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_fc, gc.get());
+  ASSERT_EQ(nullptr, cloned);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleGather.cpp b/compiler/luci/service/src/Nodes/CircleGather.cpp
new file mode 100644
index 000000000..072bdeabc
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleGather.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleGather *node)
+{
+  auto *cloned = _graph->nodes()->create<luci::CircleGather>();
+  if (cloned != nullptr)
+    cloned->axis(node->axis());
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleGather.test.cpp b/compiler/luci/service/src/Nodes/CircleGather.test.cpp
new file mode 100644
index 000000000..f48dbdb67
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleGather.test.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Gather)
+{
+  auto g = loco::make_graph();
+  auto node_gat = g->nodes()->create<luci::CircleGather>();
+  node_gat->axis(3);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_gat, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_gat = dynamic_cast<luci::CircleGather *>(cloned);
+  ASSERT_NE(nullptr, cloned_gat);
+  ASSERT_EQ(node_gat->axis(), cloned_gat->axis());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleGatherNd.cpp b/compiler/luci/service/src/Nodes/CircleGatherNd.cpp
new file mode 100644
index 000000000..df7ae6e79
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleGatherNd.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleGatherNd *)
+{
+  return _graph->nodes()->create<luci::CircleGatherNd>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleGatherNd.test.cpp b/compiler/luci/service/src/Nodes/CircleGatherNd.test.cpp
new file mode 100644
index 000000000..3a705710c
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleGatherNd.test.cpp
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/Service/CircleShapeInference.h>
+
+#include <loco/IR/TensorShape.h>
+
+#include <oops/InternalExn.h>
+
+#include <gtest/gtest.h>
+
+TEST(ShapeRuleTest, gather_nd_simple)
+{
+  luci::CircleInput input;
+  luci::CircleConst indices_const;
+  luci::CircleGatherNd gather_nd;
+
+  input.shape({1, 4, 4, 3});
+  indices_const.shape({1, 2, 3});
+
+  input.shape_status(luci::ShapeStatus::VALID);
+  indices_const.shape_status(luci::ShapeStatus::VALID);
+
+  gather_nd.params(&input);
+  gather_nd.indices(&indices_const);
+
+  loco::TensorShape shape;
+  luci::sinf::Rule shape_inf_rule;
+
+  ASSERT_TRUE(shape_inf_rule.infer(&gather_nd, shape));
+  ASSERT_EQ(3, shape.rank());
+  ASSERT_EQ(1, shape.dim(0).value());
+  ASSERT_EQ(2, shape.dim(1).value());
+  ASSERT_EQ(3, shape.dim(2).value());
+}
+
+TEST(ShapeRuleTest, gather_nd_slices)
+{
+  luci::CircleInput input;
+  luci::CircleConst indices_const;
+  luci::CircleGatherNd gather_nd;
+
+  input.shape({1, 4, 4, 3});
+  indices_const.shape({1, 2, 1});
+
+  input.shape_status(luci::ShapeStatus::VALID);
+  indices_const.shape_status(luci::ShapeStatus::VALID);
+
+  gather_nd.params(&input);
+  gather_nd.indices(&indices_const);
+
+  loco::TensorShape shape;
+  luci::sinf::Rule shape_inf_rule;
+
+  ASSERT_TRUE(shape_inf_rule.infer(&gather_nd, shape));
+  ASSERT_EQ(5, shape.rank());
+  ASSERT_EQ(1, shape.dim(0).value());
+  ASSERT_EQ(2, shape.dim(1).value());
+  ASSERT_EQ(4, shape.dim(2).value());
+  ASSERT_EQ(4, shape.dim(3).value());
+  ASSERT_EQ(3, shape.dim(4).value());
+}
+
+TEST(ShapeRuleTest, gather_nd_NEG)
+{
+  luci::CircleInput input;
+  luci::CircleConst indices_const;
+  luci::CircleGatherNd gather_nd;
+
+  input.shape({1, 4, 4, 3});
+  indices_const.shape({1, 2, 5});
+
+  input.shape_status(luci::ShapeStatus::VALID);
+  indices_const.shape_status(luci::ShapeStatus::VALID);
+
+  gather_nd.params(&input);
+  gather_nd.indices(&indices_const);
+
+  loco::TensorShape shape;
+  luci::sinf::Rule shape_inf_rule;
+
+  ASSERT_THROW(shape_inf_rule.infer(&gather_nd, shape), oops::InternalExn);
+}
+
+TEST(CloneNodeTest, clone_GatherNd)
+{
+  auto g = loco::make_graph();
+  auto node_gtnd = g->nodes()->create<luci::CircleGatherNd>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_gtnd, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_gtnd = dynamic_cast<luci::CircleGatherNd *>(cloned);
+  ASSERT_NE(nullptr, cloned_gtnd);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleGreater.cpp b/compiler/luci/service/src/Nodes/CircleGreater.cpp
new file mode 100644
index 000000000..366d955bf
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleGreater.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleGreater *)
+{
+  return _graph->nodes()->create<luci::CircleGreater>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleGreater.test.cpp b/compiler/luci/service/src/Nodes/CircleGreater.test.cpp
new file mode 100644
index 000000000..6d2df61f0
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleGreater.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Greater)
+{
+  auto g = loco::make_graph();
+  auto node_gt = g->nodes()->create<luci::CircleGreater>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_gt, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_gt = dynamic_cast<luci::CircleGreater *>(cloned);
+  ASSERT_NE(nullptr, cloned_gt);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleGreaterEqual.cpp b/compiler/luci/service/src/Nodes/CircleGreaterEqual.cpp
new file mode 100644
index 000000000..9705bbe1e
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleGreaterEqual.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleGreaterEqual *)
+{
+  return _graph->nodes()->create<luci::CircleGreaterEqual>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleGreaterEqual.test.cpp b/compiler/luci/service/src/Nodes/CircleGreaterEqual.test.cpp
new file mode 100644
index 000000000..10387df3a
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleGreaterEqual.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_GreaterEqual)
+{
+  auto g = loco::make_graph();
+  auto node_ge = g->nodes()->create<luci::CircleGreaterEqual>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_ge, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_ge = dynamic_cast<luci::CircleGreaterEqual *>(cloned);
+  ASSERT_NE(nullptr, cloned_ge);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleIfOut.cpp b/compiler/luci/service/src/Nodes/CircleIfOut.cpp
new file mode 100644
index 000000000..31ad7203f
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleIfOut.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <luci/Service/CircleShapeInference.h>
+#include <luci/Service/CircleTypeInference.h>
+
+namespace
+{
+
+struct CircleIfOutGraphs
+{
+  loco::GraphOutput *then_graph_output;
+  loco::GraphOutput *else_graph_output;
+};
+
+} // namespace
+
+namespace
+{
+
+CircleIfOutGraphs get_out_graphs(const luci::CircleIfOut *node)
+{
+  CircleIfOutGraphs ret_out;
+
+  /**
+   * @note  IF operator type and shape are that of the "then" and "else"
+   *        Graph Outputs.
+   */
+  auto circle_if = loco::must_cast<const luci::CircleIf *>(node->input());
+
+  auto index = node->index();
+  auto then_graph = circle_if->then_graph();
+  auto else_graph = circle_if->else_graph();
+  assert(then_graph != nullptr);
+  assert(else_graph != nullptr);
+
+  // shape and type are assumed to be same
+  // these are checked at post_import_graph() in Import
+  auto then_outputs = loco::output_nodes(then_graph);
+  auto else_outputs = loco::output_nodes(else_graph);
+  assert(then_outputs.size() == else_outputs.size());
+  assert(index < static_cast<int32_t>(then_outputs.size()));
+
+  auto then_out = loco::must_cast<luci::CircleOutput *>(then_outputs.at(index));
+  auto else_out = loco::must_cast<luci::CircleOutput *>(else_outputs.at(index));
+
+  auto then_graph_outputs = then_graph->outputs(); // loco::GraphOutput items
+  auto else_graph_outputs = else_graph->outputs();
+  assert(then_graph_outputs->size() == else_graph_outputs->size());
+
+  ret_out.then_graph_output = then_graph_outputs->at(then_out->index());
+  ret_out.else_graph_output = else_graph_outputs->at(else_out->index());
+
+  return ret_out;
+}
+
+} // namespace
+
+namespace luci
+{
+
+loco::TensorShape sinf::Algorithm::visit(const luci::CircleIfOut *node)
+{
+  auto graphs = get_out_graphs(node);
+  assert(*graphs.then_graph_output->shape() == *graphs.else_graph_output->shape());
+  return *graphs.then_graph_output->shape();
+}
+
+loco::DataType tinf::Algorithm::visit(const luci::CircleIfOut *node)
+{
+  auto graphs = get_out_graphs(node);
+  assert(graphs.then_graph_output->dtype() == graphs.else_graph_output->dtype());
+  return graphs.then_graph_output->dtype();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleInstanceNorm.cpp b/compiler/luci/service/src/Nodes/CircleInstanceNorm.cpp
new file mode 100644
index 000000000..d9e49d8ed
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleInstanceNorm.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleInstanceNorm *node)
+{
+  if (node->fusedActivationFunction() == luci::FusedActFunc::UNDEFINED)
+    return nullptr;
+
+  auto *cloned = _graph->nodes()->create<luci::CircleInstanceNorm>();
+  if (cloned != nullptr)
+  {
+    cloned->fusedActivationFunction(node->fusedActivationFunction());
+    cloned->epsilon(node->epsilon());
+  }
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleInstanceNorm.test.cpp b/compiler/luci/service/src/Nodes/CircleInstanceNorm.test.cpp
new file mode 100644
index 000000000..bae92b1ae
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleInstanceNorm.test.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_InstanceNorm)
+{
+  auto g = loco::make_graph();
+  auto node_fc = g->nodes()->create<luci::CircleInstanceNorm>();
+  node_fc->fusedActivationFunction(luci::FusedActFunc::RELU);
+  node_fc->epsilon(3);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_fc, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_fc = dynamic_cast<luci::CircleInstanceNorm *>(cloned);
+  ASSERT_NE(nullptr, cloned_fc);
+  ASSERT_EQ(node_fc->fusedActivationFunction(), cloned_fc->fusedActivationFunction());
+  ASSERT_EQ(node_fc->epsilon(), cloned_fc->epsilon());
+}
+
+TEST(CloneNodeTest, clone_InstanceNorm_fusedact_NEG)
+{
+  auto g = loco::make_graph();
+  auto node_fc = g->nodes()->create<luci::CircleInstanceNorm>();
+  node_fc->fusedActivationFunction(luci::FusedActFunc::UNDEFINED);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_fc, gc.get());
+  ASSERT_EQ(nullptr, cloned);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleL2Normalize.cpp b/compiler/luci/service/src/Nodes/CircleL2Normalize.cpp
new file mode 100644
index 000000000..afa2a6acb
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleL2Normalize.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleL2Normalize *node)
+{
+  if (node->fusedActivationFunction() == luci::FusedActFunc::UNDEFINED)
+    return nullptr;
+
+  auto *cloned = _graph->nodes()->create<luci::CircleL2Normalize>();
+  if (cloned != nullptr)
+    cloned->fusedActivationFunction(node->fusedActivationFunction());
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleL2Normalize.test.cpp b/compiler/luci/service/src/Nodes/CircleL2Normalize.test.cpp
new file mode 100644
index 000000000..0f148797e
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleL2Normalize.test.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_L2Normalize)
+{
+  auto g = loco::make_graph();
+  auto node_l2n = g->nodes()->create<luci::CircleL2Normalize>();
+  node_l2n->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_l2n, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_l2n = dynamic_cast<luci::CircleL2Normalize *>(cloned);
+  ASSERT_NE(nullptr, cloned_l2n);
+  ASSERT_EQ(node_l2n->fusedActivationFunction(), cloned_l2n->fusedActivationFunction());
+}
+
+TEST(CloneNodeTest, clone_L2Normalize_NEG)
+{
+  auto g = loco::make_graph();
+  auto node_l2n = g->nodes()->create<luci::CircleL2Normalize>();
+  node_l2n->fusedActivationFunction(luci::FusedActFunc::UNDEFINED);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_l2n, gc.get());
+  ASSERT_EQ(nullptr, cloned);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleL2Pool2D.cpp b/compiler/luci/service/src/Nodes/CircleL2Pool2D.cpp
new file mode 100644
index 000000000..2d876c5bc
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleL2Pool2D.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleL2Pool2D *node)
+{
+  if (node->fusedActivationFunction() == luci::FusedActFunc::UNDEFINED)
+    return nullptr;
+  if (node->padding() == luci::Padding::UNDEFINED)
+    return nullptr;
+
+  auto *cloned = _graph->nodes()->create<luci::CircleL2Pool2D>();
+  if (cloned != nullptr)
+  {
+    cloned->fusedActivationFunction(node->fusedActivationFunction());
+    cloned->padding(node->padding());
+    cloned->filter()->h(node->filter()->h());
+    cloned->filter()->w(node->filter()->w());
+    cloned->stride()->h(node->stride()->h());
+    cloned->stride()->w(node->stride()->w());
+  }
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleL2Pool2D.test.cpp b/compiler/luci/service/src/Nodes/CircleL2Pool2D.test.cpp
new file mode 100644
index 000000000..37344fd9a
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleL2Pool2D.test.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_L2Pool2D)
+{
+  auto g = loco::make_graph();
+  auto node_l2n = g->nodes()->create<luci::CircleL2Pool2D>();
+  node_l2n->fusedActivationFunction(luci::FusedActFunc::RELU);
+  node_l2n->padding(luci::Padding::SAME);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_l2n, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_l2n = dynamic_cast<luci::CircleL2Pool2D *>(cloned);
+  ASSERT_NE(nullptr, cloned_l2n);
+  ASSERT_EQ(node_l2n->fusedActivationFunction(), cloned_l2n->fusedActivationFunction());
+  ASSERT_EQ(node_l2n->padding(), cloned_l2n->padding());
+}
+
+TEST(CloneNodeTest, clone_L2Normalize_fusedact_NEG)
+{
+  auto g = loco::make_graph();
+  auto node_l2n = g->nodes()->create<luci::CircleL2Pool2D>();
+  node_l2n->fusedActivationFunction(luci::FusedActFunc::UNDEFINED);
+  node_l2n->padding(luci::Padding::SAME);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_l2n, gc.get());
+  ASSERT_EQ(nullptr, cloned);
+}
+
+TEST(CloneNodeTest, clone_L2Normalize_padding_NEG)
+{
+  auto g = loco::make_graph();
+  auto node_l2n = g->nodes()->create<luci::CircleL2Pool2D>();
+  node_l2n->fusedActivationFunction(luci::FusedActFunc::RELU);
+  node_l2n->padding(luci::Padding::UNDEFINED);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_l2n, gc.get());
+  ASSERT_EQ(nullptr, cloned);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleLeakyRelu.cpp b/compiler/luci/service/src/Nodes/CircleLeakyRelu.cpp
new file mode 100644
index 000000000..91030618c
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleLeakyRelu.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleLeakyRelu *node)
+{
+  auto *cloned = _graph->nodes()->create<luci::CircleLeakyRelu>();
+  if (cloned != nullptr)
+    cloned->alpha(node->alpha());
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleLeakyRelu.test.cpp b/compiler/luci/service/src/Nodes/CircleLeakyRelu.test.cpp
new file mode 100644
index 000000000..17fc1442a
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleLeakyRelu.test.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_LeakyRelu)
+{
+  auto g = loco::make_graph();
+  auto node_lr = g->nodes()->create<luci::CircleLeakyRelu>();
+  node_lr->alpha(1.2f);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_lr, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_lr = dynamic_cast<luci::CircleLeakyRelu *>(cloned);
+  ASSERT_NE(nullptr, cloned_lr);
+  ASSERT_EQ(node_lr->alpha(), cloned_lr->alpha());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleLess.cpp b/compiler/luci/service/src/Nodes/CircleLess.cpp
new file mode 100644
index 000000000..33b70b735
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleLess.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleLess *)
+{
+  return _graph->nodes()->create<luci::CircleLess>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleLess.test.cpp b/compiler/luci/service/src/Nodes/CircleLess.test.cpp
new file mode 100644
index 000000000..43248948d
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleLess.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Less)
+{
+  auto g = loco::make_graph();
+  auto node_less = g->nodes()->create<luci::CircleLess>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_less, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_less = dynamic_cast<luci::CircleLess *>(cloned);
+  ASSERT_NE(nullptr, cloned_less);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleLessEqual.cpp b/compiler/luci/service/src/Nodes/CircleLessEqual.cpp
new file mode 100644
index 000000000..22491365a
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleLessEqual.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleLessEqual *)
+{
+  return _graph->nodes()->create<luci::CircleLessEqual>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleLessEqual.test.cpp b/compiler/luci/service/src/Nodes/CircleLessEqual.test.cpp
new file mode 100644
index 000000000..0a87daf5d
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleLessEqual.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_LessEqual)
+{
+  auto g = loco::make_graph();
+  auto node_le = g->nodes()->create<luci::CircleLessEqual>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_le, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_le = dynamic_cast<luci::CircleLessEqual *>(cloned);
+  ASSERT_NE(nullptr, cloned_le);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleLocalResponseNormalization.cpp b/compiler/luci/service/src/Nodes/CircleLocalResponseNormalization.cpp
new file mode 100644
index 000000000..bf69b5ef5
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleLocalResponseNormalization.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleLocalResponseNormalization *node)
+{
+  auto *cloned = _graph->nodes()->create<luci::CircleLocalResponseNormalization>();
+  if (cloned != nullptr)
+  {
+    cloned->radius(node->radius());
+    cloned->bias(node->bias());
+    cloned->alpha(node->alpha());
+    cloned->beta(node->beta());
+  }
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleLocalResponseNormalization.test.cpp b/compiler/luci/service/src/Nodes/CircleLocalResponseNormalization.test.cpp
new file mode 100644
index 000000000..262b119bb
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleLocalResponseNormalization.test.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_LocalResponseNormalization)
+{
+  auto g = loco::make_graph();
+  auto node_lrn = g->nodes()->create<luci::CircleLocalResponseNormalization>();
+  node_lrn->radius(32);
+  node_lrn->bias(1.2f);
+  node_lrn->alpha(3.4f);
+  node_lrn->beta(5.7f);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_lrn, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_lrn = dynamic_cast<luci::CircleLocalResponseNormalization *>(cloned);
+  ASSERT_NE(nullptr, cloned_lrn);
+  ASSERT_EQ(node_lrn->radius(), cloned_lrn->radius());
+  ASSERT_EQ(node_lrn->bias(), cloned_lrn->bias());
+  ASSERT_EQ(node_lrn->alpha(), cloned_lrn->alpha());
+  ASSERT_EQ(node_lrn->beta(), cloned_lrn->beta());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleLog.cpp b/compiler/luci/service/src/Nodes/CircleLog.cpp
new file mode 100644
index 000000000..5788f129f
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleLog.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleLog *)
+{
+  return _graph->nodes()->create<luci::CircleLog>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleLog.test.cpp b/compiler/luci/service/src/Nodes/CircleLog.test.cpp
new file mode 100644
index 000000000..d1ee1428e
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleLog.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Log)
+{
+  auto g = loco::make_graph();
+  auto node_log = g->nodes()->create<luci::CircleLog>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_log, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_log = dynamic_cast<luci::CircleLog *>(cloned);
+  ASSERT_NE(nullptr, cloned_log);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleLogSoftmax.cpp b/compiler/luci/service/src/Nodes/CircleLogSoftmax.cpp
new file mode 100644
index 000000000..352160aff
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleLogSoftmax.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleLogSoftmax *)
+{
+  return _graph->nodes()->create<luci::CircleLogSoftmax>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleLogSoftmax.test.cpp b/compiler/luci/service/src/Nodes/CircleLogSoftmax.test.cpp
new file mode 100644
index 000000000..feebb79cb
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleLogSoftmax.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_LogSoftmax)
+{
+  auto g = loco::make_graph();
+  auto node_logs = g->nodes()->create<luci::CircleLogSoftmax>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_logs, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_logs = dynamic_cast<luci::CircleLogSoftmax *>(cloned);
+  ASSERT_NE(nullptr, cloned_logs);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleLogicalAnd.cpp b/compiler/luci/service/src/Nodes/CircleLogicalAnd.cpp
new file mode 100644
index 000000000..5df62b951
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleLogicalAnd.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleLogicalAnd *)
+{
+  return _graph->nodes()->create<luci::CircleLogicalAnd>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleLogicalAnd.test.cpp b/compiler/luci/service/src/Nodes/CircleLogicalAnd.test.cpp
new file mode 100644
index 000000000..aa811edfa
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleLogicalAnd.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_LogicalAnd)
+{
+  auto g = loco::make_graph();
+  auto node_logand = g->nodes()->create<luci::CircleLogicalAnd>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_logand, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_logand = dynamic_cast<luci::CircleLogicalAnd *>(cloned);
+  ASSERT_NE(nullptr, cloned_logand);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleLogicalNot.cpp b/compiler/luci/service/src/Nodes/CircleLogicalNot.cpp
new file mode 100644
index 000000000..ac982829d
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleLogicalNot.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleLogicalNot *)
+{
+  return _graph->nodes()->create<luci::CircleLogicalNot>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleLogicalNot.test.cpp b/compiler/luci/service/src/Nodes/CircleLogicalNot.test.cpp
new file mode 100644
index 000000000..9e55be944
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleLogicalNot.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_LogicalNot)
+{
+  auto g = loco::make_graph();
+  auto node_lognot = g->nodes()->create<luci::CircleLogicalNot>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_lognot, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_lognot = dynamic_cast<luci::CircleLogicalNot *>(cloned);
+  ASSERT_NE(nullptr, cloned_lognot);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleLogicalOr.cpp b/compiler/luci/service/src/Nodes/CircleLogicalOr.cpp
new file mode 100644
index 000000000..1201d6f34
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleLogicalOr.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleLogicalOr *)
+{
+  return _graph->nodes()->create<luci::CircleLogicalOr>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleLogicalOr.test.cpp b/compiler/luci/service/src/Nodes/CircleLogicalOr.test.cpp
new file mode 100644
index 000000000..19b706dcd
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleLogicalOr.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_LogicalOr)
+{
+  auto g = loco::make_graph();
+  auto node_logor = g->nodes()->create<luci::CircleLogicalOr>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_logor, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_logor = dynamic_cast<luci::CircleLogicalOr *>(cloned);
+  ASSERT_NE(nullptr, cloned_logor);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleLogistic.cpp b/compiler/luci/service/src/Nodes/CircleLogistic.cpp
new file mode 100644
index 000000000..b21b187e9
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleLogistic.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleLogistic *)
+{
+  return _graph->nodes()->create<luci::CircleLogistic>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleLogistic.test.cpp b/compiler/luci/service/src/Nodes/CircleLogistic.test.cpp
new file mode 100644
index 000000000..05dbe46e4
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleLogistic.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Logistic)
+{
+  auto g = loco::make_graph();
+  auto node_log = g->nodes()->create<luci::CircleLogistic>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_log, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_log = dynamic_cast<luci::CircleLogistic *>(cloned);
+  ASSERT_NE(nullptr, cloned_log);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleMatrixDiag.cpp b/compiler/luci/service/src/Nodes/CircleMatrixDiag.cpp
new file mode 100644
index 000000000..2bffa07b1
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleMatrixDiag.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleMatrixDiag *)
+{
+  return _graph->nodes()->create<luci::CircleMatrixDiag>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleMatrixDiag.test.cpp b/compiler/luci/service/src/Nodes/CircleMatrixDiag.test.cpp
new file mode 100644
index 000000000..c08c4cb94
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleMatrixDiag.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_MatrixDiag)
+{
+  auto g = loco::make_graph();
+  auto node_md = g->nodes()->create<luci::CircleMatrixDiag>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_md, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_md = dynamic_cast<luci::CircleMatrixDiag *>(cloned);
+  ASSERT_NE(nullptr, cloned_md);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleMatrixSetDiag.cpp b/compiler/luci/service/src/Nodes/CircleMatrixSetDiag.cpp
new file mode 100644
index 000000000..5ea2a5339
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleMatrixSetDiag.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleMatrixSetDiag *)
+{
+  return _graph->nodes()->create<luci::CircleMatrixSetDiag>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleMatrixSetDiag.test.cpp b/compiler/luci/service/src/Nodes/CircleMatrixSetDiag.test.cpp
new file mode 100644
index 000000000..5ea77ba75
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleMatrixSetDiag.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_MatrixSetDiag)
+{
+  auto g = loco::make_graph();
+  auto node_msd = g->nodes()->create<luci::CircleMatrixSetDiag>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_msd, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_msd = dynamic_cast<luci::CircleMatrixSetDiag *>(cloned);
+  ASSERT_NE(nullptr, cloned_msd);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleMaxPool2D.cpp b/compiler/luci/service/src/Nodes/CircleMaxPool2D.cpp
new file mode 100644
index 000000000..b21610c7f
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleMaxPool2D.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleMaxPool2D *node)
+{
+  if (node->fusedActivationFunction() == luci::FusedActFunc::UNDEFINED)
+    return nullptr;
+  if (node->padding() == luci::Padding::UNDEFINED)
+    return nullptr;
+
+  auto *cloned = _graph->nodes()->create<luci::CircleMaxPool2D>();
+  if (cloned != nullptr)
+  {
+    cloned->fusedActivationFunction(node->fusedActivationFunction());
+    cloned->padding(node->padding());
+    cloned->filter()->h(node->filter()->h());
+    cloned->filter()->w(node->filter()->w());
+    cloned->stride()->h(node->stride()->h());
+    cloned->stride()->w(node->stride()->w());
+  }
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleMaxPool2D.test.cpp b/compiler/luci/service/src/Nodes/CircleMaxPool2D.test.cpp
new file mode 100644
index 000000000..415cf7c44
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleMaxPool2D.test.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_MaxPool2D)
+{
+  auto g = loco::make_graph();
+  auto node_mp = g->nodes()->create<luci::CircleMaxPool2D>();
+  node_mp->fusedActivationFunction(luci::FusedActFunc::RELU);
+  node_mp->padding(luci::Padding::SAME);
+  node_mp->filter()->h(1);
+  node_mp->filter()->w(2);
+  node_mp->stride()->h(3);
+  node_mp->stride()->w(4);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_mp, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_mp = dynamic_cast<luci::CircleMaxPool2D *>(cloned);
+  ASSERT_NE(nullptr, cloned_mp);
+  ASSERT_EQ(node_mp->fusedActivationFunction(), cloned_mp->fusedActivationFunction());
+  ASSERT_EQ(node_mp->padding(), cloned_mp->padding());
+  ASSERT_EQ(node_mp->filter()->h(), cloned_mp->filter()->h());
+  ASSERT_EQ(node_mp->filter()->w(), cloned_mp->filter()->w());
+  ASSERT_EQ(node_mp->stride()->h(), cloned_mp->stride()->h());
+  ASSERT_EQ(node_mp->stride()->w(), cloned_mp->stride()->w());
+}
+
+TEST(CloneNodeTest, clone_MaxPool2D_fusedact_NEG)
+{
+  auto g = loco::make_graph();
+  auto node_mp = g->nodes()->create<luci::CircleMaxPool2D>();
+  node_mp->fusedActivationFunction(luci::FusedActFunc::UNDEFINED);
+  node_mp->padding(luci::Padding::SAME);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_mp, gc.get());
+  ASSERT_EQ(nullptr, cloned);
+}
+
+TEST(CloneNodeTest, clone_MaxPool2D_padding_NEG)
+{
+  auto g = loco::make_graph();
+  auto node_mp = g->nodes()->create<luci::CircleMaxPool2D>();
+  node_mp->fusedActivationFunction(luci::FusedActFunc::RELU);
+  node_mp->padding(luci::Padding::UNDEFINED);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_mp, gc.get());
+  ASSERT_EQ(nullptr, cloned);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleMaximum.cpp b/compiler/luci/service/src/Nodes/CircleMaximum.cpp
new file mode 100644
index 000000000..545f4ca21
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleMaximum.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleMaximum *)
+{
+  return _graph->nodes()->create<luci::CircleMaximum>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleMaximum.test.cpp b/compiler/luci/service/src/Nodes/CircleMaximum.test.cpp
new file mode 100644
index 000000000..6f1ada060
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleMaximum.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Maximum)
+{
+  auto g = loco::make_graph();
+  auto node_max = g->nodes()->create<luci::CircleMaximum>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_max, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_max = dynamic_cast<luci::CircleMaximum *>(cloned);
+  ASSERT_NE(nullptr, cloned_max);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleMean.cpp b/compiler/luci/service/src/Nodes/CircleMean.cpp
index a78713698..95bc54532 100644
--- a/compiler/luci/service/src/Nodes/CircleMean.cpp
+++ b/compiler/luci/service/src/Nodes/CircleMean.cpp
@@ -1,11 +1,11 @@
 /*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *    http://www.apache.org/licenses/LICENSE-2.0
+ *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -14,15 +14,17 @@
  * limitations under the License.
  */
 
-#include <luci/Service/CircleShapeSignatureInference.h>
+#include "CircleCloneNode.h"
 
 namespace luci
 {
 
-ShapeSignature ssinf::Algorithm::visit(const luci::CircleMean *node)
+luci::CircleNode *CloneNode::visit(const luci::CircleMean *node)
 {
-  return legalized_signature(
-      reduced_signature(node->input(), node->reduction_indices(), node->keep_dims()));
+  auto *cloned = _graph->nodes()->create<luci::CircleMean>();
+  if (cloned != nullptr)
+    cloned->keep_dims(node->keep_dims());
+  return cloned;
 }
 
 } // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleMean.test.cpp b/compiler/luci/service/src/Nodes/CircleMean.test.cpp
new file mode 100644
index 000000000..aa1b88f13
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleMean.test.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Mean)
+{
+  auto g = loco::make_graph();
+  auto node_mean = g->nodes()->create<luci::CircleMean>();
+  node_mean->keep_dims(true);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_mean, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_mean = dynamic_cast<luci::CircleMean *>(cloned);
+  ASSERT_NE(nullptr, cloned_mean);
+  ASSERT_EQ(node_mean->keep_dims(), cloned_mean->keep_dims());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleMinimum.cpp b/compiler/luci/service/src/Nodes/CircleMinimum.cpp
new file mode 100644
index 000000000..2c2755c55
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleMinimum.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleMinimum *)
+{
+  return _graph->nodes()->create<luci::CircleMinimum>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleMinimum.test.cpp b/compiler/luci/service/src/Nodes/CircleMinimum.test.cpp
new file mode 100644
index 000000000..0a54be71c
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleMinimum.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Minimum)
+{
+  auto g = loco::make_graph();
+  auto node_min = g->nodes()->create<luci::CircleMinimum>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_min, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_min = dynamic_cast<luci::CircleMinimum *>(cloned);
+  ASSERT_NE(nullptr, cloned_min);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleMirrorPad.cpp b/compiler/luci/service/src/Nodes/CircleMirrorPad.cpp
new file mode 100644
index 000000000..919221a0b
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleMirrorPad.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleMirrorPad *node)
+{
+  if (node->mode() == luci::MirrorPadMode::UNDEFINED)
+    return nullptr;
+
+  auto *cloned = _graph->nodes()->create<luci::CircleMirrorPad>();
+  if (cloned != nullptr)
+    cloned->mode(node->mode());
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleMirrorPad.test.cpp b/compiler/luci/service/src/Nodes/CircleMirrorPad.test.cpp
new file mode 100644
index 000000000..911cf6d3b
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleMirrorPad.test.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_MirrorPad)
+{
+  auto g = loco::make_graph();
+  auto node_mp = g->nodes()->create<luci::CircleMirrorPad>();
+  node_mp->mode(luci::MirrorPadMode::REFLECT);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_mp, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_mp = dynamic_cast<luci::CircleMirrorPad *>(cloned);
+  ASSERT_NE(nullptr, cloned_mp);
+  ASSERT_EQ(node_mp->mode(), cloned_mp->mode());
+}
+
+TEST(CloneNodeTest, clone_MirrorPad_mode_NEG)
+{
+  auto g = loco::make_graph();
+  auto node_mp = g->nodes()->create<luci::CircleMirrorPad>();
+  node_mp->mode(luci::MirrorPadMode::UNDEFINED);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_mp, gc.get());
+  ASSERT_EQ(nullptr, cloned);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleMul.cpp b/compiler/luci/service/src/Nodes/CircleMul.cpp
new file mode 100644
index 000000000..096aed196
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleMul.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleMul *node)
+{
+  if (node->fusedActivationFunction() == luci::FusedActFunc::UNDEFINED)
+    return nullptr;
+
+  auto *cloned = _graph->nodes()->create<luci::CircleMul>();
+  if (cloned != nullptr)
+    cloned->fusedActivationFunction(node->fusedActivationFunction());
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleMul.test.cpp b/compiler/luci/service/src/Nodes/CircleMul.test.cpp
new file mode 100644
index 000000000..dc5565f11
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleMul.test.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Mul)
+{
+  auto g = loco::make_graph();
+  auto node_mul = g->nodes()->create<luci::CircleMul>();
+  node_mul->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_mul, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_mul = dynamic_cast<luci::CircleMul *>(cloned);
+  ASSERT_NE(nullptr, cloned_mul);
+  ASSERT_EQ(node_mul->fusedActivationFunction(), cloned_mul->fusedActivationFunction());
+}
+
+TEST(CloneNodeTest, clone_Mul_NEG)
+{
+  auto g = loco::make_graph();
+  auto node_mul = g->nodes()->create<luci::CircleMul>();
+  node_mul->fusedActivationFunction(luci::FusedActFunc::UNDEFINED);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_mul, gc.get());
+  ASSERT_EQ(nullptr, cloned);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleNeg.cpp b/compiler/luci/service/src/Nodes/CircleNeg.cpp
new file mode 100644
index 000000000..312189e77
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleNeg.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleNeg *)
+{
+  return _graph->nodes()->create<luci::CircleNeg>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleNeg.test.cpp b/compiler/luci/service/src/Nodes/CircleNeg.test.cpp
new file mode 100644
index 000000000..8c2880324
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleNeg.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Neg)
+{
+  auto g = loco::make_graph();
+  auto node_neg = g->nodes()->create<luci::CircleNeg>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_neg, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_neg = dynamic_cast<luci::CircleNeg *>(cloned);
+  ASSERT_NE(nullptr, cloned_neg);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleNonMaxSuppressionV4.cpp b/compiler/luci/service/src/Nodes/CircleNonMaxSuppressionV4.cpp
new file mode 100644
index 000000000..4757e8314
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleNonMaxSuppressionV4.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleNonMaxSuppressionV4 *)
+{
+  return _graph->nodes()->create<luci::CircleNonMaxSuppressionV4>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleNonMaxSuppressionV4.test.cpp b/compiler/luci/service/src/Nodes/CircleNonMaxSuppressionV4.test.cpp
new file mode 100644
index 000000000..34f5b0325
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleNonMaxSuppressionV4.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_NonMaxSuppressionV4)
+{
+  auto g = loco::make_graph();
+  auto node_nms = g->nodes()->create<luci::CircleNonMaxSuppressionV4>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_nms, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_nms = dynamic_cast<luci::CircleNonMaxSuppressionV4 *>(cloned);
+  ASSERT_NE(nullptr, cloned_nms);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleNonMaxSuppressionV4Out.cpp b/compiler/luci/service/src/Nodes/CircleNonMaxSuppressionV4Out.cpp
new file mode 100644
index 000000000..2a12f2a45
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleNonMaxSuppressionV4Out.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleNonMaxSuppressionV4Out *node)
+{
+  auto *cloned = _graph->nodes()->create<luci::CircleNonMaxSuppressionV4Out>();
+  if (cloned != nullptr)
+    cloned->index(node->index());
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleNonMaxSuppressionV4Out.test.cpp b/compiler/luci/service/src/Nodes/CircleNonMaxSuppressionV4Out.test.cpp
new file mode 100644
index 000000000..ed9e0e019
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleNonMaxSuppressionV4Out.test.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_NonMaxSuppressionV4Out)
+{
+  auto g = loco::make_graph();
+  auto node_nout = g->nodes()->create<luci::CircleNonMaxSuppressionV4Out>();
+  node_nout->index(1);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_nout, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_nout = dynamic_cast<luci::CircleNonMaxSuppressionV4Out *>(cloned);
+  ASSERT_NE(nullptr, cloned_nout);
+  ASSERT_EQ(node_nout->index(), cloned_nout->index());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleNonMaxSuppressionV5.cpp b/compiler/luci/service/src/Nodes/CircleNonMaxSuppressionV5.cpp
new file mode 100644
index 000000000..34d128072
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleNonMaxSuppressionV5.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleNonMaxSuppressionV5 *)
+{
+  return _graph->nodes()->create<luci::CircleNonMaxSuppressionV5>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleNonMaxSuppressionV5.test.cpp b/compiler/luci/service/src/Nodes/CircleNonMaxSuppressionV5.test.cpp
new file mode 100644
index 000000000..faaee969e
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleNonMaxSuppressionV5.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_NonMaxSuppressionV5)
+{
+  auto g = loco::make_graph();
+  auto node_nms = g->nodes()->create<luci::CircleNonMaxSuppressionV5>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_nms, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_nms = dynamic_cast<luci::CircleNonMaxSuppressionV5 *>(cloned);
+  ASSERT_NE(nullptr, cloned_nms);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleNonMaxSuppressionV5Out.cpp b/compiler/luci/service/src/Nodes/CircleNonMaxSuppressionV5Out.cpp
new file mode 100644
index 000000000..e1d7875e7
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleNonMaxSuppressionV5Out.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleNonMaxSuppressionV5Out *node)
+{
+  auto *cloned = _graph->nodes()->create<luci::CircleNonMaxSuppressionV5Out>();
+  if (cloned != nullptr)
+    cloned->index(node->index());
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleNonMaxSuppressionV5Out.test.cpp b/compiler/luci/service/src/Nodes/CircleNonMaxSuppressionV5Out.test.cpp
new file mode 100644
index 000000000..ef0f766b9
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleNonMaxSuppressionV5Out.test.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_NonMaxSuppressionV5Out)
+{
+  auto g = loco::make_graph();
+  auto node_nout = g->nodes()->create<luci::CircleNonMaxSuppressionV5Out>();
+  node_nout->index(1);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_nout, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_nout = dynamic_cast<luci::CircleNonMaxSuppressionV5Out *>(cloned);
+  ASSERT_NE(nullptr, cloned_nout);
+  ASSERT_EQ(node_nout->index(), cloned_nout->index());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleNotEqual.cpp b/compiler/luci/service/src/Nodes/CircleNotEqual.cpp
new file mode 100644
index 000000000..4cb5320e8
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleNotEqual.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleNotEqual *)
+{
+  return _graph->nodes()->create<luci::CircleNotEqual>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleNotEqual.test.cpp b/compiler/luci/service/src/Nodes/CircleNotEqual.test.cpp
new file mode 100644
index 000000000..20f7dbc4b
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleNotEqual.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_NotEqual)
+{
+  auto g = loco::make_graph();
+  auto node_ne = g->nodes()->create<luci::CircleNotEqual>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_ne, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_ne = dynamic_cast<luci::CircleNotEqual *>(cloned);
+  ASSERT_NE(nullptr, cloned_ne);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleOneHot.cpp b/compiler/luci/service/src/Nodes/CircleOneHot.cpp
new file mode 100644
index 000000000..a33c8ff26
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleOneHot.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleOneHot *node)
+{
+  auto *cloned = _graph->nodes()->create<luci::CircleOneHot>();
+  if (cloned != nullptr)
+    cloned->axis(node->axis());
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleOneHot.test.cpp b/compiler/luci/service/src/Nodes/CircleOneHot.test.cpp
new file mode 100644
index 000000000..dea927d1b
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleOneHot.test.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_OneHot)
+{
+  auto g = loco::make_graph();
+  auto node_oh = g->nodes()->create<luci::CircleOneHot>();
+  node_oh->axis(3);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_oh, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_oh = dynamic_cast<luci::CircleOneHot *>(cloned);
+  ASSERT_NE(nullptr, cloned_oh);
+  ASSERT_EQ(node_oh->axis(), cloned_oh->axis());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleOutputDummy.cpp b/compiler/luci/service/src/Nodes/CircleOutputDummy.cpp
index e0f13c439..ce94dff94 100644
--- a/compiler/luci/service/src/Nodes/CircleOutputDummy.cpp
+++ b/compiler/luci/service/src/Nodes/CircleOutputDummy.cpp
@@ -1,11 +1,11 @@
 /*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *    http://www.apache.org/licenses/LICENSE-2.0
+ *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -14,11 +14,14 @@
  * limitations under the License.
  */
 
-#include <luci/Service/CircleShapeSignatureInference.h>
+#include "CircleCloneNode.h"
 
 namespace luci
 {
 
-ShapeSignature ssinf::Algorithm::visit(const luci::CircleOutputDummy *) { return ShapeSignature(); }
+luci::CircleNode *CloneNode::visit(const luci::CircleOutputDummy *)
+{
+  return _graph->nodes()->create<luci::CircleOutputDummy>();
+}
 
 } // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleOutputDummy.test.cpp b/compiler/luci/service/src/Nodes/CircleOutputDummy.test.cpp
new file mode 100644
index 000000000..6170c7c41
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleOutputDummy.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_OutputDummy)
+{
+  auto g = loco::make_graph();
+  auto node_dummy = g->nodes()->create<luci::CircleOutputDummy>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_dummy, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_dummy = dynamic_cast<luci::CircleOutputDummy *>(cloned);
+  ASSERT_NE(nullptr, cloned_dummy);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleOutputExclude.cpp b/compiler/luci/service/src/Nodes/CircleOutputExclude.cpp
index 75bbbb3c0..1b0f919c3 100644
--- a/compiler/luci/service/src/Nodes/CircleOutputExclude.cpp
+++ b/compiler/luci/service/src/Nodes/CircleOutputExclude.cpp
@@ -1,11 +1,11 @@
 /*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *    http://www.apache.org/licenses/LICENSE-2.0
+ *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -14,14 +14,14 @@
  * limitations under the License.
  */
 
-#include <luci/Service/CircleShapeSignatureInference.h>
+#include "CircleCloneNode.h"
 
 namespace luci
 {
 
-ShapeSignature ssinf::Algorithm::visit(const luci::CircleOutputExclude *)
+luci::CircleNode *CloneNode::visit(const luci::CircleOutputExclude *)
 {
-  return ShapeSignature();
+  return _graph->nodes()->create<luci::CircleOutputExclude>();
 }
 
 } // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleOutputExclude.test.cpp b/compiler/luci/service/src/Nodes/CircleOutputExclude.test.cpp
new file mode 100644
index 000000000..120ffe86b
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleOutputExclude.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_OutputExclude)
+{
+  auto g = loco::make_graph();
+  auto node_outex = g->nodes()->create<luci::CircleOutputExclude>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_outex, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_outex = dynamic_cast<luci::CircleOutputExclude *>(cloned);
+  ASSERT_NE(nullptr, cloned_outex);
+}
diff --git a/compiler/luci/service/src/Nodes/CirclePRelu.cpp b/compiler/luci/service/src/Nodes/CirclePRelu.cpp
new file mode 100644
index 000000000..8a34e507e
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CirclePRelu.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CirclePRelu *)
+{
+  return _graph->nodes()->create<luci::CirclePRelu>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CirclePRelu.test.cpp b/compiler/luci/service/src/Nodes/CirclePRelu.test.cpp
new file mode 100644
index 000000000..1150e3fa4
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CirclePRelu.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_PRelu)
+{
+  auto g = loco::make_graph();
+  auto node_pr = g->nodes()->create<luci::CirclePRelu>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_pr, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_pr = dynamic_cast<luci::CirclePRelu *>(cloned);
+  ASSERT_NE(nullptr, cloned_pr);
+}
diff --git a/runtime/onert/core/src/compiler/ParamChecker.cc b/compiler/luci/service/src/Nodes/CirclePack.cpp
index c4f80f087..a3cee0bfd 100644
--- a/runtime/onert/core/src/compiler/ParamChecker.cc
+++ b/compiler/luci/service/src/Nodes/CirclePack.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,20 +14,17 @@
  * limitations under the License.
  */
 
-#include "ParamChecker.h"
+#include "CircleCloneNode.h"
 
-#include "ir/Graph.h"
-
-namespace onert
-{
-namespace compiler
+namespace luci
 {
 
-void ParamChecker::operator()()
+luci::CircleNode *CloneNode::visit(const luci::CirclePack *node)
 {
-  _model->operations().iterate(
-      [&](const ir::OperationIndex &, const ir::Operation &node) { node.accept(*this); });
+  auto *cloned = _graph->nodes()->create<luci::CirclePack>(node->values_count());
+  if (cloned != nullptr)
+    cloned->axis(node->axis());
+  return cloned;
 }
 
-} // namespace compiler
-} // namespace onert
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CirclePack.test.cpp b/compiler/luci/service/src/Nodes/CirclePack.test.cpp
new file mode 100644
index 000000000..b808956dc
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CirclePack.test.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Pack)
+{
+  auto g = loco::make_graph();
+  auto node_pack = g->nodes()->create<luci::CirclePack>(3);
+  node_pack->axis(7);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_pack, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_pack = dynamic_cast<luci::CirclePack *>(cloned);
+  ASSERT_NE(nullptr, cloned_pack);
+  ASSERT_EQ(node_pack->values_count(), cloned_pack->values_count());
+  ASSERT_EQ(node_pack->axis(), cloned_pack->axis());
+}
diff --git a/compiler/luci/service/src/Nodes/CirclePad.cpp b/compiler/luci/service/src/Nodes/CirclePad.cpp
new file mode 100644
index 000000000..425bdce4d
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CirclePad.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CirclePad *)
+{
+  return _graph->nodes()->create<luci::CirclePad>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CirclePad.test.cpp b/compiler/luci/service/src/Nodes/CirclePad.test.cpp
new file mode 100644
index 000000000..1d5f8375e
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CirclePad.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Pad)
+{
+  auto g = loco::make_graph();
+  auto node_pad = g->nodes()->create<luci::CirclePad>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_pad, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_pad = dynamic_cast<luci::CirclePad *>(cloned);
+  ASSERT_NE(nullptr, cloned_pad);
+}
diff --git a/compiler/luci/service/src/Nodes/CirclePadV2.cpp b/compiler/luci/service/src/Nodes/CirclePadV2.cpp
new file mode 100644
index 000000000..0e93869b6
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CirclePadV2.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CirclePadV2 *)
+{
+  return _graph->nodes()->create<luci::CirclePadV2>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CirclePadV2.test.cpp b/compiler/luci/service/src/Nodes/CirclePadV2.test.cpp
new file mode 100644
index 000000000..d011f69f8
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CirclePadV2.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_PadV2)
+{
+  auto g = loco::make_graph();
+  auto node_pad = g->nodes()->create<luci::CirclePadV2>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_pad, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_pad = dynamic_cast<luci::CirclePadV2 *>(cloned);
+  ASSERT_NE(nullptr, cloned_pad);
+}
diff --git a/compiler/luci/service/src/Nodes/CirclePow.cpp b/compiler/luci/service/src/Nodes/CirclePow.cpp
new file mode 100644
index 000000000..bf9388913
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CirclePow.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CirclePow *)
+{
+  return _graph->nodes()->create<luci::CirclePow>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CirclePow.test.cpp b/compiler/luci/service/src/Nodes/CirclePow.test.cpp
new file mode 100644
index 000000000..946298932
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CirclePow.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Pow)
+{
+  auto g = loco::make_graph();
+  auto node_pow = g->nodes()->create<luci::CirclePow>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_pow, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_pow = dynamic_cast<luci::CirclePow *>(cloned);
+  ASSERT_NE(nullptr, cloned_pow);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleRange.cpp b/compiler/luci/service/src/Nodes/CircleRange.cpp
new file mode 100644
index 000000000..9c6f7b494
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleRange.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleRange *)
+{
+  return _graph->nodes()->create<luci::CircleRange>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleRange.test.cpp b/compiler/luci/service/src/Nodes/CircleRange.test.cpp
new file mode 100644
index 000000000..b2fb29617
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleRange.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Range)
+{
+  auto g = loco::make_graph();
+  auto node_range = g->nodes()->create<luci::CircleRange>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_range, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_range = dynamic_cast<luci::CircleRange *>(cloned);
+  ASSERT_NE(nullptr, cloned_range);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleRank.cpp b/compiler/luci/service/src/Nodes/CircleRank.cpp
new file mode 100644
index 000000000..db8171c51
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleRank.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleRank *)
+{
+  return _graph->nodes()->create<luci::CircleRank>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleRank.test.cpp b/compiler/luci/service/src/Nodes/CircleRank.test.cpp
new file mode 100644
index 000000000..0e81fb254
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleRank.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Rank)
+{
+  auto g = loco::make_graph();
+  auto node_rank = g->nodes()->create<luci::CircleRank>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_rank, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_rank = dynamic_cast<luci::CircleRank *>(cloned);
+  ASSERT_NE(nullptr, cloned_rank);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleReduceAny.cpp b/compiler/luci/service/src/Nodes/CircleReduceAny.cpp
index 27da81466..3ab0b3b59 100644
--- a/compiler/luci/service/src/Nodes/CircleReduceAny.cpp
+++ b/compiler/luci/service/src/Nodes/CircleReduceAny.cpp
@@ -1,11 +1,11 @@
 /*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *    http://www.apache.org/licenses/LICENSE-2.0
+ *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -14,15 +14,17 @@
  * limitations under the License.
  */
 
-#include <luci/Service/CircleShapeSignatureInference.h>
+#include "CircleCloneNode.h"
 
 namespace luci
 {
 
-ShapeSignature ssinf::Algorithm::visit(const luci::CircleReduceAny *node)
+luci::CircleNode *CloneNode::visit(const luci::CircleReduceAny *node)
 {
-  return legalized_signature(
-      reduced_signature(node->input(), node->reduction_indices(), node->keep_dims()));
+  auto *cloned = _graph->nodes()->create<luci::CircleReduceAny>();
+  if (cloned != nullptr)
+    cloned->keep_dims(node->keep_dims());
+  return cloned;
 }
 
 } // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleReduceAny.test.cpp b/compiler/luci/service/src/Nodes/CircleReduceAny.test.cpp
new file mode 100644
index 000000000..904b5a139
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleReduceAny.test.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_ReduceAny)
+{
+  auto g = loco::make_graph();
+  auto node_ra = g->nodes()->create<luci::CircleReduceAny>();
+  node_ra->keep_dims(true);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_ra, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_ra = dynamic_cast<luci::CircleReduceAny *>(cloned);
+  ASSERT_NE(nullptr, cloned_ra);
+  ASSERT_EQ(node_ra->keep_dims(), cloned_ra->keep_dims());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleReduceMax.cpp b/compiler/luci/service/src/Nodes/CircleReduceMax.cpp
index 48d9cb970..c026905ca 100644
--- a/compiler/luci/service/src/Nodes/CircleReduceMax.cpp
+++ b/compiler/luci/service/src/Nodes/CircleReduceMax.cpp
@@ -1,11 +1,11 @@
 /*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *    http://www.apache.org/licenses/LICENSE-2.0
+ *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -14,15 +14,17 @@
  * limitations under the License.
  */
 
-#include <luci/Service/CircleShapeSignatureInference.h>
+#include "CircleCloneNode.h"
 
 namespace luci
 {
 
-ShapeSignature ssinf::Algorithm::visit(const luci::CircleReduceMax *node)
+luci::CircleNode *CloneNode::visit(const luci::CircleReduceMax *node)
 {
-  return legalized_signature(
-      reduced_signature(node->input(), node->reduction_indices(), node->keep_dims()));
+  auto *cloned = _graph->nodes()->create<luci::CircleReduceMax>();
+  if (cloned != nullptr)
+    cloned->keep_dims(node->keep_dims());
+  return cloned;
 }
 
 } // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleReduceMax.test.cpp b/compiler/luci/service/src/Nodes/CircleReduceMax.test.cpp
new file mode 100644
index 000000000..b3f3c881e
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleReduceMax.test.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_ReduceMax)
+{
+  auto g = loco::make_graph();
+  auto node_rmax = g->nodes()->create<luci::CircleReduceMax>();
+  node_rmax->keep_dims(true);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_rmax, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_rmax = dynamic_cast<luci::CircleReduceMax *>(cloned);
+  ASSERT_NE(nullptr, cloned_rmax);
+  ASSERT_EQ(node_rmax->keep_dims(), cloned_rmax->keep_dims());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleReduceMin.cpp b/compiler/luci/service/src/Nodes/CircleReduceMin.cpp
index 9a9997118..3dfa19680 100644
--- a/compiler/luci/service/src/Nodes/CircleReduceMin.cpp
+++ b/compiler/luci/service/src/Nodes/CircleReduceMin.cpp
@@ -1,11 +1,11 @@
 /*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *    http://www.apache.org/licenses/LICENSE-2.0
+ *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -14,15 +14,17 @@
  * limitations under the License.
  */
 
-#include <luci/Service/CircleShapeSignatureInference.h>
+#include "CircleCloneNode.h"
 
 namespace luci
 {
 
-ShapeSignature ssinf::Algorithm::visit(const luci::CircleReduceMin *node)
+luci::CircleNode *CloneNode::visit(const luci::CircleReduceMin *node)
 {
-  return legalized_signature(
-      reduced_signature(node->input(), node->reduction_indices(), node->keep_dims()));
+  auto *cloned = _graph->nodes()->create<luci::CircleReduceMin>();
+  if (cloned != nullptr)
+    cloned->keep_dims(node->keep_dims());
+  return cloned;
 }
 
 } // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleReduceMin.test.cpp b/compiler/luci/service/src/Nodes/CircleReduceMin.test.cpp
new file mode 100644
index 000000000..b3faa68da
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleReduceMin.test.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_ReduceMin)
+{
+  auto g = loco::make_graph();
+  auto node_rmin = g->nodes()->create<luci::CircleReduceMin>();
+  node_rmin->keep_dims(true);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_rmin, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_rmin = dynamic_cast<luci::CircleReduceMin *>(cloned);
+  ASSERT_NE(nullptr, cloned_rmin);
+  ASSERT_EQ(node_rmin->keep_dims(), cloned_rmin->keep_dims());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleReduceProd.cpp b/compiler/luci/service/src/Nodes/CircleReduceProd.cpp
index a9d381a74..418a8ce32 100644
--- a/compiler/luci/service/src/Nodes/CircleReduceProd.cpp
+++ b/compiler/luci/service/src/Nodes/CircleReduceProd.cpp
@@ -1,11 +1,11 @@
 /*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *    http://www.apache.org/licenses/LICENSE-2.0
+ *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -14,15 +14,17 @@
  * limitations under the License.
  */
 
-#include <luci/Service/CircleShapeSignatureInference.h>
+#include "CircleCloneNode.h"
 
 namespace luci
 {
 
-ShapeSignature ssinf::Algorithm::visit(const luci::CircleReduceProd *node)
+luci::CircleNode *CloneNode::visit(const luci::CircleReduceProd *node)
 {
-  return legalized_signature(
-      reduced_signature(node->input(), node->reduction_indices(), node->keep_dims()));
+  auto *cloned = _graph->nodes()->create<luci::CircleReduceProd>();
+  if (cloned != nullptr)
+    cloned->keep_dims(node->keep_dims());
+  return cloned;
 }
 
 } // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleReduceProd.test.cpp b/compiler/luci/service/src/Nodes/CircleReduceProd.test.cpp
new file mode 100644
index 000000000..8caf8e91f
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleReduceProd.test.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_ReduceProd)
+{
+  auto g = loco::make_graph();
+  auto node_rp = g->nodes()->create<luci::CircleReduceProd>();
+  node_rp->keep_dims(true);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_rp, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_rp = dynamic_cast<luci::CircleReduceProd *>(cloned);
+  ASSERT_NE(nullptr, cloned_rp);
+  ASSERT_EQ(node_rp->keep_dims(), cloned_rp->keep_dims());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleRelu.cpp b/compiler/luci/service/src/Nodes/CircleRelu.cpp
index a7a7f6f0a..7447eea0c 100644
--- a/compiler/luci/service/src/Nodes/CircleRelu.cpp
+++ b/compiler/luci/service/src/Nodes/CircleRelu.cpp
@@ -1,11 +1,11 @@
 /*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *    http://www.apache.org/licenses/LICENSE-2.0
+ *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -14,14 +14,14 @@
  * limitations under the License.
  */
 
-#include <luci/Service/CircleShapeSignatureInference.h>
+#include "CircleCloneNode.h"
 
 namespace luci
 {
 
-ShapeSignature ssinf::Algorithm::visit(const luci::CircleRelu *node)
+luci::CircleNode *CloneNode::visit(const luci::CircleRelu *)
 {
-  return input_arg_signature(node, 0);
+  return _graph->nodes()->create<luci::CircleRelu>();
 }
 
 } // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleRelu.test.cpp b/compiler/luci/service/src/Nodes/CircleRelu.test.cpp
new file mode 100644
index 000000000..6154376ba
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleRelu.test.cpp
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/Service/CircleShapeInference.h>
+#include <luci/Service/CircleTypeInference.h>
+
+#include <loco/IR/TensorShape.h>
+
+#include <gtest/gtest.h>
+
+TEST(ShapeRuleTest, simple_relu)
+{
+  luci::CircleInput input;
+  luci::CircleRelu relu;
+
+  input.shape({3, 4});
+  input.shape_status(luci::ShapeStatus::VALID);
+
+  relu.features(&input);
+
+  loco::TensorShape shape;
+  luci::sinf::Rule shape_inf_rule;
+
+  ASSERT_TRUE(shape_inf_rule.infer(&relu, shape));
+  ASSERT_EQ(2, shape.rank());
+  ASSERT_EQ(3, shape.dim(0).value());
+  ASSERT_EQ(4, shape.dim(1).value());
+}
+
+TEST(DataTypeRuleTest, simple_relu)
+{
+  luci::CircleInput input;
+  luci::CircleRelu relu;
+
+  input.dtype(loco::DataType::S32);
+
+  relu.features(&input);
+
+  loco::DataType dtype;
+  luci::tinf::Rule type_inf_rule;
+
+  ASSERT_TRUE(type_inf_rule.infer(&relu, dtype));
+  ASSERT_EQ(loco::DataType::S32, dtype);
+}
+
+TEST(CloneNodeTest, clone_Relu)
+{
+  auto g = loco::make_graph();
+  auto node_relu = g->nodes()->create<luci::CircleRelu>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_relu, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_relu = dynamic_cast<luci::CircleRelu *>(cloned);
+  ASSERT_NE(nullptr, cloned_relu);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleRelu6.cpp b/compiler/luci/service/src/Nodes/CircleRelu6.cpp
index 92a596d08..7b98311ed 100644
--- a/compiler/luci/service/src/Nodes/CircleRelu6.cpp
+++ b/compiler/luci/service/src/Nodes/CircleRelu6.cpp
@@ -1,11 +1,11 @@
 /*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *    http://www.apache.org/licenses/LICENSE-2.0
+ *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -14,14 +14,14 @@
  * limitations under the License.
  */
 
-#include <luci/Service/CircleShapeSignatureInference.h>
+#include "CircleCloneNode.h"
 
 namespace luci
 {
 
-ShapeSignature ssinf::Algorithm::visit(const luci::CircleRelu6 *node)
+luci::CircleNode *CloneNode::visit(const luci::CircleRelu6 *)
 {
-  return input_arg_signature(node, 0);
+  return _graph->nodes()->create<luci::CircleRelu6>();
 }
 
 } // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleRelu6.test.cpp b/compiler/luci/service/src/Nodes/CircleRelu6.test.cpp
new file mode 100644
index 000000000..213dbcb09
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleRelu6.test.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <loco/IR/TensorShape.h>
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Relu6)
+{
+  auto g = loco::make_graph();
+  auto node_relu6 = g->nodes()->create<luci::CircleRelu6>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_relu6, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_relu6 = dynamic_cast<luci::CircleRelu6 *>(cloned);
+  ASSERT_NE(nullptr, cloned_relu6);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleReluN1To1.cpp b/compiler/luci/service/src/Nodes/CircleReluN1To1.cpp
index 1e8d9971d..4efedb9fc 100644
--- a/compiler/luci/service/src/Nodes/CircleReluN1To1.cpp
+++ b/compiler/luci/service/src/Nodes/CircleReluN1To1.cpp
@@ -1,11 +1,11 @@
 /*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *    http://www.apache.org/licenses/LICENSE-2.0
+ *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -14,14 +14,14 @@
  * limitations under the License.
  */
 
-#include <luci/Service/CircleShapeSignatureInference.h>
+#include "CircleCloneNode.h"
 
 namespace luci
 {
 
-ShapeSignature ssinf::Algorithm::visit(const luci::CircleReluN1To1 *node)
+luci::CircleNode *CloneNode::visit(const luci::CircleReluN1To1 *)
 {
-  return input_arg_signature(node, 0);
+  return _graph->nodes()->create<luci::CircleReluN1To1>();
 }
 
 } // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleReluN1To1.test.cpp b/compiler/luci/service/src/Nodes/CircleReluN1To1.test.cpp
new file mode 100644
index 000000000..b828e795c
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleReluN1To1.test.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <loco/IR/TensorShape.h>
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_ReluN1To1)
+{
+  auto g = loco::make_graph();
+  auto node_relun1 = g->nodes()->create<luci::CircleReluN1To1>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_relun1, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_relun1 = dynamic_cast<luci::CircleReluN1To1 *>(cloned);
+  ASSERT_NE(nullptr, cloned_relun1);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleReshape.cpp b/compiler/luci/service/src/Nodes/CircleReshape.cpp
new file mode 100644
index 000000000..07a81b306
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleReshape.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleReshape *node)
+{
+  auto *cloned = _graph->nodes()->create<luci::CircleReshape>();
+  if (cloned != nullptr)
+  {
+    uint32_t rank = node->newShape()->rank();
+    cloned->newShape()->rank(rank);
+    for (uint32_t r = 0; r < rank; ++r)
+    {
+      cloned->newShape()->dim(r) = node->newShape()->dim(r);
+    }
+  }
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleReshape.test.cpp b/compiler/luci/service/src/Nodes/CircleReshape.test.cpp
new file mode 100644
index 000000000..ca92b717d
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleReshape.test.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Reshape)
+{
+  auto g = loco::make_graph();
+  auto node_reshape = g->nodes()->create<luci::CircleReshape>();
+  node_reshape->newShape()->rank(2);
+  node_reshape->newShape()->dim(0) = 3;
+  node_reshape->newShape()->dim(1) = 4;
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_reshape, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_reshape = dynamic_cast<luci::CircleReshape *>(cloned);
+  ASSERT_NE(nullptr, cloned_reshape);
+  ASSERT_EQ(node_reshape->newShape()->rank(), cloned_reshape->newShape()->rank());
+  ASSERT_EQ(node_reshape->newShape()->dim(0), cloned_reshape->newShape()->dim(0));
+  ASSERT_EQ(node_reshape->newShape()->dim(1), cloned_reshape->newShape()->dim(1));
+}
diff --git a/compiler/luci/service/src/Nodes/CircleResizeBilinear.cpp b/compiler/luci/service/src/Nodes/CircleResizeBilinear.cpp
new file mode 100644
index 000000000..55d21af45
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleResizeBilinear.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleResizeBilinear *node)
+{
+  auto *cloned = _graph->nodes()->create<luci::CircleResizeBilinear>();
+  if (cloned != nullptr)
+  {
+    cloned->align_corners(node->align_corners());
+    cloned->half_pixel_centers(node->half_pixel_centers());
+  }
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleResizeBilinear.test.cpp b/compiler/luci/service/src/Nodes/CircleResizeBilinear.test.cpp
new file mode 100644
index 000000000..bff71261d
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleResizeBilinear.test.cpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/Service/CircleShapeInference.h>
+
+#include <loco/IR/TensorShape.h>
+
+#include <gtest/gtest.h>
+
+TEST(ShapeRuleTest, resize_bilinear_simple)
+{
+  luci::CircleInput input;
+  luci::CircleConst rb_size;
+  luci::CircleResizeBilinear rb;
+
+  input.shape({1, 4, 4, 3});
+  input.shape_status(luci::ShapeStatus::VALID);
+
+  rb_size.dtype(loco::DataType::S32);
+  rb_size.rank(1);
+  rb_size.dim(0).set(2);
+  rb_size.size<loco::DataType::S32>(2);
+  rb_size.at<loco::DataType::S32>(0) = 16;
+  rb_size.at<loco::DataType::S32>(1) = 16;
+  rb_size.shape_status(luci::ShapeStatus::VALID);
+
+  rb.input(&input);
+  rb.size(&rb_size);
+
+  loco::TensorShape shape;
+  luci::sinf::Rule shape_inf_rule;
+
+  ASSERT_TRUE(shape_inf_rule.infer(&rb, shape));
+  ASSERT_EQ(4, shape.rank());
+  ASSERT_EQ(1, shape.dim(0).value());
+  ASSERT_EQ(16, shape.dim(1).value());
+  ASSERT_EQ(16, shape.dim(2).value());
+  ASSERT_EQ(3, shape.dim(3).value());
+}
+
+TEST(CloneNodeTest, clone_ResizeBilinear)
+{
+  auto g = loco::make_graph();
+  auto node_rb = g->nodes()->create<luci::CircleResizeBilinear>();
+  node_rb->align_corners(true);
+  node_rb->half_pixel_centers(true);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_rb, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_rb = dynamic_cast<luci::CircleResizeBilinear *>(cloned);
+  ASSERT_NE(nullptr, cloned_rb);
+  ASSERT_EQ(node_rb->align_corners(), cloned_rb->align_corners());
+  ASSERT_EQ(node_rb->half_pixel_centers(), cloned_rb->half_pixel_centers());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleResizeNearestNeighbor.cpp b/compiler/luci/service/src/Nodes/CircleResizeNearestNeighbor.cpp
new file mode 100644
index 000000000..5727786a7
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleResizeNearestNeighbor.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleResizeNearestNeighbor *node)
+{
+  auto *cloned = _graph->nodes()->create<luci::CircleResizeNearestNeighbor>();
+  if (cloned != nullptr)
+    cloned->align_corners(node->align_corners());
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleResizeNearestNeighbor.test.cpp b/compiler/luci/service/src/Nodes/CircleResizeNearestNeighbor.test.cpp
new file mode 100644
index 000000000..a1d781c65
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleResizeNearestNeighbor.test.cpp
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/Service/CircleShapeInference.h>
+
+#include <loco/IR/TensorShape.h>
+
+#include <gtest/gtest.h>
+
+TEST(ShapeRuleTest, resize_nearest_neighbor_simple)
+{
+  luci::CircleInput input;
+  luci::CircleConst rnn_size;
+  luci::CircleResizeNearestNeighbor rnn;
+
+  input.shape({1, 4, 4, 3});
+  input.shape_status(luci::ShapeStatus::VALID);
+
+  rnn_size.dtype(loco::DataType::S32);
+  rnn_size.rank(1);
+  rnn_size.dim(0).set(2);
+  rnn_size.size<loco::DataType::S32>(2);
+  rnn_size.at<loco::DataType::S32>(0) = 16;
+  rnn_size.at<loco::DataType::S32>(1) = 16;
+  rnn_size.shape_status(luci::ShapeStatus::VALID);
+
+  rnn.input(&input);
+  rnn.size(&rnn_size);
+
+  loco::TensorShape shape;
+  luci::sinf::Rule shape_inf_rule;
+
+  ASSERT_TRUE(shape_inf_rule.infer(&rnn, shape));
+  ASSERT_EQ(4, shape.rank());
+  ASSERT_EQ(1, shape.dim(0).value());
+  ASSERT_EQ(16, shape.dim(1).value());
+  ASSERT_EQ(16, shape.dim(2).value());
+  ASSERT_EQ(3, shape.dim(3).value());
+}
+
+TEST(CloneNodeTest, clone_ResizeNearestNeighbor)
+{
+  auto g = loco::make_graph();
+  auto node_rnn = g->nodes()->create<luci::CircleResizeNearestNeighbor>();
+  node_rnn->align_corners(true);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_rnn, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_rnn = dynamic_cast<luci::CircleResizeNearestNeighbor *>(cloned);
+  ASSERT_NE(nullptr, cloned_rnn);
+  ASSERT_EQ(node_rnn->align_corners(), cloned_rnn->align_corners());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleReverseSequence.cpp b/compiler/luci/service/src/Nodes/CircleReverseSequence.cpp
new file mode 100644
index 000000000..6e6919b0c
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleReverseSequence.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleReverseSequence *node)
+{
+  auto *cloned = _graph->nodes()->create<luci::CircleReverseSequence>();
+  if (cloned != nullptr)
+  {
+    cloned->seq_axis(node->seq_axis());
+    cloned->batch_axis(node->batch_axis());
+  }
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleReverseSequence.test.cpp b/compiler/luci/service/src/Nodes/CircleReverseSequence.test.cpp
new file mode 100644
index 000000000..a7a8e3949
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleReverseSequence.test.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_ReverseSequence)
+{
+  auto g = loco::make_graph();
+  auto node_rs = g->nodes()->create<luci::CircleReverseSequence>();
+  node_rs->seq_axis(1);
+  node_rs->batch_axis(2);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_rs, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_rs = dynamic_cast<luci::CircleReverseSequence *>(cloned);
+  ASSERT_NE(nullptr, cloned_rs);
+  ASSERT_EQ(node_rs->seq_axis(), cloned_rs->seq_axis());
+  ASSERT_EQ(node_rs->batch_axis(), cloned_rs->batch_axis());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleReverseV2.cpp b/compiler/luci/service/src/Nodes/CircleReverseV2.cpp
new file mode 100644
index 000000000..e8fee6c3e
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleReverseV2.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleReverseV2 *)
+{
+  return _graph->nodes()->create<luci::CircleReverseV2>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleReverseV2.test.cpp b/compiler/luci/service/src/Nodes/CircleReverseV2.test.cpp
new file mode 100644
index 000000000..0e5ff933c
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleReverseV2.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_ReverseV2)
+{
+  auto g = loco::make_graph();
+  auto node_rev = g->nodes()->create<luci::CircleReverseV2>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_rev, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_rev = dynamic_cast<luci::CircleReverseV2 *>(cloned);
+  ASSERT_NE(nullptr, cloned_rev);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleRound.cpp b/compiler/luci/service/src/Nodes/CircleRound.cpp
new file mode 100644
index 000000000..2c23f2df6
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleRound.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleRound *)
+{
+  return _graph->nodes()->create<luci::CircleRound>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleRound.test.cpp b/compiler/luci/service/src/Nodes/CircleRound.test.cpp
new file mode 100644
index 000000000..2c2c3a9d0
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleRound.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Round)
+{
+  auto g = loco::make_graph();
+  auto node_rnd = g->nodes()->create<luci::CircleRound>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_rnd, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_rnd = dynamic_cast<luci::CircleRound *>(cloned);
+  ASSERT_NE(nullptr, cloned_rnd);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleRsqrt.cpp b/compiler/luci/service/src/Nodes/CircleRsqrt.cpp
new file mode 100644
index 000000000..aca702fe1
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleRsqrt.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleRsqrt *)
+{
+  return _graph->nodes()->create<luci::CircleRsqrt>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleRsqrt.test.cpp b/compiler/luci/service/src/Nodes/CircleRsqrt.test.cpp
new file mode 100644
index 000000000..3e4ced562
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleRsqrt.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Rsqrt)
+{
+  auto g = loco::make_graph();
+  auto node_rsqrt = g->nodes()->create<luci::CircleRsqrt>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_rsqrt, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_rsqrt = dynamic_cast<luci::CircleRsqrt *>(cloned);
+  ASSERT_NE(nullptr, cloned_rsqrt);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleScatterNd.cpp b/compiler/luci/service/src/Nodes/CircleScatterNd.cpp
new file mode 100644
index 000000000..6c477a598
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleScatterNd.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleScatterNd *)
+{
+  return _graph->nodes()->create<luci::CircleScatterNd>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleScatterNd.test.cpp b/compiler/luci/service/src/Nodes/CircleScatterNd.test.cpp
new file mode 100644
index 000000000..ce63603cc
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleScatterNd.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_ScatterNd)
+{
+  auto g = loco::make_graph();
+  auto node_snd = g->nodes()->create<luci::CircleScatterNd>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_snd, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_snd = dynamic_cast<luci::CircleScatterNd *>(cloned);
+  ASSERT_NE(nullptr, cloned_snd);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleSegmentSum.cpp b/compiler/luci/service/src/Nodes/CircleSegmentSum.cpp
new file mode 100644
index 000000000..aa4001f57
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSegmentSum.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleSegmentSum *)
+{
+  return _graph->nodes()->create<luci::CircleSegmentSum>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleSegmentSum.test.cpp b/compiler/luci/service/src/Nodes/CircleSegmentSum.test.cpp
new file mode 100644
index 000000000..ff17b0745
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSegmentSum.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_SegmentSum)
+{
+  auto g = loco::make_graph();
+  auto node_ss = g->nodes()->create<luci::CircleSegmentSum>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_ss, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_ss = dynamic_cast<luci::CircleSegmentSum *>(cloned);
+  ASSERT_NE(nullptr, cloned_ss);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleSelect.cpp b/compiler/luci/service/src/Nodes/CircleSelect.cpp
new file mode 100644
index 000000000..71b31d33f
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSelect.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleSelect *)
+{
+  return _graph->nodes()->create<luci::CircleSelect>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleSelect.test.cpp b/compiler/luci/service/src/Nodes/CircleSelect.test.cpp
new file mode 100644
index 000000000..e8d631618
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSelect.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Select)
+{
+  auto g = loco::make_graph();
+  auto node_sel = g->nodes()->create<luci::CircleSelect>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_sel, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_sel = dynamic_cast<luci::CircleSelect *>(cloned);
+  ASSERT_NE(nullptr, cloned_sel);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleSelectV2.cpp b/compiler/luci/service/src/Nodes/CircleSelectV2.cpp
new file mode 100644
index 000000000..07af40c40
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSelectV2.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleSelectV2 *)
+{
+  return _graph->nodes()->create<luci::CircleSelectV2>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleSelectV2.test.cpp b/compiler/luci/service/src/Nodes/CircleSelectV2.test.cpp
new file mode 100644
index 000000000..253dba555
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSelectV2.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_SelectV2)
+{
+  auto g = loco::make_graph();
+  auto node_sel = g->nodes()->create<luci::CircleSelectV2>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_sel, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_sel = dynamic_cast<luci::CircleSelectV2 *>(cloned);
+  ASSERT_NE(nullptr, cloned_sel);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleShape.cpp b/compiler/luci/service/src/Nodes/CircleShape.cpp
new file mode 100644
index 000000000..e5b5fa28f
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleShape.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleShape *node)
+{
+  auto *cloned = _graph->nodes()->create<luci::CircleShape>();
+  if (cloned != nullptr)
+    cloned->out_type(node->out_type());
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleShape.test.cpp b/compiler/luci/service/src/Nodes/CircleShape.test.cpp
new file mode 100644
index 000000000..ec057bd05
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleShape.test.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Shape)
+{
+  auto g = loco::make_graph();
+  auto node_shape = g->nodes()->create<luci::CircleShape>();
+  node_shape->out_type(loco::DataType::S32);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_shape, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_shape = dynamic_cast<luci::CircleShape *>(cloned);
+  ASSERT_NE(nullptr, cloned_shape);
+  ASSERT_EQ(node_shape->out_type(), cloned_shape->out_type());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleSin.cpp b/compiler/luci/service/src/Nodes/CircleSin.cpp
new file mode 100644
index 000000000..46a07d21d
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSin.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleSin *)
+{
+  return _graph->nodes()->create<luci::CircleSin>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleSin.test.cpp b/compiler/luci/service/src/Nodes/CircleSin.test.cpp
new file mode 100644
index 000000000..b072e7e2c
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSin.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Sin)
+{
+  auto g = loco::make_graph();
+  auto node_sin = g->nodes()->create<luci::CircleSin>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_sin, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_sin = dynamic_cast<luci::CircleSin *>(cloned);
+  ASSERT_NE(nullptr, cloned_sin);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleSlice.cpp b/compiler/luci/service/src/Nodes/CircleSlice.cpp
new file mode 100644
index 000000000..6b2f4a591
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSlice.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleSlice *)
+{
+  return _graph->nodes()->create<luci::CircleSlice>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleSlice.test.cpp b/compiler/luci/service/src/Nodes/CircleSlice.test.cpp
new file mode 100644
index 000000000..48ec20304
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSlice.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Slice)
+{
+  auto g = loco::make_graph();
+  auto node_slice = g->nodes()->create<luci::CircleSlice>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_slice, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_slice = dynamic_cast<luci::CircleSlice *>(cloned);
+  ASSERT_NE(nullptr, cloned_slice);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleSoftmax.cpp b/compiler/luci/service/src/Nodes/CircleSoftmax.cpp
new file mode 100644
index 000000000..359d1000c
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSoftmax.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleSoftmax *node)
+{
+  auto *cloned = _graph->nodes()->create<luci::CircleSoftmax>();
+  if (cloned != nullptr)
+    cloned->beta(node->beta());
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleSoftmax.test.cpp b/compiler/luci/service/src/Nodes/CircleSoftmax.test.cpp
new file mode 100644
index 000000000..c80b44d69
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSoftmax.test.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Softmax)
+{
+  auto g = loco::make_graph();
+  auto node_sm = g->nodes()->create<luci::CircleSoftmax>();
+  node_sm->beta(2.3f);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_sm, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_sm = dynamic_cast<luci::CircleSoftmax *>(cloned);
+  ASSERT_NE(nullptr, cloned_sm);
+  ASSERT_EQ(node_sm->beta(), cloned_sm->beta());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleSpaceToBatchND.cpp b/compiler/luci/service/src/Nodes/CircleSpaceToBatchND.cpp
new file mode 100644
index 000000000..feb4f3e37
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSpaceToBatchND.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleSpaceToBatchND *)
+{
+  return _graph->nodes()->create<luci::CircleSpaceToBatchND>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleSpaceToBatchND.test.cpp b/compiler/luci/service/src/Nodes/CircleSpaceToBatchND.test.cpp
new file mode 100644
index 000000000..eb743795d
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSpaceToBatchND.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_SpaceToBatchND)
+{
+  auto g = loco::make_graph();
+  auto node_s2bnd = g->nodes()->create<luci::CircleSpaceToBatchND>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_s2bnd, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_s2bnd = dynamic_cast<luci::CircleSpaceToBatchND *>(cloned);
+  ASSERT_NE(nullptr, cloned_s2bnd);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleSpaceToDepth.cpp b/compiler/luci/service/src/Nodes/CircleSpaceToDepth.cpp
new file mode 100644
index 000000000..3a82f5c7a
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSpaceToDepth.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleSpaceToDepth *node)
+{
+  auto *cloned = _graph->nodes()->create<luci::CircleSpaceToDepth>();
+  if (cloned != nullptr)
+    cloned->block_size(node->block_size());
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleSpaceToDepth.test.cpp b/compiler/luci/service/src/Nodes/CircleSpaceToDepth.test.cpp
new file mode 100644
index 000000000..fb544e6d7
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSpaceToDepth.test.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_SpaceToDepth)
+{
+  auto g = loco::make_graph();
+  auto node_s2d = g->nodes()->create<luci::CircleSpaceToDepth>();
+  node_s2d->block_size(32);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_s2d, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_s2d = dynamic_cast<luci::CircleSpaceToDepth *>(cloned);
+  ASSERT_NE(nullptr, cloned_s2d);
+  ASSERT_EQ(node_s2d->block_size(), cloned_s2d->block_size());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleSparseToDense.cpp b/compiler/luci/service/src/Nodes/CircleSparseToDense.cpp
new file mode 100644
index 000000000..3dba1a542
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSparseToDense.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleSparseToDense *node)
+{
+  auto *cloned = _graph->nodes()->create<luci::CircleSparseToDense>();
+  if (cloned != nullptr)
+    cloned->validate_indices(node->validate_indices());
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleSparseToDense.test.cpp b/compiler/luci/service/src/Nodes/CircleSparseToDense.test.cpp
new file mode 100644
index 000000000..177a469cd
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSparseToDense.test.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_SparseToDense)
+{
+  auto g = loco::make_graph();
+  auto node_s2d = g->nodes()->create<luci::CircleSparseToDense>();
+  node_s2d->validate_indices(true);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_s2d, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_s2d = dynamic_cast<luci::CircleSparseToDense *>(cloned);
+  ASSERT_NE(nullptr, cloned_s2d);
+  ASSERT_EQ(node_s2d->validate_indices(), cloned_s2d->validate_indices());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleSplit.cpp b/compiler/luci/service/src/Nodes/CircleSplit.cpp
new file mode 100644
index 000000000..e68a24a1f
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSplit.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleSplit *node)
+{
+  auto *cloned = _graph->nodes()->create<luci::CircleSplit>();
+  if (cloned != nullptr)
+    cloned->num_split(node->num_split());
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleSplit.test.cpp b/compiler/luci/service/src/Nodes/CircleSplit.test.cpp
new file mode 100644
index 000000000..9ee26b425
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSplit.test.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Split)
+{
+  auto g = loco::make_graph();
+  auto node_split = g->nodes()->create<luci::CircleSplit>();
+  node_split->num_split(5);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_split, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_split = dynamic_cast<luci::CircleSplit *>(cloned);
+  ASSERT_NE(nullptr, cloned_split);
+  ASSERT_EQ(node_split->num_split(), cloned_split->num_split());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleSplitOut.cpp b/compiler/luci/service/src/Nodes/CircleSplitOut.cpp
new file mode 100644
index 000000000..024598892
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSplitOut.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleSplitOut *node)
+{
+  auto *cloned = _graph->nodes()->create<luci::CircleSplitOut>();
+  if (cloned != nullptr)
+    cloned->index(node->index());
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleSplitOut.test.cpp b/compiler/luci/service/src/Nodes/CircleSplitOut.test.cpp
new file mode 100644
index 000000000..deec08804
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSplitOut.test.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_SplitOut)
+{
+  auto g = loco::make_graph();
+  auto node_sout = g->nodes()->create<luci::CircleSplitOut>();
+  node_sout->index(1);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_sout, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_sout = dynamic_cast<luci::CircleSplitOut *>(cloned);
+  ASSERT_NE(nullptr, cloned_sout);
+  ASSERT_EQ(node_sout->index(), cloned_sout->index());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleSplitV.cpp b/compiler/luci/service/src/Nodes/CircleSplitV.cpp
new file mode 100644
index 000000000..de6c6cce6
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSplitV.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleSplitV *node)
+{
+  auto *cloned = _graph->nodes()->create<luci::CircleSplitV>();
+  if (cloned != nullptr)
+    cloned->num_split(node->num_split());
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleSplitV.test.cpp b/compiler/luci/service/src/Nodes/CircleSplitV.test.cpp
new file mode 100644
index 000000000..d109a64aa
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSplitV.test.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_SplitV)
+{
+  auto g = loco::make_graph();
+  auto node_split = g->nodes()->create<luci::CircleSplitV>();
+  node_split->num_split(5);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_split, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_split = dynamic_cast<luci::CircleSplitV *>(cloned);
+  ASSERT_NE(nullptr, cloned_split);
+  ASSERT_EQ(node_split->num_split(), cloned_split->num_split());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleSplitVOut.cpp b/compiler/luci/service/src/Nodes/CircleSplitVOut.cpp
new file mode 100644
index 000000000..f40eb0a47
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSplitVOut.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleSplitVOut *node)
+{
+  auto *cloned = _graph->nodes()->create<luci::CircleSplitVOut>();
+  if (cloned != nullptr)
+    cloned->index(node->index());
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleSplitVOut.test.cpp b/compiler/luci/service/src/Nodes/CircleSplitVOut.test.cpp
new file mode 100644
index 000000000..ab5e9d6be
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSplitVOut.test.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_SplitVOut)
+{
+  auto g = loco::make_graph();
+  auto node_sout = g->nodes()->create<luci::CircleSplitVOut>();
+  node_sout->index(1);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_sout, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_sout = dynamic_cast<luci::CircleSplitVOut *>(cloned);
+  ASSERT_NE(nullptr, cloned_sout);
+  ASSERT_EQ(node_sout->index(), cloned_sout->index());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleSqrt.cpp b/compiler/luci/service/src/Nodes/CircleSqrt.cpp
new file mode 100644
index 000000000..a3e63684b
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSqrt.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleSqrt *)
+{
+  return _graph->nodes()->create<luci::CircleSqrt>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleSqrt.test.cpp b/compiler/luci/service/src/Nodes/CircleSqrt.test.cpp
new file mode 100644
index 000000000..dbef839d6
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSqrt.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Sqrt)
+{
+  auto g = loco::make_graph();
+  auto node_sqrt = g->nodes()->create<luci::CircleSqrt>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_sqrt, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_sqrt = dynamic_cast<luci::CircleSqrt *>(cloned);
+  ASSERT_NE(nullptr, cloned_sqrt);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleSquare.cpp b/compiler/luci/service/src/Nodes/CircleSquare.cpp
new file mode 100644
index 000000000..88bbed76c
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSquare.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleSquare *)
+{
+  return _graph->nodes()->create<luci::CircleSquare>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleSquare.test.cpp b/compiler/luci/service/src/Nodes/CircleSquare.test.cpp
new file mode 100644
index 000000000..67ac21210
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSquare.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Square)
+{
+  auto g = loco::make_graph();
+  auto node_squ = g->nodes()->create<luci::CircleSquare>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_squ, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_squ = dynamic_cast<luci::CircleSquare *>(cloned);
+  ASSERT_NE(nullptr, cloned_squ);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleSquaredDifference.cpp b/compiler/luci/service/src/Nodes/CircleSquaredDifference.cpp
new file mode 100644
index 000000000..6becdf1c9
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSquaredDifference.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleSquaredDifference *)
+{
+  return _graph->nodes()->create<luci::CircleSquaredDifference>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleSquaredDifference.test.cpp b/compiler/luci/service/src/Nodes/CircleSquaredDifference.test.cpp
new file mode 100644
index 000000000..26099612b
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSquaredDifference.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_SquaredDifference)
+{
+  auto g = loco::make_graph();
+  auto node_sd = g->nodes()->create<luci::CircleSquaredDifference>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_sd, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_sd = dynamic_cast<luci::CircleSquaredDifference *>(cloned);
+  ASSERT_NE(nullptr, cloned_sd);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleSqueeze.cpp b/compiler/luci/service/src/Nodes/CircleSqueeze.cpp
new file mode 100644
index 000000000..02ba5020c
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSqueeze.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleSqueeze *node)
+{
+  auto *cloned = _graph->nodes()->create<luci::CircleSqueeze>();
+  if (cloned != nullptr)
+    cloned->squeeze_dims(node->squeeze_dims());
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleSqueeze.test.cpp b/compiler/luci/service/src/Nodes/CircleSqueeze.test.cpp
new file mode 100644
index 000000000..bc73eafa7
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSqueeze.test.cpp
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/Service/CircleShapeInference.h>
+
+#include <loco/IR/TensorShape.h>
+
+#include <gtest/gtest.h>
+
+TEST(ShapeRuleTest, squeeze_simple)
+{
+  luci::CircleInput input;
+  luci::CircleSqueeze squeeze;
+
+  input.shape({1, 4, 3, 1});
+  input.shape_status(luci::ShapeStatus::VALID);
+
+  squeeze.input(&input);
+  squeeze.squeeze_dims({0});
+
+  loco::TensorShape shape;
+  luci::sinf::Rule shape_inf_rule;
+
+  ASSERT_TRUE(shape_inf_rule.infer(&squeeze, shape));
+  ASSERT_EQ(3, shape.rank());
+  ASSERT_EQ(4, shape.dim(0).value());
+  ASSERT_EQ(3, shape.dim(1).value());
+  ASSERT_EQ(1, shape.dim(2).value());
+}
+
+TEST(ShapeRuleTest, squeeze_all)
+{
+  luci::CircleInput input;
+  luci::CircleSqueeze squeeze;
+
+  input.shape({1, 4, 3, 1});
+  input.shape_status(luci::ShapeStatus::VALID);
+
+  squeeze.input(&input);
+  squeeze.squeeze_dims({});
+
+  loco::TensorShape shape;
+  luci::sinf::Rule shape_inf_rule;
+
+  ASSERT_TRUE(shape_inf_rule.infer(&squeeze, shape));
+  ASSERT_EQ(2, shape.rank());
+  ASSERT_EQ(4, shape.dim(0).value());
+  ASSERT_EQ(3, shape.dim(1).value());
+}
+
+TEST(CloneNodeTest, clone_Squeeze)
+{
+  auto g = loco::make_graph();
+  auto node_squ = g->nodes()->create<luci::CircleSqueeze>();
+  node_squ->squeeze_dims({2, 3});
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_squ, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_squ = dynamic_cast<luci::CircleSqueeze *>(cloned);
+  ASSERT_NE(nullptr, cloned_squ);
+  ASSERT_EQ(node_squ->squeeze_dims().size(), cloned_squ->squeeze_dims().size());
+  for (size_t s = 0; s < node_squ->squeeze_dims().size(); ++s)
+    ASSERT_EQ(node_squ->squeeze_dims().at(s), cloned_squ->squeeze_dims().at(s));
+}
diff --git a/compiler/luci/service/src/Nodes/CircleStridedSlice.cpp b/compiler/luci/service/src/Nodes/CircleStridedSlice.cpp
new file mode 100644
index 000000000..c4d199316
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleStridedSlice.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleStridedSlice *node)
+{
+  auto *cloned = _graph->nodes()->create<luci::CircleStridedSlice>();
+  if (cloned != nullptr)
+  {
+    cloned->begin_mask(node->begin_mask());
+    cloned->end_mask(node->end_mask());
+    cloned->ellipsis_mask(node->ellipsis_mask());
+    cloned->new_axis_mask(node->new_axis_mask());
+    cloned->shrink_axis_mask(node->shrink_axis_mask());
+  }
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleStridedSlice.test.cpp b/compiler/luci/service/src/Nodes/CircleStridedSlice.test.cpp
new file mode 100644
index 000000000..d633f3022
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleStridedSlice.test.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_StridedSlice)
+{
+  auto g = loco::make_graph();
+  auto node_ss = g->nodes()->create<luci::CircleStridedSlice>();
+  node_ss->begin_mask(1);
+  node_ss->end_mask(2);
+  node_ss->ellipsis_mask(3);
+  node_ss->new_axis_mask(4);
+  node_ss->shrink_axis_mask(5);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_ss, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_ss = dynamic_cast<luci::CircleStridedSlice *>(cloned);
+  ASSERT_NE(nullptr, cloned_ss);
+  ASSERT_EQ(node_ss->begin_mask(), cloned_ss->begin_mask());
+  ASSERT_EQ(node_ss->end_mask(), cloned_ss->end_mask());
+  ASSERT_EQ(node_ss->ellipsis_mask(), cloned_ss->ellipsis_mask());
+  ASSERT_EQ(node_ss->new_axis_mask(), cloned_ss->new_axis_mask());
+  ASSERT_EQ(node_ss->shrink_axis_mask(), cloned_ss->shrink_axis_mask());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleSub.cpp b/compiler/luci/service/src/Nodes/CircleSub.cpp
new file mode 100644
index 000000000..fb4bab19a
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSub.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleSub *node)
+{
+  if (node->fusedActivationFunction() == luci::FusedActFunc::UNDEFINED)
+    return nullptr;
+
+  auto *cloned = _graph->nodes()->create<luci::CircleSub>();
+  if (cloned != nullptr)
+    cloned->fusedActivationFunction(node->fusedActivationFunction());
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleSub.test.cpp b/compiler/luci/service/src/Nodes/CircleSub.test.cpp
new file mode 100644
index 000000000..e6bd7b8ff
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSub.test.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Sub)
+{
+  auto g = loco::make_graph();
+  auto node_sub = g->nodes()->create<luci::CircleSub>();
+  node_sub->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_sub, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_sub = dynamic_cast<luci::CircleSub *>(cloned);
+  ASSERT_NE(nullptr, cloned_sub);
+  ASSERT_EQ(node_sub->fusedActivationFunction(), cloned_sub->fusedActivationFunction());
+}
+
+TEST(CloneNodeTest, clone_Sub_NEG)
+{
+  auto g = loco::make_graph();
+  auto node_sub = g->nodes()->create<luci::CircleSub>();
+  node_sub->fusedActivationFunction(luci::FusedActFunc::UNDEFINED);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_sub, gc.get());
+  ASSERT_EQ(nullptr, cloned);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleSum.cpp b/compiler/luci/service/src/Nodes/CircleSum.cpp
index 9ef90e8e0..29e6ee5f1 100644
--- a/compiler/luci/service/src/Nodes/CircleSum.cpp
+++ b/compiler/luci/service/src/Nodes/CircleSum.cpp
@@ -1,11 +1,11 @@
 /*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
- *    http://www.apache.org/licenses/LICENSE-2.0
+ *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -14,15 +14,17 @@
  * limitations under the License.
  */
 
-#include <luci/Service/CircleShapeSignatureInference.h>
+#include "CircleCloneNode.h"
 
 namespace luci
 {
 
-ShapeSignature ssinf::Algorithm::visit(const luci::CircleSum *node)
+luci::CircleNode *CloneNode::visit(const luci::CircleSum *node)
 {
-  return legalized_signature(
-      reduced_signature(node->input(), node->reduction_indices(), node->keep_dims()));
+  auto *cloned = _graph->nodes()->create<luci::CircleSum>();
+  if (cloned != nullptr)
+    cloned->keep_dims(node->keep_dims());
+  return cloned;
 }
 
 } // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleSum.test.cpp b/compiler/luci/service/src/Nodes/CircleSum.test.cpp
new file mode 100644
index 000000000..aa1b0d128
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleSum.test.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Sum)
+{
+  auto g = loco::make_graph();
+  auto node_sum = g->nodes()->create<luci::CircleSum>();
+  node_sum->keep_dims(true);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_sum, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_sum = dynamic_cast<luci::CircleSum *>(cloned);
+  ASSERT_NE(nullptr, cloned_sum);
+  ASSERT_EQ(node_sum->keep_dims(), cloned_sum->keep_dims());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleTanh.cpp b/compiler/luci/service/src/Nodes/CircleTanh.cpp
new file mode 100644
index 000000000..9cb35932f
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleTanh.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleTanh *)
+{
+  return _graph->nodes()->create<luci::CircleTanh>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleTanh.test.cpp b/compiler/luci/service/src/Nodes/CircleTanh.test.cpp
new file mode 100644
index 000000000..0215b42ca
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleTanh.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Tanh)
+{
+  auto g = loco::make_graph();
+  auto node_tanh = g->nodes()->create<luci::CircleTanh>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_tanh, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_tanh = dynamic_cast<luci::CircleTanh *>(cloned);
+  ASSERT_NE(nullptr, cloned_tanh);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleTile.cpp b/compiler/luci/service/src/Nodes/CircleTile.cpp
new file mode 100644
index 000000000..21c32e021
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleTile.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleTile *)
+{
+  return _graph->nodes()->create<luci::CircleTile>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleTile.test.cpp b/compiler/luci/service/src/Nodes/CircleTile.test.cpp
new file mode 100644
index 000000000..089c86ccb
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleTile.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Tile)
+{
+  auto g = loco::make_graph();
+  auto node_tile = g->nodes()->create<luci::CircleTile>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_tile, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_tile = dynamic_cast<luci::CircleTile *>(cloned);
+  ASSERT_NE(nullptr, cloned_tile);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleTopKV2.cpp b/compiler/luci/service/src/Nodes/CircleTopKV2.cpp
new file mode 100644
index 000000000..e940c03dd
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleTopKV2.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleTopKV2 *)
+{
+  return _graph->nodes()->create<luci::CircleTopKV2>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleTopKV2.test.cpp b/compiler/luci/service/src/Nodes/CircleTopKV2.test.cpp
new file mode 100644
index 000000000..7f68a408d
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleTopKV2.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_TopKV2)
+{
+  auto g = loco::make_graph();
+  auto node_top = g->nodes()->create<luci::CircleTopKV2>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_top, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_top = dynamic_cast<luci::CircleTopKV2 *>(cloned);
+  ASSERT_NE(nullptr, cloned_top);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleTopKV2Out.cpp b/compiler/luci/service/src/Nodes/CircleTopKV2Out.cpp
new file mode 100644
index 000000000..5c13f2be1
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleTopKV2Out.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleTopKV2Out *node)
+{
+  auto *cloned = _graph->nodes()->create<luci::CircleTopKV2Out>();
+  if (cloned != nullptr)
+    cloned->index(node->index());
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleTopKV2Out.test.cpp b/compiler/luci/service/src/Nodes/CircleTopKV2Out.test.cpp
new file mode 100644
index 000000000..cfba61f10
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleTopKV2Out.test.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_TopKV2Out)
+{
+  auto g = loco::make_graph();
+  auto node_tout = g->nodes()->create<luci::CircleTopKV2Out>();
+  node_tout->index(1);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_tout, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_tout = dynamic_cast<luci::CircleTopKV2Out *>(cloned);
+  ASSERT_NE(nullptr, cloned_tout);
+  ASSERT_EQ(node_tout->index(), cloned_tout->index());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleTranspose.cpp b/compiler/luci/service/src/Nodes/CircleTranspose.cpp
new file mode 100644
index 000000000..81db55269
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleTranspose.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleTranspose *)
+{
+  return _graph->nodes()->create<luci::CircleTranspose>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleTranspose.test.cpp b/compiler/luci/service/src/Nodes/CircleTranspose.test.cpp
new file mode 100644
index 000000000..9447d1a5b
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleTranspose.test.cpp
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/Service/CircleShapeInference.h>
+
+#include <loco/IR/TensorShape.h>
+
+#include <gtest/gtest.h>
+
+TEST(ShapeRuleTest, transpose_simple)
+{
+  luci::CircleInput input;
+  luci::CircleConst perm;
+  luci::CircleTranspose transpose;
+
+  input.shape({3, 8, 1});
+  input.shape_status(luci::ShapeStatus::VALID);
+
+  perm.dtype(loco::DataType::S32);
+  perm.rank(1);
+  perm.dim(0).set(3);
+  perm.size<loco::DataType::S32>(3);
+  perm.at<loco::DataType::S32>(0) = 1;
+  perm.at<loco::DataType::S32>(1) = 2;
+  perm.at<loco::DataType::S32>(2) = 0;
+  perm.shape_status(luci::ShapeStatus::VALID);
+
+  transpose.a(&input);
+  transpose.perm(&perm);
+
+  loco::TensorShape shape;
+  luci::sinf::Rule shape_inf_rule;
+
+  ASSERT_TRUE(shape_inf_rule.infer(&transpose, shape));
+  ASSERT_EQ(3, shape.rank());
+  ASSERT_EQ(8, shape.dim(0).value());
+  ASSERT_EQ(1, shape.dim(1).value());
+  ASSERT_EQ(3, shape.dim(2).value());
+}
+
+TEST(CloneNodeTest, clone_Transpose)
+{
+  auto g = loco::make_graph();
+  auto node_tr = g->nodes()->create<luci::CircleTranspose>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_tr, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_tr = dynamic_cast<luci::CircleTranspose *>(cloned);
+  ASSERT_NE(nullptr, cloned_tr);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleTransposeConv.cpp b/compiler/luci/service/src/Nodes/CircleTransposeConv.cpp
new file mode 100644
index 000000000..1fe41bdb2
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleTransposeConv.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleTransposeConv *node)
+{
+  if (node->padding() == luci::Padding::UNDEFINED)
+    return nullptr;
+
+  auto *cloned = _graph->nodes()->create<luci::CircleTransposeConv>();
+  if (cloned != nullptr)
+  {
+    cloned->padding(node->padding());
+    cloned->stride()->h(node->stride()->h());
+    cloned->stride()->w(node->stride()->w());
+  }
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleTransposeConv.test.cpp b/compiler/luci/service/src/Nodes/CircleTransposeConv.test.cpp
new file mode 100644
index 000000000..29a656c03
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleTransposeConv.test.cpp
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_TransposeConv)
+{
+  auto g = loco::make_graph();
+  auto node_trconv = g->nodes()->create<luci::CircleTransposeConv>();
+  node_trconv->padding(luci::Padding::SAME);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_trconv, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_trconv = dynamic_cast<luci::CircleTransposeConv *>(cloned);
+  ASSERT_NE(nullptr, cloned_trconv);
+  ASSERT_EQ(node_trconv->padding(), cloned_trconv->padding());
+}
+
+TEST(CloneNodeTest, clone_TransposeConv_padding_NEG)
+{
+  auto g = loco::make_graph();
+  auto node_trconv = g->nodes()->create<luci::CircleTransposeConv>();
+  node_trconv->padding(luci::Padding::UNDEFINED);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_trconv, gc.get());
+  ASSERT_EQ(nullptr, cloned);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleUnidirectionalSequenceLSTM.cpp b/compiler/luci/service/src/Nodes/CircleUnidirectionalSequenceLSTM.cpp
new file mode 100644
index 000000000..12205f3b0
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleUnidirectionalSequenceLSTM.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleUnidirectionalSequenceLSTM *node)
+{
+  if (node->fusedActivationFunction() == luci::FusedActFunc::UNDEFINED)
+    return nullptr;
+
+  auto *cloned = _graph->nodes()->create<luci::CircleUnidirectionalSequenceLSTM>();
+  if (cloned != nullptr)
+  {
+    cloned->fusedActivationFunction(node->fusedActivationFunction());
+    cloned->cell_clip(node->cell_clip());
+    cloned->proj_clip(node->proj_clip());
+    cloned->time_major(node->time_major());
+    cloned->asymmetric_quantize_inputs(node->asymmetric_quantize_inputs());
+  }
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleUnidirectionalSequenceLSTM.test.cpp b/compiler/luci/service/src/Nodes/CircleUnidirectionalSequenceLSTM.test.cpp
new file mode 100644
index 000000000..c3816ab27
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleUnidirectionalSequenceLSTM.test.cpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_UnidirectionalSequenceLSTM)
+{
+  auto g = loco::make_graph();
+  auto node_uslstm = g->nodes()->create<luci::CircleUnidirectionalSequenceLSTM>();
+  node_uslstm->fusedActivationFunction(luci::FusedActFunc::RELU);
+  node_uslstm->cell_clip(1.1f);
+  node_uslstm->proj_clip(2.2f);
+  node_uslstm->time_major(true);
+  node_uslstm->asymmetric_quantize_inputs(true);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_uslstm, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_uslstm = dynamic_cast<luci::CircleUnidirectionalSequenceLSTM *>(cloned);
+  ASSERT_NE(nullptr, cloned_uslstm);
+  ASSERT_EQ(node_uslstm->fusedActivationFunction(), cloned_uslstm->fusedActivationFunction());
+  ASSERT_EQ(node_uslstm->cell_clip(), cloned_uslstm->cell_clip());
+  ASSERT_EQ(node_uslstm->proj_clip(), cloned_uslstm->proj_clip());
+  ASSERT_EQ(node_uslstm->time_major(), cloned_uslstm->time_major());
+  ASSERT_EQ(node_uslstm->asymmetric_quantize_inputs(), cloned_uslstm->asymmetric_quantize_inputs());
+}
+
+TEST(CloneNodeTest, clone_UnidirectionalSequenceLSTM_NEG)
+{
+  auto g = loco::make_graph();
+  auto node_uslstm = g->nodes()->create<luci::CircleUnidirectionalSequenceLSTM>();
+  node_uslstm->fusedActivationFunction(luci::FusedActFunc::UNDEFINED);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_uslstm, gc.get());
+  ASSERT_EQ(nullptr, cloned);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleUnique.cpp b/compiler/luci/service/src/Nodes/CircleUnique.cpp
new file mode 100644
index 000000000..bde2ea0dc
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleUnique.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleUnique *node)
+{
+  auto *cloned = _graph->nodes()->create<luci::CircleUnique>();
+  if (cloned != nullptr)
+    cloned->idx_out_type(node->idx_out_type());
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleUnique.test.cpp b/compiler/luci/service/src/Nodes/CircleUnique.test.cpp
new file mode 100644
index 000000000..a8ff9eade
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleUnique.test.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Unique)
+{
+  auto g = loco::make_graph();
+  auto node_uniq = g->nodes()->create<luci::CircleUnique>();
+  node_uniq->idx_out_type(loco::DataType::S32);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_uniq, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_uniq = dynamic_cast<luci::CircleUnique *>(cloned);
+  ASSERT_NE(nullptr, cloned_uniq);
+  ASSERT_EQ(node_uniq->idx_out_type(), cloned_uniq->idx_out_type());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleUniqueOut.cpp b/compiler/luci/service/src/Nodes/CircleUniqueOut.cpp
new file mode 100644
index 000000000..30093f9db
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleUniqueOut.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleUniqueOut *node)
+{
+  auto *cloned = _graph->nodes()->create<luci::CircleUniqueOut>();
+  if (cloned != nullptr)
+    cloned->index(node->index());
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleUniqueOut.test.cpp b/compiler/luci/service/src/Nodes/CircleUniqueOut.test.cpp
new file mode 100644
index 000000000..780ad4b78
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleUniqueOut.test.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_UniqueOut)
+{
+  auto g = loco::make_graph();
+  auto node_uout = g->nodes()->create<luci::CircleUniqueOut>();
+  node_uout->index(1);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_uout, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_uout = dynamic_cast<luci::CircleUniqueOut *>(cloned);
+  ASSERT_NE(nullptr, cloned_uout);
+  ASSERT_EQ(node_uout->index(), cloned_uout->index());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleUnpack.cpp b/compiler/luci/service/src/Nodes/CircleUnpack.cpp
new file mode 100644
index 000000000..f9d61c426
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleUnpack.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleUnpack *node)
+{
+  auto *cloned = _graph->nodes()->create<luci::CircleUnpack>();
+  if (cloned != nullptr)
+  {
+    cloned->num(node->num());
+    cloned->axis(node->axis());
+  }
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleUnpack.test.cpp b/compiler/luci/service/src/Nodes/CircleUnpack.test.cpp
new file mode 100644
index 000000000..6559a9276
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleUnpack.test.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Unpack)
+{
+  auto g = loco::make_graph();
+  auto node_unp = g->nodes()->create<luci::CircleUnpack>();
+  node_unp->num(1);
+  node_unp->axis(2);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_unp, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_unp = dynamic_cast<luci::CircleUnpack *>(cloned);
+  ASSERT_NE(nullptr, cloned_unp);
+  ASSERT_EQ(node_unp->num(), cloned_unp->num());
+  ASSERT_EQ(node_unp->axis(), cloned_unp->axis());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleUnpackOut.cpp b/compiler/luci/service/src/Nodes/CircleUnpackOut.cpp
new file mode 100644
index 000000000..342d5daca
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleUnpackOut.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleUnpackOut *node)
+{
+  auto *cloned = _graph->nodes()->create<luci::CircleUnpackOut>();
+  if (cloned != nullptr)
+    cloned->index(node->index());
+  return cloned;
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleUnpackOut.test.cpp b/compiler/luci/service/src/Nodes/CircleUnpackOut.test.cpp
new file mode 100644
index 000000000..ec9bb974e
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleUnpackOut.test.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_UnpackOut)
+{
+  auto g = loco::make_graph();
+  auto node_uout = g->nodes()->create<luci::CircleUnpackOut>();
+  node_uout->index(1);
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_uout, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_uout = dynamic_cast<luci::CircleUnpackOut *>(cloned);
+  ASSERT_NE(nullptr, cloned_uout);
+  ASSERT_EQ(node_uout->index(), cloned_uout->index());
+}
diff --git a/compiler/luci/service/src/Nodes/CircleWhere.cpp b/compiler/luci/service/src/Nodes/CircleWhere.cpp
new file mode 100644
index 000000000..73f4b64ac
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleWhere.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleWhere *)
+{
+  return _graph->nodes()->create<luci::CircleWhere>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleWhere.test.cpp b/compiler/luci/service/src/Nodes/CircleWhere.test.cpp
new file mode 100644
index 000000000..352719d85
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleWhere.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_Where)
+{
+  auto g = loco::make_graph();
+  auto node_wh = g->nodes()->create<luci::CircleWhere>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_wh, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_wh = dynamic_cast<luci::CircleWhere *>(cloned);
+  ASSERT_NE(nullptr, cloned_wh);
+}
diff --git a/compiler/luci/service/src/Nodes/CircleZerosLike.cpp b/compiler/luci/service/src/Nodes/CircleZerosLike.cpp
new file mode 100644
index 000000000..2ee455857
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleZerosLike.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "CircleCloneNode.h"
+
+namespace luci
+{
+
+luci::CircleNode *CloneNode::visit(const luci::CircleZerosLike *)
+{
+  return _graph->nodes()->create<luci::CircleZerosLike>();
+}
+
+} // namespace luci
diff --git a/compiler/luci/service/src/Nodes/CircleZerosLike.test.cpp b/compiler/luci/service/src/Nodes/CircleZerosLike.test.cpp
new file mode 100644
index 000000000..6e0a4b3be
--- /dev/null
+++ b/compiler/luci/service/src/Nodes/CircleZerosLike.test.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/CircleNodeClone.h"
+
+#include <gtest/gtest.h>
+
+TEST(CloneNodeTest, clone_ZerosLike)
+{
+  auto g = loco::make_graph();
+  auto node_zl = g->nodes()->create<luci::CircleZerosLike>();
+
+  auto gc = loco::make_graph();
+  auto cloned = luci::clone_node(node_zl, gc.get());
+  ASSERT_NE(nullptr, cloned);
+  ASSERT_EQ(gc.get(), cloned->graph());
+
+  auto cloned_zl = dynamic_cast<luci::CircleZerosLike *>(cloned);
+  ASSERT_NE(nullptr, cloned_zl);
+}
diff --git a/compiler/luci/service/src/ShapeDescription.cpp b/compiler/luci/service/src/ShapeDescription.cpp
index 01a638f8f..adfb7e342 100644
--- a/compiler/luci/service/src/ShapeDescription.cpp
+++ b/compiler/luci/service/src/ShapeDescription.cpp
@@ -31,7 +31,7 @@ ShapeDescription to_shape_description(const luci::CircleNode *circle_node)
 
   res._dims.resize(circle_node->rank());
   for (uint32_t i = 0; i < circle_node->rank(); ++i)
-    res._dims.at(i) = circle_node->dim(i).value();
+    res._dims.at(i) = circle_node->dim(i).known() ? circle_node->dim(i).value() : -1;
 
   return res;
 }
@@ -53,95 +53,12 @@ ShapeDescription to_shape_description(const loco::TensorShape &shape)
   return res;
 }
 
-ShapeDescription to_shape_description(const loco::FeatureShape &shape)
-{
-  ShapeDescription res;
-
-  res._rank_known = true;
-
-  // T/F Lite encodes a feature map as a NHWC tensor
-  res._dims.resize(4);
-  res._dims.at(0) = shape.count().value();
-  res._dims.at(1) = shape.height().value();
-  res._dims.at(2) = shape.width().value();
-  res._dims.at(3) = shape.depth().value();
-
-  return res;
-}
-
-ShapeDescription to_shape_description(const loco::FilterShape &shape)
-{
-  ShapeDescription res;
-
-  res._rank_known = true;
-
-  // T/F Lite encodes a convolution filter as a NHWC tensor
-  res._dims.resize(4);
-  res._dims.at(0) = shape.count().value();
-  res._dims.at(1) = shape.height().value();
-  res._dims.at(2) = shape.width().value();
-  res._dims.at(3) = shape.depth().value();
-
-  return res;
-}
-
-ShapeDescription to_shape_description(const loco::DepthwiseFilterShape &shape)
-{
-  ShapeDescription res;
-
-  res._rank_known = true;
-
-  // T/F Lite encodes a depthwise convolution filter as a [1, H, W, C*M] tensor
-  res._dims.resize(4);
-  res._dims.at(0) = 1;
-  res._dims.at(1) = shape.height().value();
-  res._dims.at(2) = shape.width().value();
-  res._dims.at(3) = shape.depth().value() * shape.multiplier().value();
-
-  return res;
-}
-
-ShapeDescription to_shape_description(const loco::BiasShape &shape)
-{
-  ShapeDescription res;
-
-  res._rank_known = true;
-
-  res._dims.resize(1);
-  res._dims.at(0) = shape.length().value();
-
-  return res;
-}
-
-ShapeDescription to_shape_description(const loco::MatrixShape &shape)
-{
-  ShapeDescription res;
-
-  res._rank_known = true;
-
-  res._dims.resize(2);
-  res._dims.at(0) = shape.height().value();
-  res._dims.at(1) = shape.width().value();
-
-  return res;
-}
-
 ShapeDescription to_shape_description(const loco::NodeShape &shape)
 {
   switch (shape.domain())
   {
     case loco::Domain::Tensor:
       return to_shape_description(shape.as<loco::TensorShape>());
-    case loco::Domain::Feature:
-      return to_shape_description(shape.as<loco::FeatureShape>());
-    case loco::Domain::Filter:
-      return to_shape_description(shape.as<loco::FilterShape>());
-    case loco::Domain::DepthwiseFilter:
-      return to_shape_description(shape.as<loco::DepthwiseFilterShape>());
-    case loco::Domain::Bias:
-      return to_shape_description(shape.as<loco::BiasShape>());
-    case loco::Domain::Matrix:
-      return to_shape_description(shape.as<loco::MatrixShape>());
     default:
       break;
   }
diff --git a/compiler/luci/service/src/ShapeDescription.test.cpp b/compiler/luci/service/src/ShapeDescription.test.cpp
new file mode 100644
index 000000000..6e53aac75
--- /dev/null
+++ b/compiler/luci/service/src/ShapeDescription.test.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/ShapeDescription.h"
+
+#include <luci/IR/CircleNode.h>
+#include <luci/IR/Nodes/CircleConst.h>
+
+#include <gtest/gtest.h>
+
+TEST(ShapeDescriptionTest, CircleNode)
+{
+  // Use CircleConst as CircleNode
+  luci::CircleConst circle_const;
+  circle_const.shape({1, 2, 3, 4});
+
+  auto sd = luci::to_shape_description(&circle_const);
+
+  ASSERT_EQ(4, sd._dims.size());
+  ASSERT_EQ(1, sd._dims.at(0));
+  ASSERT_TRUE(sd._rank_known);
+}
+
+TEST(ShapeDescriptionTest, TensorShape)
+{
+  loco::TensorShape tensor_shape{1, 2, 3, 4};
+  loco::NodeShape node_shape(tensor_shape);
+
+  auto sd = luci::to_shape_description(node_shape);
+
+  ASSERT_EQ(4, sd._dims.size());
+  ASSERT_EQ(1, sd._dims.at(0));
+  ASSERT_TRUE(sd._rank_known);
+}
+
+TEST(ShapeDescriptionTest, BiasShape_NEG)
+{
+  loco::BiasShape bias_shape;
+  bias_shape.length() = 1;
+  loco::NodeShape node_shape(bias_shape);
+
+  EXPECT_THROW(luci::to_shape_description(node_shape), std::exception);
+}
diff --git a/compiler/luci/service/src/ShapeInfer_StridedSlice.cpp b/compiler/luci/service/src/ShapeInfer_StridedSlice.cpp
index 341201148..c5864f938 100644
--- a/compiler/luci/service/src/ShapeInfer_StridedSlice.cpp
+++ b/compiler/luci/service/src/ShapeInfer_StridedSlice.cpp
@@ -17,12 +17,12 @@
 
 #include "ShapeInfer_StridedSlice.h"
 #include "Check.h"
+#include "CircleShapeInferenceHelper.h"
 
 #include <luci/IR/CircleNode.h>
 #include <loco/IR/DataType.h>
 #include <loco/IR/NodeShape.h>
 #include <oops/InternalExn.h>
-#include <loco/Service/ShapeInference.h>
 
 #include <cmath>
 #include <cstdint>
@@ -245,7 +245,7 @@ loco::TensorShape infer_output_shape(const CircleStridedSlice *node)
   assert(node->new_axis_mask() == 0);
 
   auto op_params = BuildStridedSliceParams(node);
-  loco::TensorShape input_shape = loco::shape_get(input_node).as<loco::TensorShape>();
+  loco::TensorShape input_shape = luci::shape_get(input_node).as<loco::TensorShape>();
 
   uint32_t num_input_axes = input_shape.rank();
   assert(begin_node->size<S32>() <= num_input_axes);
diff --git a/compiler/luci/service/src/Validate.cpp b/compiler/luci/service/src/Validate.cpp
index 3f732b6fe..7ed14c356 100644
--- a/compiler/luci/service/src/Validate.cpp
+++ b/compiler/luci/service/src/Validate.cpp
@@ -20,10 +20,9 @@
 #include <luci/Log.h>
 
 #include <loco/IR/NodeShape.h>
-#include <loco/Service/ShapeInference.h>
-#include <loco/Service/TypeInference.h>
 
 #include <cassert>
+#include <unordered_map>
 #include <vector>
 
 namespace
@@ -36,7 +35,11 @@ std::ostream &operator<<(std::ostream &os, const loco::TensorShape &tensor_shape
   {
     if (r)
       os << ",";
-    os << tensor_shape.dim(r).value();
+
+    if (tensor_shape.dim(r).known())
+      os << tensor_shape.dim(r).value();
+    else
+      os << "?";
   }
   os << "]";
   return os;
@@ -49,7 +52,11 @@ std::ostream &operator<<(std::ostream &os, const luci::CircleNode *circle_node)
   {
     if (r)
       os << ",";
-    os << circle_node->dim(r).value();
+
+    if (circle_node->dim(r).known())
+      os << circle_node->dim(r).value();
+    else
+      os << "?";
   }
   os << "]";
   return os;
@@ -99,10 +106,24 @@ bool validate_shape_dtype(loco::Graph *g)
     auto go_tensor_shape = graph_out->shape();
     assert(go_tensor_shape);
 
+    // NOTE Even if shape of graph output is [] (which means "shape inference was impossible")
+    //      but shape of CircleNode is not, it can be valid case because shape inference
+    //      algorithm of CircleNode may be upgraded than before. The opposite is possible either.
+    //      If such cases are appeared, following validation code should be fixed.
     bool is_shape_valid = (circle_node->rank() == go_tensor_shape->rank());
     for (uint32_t i = 0; is_shape_valid && i < circle_node->rank(); ++i)
-      if (circle_node->dim(i).value() != go_tensor_shape->dim(i).value())
+    {
+      if (!circle_node->dim(i).known() || !go_tensor_shape->dim(i).known())
+      {
+        // If at least one of two dimensions is unknown,
+        // the unknown dimension can accept any value.
+        INFO(l) << "Unknown dimension is matched with known dimension" << std::endl;
+      }
+      else if (circle_node->dim(i).value() != go_tensor_shape->dim(i).value())
+      {
         is_shape_valid = false;
+      }
+    }
 
     if (is_shape_valid == false)
     {
@@ -124,72 +145,62 @@ bool validate_shape_dtype(loco::Graph *g)
   return true;
 }
 
-bool validate_shape_signature(loco::Graph *g)
-{
-  LOGGER(l);
-
-  for (auto node : loco::postorder_traversal(loco::output_nodes(g)))
-  {
-    auto circle_node = loco::must_cast<luci::CircleNode *>(node);
-    const auto shape_signature = circle_node->shape_signature();
+} // namespace
 
-    if (shape_signature.rank() == 0)
-      continue;
+namespace luci
+{
 
-    // Rank of shape and shape signature should be same
-    if (circle_node->rank() != shape_signature.rank())
-    {
-      INFO(l) << "[luci] Rank of shape signature for " << circle_node->name() << " do not match"
-              << std::endl;
-      return false;
-    }
+bool validate(loco::Graph *g)
+{
+  if (!loco::valid(g))
+    return false;
 
-    bool has_unknown = false;
+  if (!validate_shape_dtype(g))
+    return false;
 
-    // If shape siganture is not -1, dimension value should be same
-    for (uint32_t d = 0; d < shape_signature.rank(); ++d)
-    {
-      if (shape_signature.dim(d) != -1 &&
-          shape_signature.dim(d) != (int32_t)(circle_node->dim(d).value()))
-      {
-        INFO(l) << "[luci] Dimension " << d << "of shape signature for " << circle_node->name()
-                << " do not match" << std::endl;
-        return false;
-      }
+  // TODO add more validation
 
-      if (shape_signature.dim(d) == -1)
-        has_unknown = true;
-    }
+  return true;
+}
 
-    // Shape signature should have at least one -1 value.
-    if (!has_unknown)
-    {
-      INFO(l) << "[luci] Shape signature in " << circle_node->name()
-              << " do not have unknown dimension" << std::endl;
+bool validate_name(loco::Graph *g)
+{
+  auto nodes = g->nodes();
+  for (uint32_t n = 0; n < nodes->size(); ++n)
+  {
+    auto node = loco::must_cast<luci::CircleNode *>(nodes->at(n));
+    auto name = node->name();
+    if (name.empty())
       return false;
-    }
   }
 
   return true;
 }
 
-} // namespace
-
-namespace luci
+bool validate_unique_name(luci::Module *m)
 {
+  std::unordered_map<std::string, bool> names_col;
 
-bool validate(loco::Graph *g)
-{
-  if (!loco::valid(g))
-    return false;
-
-  if (!validate_shape_dtype(g))
-    return false;
-
-  if (!validate_shape_signature(g))
-    return false;
+  for (size_t g = 0; g < m->size(); ++g)
+  {
+    auto graph = m->graph(g);
+    auto nodes = graph->nodes();
+    for (uint32_t n = 0; n < nodes->size(); ++n)
+    {
+      auto node = loco::must_cast<luci::CircleNode *>(nodes->at(n));
+      // skip CircleOutput as it may have same name with from() node
+      auto output = dynamic_cast<luci::CircleOutput *>(node);
+      if (output != nullptr)
+        continue;
+
+      auto name = node->name();
+      auto it = names_col.find(name);
+      if (it != names_col.end())
+        return false;
 
-  // TODO add more validation
+      names_col[name] = true;
+    }
+  }
 
   return true;
 }
diff --git a/compiler/luci/service/src/Validate.test.cpp b/compiler/luci/service/src/Validate.test.cpp
new file mode 100644
index 000000000..8ce6d895b
--- /dev/null
+++ b/compiler/luci/service/src/Validate.test.cpp
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Service/Validate.h"
+
+#include <luci/test/TestIOGraph.h>
+
+#include <luci/IR/Nodes/CircleAdd.h>
+#include <luci/IR/Nodes/CircleSqrt.h>
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+using namespace luci::test;
+
+class SqrtGraphlet
+{
+public:
+  SqrtGraphlet() = default;
+
+public:
+  void init(loco::Graph *g, const ShapeU32 input_shape)
+  {
+    _sqrt = g->nodes()->create<luci::CircleSqrt>();
+    _sqrt->dtype(loco::DataType::S32);
+    _sqrt->name("sqrt");
+  }
+
+protected:
+  luci::CircleSqrt *_sqrt = nullptr;
+};
+
+class SqrtGraph : public TestIOGraph, public SqrtGraphlet
+{
+public:
+  SqrtGraph() = default;
+
+public:
+  void init(const ShapeU32 shape)
+  {
+    TestIOGraph::init(shape, shape);
+    SqrtGraphlet::init(g(), shape);
+
+    _sqrt->x(input());
+
+    output()->from(_sqrt);
+
+    // set output name to _sqrt: CircleOutput may have duplicate name
+    output()->name(_sqrt->name());
+  }
+};
+
+class Sqrt2xGraphlet
+{
+public:
+  Sqrt2xGraphlet() = default;
+
+public:
+  void init(loco::Graph *g, const ShapeU32 input_shape)
+  {
+    _sqrt1 = g->nodes()->create<luci::CircleSqrt>();
+    _sqrt1->dtype(loco::DataType::S32);
+    _sqrt1->name("sqrt");
+
+    _sqrt2 = g->nodes()->create<luci::CircleSqrt>();
+    _sqrt2->dtype(loco::DataType::S32);
+    _sqrt2->name("sqrt");
+  }
+
+protected:
+  luci::CircleSqrt *_sqrt1 = nullptr;
+  luci::CircleSqrt *_sqrt2 = nullptr;
+};
+
+class Sqrt2xGraph : public TestIOGraph, public Sqrt2xGraphlet
+{
+public:
+  Sqrt2xGraph() = default;
+
+public:
+  void init(const ShapeU32 shape)
+  {
+    TestIOGraph::init(shape, shape);
+    Sqrt2xGraphlet::init(g(), shape);
+
+    _sqrt1->x(input());
+
+    _sqrt2->x(_sqrt1);
+
+    output()->from(_sqrt2);
+  }
+};
+
+} // namespace
+
+TEST(ValidateTest, non_empty_name)
+{
+  SqrtGraph g;
+  g.init({3, 3});
+
+  ASSERT_TRUE(luci::validate_name(g.g()));
+}
+
+TEST(ValidateTest, unique_name)
+{
+  luci::Module module;
+
+  SqrtGraph g;
+  g.init({3, 3});
+  g.transfer_to(&module);
+
+  ASSERT_TRUE(luci::validate_unique_name(&module));
+}
+
+TEST(ValidateTest, unique_name_NEG)
+{
+  luci::Module module;
+
+  Sqrt2xGraph g;
+  g.init({3, 3});
+  g.transfer_to(&module);
+
+  ASSERT_FALSE(luci::validate_unique_name(&module));
+}
diff --git a/compiler/luci/tester/CMakeLists.txt b/compiler/luci/tester/CMakeLists.txt
index 3ac06ef3a..13aab11e7 100644
--- a/compiler/luci/tester/CMakeLists.txt
+++ b/compiler/luci/tester/CMakeLists.txt
@@ -6,6 +6,7 @@ TargetRequire_Return(${REQUIRED_TARGETS})
 
 set(SRCS_READ_TESTER
       src/ReadTester.cpp
+      src/ReadModule.cpp
    )
 
 add_executable(luci_readtester "${SRCS_READ_TESTER}")
@@ -18,6 +19,7 @@ target_link_libraries(luci_readtester PRIVATE safemain)
 
 set(SRCS_WRITE_TESTER
       src/WriteTester.cpp
+      src/ReadModule.cpp
    )
 
 add_executable(luci_writetester "${SRCS_WRITE_TESTER}")
@@ -28,3 +30,22 @@ target_link_libraries(luci_writetester PRIVATE luci_export)
 target_link_libraries(luci_writetester PRIVATE foder)
 target_link_libraries(luci_writetester PRIVATE oops)
 target_link_libraries(luci_writetester PRIVATE safemain)
+
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
+nnas_find_package(GTest REQUIRED)
+
+GTest_AddTest(luci_readtester_test src/ReadTester.test.cpp ${SRCS_READ_TESTER})
+target_link_libraries(luci_readtester_test luci_import)
+target_link_libraries(luci_readtester_test luci_service)
+target_link_libraries(luci_readtester_test luci_pass)
+target_link_libraries(luci_readtester_test foder)
+
+GTest_AddTest(luci_writetester_test src/WriteTester.test.cpp ${SRCS_WRITE_TESTER})
+target_link_libraries(luci_writetester_test luci_import)
+target_link_libraries(luci_writetester_test luci_service)
+target_link_libraries(luci_writetester_test luci_pass)
+target_link_libraries(luci_writetester_test luci_export)
+target_link_libraries(luci_writetester_test foder)
diff --git a/compiler/luci/tester/src/ReadModule.cpp b/compiler/luci/tester/src/ReadModule.cpp
new file mode 100644
index 000000000..87c1233f0
--- /dev/null
+++ b/compiler/luci/tester/src/ReadModule.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ReadModule.h"
+
+#include <luci/Pass/CircleShapeInferencePass.h>
+#include <luci/Pass/CircleTypeInferencePass.h>
+#include <luci/Service/Validate.h>
+
+#include <logo/Phase.h>
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+std::unique_ptr<luci::Module> ReadModule(std::string &input_path)
+{
+  // Load model from the file
+  foder::FileLoader file_loader{input_path};
+  std::vector<char> model_data = file_loader.load();
+  const circle::Model *circle_model = circle::GetModel(model_data.data());
+  if (circle_model == nullptr)
+  {
+    std::cerr << "ERROR: Failed to load circle '" << input_path << "'" << std::endl;
+    return nullptr;
+  }
+
+  luci::Importer importer;
+  auto module = importer.importModule(circle_model);
+  assert(module->size() > 0);
+
+  for (size_t g = 0; g < module->size(); ++g)
+  {
+    auto graph = module->graph(g);
+    if (graph == nullptr)
+      return nullptr;
+
+    {
+      logo::Phase phase;
+
+      phase.emplace_back(std::make_unique<luci::CircleShapeInferencePass>());
+      phase.emplace_back(std::make_unique<luci::CircleTypeInferencePass>());
+
+      logo::PhaseRunner<logo::PhaseStrategy::Saturate> phase_runner{graph};
+      phase_runner.run(phase);
+    }
+
+    if (!luci::validate(graph))
+      return nullptr;
+  }
+  return module;
+}
diff --git a/compiler/luci/tester/src/ReadModule.h b/compiler/luci/tester/src/ReadModule.h
new file mode 100644
index 000000000..dfa9bad6b
--- /dev/null
+++ b/compiler/luci/tester/src/ReadModule.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_TESTER_READ_MODULE_H__
+#define __LUCI_TESTER_READ_MODULE_H__
+
+#include <luci/Importer.h>
+#include <foder/FileLoader.h>
+
+#include <memory>
+#include <string>
+
+std::unique_ptr<luci::Module> ReadModule(std::string &input_path);
+
+#endif // __LUCI_TESTER_READ_MODULE_H__
diff --git a/compiler/luci/tester/src/ReadTester.cpp b/compiler/luci/tester/src/ReadTester.cpp
index f270a232c..864343e43 100644
--- a/compiler/luci/tester/src/ReadTester.cpp
+++ b/compiler/luci/tester/src/ReadTester.cpp
@@ -14,18 +14,9 @@
  * limitations under the License.
  */
 
-#include <foder/FileLoader.h>
-
-#include <luci/Importer.h>
-#include <luci/Service/Validate.h>
-#include <luci/Pass/ShapeInferencePass.h>
-#include <luci/Pass/TypeInferencePass.h>
-
-// Following passes will be removed after refactoring is finished
-#include <luci/Pass/MigrateLegacyShapeDtypePass.h>
+#include "ReadModule.h"
 
 #include <iostream>
-#include <map>
 #include <string>
 
 namespace
@@ -68,45 +59,9 @@ int entry(int argc, char **argv)
 
   std::cout << "[INFO] Circle is '" << input_path << "'" << std::endl;
 
-  // Load model from the file
-  foder::FileLoader file_loader{input_path};
-  std::vector<char> model_data = file_loader.load();
-  const circle::Model *circle_model = circle::GetModel(model_data.data());
-  if (circle_model == nullptr)
-  {
-    std::cerr << "ERROR: Failed to load circle '" << input_path << "'" << std::endl;
+  auto module = ReadModule(input_path);
+  if (module == nullptr)
     return EXIT_FAILURE;
-  }
-
-  luci::Importer importer;
-  auto module = importer.importModule(circle_model);
-  assert(module->size() > 0);
 
-  for (size_t g = 0; g < module->size(); ++g)
-  {
-    auto graph = module->graph(g);
-    if (graph == nullptr)
-      return 255;
-
-    {
-      luci::ShapeInferencePass pass;
-      while (pass.run(graph) == true)
-        ;
-    }
-    {
-      luci::TypeInferencePass pass;
-      while (pass.run(graph) == true)
-        ;
-    }
-    {
-      // This pass will be removed after refactoring is finished
-      luci::MigrateLegacyShapeDtypePass pass;
-      while (pass.run(graph) == true)
-        ;
-    }
-
-    if (!luci::validate(graph))
-      return 255;
-  }
   return 0;
 }
diff --git a/compiler/luci/tester/src/ReadTester.test.cpp b/compiler/luci/tester/src/ReadTester.test.cpp
new file mode 100644
index 000000000..f3850d517
--- /dev/null
+++ b/compiler/luci/tester/src/ReadTester.test.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+// From ReadTester.cpp
+int entry(int argc, char **argv);
+
+TEST(ReadTesterTest, invalid_argc_NEG)
+{
+  char argv_1[20];
+  strcpy(argv_1, "ReadTesterTest");
+
+  int argc = 1;
+  char *argv[] = {argv_1};
+
+  ASSERT_NE(0, entry(argc, argv));
+}
+
+TEST(ReadTesterTest, invalid_file_NEG)
+{
+  char argv_1[20], argv_2[20];
+  strcpy(argv_1, "ReadTesterTest");
+  strcpy(argv_2, "not_a_file");
+
+  int argc = 2;
+  char *argv[] = {argv_1, argv_2};
+
+  EXPECT_THROW(entry(argc, argv), std::runtime_error);
+}
diff --git a/compiler/luci/tester/src/WriteTester.cpp b/compiler/luci/tester/src/WriteTester.cpp
index 9a6e8de05..0d3a1efa2 100644
--- a/compiler/luci/tester/src/WriteTester.cpp
+++ b/compiler/luci/tester/src/WriteTester.cpp
@@ -14,21 +14,13 @@
  * limitations under the License.
  */
 
-#include <foder/FileLoader.h>
+#include "ReadModule.h"
 
-#include <luci/Importer.h>
-#include <luci/Pass/ShapeInferencePass.h>
-#include <luci/Pass/TypeInferencePass.h>
-#include <luci/Service/Validate.h>
 #include <luci/CircleExporter.h>
 #include <oops/InternalExn.h>
 
-// Following passes will be removed after refactoring is finished
-#include <luci/Pass/MigrateLegacyShapeDtypePass.h>
-
 #include <fstream>
 #include <iostream>
-#include <map>
 #include <string>
 
 namespace
@@ -51,12 +43,12 @@ struct CircleExpContract : public luci::CircleExporter::Contract
 {
 public:
   CircleExpContract(loco::Graph *graph, const std::string &filename)
-      : _graph(graph), _filepath(filename)
+    : _graph(graph), _filepath(filename)
   {
     // NOTHING TO DO
   }
   CircleExpContract(luci::Module *module, const std::string &filename)
-      : _module(module), _filepath(filename)
+    : _module(module), _filepath(filename)
   {
     // NOTHING TO DO
   }
@@ -111,47 +103,9 @@ int entry(int argc, char **argv)
 
   std::cout << "[INFO] Circle from '" << input_path << "' to '" << output_path << "'" << std::endl;
 
-  // Load model from the file
-  foder::FileLoader file_loader{input_path};
-  std::vector<char> model_data = file_loader.load();
-  const circle::Model *circle_model = circle::GetModel(model_data.data());
-  if (circle_model == nullptr)
-  {
-    std::cerr << "ERROR: Failed to load circle '" << input_path << "'" << std::endl;
+  auto module = ReadModule(input_path);
+  if (module == nullptr)
     return EXIT_FAILURE;
-  }
-
-  // Import from input Circle file
-  luci::Importer importer;
-  auto module = importer.importModule(circle_model);
-  assert(module->size() > 0);
-
-  for (size_t g = 0; g < module->size(); ++g)
-  {
-    auto graph = module->graph(g);
-    if (graph == nullptr)
-      return 255;
-
-    {
-      luci::ShapeInferencePass pass;
-      while (pass.run(graph) == true)
-        ;
-    }
-    {
-      luci::TypeInferencePass pass;
-      while (pass.run(graph) == true)
-        ;
-    }
-    {
-      // This pass will be removed after refactoring is finished
-      luci::MigrateLegacyShapeDtypePass pass;
-      while (pass.run(graph) == true)
-        ;
-    }
-
-    if (!luci::validate(graph))
-      return 255;
-  }
 
   // Export to output Circle file
   luci::CircleExporter exporter;
diff --git a/compiler/luci/tester/src/WriteTester.test.cpp b/compiler/luci/tester/src/WriteTester.test.cpp
new file mode 100644
index 000000000..9d34c5f98
--- /dev/null
+++ b/compiler/luci/tester/src/WriteTester.test.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+// From WriteTester.cpp
+int entry(int argc, char **argv);
+
+TEST(WriteTesterTest, invalid_argc_NEG)
+{
+  char argv_1[20];
+  strcpy(argv_1, "WriteTesterTest");
+
+  int argc = 1;
+  char *argv[] = {argv_1};
+
+  ASSERT_NE(0, entry(argc, argv));
+}
+
+TEST(WriteTesterTest, invalid_file_NEG)
+{
+  char argv_1[20], argv_2[20], argv_3[20];
+  strcpy(argv_1, "WriteTesterTest");
+  strcpy(argv_2, "not_a_file");
+  strcpy(argv_3, "not_a_file");
+
+  int argc = 3;
+  char *argv[] = {argv_1, argv_2, argv_3};
+
+  EXPECT_THROW(entry(argc, argv), std::runtime_error);
+}
diff --git a/compiler/luci/testhelper/CMakeLists.txt b/compiler/luci/testhelper/CMakeLists.txt
new file mode 100644
index 000000000..86aa66225
--- /dev/null
+++ b/compiler/luci/testhelper/CMakeLists.txt
@@ -0,0 +1,25 @@
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
+nnas_find_package(GTest REQUIRED)
+
+# NOTE we are using "*.test.cpp" NOT to be included in static analyzer tools
+
+# testhelper library itself
+set(HELPER_SOURCE
+      src/TestShape.test.cpp
+   )
+
+add_library(luci_testhelper STATIC ${HELPER_SOURCE})
+target_include_directories(luci_testhelper PRIVATE src)
+target_include_directories(luci_testhelper PUBLIC include)
+target_link_libraries(luci_testhelper luci_lang)
+
+# test for testhelper library
+set(TESTER_SOURCE
+      src/TestIOGraph.test.cpp
+   )
+
+GTest_AddTest(luci_testhelper_test ${TESTER_SOURCE})
+target_link_libraries(luci_testhelper_test luci_testhelper)
diff --git a/compiler/luci/testhelper/README.md b/compiler/luci/testhelper/README.md
new file mode 100644
index 000000000..6bdb92aa4
--- /dev/null
+++ b/compiler/luci/testhelper/README.md
@@ -0,0 +1,3 @@
+# luci-testhelper
+
+_luci-testhelper_ provides Helper classes for unit testing
diff --git a/compiler/luci/testhelper/include/luci/test/TestIOGraph.h b/compiler/luci/testhelper/include/luci/test/TestIOGraph.h
new file mode 100644
index 000000000..ae04f4dbc
--- /dev/null
+++ b/compiler/luci/testhelper/include/luci/test/TestIOGraph.h
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_TESTHELPER_TEST_IO_GRAPH_H__
+#define __LUCI_TESTHELPER_TEST_IO_GRAPH_H__
+
+#include "TestShape.h"
+
+#include <luci/IR/CircleNodes.h>
+#include <luci/IR/Module.h>
+
+#include <memory>
+#include <stdexcept>
+
+namespace luci
+{
+namespace test
+{
+
+/**
+ * @brief Graphlet with Inputs and loco::Graph for multiple inputs
+ * @note  Every Graph will have Input(s) and Output(s)
+ *        We put loco::Graph only in IsGraphlet not to declare separate
+ *        class for loco::Graph
+ */
+template <unsigned N> class TestIsGraphlet
+{
+public:
+  TestIsGraphlet()
+  {
+    for (uint32_t n = 0; n < N; ++n)
+    {
+      _graph_inputs[n] = nullptr;
+      _inputs[n] = nullptr;
+    }
+    _g = loco::make_graph();
+  }
+
+public:
+  virtual void init(loco::Graph *g, const std::initializer_list<ShapeU32> shape_in)
+  {
+    if (shape_in.size() != N)
+      throw std::runtime_error("Failed to init TestIsGraphlet");
+
+    auto shpin = shape_in.begin();
+    for (uint32_t n = 0; n < N; ++n)
+    {
+      _graph_inputs[n] = g->inputs()->create();
+
+      _inputs[n] = g->nodes()->create<luci::CircleInput>();
+      _inputs[n]->shape(*shpin);
+      _inputs[n]->shape_status(luci::ShapeStatus::VALID);
+      _inputs[n]->dtype(loco::DataType::FLOAT32);
+      _inputs[n]->name("input_" + std::to_string(n));
+
+      _inputs[n]->index(_graph_inputs[n]->index());
+
+      auto input_shape = std::make_unique<loco::TensorShape>();
+      set_shape_vector(input_shape.get(), *shpin);
+      _graph_inputs[n]->shape(std::move(input_shape));
+      _graph_inputs[n]->dtype(loco::DataType::FLOAT32);
+
+      shpin++;
+    }
+  }
+
+public:
+  loco::Graph *g(void) { return _g.get(); }
+  luci::CircleInput *input(int idx) { return _inputs[idx]; }
+  uint32_t num_inputs(void) { return N; }
+
+public:
+  void transfer_to(luci::Module *module)
+  {
+    // WARNING: after g is transfered, _graph_inputs, _inputs
+    //          and _graph_outputs, _outputs in TestOsGraphlet will be invalid.
+    //          arrays are not cleared as this is just helpers to unit tests
+    module->add(std::move(_g));
+  }
+
+protected:
+  std::unique_ptr<loco::Graph> _g;
+  std::array<loco::GraphInput *, N> _graph_inputs;
+  std::array<luci::CircleInput *, N> _inputs;
+};
+
+/**
+ * @brief Graphlet with one Input
+ */
+class TestIGraphlet : public TestIsGraphlet<1>
+{
+public:
+  virtual void init(loco::Graph *g, const ShapeU32 shape_in)
+  {
+    TestIsGraphlet<1>::init(g, {shape_in});
+  }
+
+  luci::CircleInput *input() { return _inputs[0]; }
+};
+
+/**
+ * @brief Graphlet with Outputs for multiple outputs
+ */
+template <unsigned N> class TestOsGraphlet
+{
+public:
+  TestOsGraphlet()
+  {
+    for (uint32_t n = 0; n < N; ++n)
+    {
+      _graph_outputs[n] = nullptr;
+      _outputs[n] = nullptr;
+    }
+  }
+
+public:
+  virtual void init(loco::Graph *g, const std::initializer_list<ShapeU32> shape_out)
+  {
+    if (shape_out.size() != N)
+      throw std::runtime_error("Failed to init TestOsGraphlet");
+
+    auto shpout = shape_out.begin();
+    for (uint32_t n = 0; n < N; ++n)
+    {
+      _graph_outputs[n] = g->outputs()->create();
+
+      _outputs[n] = g->nodes()->create<luci::CircleOutput>();
+      _outputs[n]->shape(*shpout);
+      _outputs[n]->shape_status(luci::ShapeStatus::VALID);
+      _outputs[n]->dtype(loco::DataType::FLOAT32);
+      _outputs[n]->name("output_" + std::to_string(n));
+
+      _outputs[n]->index(_graph_outputs[n]->index());
+
+      auto output_shape = std::make_unique<loco::TensorShape>();
+      set_shape_vector(output_shape.get(), *shpout);
+      _graph_outputs[n]->shape(std::move(output_shape));
+      _graph_outputs[n]->dtype(loco::DataType::FLOAT32);
+
+      shpout++;
+    }
+  }
+
+public:
+  luci::CircleOutput *output(int idx) { return _outputs[idx]; }
+
+protected:
+  std::array<loco::GraphOutput *, N> _graph_outputs;
+  std::array<luci::CircleOutput *, N> _outputs;
+};
+
+/**
+ * @brief Graphlet with one Output
+ */
+class TestOGraphlet : public TestOsGraphlet<1>
+{
+public:
+  virtual void init(loco::Graph *g, const ShapeU32 shape_out)
+  {
+    TestOsGraphlet<1>::init(g, {shape_out});
+  }
+
+  luci::CircleOutput *output() { return _outputs[0]; }
+};
+
+/**
+ * @brief Graph with Input and Output
+ */
+class TestIOGraph : public TestIGraphlet, public TestOGraphlet
+{
+public:
+  TestIOGraph() = default;
+
+public:
+  virtual void init(const ShapeU32 shape_in, const ShapeU32 shape_out)
+  {
+    TestIGraphlet::init(g(), shape_in);
+    TestOGraphlet::init(g(), shape_out);
+  }
+};
+
+} // namespace test
+} // namespace luci
+
+#endif // __LUCI_TESTHELPER_TEST_IO_GRAPH_H__
diff --git a/compiler/luci/testhelper/include/luci/test/TestShape.h b/compiler/luci/testhelper/include/luci/test/TestShape.h
new file mode 100644
index 000000000..1a5adf7d6
--- /dev/null
+++ b/compiler/luci/testhelper/include/luci/test/TestShape.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_TESTHELPER_TEST_SHAPE_H__
+#define __LUCI_TESTHELPER_TEST_SHAPE_H__
+
+#include <luci/IR/CircleNode.h>
+
+#include <initializer_list>
+
+namespace luci
+{
+namespace test
+{
+
+using ShapeU32 = std::initializer_list<uint32_t>;
+using ShapeI32 = std::initializer_list<int32_t>;
+
+void set_shape_vector(loco::TensorShape *shape, const ShapeU32 &values);
+void set_shape_vector(luci::CircleConst *const_node, const ShapeI32 &values);
+
+uint32_t num_elements(const ShapeU32 shape);
+
+} // namespace test
+} // namespace luci
+
+#endif // __LUCI_TESTHELPER_TEST_SHAPE_H__
diff --git a/compiler/luci/testhelper/src/TestIOGraph.test.cpp b/compiler/luci/testhelper/src/TestIOGraph.test.cpp
new file mode 100644
index 000000000..8a7d1e060
--- /dev/null
+++ b/compiler/luci/testhelper/src/TestIOGraph.test.cpp
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/test/TestIOGraph.h"
+
+#include <gtest/gtest.h>
+
+namespace
+{
+
+using namespace luci::test;
+
+class SqrtGraphlet
+{
+public:
+  SqrtGraphlet() = default;
+
+  void init(loco::Graph *g)
+  {
+    _sqrt = g->nodes()->create<luci::CircleSqrt>();
+    _sqrt->name("sqrt");
+  }
+
+protected:
+  luci::CircleSqrt *_sqrt = nullptr;
+};
+
+class AddGraphlet
+{
+public:
+  AddGraphlet() = default;
+
+  void init(loco::Graph *g)
+  {
+    _add = g->nodes()->create<luci::CircleAdd>();
+    _add->name("add");
+  }
+
+protected:
+  luci::CircleAdd *_add = nullptr;
+};
+
+class ConvGraphlet
+{
+public:
+  ConvGraphlet() = default;
+
+  void init(loco::Graph *g)
+  {
+    _conv = g->nodes()->create<luci::CircleConv2D>();
+    _conv->name("conv");
+  }
+
+protected:
+  luci::CircleConv2D *_conv = nullptr;
+};
+
+} // namespace
+
+namespace
+{
+
+class TestOfTestIOGraph : public TestIOGraph, public SqrtGraphlet
+{
+public:
+  TestOfTestIOGraph() = default;
+
+public:
+  void init(void)
+  {
+    TestIOGraph::init({1}, {1});
+    SqrtGraphlet::init(g());
+
+    _sqrt->x(input());
+
+    output()->from(_sqrt);
+  }
+};
+
+class TestOfTestI2OGraph : public TestIsGraphlet<2>, public TestOGraphlet, public AddGraphlet
+{
+public:
+  TestOfTestI2OGraph() = default;
+
+public:
+  void init(void)
+  {
+    TestIsGraphlet<2>::init(g(), {{2, 3}, {2, 3}});
+    TestOsGraphlet<1>::init(g(), {{2, 3}});
+    AddGraphlet::init(g());
+
+    _add->x(input(0));
+    _add->y(input(1));
+
+    output()->from(_add);
+  }
+};
+
+class TestOfTestI3OGraph : public TestIsGraphlet<3>, public TestOGraphlet, public ConvGraphlet
+{
+public:
+  TestOfTestI3OGraph() = default;
+
+public:
+  void init(void)
+  {
+    TestIsGraphlet<3>::init(g(), {{2, 3, 3, 4}, {1, 1}, {4}});
+    TestOsGraphlet<1>::init(g(), {{2, 3, 3, 4}});
+    ConvGraphlet::init(g());
+
+    _conv->input(input(0));
+    _conv->filter(input(1));
+    _conv->bias(input(2));
+
+    output()->from(_conv);
+  }
+};
+
+class FailOfTestI3OGraph : public TestIsGraphlet<3>, public TestOGraphlet, public ConvGraphlet
+{
+public:
+  FailOfTestI3OGraph() = default;
+
+public:
+  void init(void)
+  {
+    TestIsGraphlet<3>::init(g(), {{2, 3, 3, 4}, {1, 1}});
+    TestOsGraphlet<1>::init(g(), {{2, 3, 3, 4}});
+    ConvGraphlet::init(g());
+
+    _conv->input(input(0));
+    _conv->filter(input(1));
+    _conv->bias(input(2));
+
+    output()->from(_conv);
+  }
+};
+
+} // namespace
+
+TEST(TestIOGraphTest, IOGraph_init)
+{
+  TestOfTestIOGraph tg;
+  tg.init();
+
+  SUCCEED();
+}
+
+TEST(TestIOGraphTest, I2OGraph_init)
+{
+  TestOfTestI2OGraph tg;
+  tg.init();
+
+  SUCCEED();
+}
+
+TEST(TestIOGraphTest, I3OGraph_init)
+{
+  TestOfTestI3OGraph tg;
+  tg.init();
+
+  SUCCEED();
+}
+
+TEST(TestIOGraphTest, I3OGraph_input_number_mismatch_NEG)
+{
+  FailOfTestI3OGraph fg;
+  EXPECT_THROW(fg.init(), std::runtime_error);
+}
diff --git a/compiler/luci/testhelper/src/TestShape.test.cpp b/compiler/luci/testhelper/src/TestShape.test.cpp
new file mode 100644
index 000000000..9838c6182
--- /dev/null
+++ b/compiler/luci/testhelper/src/TestShape.test.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/test/TestShape.h"
+
+/**
+ * @note This file does not hold any test cases but provides methods for tests
+ */
+
+namespace luci
+{
+namespace test
+{
+
+void set_shape_vector(loco::TensorShape *shape, const ShapeU32 &values)
+{
+  uint32_t r = 0;
+  shape->rank(values.size());
+  for (auto v : values)
+    shape->dim(r++).set(v);
+}
+
+void set_shape_vector(luci::CircleConst *const_node, const ShapeI32 &values)
+{
+  const_node->rank(1);
+  const_node->dim(0).set(values.size());
+  const_node->shape_status(luci::ShapeStatus::VALID);
+  const_node->dtype(loco::DataType::S32);
+  const_node->size<loco::DataType::S32>(values.size());
+  uint32_t idx = 0;
+  for (auto val : values)
+    const_node->at<loco::DataType::S32>(idx++) = val;
+}
+
+uint32_t num_elements(const ShapeU32 shape)
+{
+  uint32_t result = 1;
+  for (auto val : shape)
+    result = result * val;
+  return result;
+}
+
+} // namespace test
+} // namespace luci
diff --git a/compiler/luci/tests/test.lst b/compiler/luci/tests/test.lst
index 897d41983..a278fa256 100644
--- a/compiler/luci/tests/test.lst
+++ b/compiler/luci/tests/test.lst
@@ -51,6 +51,8 @@ addread(ExpandDims_000)
 addread(ExpandDims_001)
 addread(ExpandDims_002)
 addread(ExpandDims_003)
+addread(ExpandDims_004)
+addread(FakeQuant_000)
 addread(Fill_000)
 addread(Fill_001)
 addread(Floor_000)
@@ -151,6 +153,7 @@ addread(SelectV2_002)
 addread(Shape_000)
 addread(Sin_000)
 addread(Slice_000)
+addread(Slice_001)
 addread(Softmax_000)
 addread(Softmax_U8_000)
 addread(SpaceToBatchND_000)
@@ -166,6 +169,7 @@ addread(Sqrt_000)
 addread(Square_000)
 addread(SquaredDifference_000)
 addread(Squeeze_000)
+addread(Squeeze_001)
 addread(StridedSlice_000)
 addread(StridedSlice_001)
 addread(StridedSlice_002)
@@ -268,6 +272,8 @@ addwrite(ExpandDims_000)
 addwrite(ExpandDims_001)
 addwrite(ExpandDims_002)
 addwrite(ExpandDims_003)
+addwrite(ExpandDims_004)
+addwrite(FakeQuant_000)
 addwrite(Fill_000)
 addwrite(Fill_001)
 addwrite(Floor_000)
@@ -367,6 +373,7 @@ addwrite(SelectV2_002)
 addwrite(Shape_000)
 addwrite(Sin_000)
 addwrite(Slice_000)
+addwrite(Slice_001)
 addwrite(Softmax_000)
 addwrite(Softmax_U8_000)
 addwrite(SpaceToBatchND_000)
@@ -382,6 +389,7 @@ addwrite(Sqrt_000)
 addwrite(Square_000)
 addwrite(SquaredDifference_000)
 addwrite(Squeeze_000)
+addwrite(Squeeze_001)
 addwrite(StridedSlice_000)
 addwrite(StridedSlice_001)
 addwrite(StridedSlice_002)
diff --git a/compiler/mir-interpreter/src/ops/Add.cpp b/compiler/mir-interpreter/src/ops/Add.cpp
index 631b854b7..f80c63c15 100644
--- a/compiler/mir-interpreter/src/ops/Add.cpp
+++ b/compiler/mir-interpreter/src/ops/Add.cpp
@@ -106,13 +106,13 @@ void AddImpl<uint8_t>::run(const TensorVariant &lhs, const TensorVariant &rhs, T
     const int32_t shifted_lhs_val = lhs_val * (1 << left_shift);
     const int32_t shifted_rhs_val = rhs_val * (1 << left_shift);
     const int32_t scaled_lhs_val =
-        MultiplyByQuantizedMultiplierSmallerThanOneExp(shifted_lhs_val, lhs_multiplier, lhs_shift);
+      MultiplyByQuantizedMultiplierSmallerThanOneExp(shifted_lhs_val, lhs_multiplier, lhs_shift);
     const int32_t scaled_rhs_val =
-        MultiplyByQuantizedMultiplierSmallerThanOneExp(shifted_rhs_val, rhs_multiplier, rhs_shift);
+      MultiplyByQuantizedMultiplierSmallerThanOneExp(shifted_rhs_val, rhs_multiplier, rhs_shift);
     const int32_t raw_sum = scaled_lhs_val + scaled_rhs_val;
     const int32_t raw_output =
-        MultiplyByQuantizedMultiplierSmallerThanOneExp(raw_sum, output_multiplier, output_shift) +
-        output_offset;
+      MultiplyByQuantizedMultiplierSmallerThanOneExp(raw_sum, output_multiplier, output_shift) +
+      output_offset;
     const int32_t clamped_output = std::min(output_max, std::max(output_min, raw_output));
     res_accessor.at(index) = static_cast<uint8_t>(clamped_output);
   }
diff --git a/compiler/mir-interpreter/src/ops/AvgPool2D.cpp b/compiler/mir-interpreter/src/ops/AvgPool2D.cpp
index 3f1d65100..3f74cd1e8 100644
--- a/compiler/mir-interpreter/src/ops/AvgPool2D.cpp
+++ b/compiler/mir-interpreter/src/ops/AvgPool2D.cpp
@@ -72,7 +72,7 @@ void AvgPool2DImpl<T>::run(const ops::AvgPool2DOp &op, const TensorVariant &inpu
       // Assuming NHWC format.
       for (int i = 0; i < num_spatial_dims; ++i)
         in_index.at(1 + i) =
-            out_index.at(1 + i) * strides[i] + window_index.at(i) - padding_before[i];
+          out_index.at(1 + i) * strides[i] + window_index.at(i) - padding_before[i];
 
       if (in_range.contains(in_index))
       {
@@ -145,7 +145,7 @@ void AvgPool2DImpl<uint8_t>::run(const ops::AvgPool2DOp &op, const TensorVariant
       // Assuming NHWC format.
       for (int i = 0; i < num_spatial_dims; ++i)
         in_index.at(1 + i) =
-            out_index.at(1 + i) * strides[i] + window_index.at(i) - padding_before[i];
+          out_index.at(1 + i) * strides[i] + window_index.at(i) - padding_before[i];
 
       if (in_range.contains(in_index))
       {
diff --git a/compiler/mir-interpreter/src/ops/CappedReLU.cpp b/compiler/mir-interpreter/src/ops/CappedReLU.cpp
index 1ac95ac16..5b348d463 100644
--- a/compiler/mir-interpreter/src/ops/CappedReLU.cpp
+++ b/compiler/mir-interpreter/src/ops/CappedReLU.cpp
@@ -68,7 +68,7 @@ template <> struct CappedReLUImpl<uint8_t>
     {
       auto value = dequantize(arg_accessor.at(index), quant_info);
       auto out_value =
-          quantize(std::min(std::max(value, 0.0f), cap), result.getType().getQuantization());
+        quantize(std::min(std::max(value, 0.0f), cap), result.getType().getQuantization());
       res_accessor.at(index) = out_value;
     }
   }
diff --git a/compiler/mir-interpreter/src/ops/Concat.cpp b/compiler/mir-interpreter/src/ops/Concat.cpp
index 99fe00c31..3c71709e6 100644
--- a/compiler/mir-interpreter/src/ops/Concat.cpp
+++ b/compiler/mir-interpreter/src/ops/Concat.cpp
@@ -90,8 +90,8 @@ template <> struct ConcatImpl<uint8_t>
 };
 
 void ConcatImpl<uint8_t>::run(
-    const std::vector<std::reference_wrapper<const mir::TensorVariant>> &inputs, int axis,
-    mir::TensorVariant &output)
+  const std::vector<std::reference_wrapper<const mir::TensorVariant>> &inputs, int axis,
+  mir::TensorVariant &output)
 {
   const size_t inputs_count = inputs.size();
   std::vector<int32_t> input_zeropoints(inputs_count);
@@ -154,7 +154,7 @@ void ConcatImpl<uint8_t>::run(
         for (int j = 0; j < copy_size; ++j)
         {
           const int32_t value =
-              static_cast<int32_t>(std::round(input_ptr[j] * scale + bias)) + output_zeropoint;
+            static_cast<int32_t>(std::round(input_ptr[j] * scale + bias)) + output_zeropoint;
           output_ptr[j] = static_cast<uint8_t>(std::max(std::min(255, value), 0));
         }
       }
diff --git a/compiler/mir-interpreter/src/ops/Conv2D.cpp b/compiler/mir-interpreter/src/ops/Conv2D.cpp
index c9b98a56f..9f4339bda 100644
--- a/compiler/mir-interpreter/src/ops/Conv2D.cpp
+++ b/compiler/mir-interpreter/src/ops/Conv2D.cpp
@@ -109,9 +109,9 @@ void Conv2DImpl<T>::run(const TensorVariant &input, const TensorVariant &kernel,
                   if ((in_y >= 0 && in_y < input_height) && (in_x >= 0 && in_x < input_width))
                   {
                     const std::int32_t in_offset =
-                        calcOffset(input_shape, batch, in_y, in_x, in_group_offset + in_c);
-                    const std::int32_t kernel_offset = calcOffset(
-                        kernel_shape, out_group_offset + out_c, kernel_y, kernel_x, in_c);
+                      calcOffset(input_shape, batch, in_y, in_x, in_group_offset + in_c);
+                    const std::int32_t kernel_offset =
+                      calcOffset(kernel_shape, out_group_offset + out_c, kernel_y, kernel_x, in_c);
                     const T input_val = input_data[in_offset];
                     const T kernel_val = kernel_data[kernel_offset];
                     sum += kernel_val * input_val;
@@ -121,7 +121,7 @@ void Conv2DImpl<T>::run(const TensorVariant &input, const TensorVariant &kernel,
             }
 
             const std::int32_t out_offset =
-                calcOffset(output_shape, batch, out_y, out_x, out_group_offset + out_c);
+              calcOffset(output_shape, batch, out_y, out_x, out_group_offset + out_c);
             result_data[out_offset] = sum;
           }
         }
diff --git a/compiler/mir-interpreter/src/ops/DeConv2D.cpp b/compiler/mir-interpreter/src/ops/DeConv2D.cpp
index 746d8c87c..f9e837ddb 100644
--- a/compiler/mir-interpreter/src/ops/DeConv2D.cpp
+++ b/compiler/mir-interpreter/src/ops/DeConv2D.cpp
@@ -98,9 +98,9 @@ void DeConv2DImpl<T>::run(const TensorVariant &input, const TensorVariant &kerne
                 for (int32_t out_c = 0; out_c < num_out_channels; ++out_c)
                 {
                   const int32_t kernel_offset =
-                      calcOffset(kernel_shape, in_c, kernel_y, kernel_x, out_c);
+                    calcOffset(kernel_shape, in_c, kernel_y, kernel_x, out_c);
                   const int32_t output_offset =
-                      calcOffset(output_shape, batch, out_y, out_x, out_c);
+                    calcOffset(output_shape, batch, out_y, out_x, out_c);
                   const T kernel_val = kernel_data[kernel_offset];
                   output_data[output_offset] += input_val * kernel_val;
                 }
diff --git a/compiler/mir-interpreter/src/ops/Gather.cpp b/compiler/mir-interpreter/src/ops/Gather.cpp
index 4328c26b2..11bffd411 100644
--- a/compiler/mir-interpreter/src/ops/Gather.cpp
+++ b/compiler/mir-interpreter/src/ops/Gather.cpp
@@ -64,7 +64,7 @@ void GatherImpl<T, IndicesT>::run(const TensorVariant &datav, const TensorVarian
       for (int32_t inner = 0; inner < inner_size; inner++)
       {
         output.atOffset((outer * num_indices + i) * inner_size + inner) =
-            data.atOffset((outer * axis_size + index) * inner_size + inner);
+          data.atOffset((outer * axis_size + index) * inner_size + inner);
       }
     }
   }
diff --git a/compiler/mir-interpreter/src/ops/MaxPool2D.cpp b/compiler/mir-interpreter/src/ops/MaxPool2D.cpp
index cec2f5984..6be1ccf08 100644
--- a/compiler/mir-interpreter/src/ops/MaxPool2D.cpp
+++ b/compiler/mir-interpreter/src/ops/MaxPool2D.cpp
@@ -72,7 +72,7 @@ void MaxPool2DImpl<T>::run(const TensorVariant &inputv, const ops::MaxPool2DOp &
       // Assuming NHWC format.
       for (int i = 0; i < num_spatial_dims; ++i)
         in_index.at(1 + i) =
-            out_index.at(1 + i) * strides[i] + window_index.at(i) - padding_before[i];
+          out_index.at(1 + i) * strides[i] + window_index.at(i) - padding_before[i];
 
       if (in_range.contains(in_index))
       {
@@ -137,7 +137,7 @@ void MaxPool2DImpl<uint8_t>::run(const TensorVariant &input, const ops::MaxPool2
       // Assuming NHWC format.
       for (int i = 0; i < num_spatial_dims; ++i)
         in_index.at(1 + i) =
-            out_index.at(1 + i) * strides[i] + window_index.at(i) - padding_before[i];
+          out_index.at(1 + i) * strides[i] + window_index.at(i) - padding_before[i];
 
       if (in_range.contains(in_index))
       {
diff --git a/compiler/mir-interpreter/src/ops/QuantizationHelpers.h b/compiler/mir-interpreter/src/ops/QuantizationHelpers.h
index 8faeffbd3..3ab6f1edc 100644
--- a/compiler/mir-interpreter/src/ops/QuantizationHelpers.h
+++ b/compiler/mir-interpreter/src/ops/QuantizationHelpers.h
@@ -110,7 +110,7 @@ inline int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multip
   int left_shift = shift > 0 ? shift : 0;
   int right_shift = shift > 0 ? 0 : -shift;
   return RoundingDivideByPOT(
-      SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier), right_shift);
+    SaturatingRoundingDoublingHighMul(x * (1 << left_shift), quantized_multiplier), right_shift);
 }
 
 inline int32_t MultiplyByQuantizedMultiplierSmallerThanOneExp(int32_t x,
diff --git a/compiler/mir-interpreter/src/ops/Softmax.cpp b/compiler/mir-interpreter/src/ops/Softmax.cpp
index f263f967d..554f8c371 100644
--- a/compiler/mir-interpreter/src/ops/Softmax.cpp
+++ b/compiler/mir-interpreter/src/ops/Softmax.cpp
@@ -70,7 +70,7 @@ void SoftmaxImpl<T>::run(const mir::TensorVariant &arg, int axis, mir::TensorVar
     mir::Index expsum_index = res_index;
     expsum_index.at(axis) = 0;
     res_accessor.at(res_index) =
-        std::exp(arg_accessor.at(res_index)) / expsum_accessor.at(expsum_index);
+      std::exp(arg_accessor.at(res_index)) / expsum_accessor.at(expsum_index);
   }
 }
 
@@ -140,7 +140,7 @@ void SoftmaxImpl<uint8_t>::run(const mir::TensorVariant &input, int axis,
       const float prob_rescaled = table_offset[input_data[j]] * inv_sum_exp;
       const int32_t prob_quantized = static_cast<int32_t>(prob_rescaled + 0.5);
       output_data[j] =
-          static_cast<uint8_t>(std::max(std::min(clamp_max, prob_quantized), clamp_min));
+        static_cast<uint8_t>(std::max(std::min(clamp_max, prob_quantized), clamp_min));
     }
     input_data += last_dim;
     output_data += last_dim;
diff --git a/compiler/mir/include/mir/Quantization.h b/compiler/mir/include/mir/Quantization.h
index d266ee00d..901915a74 100644
--- a/compiler/mir/include/mir/Quantization.h
+++ b/compiler/mir/include/mir/Quantization.h
@@ -26,7 +26,7 @@ public:
   AffineQuantization() = default;
 
   AffineQuantization(float scale, int zero_point)
-      : _scale(scale), _zero_point(zero_point), _empty(false)
+    : _scale(scale), _zero_point(zero_point), _empty(false)
   {
   }
 
diff --git a/compiler/mir/include/mir/ShapeRange.h b/compiler/mir/include/mir/ShapeRange.h
index a450bf090..70b29715f 100644
--- a/compiler/mir/include/mir/ShapeRange.h
+++ b/compiler/mir/include/mir/ShapeRange.h
@@ -26,7 +26,7 @@ namespace mir
 {
 
 class ShapeIter
-    : public std::iterator<std::forward_iterator_tag, Index, std::size_t, Index *, Index &>
+  : public std::iterator<std::forward_iterator_tag, Index, std::size_t, Index *, Index &>
 {
 public:
   ShapeIter &operator++()
diff --git a/compiler/mir/include/mir/TensorType.h b/compiler/mir/include/mir/TensorType.h
index 98797d687..b94a26eeb 100644
--- a/compiler/mir/include/mir/TensorType.h
+++ b/compiler/mir/include/mir/TensorType.h
@@ -34,7 +34,7 @@ public:
   }
 
   TensorType(DataType element_type, const Shape &shape, const AffineQuantization &quant)
-      : _element_type(element_type), _shape(shape), _quantization(quant)
+    : _element_type(element_type), _shape(shape), _quantization(quant)
   {
   }
 
diff --git a/compiler/mir/include/mir/ops/AvgPool2DOp.h b/compiler/mir/include/mir/ops/AvgPool2DOp.h
index 47fe058ee..37fb66437 100644
--- a/compiler/mir/include/mir/ops/AvgPool2DOp.h
+++ b/compiler/mir/include/mir/ops/AvgPool2DOp.h
@@ -32,7 +32,7 @@ class AvgPool2DOp : public Operation
 {
 public:
   AvgPool2DOp(Output *arg, const AvgPool2DOpAttributes &attributes)
-      : Operation(Type::avgPool2D, {arg}), _attributes(attributes)
+    : Operation(Type::avgPool2D, {arg}), _attributes(attributes)
   {
     inferOutputTypes();
   }
diff --git a/compiler/mir/include/mir/ops/ConcatOp.h b/compiler/mir/include/mir/ops/ConcatOp.h
index 4f46d4449..d1f9142fa 100644
--- a/compiler/mir/include/mir/ops/ConcatOp.h
+++ b/compiler/mir/include/mir/ops/ConcatOp.h
@@ -31,7 +31,7 @@ class ConcatOp : public Operation
 {
 public:
   ConcatOp(const std::vector<Output *> &args, int32_t axis)
-      : Operation(Type::concat, args), _axis(axis)
+    : Operation(Type::concat, args), _axis(axis)
   {
     inferOutputTypes();
   }
diff --git a/compiler/mir/include/mir/ops/Conv2DOp.h b/compiler/mir/include/mir/ops/Conv2DOp.h
index ec818dae5..f8590a947 100644
--- a/compiler/mir/include/mir/ops/Conv2DOp.h
+++ b/compiler/mir/include/mir/ops/Conv2DOp.h
@@ -30,13 +30,13 @@ class Conv2DOp : public Operation
 {
 public:
   Conv2DOp(Output *input, Output *kernel, const Conv2DOpAttributes &attributes)
-      : Operation(Type::conv2D, {input, kernel}), _attributes(attributes)
+    : Operation(Type::conv2D, {input, kernel}), _attributes(attributes)
   {
     inferOutputTypes();
   }
 
   Conv2DOp(Output *input, Output *kernel, Output *bias, const Conv2DOpAttributes &attributes)
-      : Operation(Type::conv2D, {input, kernel, bias}), _attributes(attributes)
+    : Operation(Type::conv2D, {input, kernel, bias}), _attributes(attributes)
   {
     inferOutputTypes();
   }
diff --git a/compiler/mir/include/mir/ops/Deconv2DOp.h b/compiler/mir/include/mir/ops/Deconv2DOp.h
index a7b548028..9565eeb37 100644
--- a/compiler/mir/include/mir/ops/Deconv2DOp.h
+++ b/compiler/mir/include/mir/ops/Deconv2DOp.h
@@ -33,14 +33,14 @@ class DeConv2DOp : public Operation
 {
 public:
   DeConv2DOp(Output *input, Output *kernel, const Deconv2DOpAttributes &attributes)
-      : Operation(Type::deConv2D, {input, kernel}), _attributes(attributes)
+    : Operation(Type::deConv2D, {input, kernel}), _attributes(attributes)
   {
     inferOutputTypes();
   }
 
   DeConv2DOp(Output *input, Output *kernel, const Deconv2DOpAttributes &attributes,
              const Shape &output_shape)
-      : Operation(Type::deConv2D, {input, kernel}), _attributes(attributes)
+    : Operation(Type::deConv2D, {input, kernel}), _attributes(attributes)
   {
     assert(input->getElementType() == kernel->getElementType());
     setOutputType(0, {input->getElementType(), output_shape});
diff --git a/compiler/mir/include/mir/ops/DepthwiseConv2DOp.h b/compiler/mir/include/mir/ops/DepthwiseConv2DOp.h
index 347b8e94f..558d60a4a 100644
--- a/compiler/mir/include/mir/ops/DepthwiseConv2DOp.h
+++ b/compiler/mir/include/mir/ops/DepthwiseConv2DOp.h
@@ -30,14 +30,14 @@ class DepthwiseConv2DOp : public Operation
 {
 public:
   DepthwiseConv2DOp(Output *input, Output *kernel, const Conv2DOpAttributes &attributes)
-      : Operation(Type::depthwiseConv, {input, kernel}), _attributes(attributes)
+    : Operation(Type::depthwiseConv, {input, kernel}), _attributes(attributes)
   {
     inferOutputTypes();
   }
 
   DepthwiseConv2DOp(Output *input, Output *kernel, Output *bias,
                     const Conv2DOpAttributes &attributes)
-      : Operation(Type::depthwiseConv, {input, kernel, bias}), _attributes(attributes)
+    : Operation(Type::depthwiseConv, {input, kernel, bias}), _attributes(attributes)
   {
     inferOutputTypes();
   }
diff --git a/compiler/mir/include/mir/ops/FullyConnectedOp.h b/compiler/mir/include/mir/ops/FullyConnectedOp.h
index 589c42df9..f937df539 100644
--- a/compiler/mir/include/mir/ops/FullyConnectedOp.h
+++ b/compiler/mir/include/mir/ops/FullyConnectedOp.h
@@ -29,13 +29,13 @@ class FullyConnectedOp : public Operation
 {
 public:
   FullyConnectedOp(Output *input, Output *weights)
-      : Operation(Type::fullyConnected, {input, weights})
+    : Operation(Type::fullyConnected, {input, weights})
   {
     inferOutputTypes();
   }
 
   FullyConnectedOp(Output *input, Output *weights, Output *bias)
-      : Operation(Type::fullyConnected, {input, weights, bias})
+    : Operation(Type::fullyConnected, {input, weights, bias})
   {
     inferOutputTypes();
   }
diff --git a/compiler/mir/include/mir/ops/GatherOp.h b/compiler/mir/include/mir/ops/GatherOp.h
index 899c9f169..58ea04074 100644
--- a/compiler/mir/include/mir/ops/GatherOp.h
+++ b/compiler/mir/include/mir/ops/GatherOp.h
@@ -33,7 +33,7 @@ class GatherOp : public Operation
 {
 public:
   GatherOp(Output *data, Output *indices, int32_t axis)
-      : Operation(Type::gather, {data, indices}), _axis(axis)
+    : Operation(Type::gather, {data, indices}), _axis(axis)
   {
     inferOutputTypes();
   }
diff --git a/compiler/mir/include/mir/ops/MaxPool2DOp.h b/compiler/mir/include/mir/ops/MaxPool2DOp.h
index 7c5df4a53..4345cfc18 100644
--- a/compiler/mir/include/mir/ops/MaxPool2DOp.h
+++ b/compiler/mir/include/mir/ops/MaxPool2DOp.h
@@ -32,7 +32,7 @@ class MaxPool2DOp : public Operation
 {
 public:
   MaxPool2DOp(Output *arg, const MaxPool2DOpAttributes &attributes)
-      : Operation(Type::maxPool2D, {arg}), _attributes(attributes)
+    : Operation(Type::maxPool2D, {arg}), _attributes(attributes)
   {
     inferOutputTypes();
   }
diff --git a/compiler/mir/include/mir/ops/PadOp.h b/compiler/mir/include/mir/ops/PadOp.h
index 76453acec..d229a97bd 100644
--- a/compiler/mir/include/mir/ops/PadOp.h
+++ b/compiler/mir/include/mir/ops/PadOp.h
@@ -29,7 +29,7 @@ class PadOp : public Operation
 {
 public:
   PadOp(Output *arg, const PadOpAttributes &attributes)
-      : Operation(Type::pad, {arg}), _attributes(attributes)
+    : Operation(Type::pad, {arg}), _attributes(attributes)
   {
     assert(_attributes.padding_before.size() == _attributes.padding_after.size());
     inferOutputTypes();
diff --git a/compiler/mir/include/mir/ops/ReduceMeanOp.h b/compiler/mir/include/mir/ops/ReduceMeanOp.h
index add47ac75..5759b845e 100644
--- a/compiler/mir/include/mir/ops/ReduceMeanOp.h
+++ b/compiler/mir/include/mir/ops/ReduceMeanOp.h
@@ -29,7 +29,7 @@ class ReduceMeanOp : public ReduceOp
 {
 public:
   ReduceMeanOp(Output *arg, const std::vector<int> &reduction_dims, bool keep_dims)
-      : ReduceOp(Type::reduceMean, arg, reduction_dims, keep_dims)
+    : ReduceOp(Type::reduceMean, arg, reduction_dims, keep_dims)
   {
   }
 
diff --git a/compiler/mir/include/mir/ops/ReduceOp.h b/compiler/mir/include/mir/ops/ReduceOp.h
index 0f46a4596..5204a0903 100644
--- a/compiler/mir/include/mir/ops/ReduceOp.h
+++ b/compiler/mir/include/mir/ops/ReduceOp.h
@@ -29,7 +29,7 @@ class ReduceOp : public Operation
 {
 protected:
   ReduceOp(Type type, Output *arg, const std::vector<int> &reduction_dims, bool keep_dims)
-      : Operation(type, {arg}), _reduction_dims(reduction_dims), _keep_dims(keep_dims)
+    : Operation(type, {arg}), _reduction_dims(reduction_dims), _keep_dims(keep_dims)
   {
     inferOutputTypes();
   }
diff --git a/compiler/mir/include/mir/ops/ResizeOp.h b/compiler/mir/include/mir/ops/ResizeOp.h
index 51e1b0b76..62743e396 100644
--- a/compiler/mir/include/mir/ops/ResizeOp.h
+++ b/compiler/mir/include/mir/ops/ResizeOp.h
@@ -40,7 +40,7 @@ public:
   };
 
   ResizeOp(Output *arg, ResizeMethod mode, const std::vector<float> &scales)
-      : Operation(Type::resizeIm, {arg}), _mode(mode), _scales(scales)
+    : Operation(Type::resizeIm, {arg}), _mode(mode), _scales(scales)
   {
     // Infer output shape based on given scales.
     auto &input_shape = getInputShape(0);
@@ -61,7 +61,7 @@ public:
   }
 
   ResizeOp(Output *arg, ResizeMethod mode, const Shape &output_shape)
-      : Operation(Type::resizeIm, {arg}), _mode(mode)
+    : Operation(Type::resizeIm, {arg}), _mode(mode)
   {
     // Calculate scales based on given shape.
     auto &input_shape = getInputShape(0);
diff --git a/compiler/mir/include/mir/ops/SliceOp.h b/compiler/mir/include/mir/ops/SliceOp.h
index 6370de4fa..1627d4b82 100644
--- a/compiler/mir/include/mir/ops/SliceOp.h
+++ b/compiler/mir/include/mir/ops/SliceOp.h
@@ -28,7 +28,7 @@ class SliceOp : public Operation
 {
 public:
   SliceOp(Output *arg, const Shape &starts, const Shape &sizes)
-      : Operation(Type::slice, {arg}), _starts(starts), _sizes(sizes)
+    : Operation(Type::slice, {arg}), _starts(starts), _sizes(sizes)
   {
     inferOutputTypes();
   }
diff --git a/compiler/mir/include/mir/ops/SqueezeOp.h b/compiler/mir/include/mir/ops/SqueezeOp.h
index 8ef2a78bb..735b7d86d 100644
--- a/compiler/mir/include/mir/ops/SqueezeOp.h
+++ b/compiler/mir/include/mir/ops/SqueezeOp.h
@@ -29,7 +29,7 @@ class SqueezeOp : public Operation
 {
 public:
   SqueezeOp(Output *arg, const std::vector<std::int32_t> &dims_to_squeeze)
-      : Operation(Type::squeeze, {arg}), _dims_to_squeeze(dims_to_squeeze)
+    : Operation(Type::squeeze, {arg}), _dims_to_squeeze(dims_to_squeeze)
   {
     // Infer output shape.
     inferOutputTypes();
diff --git a/compiler/mir/src/Graph.cpp b/compiler/mir/src/Graph.cpp
index 0eccdac2b..04b005de4 100644
--- a/compiler/mir/src/Graph.cpp
+++ b/compiler/mir/src/Graph.cpp
@@ -123,11 +123,11 @@ void Graph::removeNode(Operation *op)
 
   if (op->getType() == Operation::Type::input)
     _inputs.erase(
-        std::remove(_inputs.begin(), _inputs.end(), op)); // NOLINT(bugprone-inaccurate-erase)
+      std::remove(_inputs.begin(), _inputs.end(), op)); // NOLINT(bugprone-inaccurate-erase)
 
   if (op->getType() == Operation::Type::output)
     _outputs.erase(
-        std::remove(_outputs.begin(), _outputs.end(), op)); // NOLINT(bugprone-inaccurate-erase)
+      std::remove(_outputs.begin(), _outputs.end(), op)); // NOLINT(bugprone-inaccurate-erase)
 
   _ops.erase(op);
   delete op;
diff --git a/compiler/mir/src/Operation.cpp b/compiler/mir/src/Operation.cpp
index 6f72acbf6..9ba395f94 100644
--- a/compiler/mir/src/Operation.cpp
+++ b/compiler/mir/src/Operation.cpp
@@ -40,7 +40,7 @@ void Operation::Output::replaceAllUsesWith(mir::Operation::Output *new_def)
 }
 
 Operation::Operation(Type type, const std::vector<Output *> &inputs, std::size_t num_outputs)
-    : _type(type)
+  : _type(type)
 {
   for (std::size_t i = 0; i < inputs.size(); ++i)
   {
diff --git a/compiler/mir/src/Shape.cpp b/compiler/mir/src/Shape.cpp
index 825420cd6..06dae0c54 100644
--- a/compiler/mir/src/Shape.cpp
+++ b/compiler/mir/src/Shape.cpp
@@ -48,9 +48,9 @@ Shape broadcastShapes(const Shape &lhs_shape, const Shape &rhs_shape)
   for (int i = 0; i < num_dims; ++i)
   {
     const std::int32_t lhs_dim =
-        (i >= num_dims - lhs_shape.rank()) ? lhs_shape.dim(i - (num_dims - lhs_shape.rank())) : 1;
+      (i >= num_dims - lhs_shape.rank()) ? lhs_shape.dim(i - (num_dims - lhs_shape.rank())) : 1;
     const std::int32_t rhs_dim =
-        (i >= num_dims - rhs_shape.rank()) ? rhs_shape.dim(i - (num_dims - rhs_shape.rank())) : 1;
+      (i >= num_dims - rhs_shape.rank()) ? rhs_shape.dim(i - (num_dims - rhs_shape.rank())) : 1;
     if (lhs_dim == 1)
     {
       result_shape.dim(i) = rhs_dim;
diff --git a/compiler/mir/src/TensorVariant.cpp b/compiler/mir/src/TensorVariant.cpp
index 9e57dbaf0..516c0df73 100644
--- a/compiler/mir/src/TensorVariant.cpp
+++ b/compiler/mir/src/TensorVariant.cpp
@@ -35,7 +35,7 @@ TensorVariant::TensorVariant(const TensorType &type) : _type(type), _strides(typ
 }
 
 TensorVariant::TensorVariant(DataType element_type, const Shape &shape)
-    : TensorVariant(TensorType(element_type, shape))
+  : TensorVariant(TensorType(element_type, shape))
 {
 }
 
@@ -46,7 +46,7 @@ TensorVariant::TensorVariant(const TensorType &type, const void *data) : TensorV
 }
 
 TensorVariant::TensorVariant(DataType element_type, const Shape &shape, const void *data)
-    : TensorVariant(TensorType(element_type, shape), data)
+  : TensorVariant(TensorType(element_type, shape), data)
 {
 }
 
@@ -57,8 +57,8 @@ TensorVariant::TensorVariant(DataType element_type, const Shape &shape, const vo
  * @param shape shape to broadcast to
  */
 TensorVariant::TensorVariant(const TensorVariant &t_old, const Shape &shape)
-    : _type(t_old.getType().getElementType(), shape), _data(t_old._data),
-      _strides(static_cast<size_t>(shape.rank())), _element_size(t_old._element_size)
+  : _type(t_old.getType().getElementType(), shape), _data(t_old._data),
+    _strides(static_cast<size_t>(shape.rank())), _element_size(t_old._element_size)
 {
   int axis_old = t_old.getShape().rank() - 1;
   for (int d = shape.rank() - 1; d >= 0; d--)
diff --git a/compiler/mir/src/mir_caffe2_importer/caffe2_importer.cpp b/compiler/mir/src/mir_caffe2_importer/caffe2_importer.cpp
index 812fcc5cc..abecfc88a 100644
--- a/compiler/mir/src/mir_caffe2_importer/caffe2_importer.cpp
+++ b/compiler/mir/src/mir_caffe2_importer/caffe2_importer.cpp
@@ -99,7 +99,7 @@ using mir::Shape;
 
 Caffe2Importer::Caffe2Importer(std::string predict_net, std::string init_net,
                                const std::vector<std::vector<int>> &input_shapes)
-    : _predictNet(std::move(predict_net)), _initNet(std::move(init_net))
+  : _predictNet(std::move(predict_net)), _initNet(std::move(init_net))
 {
   for (auto &shape : input_shapes)
     _inputShapes.emplace_back(shape);
@@ -308,27 +308,27 @@ void Caffe2Importer::setGraphOutputs()
 }
 
 const std::map<std::string, SupportedCaffe2OpType> Caffe2Importer::_operatorTypes = {
-    {"Add", SupportedCaffe2OpType::add},
-    {"AveragePool", SupportedCaffe2OpType::averagePool},
-    {"Conv", SupportedCaffe2OpType::conv},
-    {"Concat", SupportedCaffe2OpType::concat},
-    {"ConstantFill", SupportedCaffe2OpType::constantFill},
-    {"Dropout", SupportedCaffe2OpType::dropout},
-    {"FC", SupportedCaffe2OpType::FC},
-    {"GivenTensorFill", SupportedCaffe2OpType::givenTensorFill},
-    {"MaxPool", SupportedCaffe2OpType::maxPool},
-    {"Mul", SupportedCaffe2OpType::mul},
-    {"Relu", SupportedCaffe2OpType::relu},
-    {"ResizeNearest", SupportedCaffe2OpType::resizeNearest},
-    {"Sigmoid", SupportedCaffe2OpType::sigmoid},
-    {"Softmax", SupportedCaffe2OpType::softmax},
-    {"SpatialBN", SupportedCaffe2OpType::spatialBN},
-    {"Sum", SupportedCaffe2OpType::sum},
-    {"Clip", SupportedCaffe2OpType::clip},
-    {"Reshape", SupportedCaffe2OpType::reshape},
-    {"GivenTensorInt64Fill", SupportedCaffe2OpType::givenTensorInt64Fill},
+  {"Add", SupportedCaffe2OpType::add},
+  {"AveragePool", SupportedCaffe2OpType::averagePool},
+  {"Conv", SupportedCaffe2OpType::conv},
+  {"Concat", SupportedCaffe2OpType::concat},
+  {"ConstantFill", SupportedCaffe2OpType::constantFill},
+  {"Dropout", SupportedCaffe2OpType::dropout},
+  {"FC", SupportedCaffe2OpType::FC},
+  {"GivenTensorFill", SupportedCaffe2OpType::givenTensorFill},
+  {"MaxPool", SupportedCaffe2OpType::maxPool},
+  {"Mul", SupportedCaffe2OpType::mul},
+  {"Relu", SupportedCaffe2OpType::relu},
+  {"ResizeNearest", SupportedCaffe2OpType::resizeNearest},
+  {"Sigmoid", SupportedCaffe2OpType::sigmoid},
+  {"Softmax", SupportedCaffe2OpType::softmax},
+  {"SpatialBN", SupportedCaffe2OpType::spatialBN},
+  {"Sum", SupportedCaffe2OpType::sum},
+  {"Clip", SupportedCaffe2OpType::clip},
+  {"Reshape", SupportedCaffe2OpType::reshape},
+  {"GivenTensorInt64Fill", SupportedCaffe2OpType::givenTensorInt64Fill},
 };
-}
+} // namespace
 
 namespace mir_caffe2
 {
diff --git a/compiler/mir/src/mir_caffe2_importer/caffe2_op_creator.cpp b/compiler/mir/src/mir_caffe2_importer/caffe2_op_creator.cpp
index 3390f4482..de0762dfa 100644
--- a/compiler/mir/src/mir_caffe2_importer/caffe2_op_creator.cpp
+++ b/compiler/mir/src/mir_caffe2_importer/caffe2_op_creator.cpp
@@ -125,7 +125,7 @@ static std::vector<std::int32_t> getWindowSize(const ::caffe2::OperatorDef &op,
 {
   int is_global_pooling = getSingleArgument(op, "global_pooling", 0);
   bool has_custom_kernel_size =
-      hasArgument(op.arg(), "kernel_h") || hasArgument(op.arg(), "kernel_w");
+    hasArgument(op.arg(), "kernel_h") || hasArgument(op.arg(), "kernel_w");
   bool has_custom_kernels_size = hasArgument(op.arg(), "kernels");
 
   int kernel_h(0), kernel_w(0);
@@ -186,14 +186,13 @@ static void checkConvLikeOp(const ::caffe2::OperatorDef &op)
   if (has_custom_pad && hasArgument(op.arg(), "pad"))
     throw std::runtime_error("Custom pad can't be combined with overall pad");
 
-  if (has_custom_pad &&
-      !(hasArgument(op.arg(), "pad_l") && hasArgument(op.arg(), "pad_r") &&
-        hasArgument(op.arg(), "pad_t") && hasArgument(op.arg(), "pad_b")))
+  if (has_custom_pad && !(hasArgument(op.arg(), "pad_l") && hasArgument(op.arg(), "pad_r") &&
+                          hasArgument(op.arg(), "pad_t") && hasArgument(op.arg(), "pad_b")))
     throw std::runtime_error("If one custom pad specified - all custom pads must be specified");
 
   // Kernel size
   bool has_custom_kernel_size =
-      hasArgument(op.arg(), "kernel_h") || hasArgument(op.arg(), "kernel_w");
+    hasArgument(op.arg(), "kernel_h") || hasArgument(op.arg(), "kernel_w");
 
   if (has_custom_kernel_size && hasArgument(op.arg(), "kernel"))
     throw std::runtime_error("Custom kernel size can't be combined with overall kernel size");
@@ -201,7 +200,7 @@ static void checkConvLikeOp(const ::caffe2::OperatorDef &op)
   if (has_custom_kernel_size &&
       !(hasArgument(op.arg(), "kernel_h") && hasArgument(op.arg(), "kernel_w")))
     throw std::runtime_error(
-        "If one custom kernel size specified - all custom kernel sizes must be specified");
+      "If one custom kernel size specified - all custom kernel sizes must be specified");
 }
 
 static mir::TensorVariant createTensor(const OperatorDef &op)
@@ -356,7 +355,7 @@ Caffe2OpCreator::convertFC(const std::vector<mir::Operation::Output *> &inputs,
 
   auto reshape = createOp<ops::ReshapeOp>(inputs[0], shape)->getOutput(0);
   auto weights =
-      createOp<ops::TransposeOp>(inputs[1], std::vector<std::size_t>{1, 0})->getOutput(0);
+    createOp<ops::TransposeOp>(inputs[1], std::vector<std::size_t>{1, 0})->getOutput(0);
   auto result = createOp<ops::FullyConnectedOp>(reshape, weights)->getOutput(0);
   result = createOp<ops::AddOp>(result, inputs[2])->getOutput(0);
 
@@ -420,8 +419,8 @@ Caffe2OpCreator::convertResizeNearest(const std::vector<mir::Operation::Output *
   scales[2] = getSingleArgument(op, "height_scale", 1.0f);
   scales[3] = getSingleArgument(op, "width_scale", 1.0f);
   auto result =
-      createOp<ops::ResizeOp>(inputs[0], ops::ResizeOp::ResizeMethod::nearestNeighbor, scales)
-          ->getOutput(0);
+    createOp<ops::ResizeOp>(inputs[0], ops::ResizeOp::ResizeMethod::nearestNeighbor, scales)
+      ->getOutput(0);
   return {result};
 }
 
@@ -450,7 +449,7 @@ Caffe2OpCreator::convertSpatialBN(const std::vector<mir::Operation::Output *> &i
   // Sanity checks
   if (op.input_size() != 5)
     throw std::runtime_error(
-        "SpatialBN must have exactly 5 inputs ('sums' and 'sumsq' are not supported yet)");
+      "SpatialBN must have exactly 5 inputs ('sums' and 'sumsq' are not supported yet)");
   if (getSingleArgument(op, "is_test", 1) != 1)
     throw std::runtime_error("SpatialBN: only test mode supported");
 
@@ -462,7 +461,7 @@ Caffe2OpCreator::convertSpatialBN(const std::vector<mir::Operation::Output *> &i
   auto var_op = dynamic_cast<mir::ops::ConstantOp *>(inputs[4]->getNode());
   if (scale_op == nullptr || bias_op == nullptr || mean_op == nullptr || var_op == nullptr)
     throw std::runtime_error(
-        "SpatialBN: non-constant 'scale', 'bias', 'mean' and 'var' inputs are not supported yet.");
+      "SpatialBN: non-constant 'scale', 'bias', 'mean' and 'var' inputs are not supported yet.");
 
   const auto &scale_tensor = scale_op->getValue();
   const auto &bias_tensor = bias_op->getValue();
diff --git a/compiler/mir/src/mir_caffe_importer/caffe_importer.cpp b/compiler/mir/src/mir_caffe_importer/caffe_importer.cpp
index 49f13fbd8..c74658299 100644
--- a/compiler/mir/src/mir_caffe_importer/caffe_importer.cpp
+++ b/compiler/mir/src/mir_caffe_importer/caffe_importer.cpp
@@ -357,66 +357,66 @@ void CaffeImporter::setGraphOutputs(mir::Graph *graph)
 }
 
 const std::map<std::string, CaffeOpType> CaffeImporter::_operatorTypes = {
-    {"AbsVal", CaffeOpType::absVal},
-    {"Accuracy", CaffeOpType::accuracy},
-    {"ArgMax", CaffeOpType::argMax},
-    {"BatchNorm", CaffeOpType::batchNorm},
-    {"BatchReindex", CaffeOpType::batchReindex},
-    {"Bias", CaffeOpType::bias},
-    {"BNLL", CaffeOpType::BNLL},
-    {"Clip", CaffeOpType::clip},
-    {"Concat", CaffeOpType::concat},
-    {"ContrastiveLoss", CaffeOpType::contrastiveLoss},
-    {"Convolution", CaffeOpType::convolution},
-    {"Crop", CaffeOpType::crop},
-    {"Data", CaffeOpType::data},
-    {"Deconvolution", CaffeOpType::deconvolution},
-    {"Dropout", CaffeOpType::dropout},
-    {"DummyData", CaffeOpType::dummyData},
-    {"Eltwise", CaffeOpType::eltwise},
-    {"ELU", CaffeOpType::ELU},
-    {"Embed", CaffeOpType::embed},
-    {"EuclidianLoss", CaffeOpType::euclidianLoss},
-    {"Exp", CaffeOpType::exp},
-    {"Filter", CaffeOpType::filter},
-    {"Flatten", CaffeOpType::flatten},
-    {"HDF5Data", CaffeOpType::HDF5Data},
-    {"HDF5Output", CaffeOpType::HDF5Output},
-    {"HingeLoss", CaffeOpType::hingeLoss},
-    {"Im2Col", CaffeOpType::im2Col},
-    {"ImageData", CaffeOpType::imageData},
-    {"InfogainLoss", CaffeOpType::infogainLoss},
-    {"InnerProduct", CaffeOpType::innerProduct},
-    {"Input", CaffeOpType::input},
-    {"Log", CaffeOpType::log},
-    {"LRN", CaffeOpType::LRN},
-    {"LSTM", CaffeOpType::LSTM},
-    {"MemoryData", CaffeOpType::memoryData},
-    {"MultinomialLogisticLoss", CaffeOpType::multinomialLogisticLoss},
-    {"MVN", CaffeOpType::MVN},
-    {"Parameter", CaffeOpType::parameter},
-    {"Pooling", CaffeOpType::pooling},
-    {"Power", CaffeOpType::power},
-    {"PReLU", CaffeOpType::PReLU},
-    {"Python", CaffeOpType::python},
-    {"Recurrent", CaffeOpType::recurrent},
-    {"Reduction", CaffeOpType::reduction},
-    {"ReLU", CaffeOpType::ReLU},
-    {"Reshape", CaffeOpType::reshape},
-    {"RNN", CaffeOpType::RNN},
-    {"Scale", CaffeOpType::scale},
-    {"SigmoidCrossEntropyLoss", CaffeOpType::sigmoidCrossEntropyLoss},
-    {"Sigmoid", CaffeOpType::sigmoid},
-    {"Silence", CaffeOpType::silence},
-    {"Softmax", CaffeOpType::softmax},
-    {"SoftmaxWithLoss", CaffeOpType::softmaxWithLoss},
-    {"SPP", CaffeOpType::SPP},
-    {"Split", CaffeOpType::split},
-    {"Slice", CaffeOpType::slice},
-    {"TanH", CaffeOpType::tanh},
-    {"Threshold", CaffeOpType::threshold},
-    {"Tile", CaffeOpType::tile},
-    {"WindowData", CaffeOpType::windowData}};
+  {"AbsVal", CaffeOpType::absVal},
+  {"Accuracy", CaffeOpType::accuracy},
+  {"ArgMax", CaffeOpType::argMax},
+  {"BatchNorm", CaffeOpType::batchNorm},
+  {"BatchReindex", CaffeOpType::batchReindex},
+  {"Bias", CaffeOpType::bias},
+  {"BNLL", CaffeOpType::BNLL},
+  {"Clip", CaffeOpType::clip},
+  {"Concat", CaffeOpType::concat},
+  {"ContrastiveLoss", CaffeOpType::contrastiveLoss},
+  {"Convolution", CaffeOpType::convolution},
+  {"Crop", CaffeOpType::crop},
+  {"Data", CaffeOpType::data},
+  {"Deconvolution", CaffeOpType::deconvolution},
+  {"Dropout", CaffeOpType::dropout},
+  {"DummyData", CaffeOpType::dummyData},
+  {"Eltwise", CaffeOpType::eltwise},
+  {"ELU", CaffeOpType::ELU},
+  {"Embed", CaffeOpType::embed},
+  {"EuclidianLoss", CaffeOpType::euclidianLoss},
+  {"Exp", CaffeOpType::exp},
+  {"Filter", CaffeOpType::filter},
+  {"Flatten", CaffeOpType::flatten},
+  {"HDF5Data", CaffeOpType::HDF5Data},
+  {"HDF5Output", CaffeOpType::HDF5Output},
+  {"HingeLoss", CaffeOpType::hingeLoss},
+  {"Im2Col", CaffeOpType::im2Col},
+  {"ImageData", CaffeOpType::imageData},
+  {"InfogainLoss", CaffeOpType::infogainLoss},
+  {"InnerProduct", CaffeOpType::innerProduct},
+  {"Input", CaffeOpType::input},
+  {"Log", CaffeOpType::log},
+  {"LRN", CaffeOpType::LRN},
+  {"LSTM", CaffeOpType::LSTM},
+  {"MemoryData", CaffeOpType::memoryData},
+  {"MultinomialLogisticLoss", CaffeOpType::multinomialLogisticLoss},
+  {"MVN", CaffeOpType::MVN},
+  {"Parameter", CaffeOpType::parameter},
+  {"Pooling", CaffeOpType::pooling},
+  {"Power", CaffeOpType::power},
+  {"PReLU", CaffeOpType::PReLU},
+  {"Python", CaffeOpType::python},
+  {"Recurrent", CaffeOpType::recurrent},
+  {"Reduction", CaffeOpType::reduction},
+  {"ReLU", CaffeOpType::ReLU},
+  {"Reshape", CaffeOpType::reshape},
+  {"RNN", CaffeOpType::RNN},
+  {"Scale", CaffeOpType::scale},
+  {"SigmoidCrossEntropyLoss", CaffeOpType::sigmoidCrossEntropyLoss},
+  {"Sigmoid", CaffeOpType::sigmoid},
+  {"Silence", CaffeOpType::silence},
+  {"Softmax", CaffeOpType::softmax},
+  {"SoftmaxWithLoss", CaffeOpType::softmaxWithLoss},
+  {"SPP", CaffeOpType::SPP},
+  {"Split", CaffeOpType::split},
+  {"Slice", CaffeOpType::slice},
+  {"TanH", CaffeOpType::tanh},
+  {"Threshold", CaffeOpType::threshold},
+  {"Tile", CaffeOpType::tile},
+  {"WindowData", CaffeOpType::windowData}};
 } // namespace
 
 std::unique_ptr<mir::Graph> importModelFromBinaryFile(const std::string &filename)
diff --git a/compiler/mir/src/mir_caffe_importer/caffe_op_creator.cpp b/compiler/mir/src/mir_caffe_importer/caffe_op_creator.cpp
index 37edc69c4..a2c881b82 100644
--- a/compiler/mir/src/mir_caffe_importer/caffe_op_creator.cpp
+++ b/compiler/mir/src/mir_caffe_importer/caffe_op_creator.cpp
@@ -374,7 +374,7 @@ static void convertPoolingParam(const caffe::PoolingParameter &params,
   {
     // Assuming NCHW format.
     const std::int32_t padded_input =
-        input_shape.dim(2 + i) + attributes.padding_before[i] + attributes.padding_after[i];
+      input_shape.dim(2 + i) + attributes.padding_before[i] + attributes.padding_after[i];
     if ((padded_input - attributes.window[i]) % attributes.strides[i] != 0)
       ++attributes.padding_after[i];
   }
@@ -449,7 +449,7 @@ CaffeOpCreator::convertSoftmax(const caffe::LayerParameter &layer,
     auto input = createOp<ops::TransposeOp>(inputs[0], std::vector<std::size_t>{0, 2, 3, 1});
     auto softmax = createOp<ops::SoftmaxOp>(input->getOutput(0), axis);
     auto result =
-        createOp<ops::TransposeOp>(softmax->getOutput(0), std::vector<std::size_t>{0, 3, 1, 2});
+      createOp<ops::TransposeOp>(softmax->getOutput(0), std::vector<std::size_t>{0, 3, 1, 2});
     return {result->getOutput(0)};
   }
 
@@ -823,7 +823,7 @@ CaffeOpCreator::convertLSTM(const caffe::LayerParameter &layer,
 
     c_t = createOp<ops::AddOp>(createOp<ops::MulOp>(c_cont_t, f_t)->getOutput(0),
                                createOp<ops::MulOp>(i_t, g_t)->getOutput(0))
-              ->getOutput(0);
+            ->getOutput(0);
     h_t = createOp<ops::MulOp>(createOp<ops::TanhOp>(c_t)->getOutput(0), o_t)->getOutput(0);
 
     h_slices[t] = h_t;
diff --git a/compiler/mir/src/mir_onnx_importer/AttributeHelpers.h b/compiler/mir/src/mir_onnx_importer/AttributeHelpers.h
index 9a93b5b7d..ac1c3cfad 100644
--- a/compiler/mir/src/mir_onnx_importer/AttributeHelpers.h
+++ b/compiler/mir/src/mir_onnx_importer/AttributeHelpers.h
@@ -76,8 +76,8 @@ inline const onnx::AttributeProto *findAttribute(const onnx::NodeProto &node,
 {
   const auto &attributes = node.attribute();
   const auto it = std::find_if(
-      attributes.cbegin(), attributes.cend(),
-      [&name](const onnx::AttributeProto &attribute) { return attribute.name() == name; });
+    attributes.cbegin(), attributes.cend(),
+    [&name](const onnx::AttributeProto &attribute) { return attribute.name() == name; });
   if (it == attributes.cend())
     return nullptr;
   return &*it;
diff --git a/compiler/mir/src/mir_onnx_importer/ConvPoolHelpers.cpp b/compiler/mir/src/mir_onnx_importer/ConvPoolHelpers.cpp
index d98e6deae..2091968d8 100644
--- a/compiler/mir/src/mir_onnx_importer/ConvPoolHelpers.cpp
+++ b/compiler/mir/src/mir_onnx_importer/ConvPoolHelpers.cpp
@@ -55,7 +55,7 @@ void inferAutoPadding(const std::string &pad_type, const mir::Shape &input_shape
       // Assuming input has NCHW format.
       const std::int32_t residual = input_shape.dim(2 + i) % strides[i];
       const std::int32_t total_pad = std::max(
-          INT32_C(0), residual == 0 ? eff_window_size - strides[i] : eff_window_size - residual);
+        INT32_C(0), residual == 0 ? eff_window_size - strides[i] : eff_window_size - residual);
       if (pad_type == "SAME_UPPER")
       {
         padding_before[i] = total_pad / 2;
diff --git a/compiler/mir/src/mir_onnx_importer/ONNXHelpers.cpp b/compiler/mir/src/mir_onnx_importer/ONNXHelpers.cpp
index f3a9d182d..77656cf48 100644
--- a/compiler/mir/src/mir_onnx_importer/ONNXHelpers.cpp
+++ b/compiler/mir/src/mir_onnx_importer/ONNXHelpers.cpp
@@ -166,9 +166,9 @@ mir::Operation *foldConstants(mir::Graph *graph, mir::Operation *op)
   }
 
   bool is_foldable =
-      std::all_of(op->getInputs().begin(), op->getInputs().end(), [](mir::Operation::Output *out) {
-        return out->getNode()->getType() == mir::Operation::Type::constant;
-      });
+    std::all_of(op->getInputs().begin(), op->getInputs().end(), [](mir::Operation::Output *out) {
+      return out->getNode()->getType() == mir::Operation::Type::constant;
+    });
 
   if (!is_foldable)
     return op;
diff --git a/compiler/mir/src/mir_onnx_importer/ONNXImporterImpl.cpp b/compiler/mir/src/mir_onnx_importer/ONNXImporterImpl.cpp
index 8b996244f..6379b6c87 100644
--- a/compiler/mir/src/mir_onnx_importer/ONNXImporterImpl.cpp
+++ b/compiler/mir/src/mir_onnx_importer/ONNXImporterImpl.cpp
@@ -134,7 +134,7 @@ void ONNXImporterImpl::collectUnsupportedOps()
     auto opset = _modelCtx->getDomainOpsetVersion(onnx_node.domain());
 
     NodeConverterRegistry::ConverterFunc converter =
-        NodeConverterRegistry::getInstance().lookup(op_type, opset);
+      NodeConverterRegistry::getInstance().lookup(op_type, opset);
 
     if (converter == nullptr)
       problems_op_set.emplace(op_type, opset);
@@ -176,7 +176,7 @@ void ONNXImporterImpl::createGraphInputs()
       }
 
       auto elem_type = onnxDataTypeToMirDataType(
-          (onnx::TensorProto_DataType)input.type().tensor_type().elem_type());
+        (onnx::TensorProto_DataType)input.type().tensor_type().elem_type());
       mir::TensorType type{elem_type, shape};
       auto *op = _graph->create<mir::ops::InputOp>(type);
       _converterCtx->setOutput(input.name(), op->getOutput(0));
@@ -199,7 +199,7 @@ std::unique_ptr<mir::Graph> ONNXImporterImpl::createIR()
     auto opset = _modelCtx->getDomainOpsetVersion(onnx_node.domain());
     // Get converter
     NodeConverterRegistry::ConverterFunc converter =
-        NodeConverterRegistry::getInstance().lookup(op_type, opset);
+      NodeConverterRegistry::getInstance().lookup(op_type, opset);
     assert(converter != nullptr);
     converter(onnx_node, _converterCtx.get());
   }
diff --git a/compiler/mir/src/mir_onnx_importer/ONNXNodeConverterRegistry.cpp b/compiler/mir/src/mir_onnx_importer/ONNXNodeConverterRegistry.cpp
index a11b18e89..573b41468 100644
--- a/compiler/mir/src/mir_onnx_importer/ONNXNodeConverterRegistry.cpp
+++ b/compiler/mir/src/mir_onnx_importer/ONNXNodeConverterRegistry.cpp
@@ -117,8 +117,8 @@ NodeConverterRegistry::ConverterFunc NodeConverterRegistry::lookup(const std::st
   const VersionMap &conv_map = it->second;
 
   auto res = std::lower_bound(
-      conv_map.crbegin(), conv_map.crend(), opset,
-      [](const VersionMap::value_type &pair, int64_t opset) { return pair.first > opset; });
+    conv_map.crbegin(), conv_map.crend(), opset,
+    [](const VersionMap::value_type &pair, int64_t opset) { return pair.first > opset; });
 
   if (res == conv_map.crend())
   {
diff --git a/compiler/mir/src/mir_onnx_importer/Op/AveragePool.cpp b/compiler/mir/src/mir_onnx_importer/Op/AveragePool.cpp
index 503feffc8..1ee136ea6 100644
--- a/compiler/mir/src/mir_onnx_importer/Op/AveragePool.cpp
+++ b/compiler/mir/src/mir_onnx_importer/Op/AveragePool.cpp
@@ -40,7 +40,7 @@ void convertAveragePoolV1(const onnx::NodeProto &onnx_node, ConverterContext *co
   constexpr int num_spatial_dims = 2;
 
   const auto strides =
-      getAttributeValue(onnx_node, "strides", std::vector<std::int32_t>(num_spatial_dims, 1));
+    getAttributeValue(onnx_node, "strides", std::vector<std::int32_t>(num_spatial_dims, 1));
   if (strides.size() != num_spatial_dims)
     throw std::runtime_error("AveragePool: attribute 'strides' has incorrect size.");
 
diff --git a/compiler/mir/src/mir_onnx_importer/Op/BatchNormalization.cpp b/compiler/mir/src/mir_onnx_importer/Op/BatchNormalization.cpp
index 8a6d8cc51..c743ee9e0 100644
--- a/compiler/mir/src/mir_onnx_importer/Op/BatchNormalization.cpp
+++ b/compiler/mir/src/mir_onnx_importer/Op/BatchNormalization.cpp
@@ -81,7 +81,7 @@ void convertBatchNormalizationV9(const onnx::NodeProto &onnx_node, ConverterCont
 
   if (scale_op == nullptr || mean_op == nullptr || var_op == nullptr)
     throw std::runtime_error(
-        "BatchNormalization: only constant 'scale', 'mean' and 'variance' inputs are supported.");
+      "BatchNormalization: only constant 'scale', 'mean' and 'variance' inputs are supported.");
 
   mir::Tensor<float> scale_accessor(scale_op->getValue());
   mir::Tensor<float> mean_accessor(mean_op->getValue());
diff --git a/compiler/mir/src/mir_onnx_importer/Op/Conv.cpp b/compiler/mir/src/mir_onnx_importer/Op/Conv.cpp
index 7dc6ce818..7d78826a6 100644
--- a/compiler/mir/src/mir_onnx_importer/Op/Conv.cpp
+++ b/compiler/mir/src/mir_onnx_importer/Op/Conv.cpp
@@ -139,7 +139,7 @@ void convertConvV1(const onnx::NodeProto &onnx_node, ConverterContext *context)
   {
     auto bias = inputs[2];
     bias = createOp<mir::ops::ReshapeOp>(graph, bias, mir::Shape{1, bias->getShape().dim(0), 1, 1})
-               ->getOutput(0);
+             ->getOutput(0);
     result = createOp<mir::ops::AddOp>(graph, result, bias)->getOutput(0);
   }
 
diff --git a/compiler/mir/src/mir_onnx_importer/Op/ConvTranspose.cpp b/compiler/mir/src/mir_onnx_importer/Op/ConvTranspose.cpp
index 3078a1959..ea0b6fa5e 100644
--- a/compiler/mir/src/mir_onnx_importer/Op/ConvTranspose.cpp
+++ b/compiler/mir/src/mir_onnx_importer/Op/ConvTranspose.cpp
@@ -49,19 +49,19 @@ void convertConvTransposeV1(const onnx::NodeProto &onnx_node, ConverterContext *
   constexpr int num_spatial_dims = 2;
 
   const auto dilations =
-      getAttributeValue(onnx_node, "dilations", std::vector<std::int32_t>(num_spatial_dims, 1));
+    getAttributeValue(onnx_node, "dilations", std::vector<std::int32_t>(num_spatial_dims, 1));
   if (dilations.size() != num_spatial_dims)
     throw std::runtime_error("ConvTranspose: attribute 'dilations' has incorrect size.");
   if (!std::all_of(dilations.cbegin(), dilations.cend(), [](std::int32_t x) { return x == 1; }))
     throw std::runtime_error("ConvTranspose: attribute 'dilations' has unsupported value.");
 
   const auto strides =
-      getAttributeValue(onnx_node, "strides", std::vector<std::int32_t>(num_spatial_dims, 1));
+    getAttributeValue(onnx_node, "strides", std::vector<std::int32_t>(num_spatial_dims, 1));
   if (strides.size() != num_spatial_dims)
     throw std::runtime_error("ConvTranspose: attribute 'strides' has incorrect size.");
 
-  const auto output_padding = getAttributeValue(onnx_node, "output_padding",
-                                                std::vector<std::int32_t>(num_spatial_dims, 0));
+  const auto output_padding =
+    getAttributeValue(onnx_node, "output_padding", std::vector<std::int32_t>(num_spatial_dims, 0));
   if (output_padding.size() != num_spatial_dims)
     throw std::runtime_error("ConvTranspose: attribute 'output_padding' has incorrect size.");
   if (!std::all_of(output_padding.cbegin(), output_padding.cend(),
@@ -71,8 +71,8 @@ void convertConvTransposeV1(const onnx::NodeProto &onnx_node, ConverterContext *
   // Assuming kernel has IOHW format.
   assert(kernel->getShape().rank() == 4);
   const auto kernel_size = getAttributeValue(
-      onnx_node, "kernel_shape",
-      std::vector<std::int32_t>{kernel->getShape().dim(2), kernel->getShape().dim(3)});
+    onnx_node, "kernel_shape",
+    std::vector<std::int32_t>{kernel->getShape().dim(2), kernel->getShape().dim(3)});
   if (kernel_size.size() != num_spatial_dims)
     throw std::runtime_error("ConvTranspose: attribute 'kernel_shape' has incorrect size.");
 
@@ -92,14 +92,14 @@ void convertConvTransposeV1(const onnx::NodeProto &onnx_node, ConverterContext *
     attributes.strides = strides;
     attributes.data_format = mir::DataFormat::NCHW;
     attributes.padding_type = mir::ops::PaddingType::SameUpper;
-    result = createOp<mir::ops::DeConv2DOp>(graph, input, kernel, attributes, output_shape)
-                 ->getOutput(0);
+    result =
+      createOp<mir::ops::DeConv2DOp>(graph, input, kernel, attributes, output_shape)->getOutput(0);
   }
   else
   {
     // TODO This code was not tested.
     throw std::runtime_error(
-        "ConvTranspose: absence of attribute 'output_shape' is not supported.");
+      "ConvTranspose: absence of attribute 'output_shape' is not supported.");
     std::vector<std::int32_t> padding_before(num_spatial_dims, 0);
     std::vector<std::int32_t> padding_after(num_spatial_dims, 0);
     if (const auto *pads_attr = findAttribute(onnx_node, "pads"))
@@ -128,7 +128,7 @@ void convertConvTransposeV1(const onnx::NodeProto &onnx_node, ConverterContext *
   {
     auto bias = inputs[2];
     bias = createOp<mir::ops::ReshapeOp>(graph, bias, mir::Shape{1, bias->getShape().dim(0), 1, 1})
-               ->getOutput(0);
+             ->getOutput(0);
     result = createOp<mir::ops::AddOp>(graph, result, bias)->getOutput(0);
   }
 
diff --git a/compiler/mir/src/mir_onnx_importer/Op/MaxPool.cpp b/compiler/mir/src/mir_onnx_importer/Op/MaxPool.cpp
index 53e6e1556..6c9ef6621 100644
--- a/compiler/mir/src/mir_onnx_importer/Op/MaxPool.cpp
+++ b/compiler/mir/src/mir_onnx_importer/Op/MaxPool.cpp
@@ -40,7 +40,7 @@ void convertMaxPoolV1(const onnx::NodeProto &onnx_node, ConverterContext *contex
   constexpr int num_spatial_dims = 2;
 
   const auto strides =
-      getAttributeValue(onnx_node, "strides", std::vector<std::int32_t>(num_spatial_dims, 1));
+    getAttributeValue(onnx_node, "strides", std::vector<std::int32_t>(num_spatial_dims, 1));
   if (strides.size() != num_spatial_dims)
     throw std::runtime_error("MaxPool: attribute 'strides' has incorrect size.");
 
diff --git a/compiler/mir/src/mir_onnx_importer/Op/ReduceMean.cpp b/compiler/mir/src/mir_onnx_importer/Op/ReduceMean.cpp
index ec43bffb4..9bfe16282 100644
--- a/compiler/mir/src/mir_onnx_importer/Op/ReduceMean.cpp
+++ b/compiler/mir/src/mir_onnx_importer/Op/ReduceMean.cpp
@@ -52,7 +52,7 @@ void convertReduceMeanV1(const onnx::NodeProto &onnx_node, ConverterContext *con
 
   mir::Graph *graph = context->getGraph();
   auto result =
-      createOp<mir::ops::ReduceMeanOp>(graph, inputs[0], reduce_dims, keep_dims)->getOutput(0);
+    createOp<mir::ops::ReduceMeanOp>(graph, inputs[0], reduce_dims, keep_dims)->getOutput(0);
 
   context->setNodeOutputs(onnx_node, {result});
 }
diff --git a/compiler/mir/src/mir_onnx_importer/Op/Upsample.cpp b/compiler/mir/src/mir_onnx_importer/Op/Upsample.cpp
index 346e22cc2..881ec89d3 100644
--- a/compiler/mir/src/mir_onnx_importer/Op/Upsample.cpp
+++ b/compiler/mir/src/mir_onnx_importer/Op/Upsample.cpp
@@ -52,9 +52,9 @@ void convertUpsampleV1(const onnx::NodeProto &onnx_node, ConverterContext *conte
   scales_vector.at(3) = w_scale;
 
   auto result =
-      createOp<mir::ops::ResizeOp>(graph, inputs[0],
-                                   mir::ops::ResizeOp::ResizeMethod::nearestNeighbor, scales_vector)
-          ->getOutput(0);
+    createOp<mir::ops::ResizeOp>(graph, inputs[0],
+                                 mir::ops::ResizeOp::ResizeMethod::nearestNeighbor, scales_vector)
+      ->getOutput(0);
 
   context->setNodeOutputs(onnx_node, {result});
 }
@@ -74,7 +74,7 @@ void convertUpsampleV7(const onnx::NodeProto &onnx_node, ConverterContext *conte
 
   if (scales_attr->floats_size() != inputs[0]->getShape().rank())
     throw std::runtime_error(
-        "Number of elements of scales should be the same as the rank of input");
+      "Number of elements of scales should be the same as the rank of input");
 
   assert(inputs[0]->getShape().rank() == 4 && "Only rank 4 is supported");
   std::vector<float> scales_vector(4);
@@ -85,9 +85,9 @@ void convertUpsampleV7(const onnx::NodeProto &onnx_node, ConverterContext *conte
   scales_vector.at(3) = scales_attr->floats(3);
 
   auto result =
-      createOp<mir::ops::ResizeOp>(graph, inputs[0],
-                                   mir::ops::ResizeOp::ResizeMethod::nearestNeighbor, scales_vector)
-          ->getOutput(0);
+    createOp<mir::ops::ResizeOp>(graph, inputs[0],
+                                 mir::ops::ResizeOp::ResizeMethod::nearestNeighbor, scales_vector)
+      ->getOutput(0);
 
   context->setNodeOutputs(onnx_node, {result});
 }
@@ -117,9 +117,9 @@ void convertUpsampleV9(const onnx::NodeProto &onnx_node, ConverterContext *conte
     scales_vector[i] = scales_tensor.atOffset(i);
 
   auto result =
-      createOp<mir::ops::ResizeOp>(graph, inputs[0],
-                                   mir::ops::ResizeOp::ResizeMethod::nearestNeighbor, scales_vector)
-          ->getOutput(0);
+    createOp<mir::ops::ResizeOp>(graph, inputs[0],
+                                 mir::ops::ResizeOp::ResizeMethod::nearestNeighbor, scales_vector)
+      ->getOutput(0);
 
   context->setNodeOutputs(onnx_node, {result});
 }
diff --git a/compiler/mir/src/mir_tflite_importer/tflite_importer.cpp b/compiler/mir/src/mir_tflite_importer/tflite_importer.cpp
index 3f245d2d4..7b91bf0ba 100644
--- a/compiler/mir/src/mir_tflite_importer/tflite_importer.cpp
+++ b/compiler/mir/src/mir_tflite_importer/tflite_importer.cpp
@@ -105,37 +105,37 @@ void TfliteImporter::import()
 }
 
 static const std::set<tflite::BuiltinOperator> supportedOperators = {
-    tflite::BuiltinOperator_ADD,
-    tflite::BuiltinOperator_AVERAGE_POOL_2D,
-    tflite::BuiltinOperator_CONCATENATION,
-    tflite::BuiltinOperator_CONV_2D,
-    tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
-    tflite::BuiltinOperator_DIV,
-    tflite::BuiltinOperator_FULLY_CONNECTED,
-    tflite::BuiltinOperator_HARD_SWISH,
-    tflite::BuiltinOperator_LEAKY_RELU,
-    tflite::BuiltinOperator_LOGISTIC,
-    tflite::BuiltinOperator_MAX_POOL_2D,
-    tflite::BuiltinOperator_MAXIMUM,
-    tflite::BuiltinOperator_MEAN,
-    tflite::BuiltinOperator_MUL,
-    tflite::BuiltinOperator_PAD,
-    tflite::BuiltinOperator_RELU,
-    tflite::BuiltinOperator_RELU6,
-    tflite::BuiltinOperator_RESHAPE,
-    tflite::BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
-    tflite::BuiltinOperator_RSQRT,
-    tflite::BuiltinOperator_SHAPE,
-    tflite::BuiltinOperator_SLICE,
-    tflite::BuiltinOperator_SOFTMAX,
-    tflite::BuiltinOperator_SQRT,
-    tflite::BuiltinOperator_SQUARED_DIFFERENCE,
-    tflite::BuiltinOperator_SQUEEZE,
-    tflite::BuiltinOperator_STRIDED_SLICE,
-    tflite::BuiltinOperator_SUB,
-    tflite::BuiltinOperator_TANH,
-    tflite::BuiltinOperator_TRANSPOSE,
-    tflite::BuiltinOperator_TRANSPOSE_CONV,
+  tflite::BuiltinOperator_ADD,
+  tflite::BuiltinOperator_AVERAGE_POOL_2D,
+  tflite::BuiltinOperator_CONCATENATION,
+  tflite::BuiltinOperator_CONV_2D,
+  tflite::BuiltinOperator_DEPTHWISE_CONV_2D,
+  tflite::BuiltinOperator_DIV,
+  tflite::BuiltinOperator_FULLY_CONNECTED,
+  tflite::BuiltinOperator_HARD_SWISH,
+  tflite::BuiltinOperator_LEAKY_RELU,
+  tflite::BuiltinOperator_LOGISTIC,
+  tflite::BuiltinOperator_MAX_POOL_2D,
+  tflite::BuiltinOperator_MAXIMUM,
+  tflite::BuiltinOperator_MEAN,
+  tflite::BuiltinOperator_MUL,
+  tflite::BuiltinOperator_PAD,
+  tflite::BuiltinOperator_RELU,
+  tflite::BuiltinOperator_RELU6,
+  tflite::BuiltinOperator_RESHAPE,
+  tflite::BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
+  tflite::BuiltinOperator_RSQRT,
+  tflite::BuiltinOperator_SHAPE,
+  tflite::BuiltinOperator_SLICE,
+  tflite::BuiltinOperator_SOFTMAX,
+  tflite::BuiltinOperator_SQRT,
+  tflite::BuiltinOperator_SQUARED_DIFFERENCE,
+  tflite::BuiltinOperator_SQUEEZE,
+  tflite::BuiltinOperator_STRIDED_SLICE,
+  tflite::BuiltinOperator_SUB,
+  tflite::BuiltinOperator_TANH,
+  tflite::BuiltinOperator_TRANSPOSE,
+  tflite::BuiltinOperator_TRANSPOSE_CONV,
 };
 
 void TfliteImporter::collectUnsupportedOps()
@@ -268,8 +268,8 @@ void TfliteImporter::walkOperator(const tflite::SubGraphT *subgraph, const tflit
       outputs = _opCreator->convertConv2D(op->builtin_options.AsConv2DOptions(), inputs);
       break;
     case tflite::BuiltinOperator_DEPTHWISE_CONV_2D:
-      outputs = _opCreator->convertDepthwiseConv2D(op->builtin_options.AsDepthwiseConv2DOptions(),
-                                                   inputs);
+      outputs =
+        _opCreator->convertDepthwiseConv2D(op->builtin_options.AsDepthwiseConv2DOptions(), inputs);
       break;
     case tflite::BuiltinOperator_MAX_POOL_2D:
       outputs = _opCreator->convertMaxPool2D(op->builtin_options.AsPool2DOptions(), inputs);
@@ -279,21 +279,21 @@ void TfliteImporter::walkOperator(const tflite::SubGraphT *subgraph, const tflit
       break;
     case tflite::BuiltinOperator_CONCATENATION:
       outputs =
-          _opCreator->convertConcatenation(op->builtin_options.AsConcatenationOptions(), inputs);
+        _opCreator->convertConcatenation(op->builtin_options.AsConcatenationOptions(), inputs);
       break;
     case tflite::BuiltinOperator_RESHAPE:
       outputs = _opCreator->convertReshape(op->builtin_options.AsReshapeOptions(), inputs);
       break;
     case tflite::BuiltinOperator_RESIZE_NEAREST_NEIGHBOR:
       outputs = _opCreator->convertResizeNearestNeighbor(
-          op->builtin_options.AsResizeNearestNeighborOptions(), inputs);
+        op->builtin_options.AsResizeNearestNeighborOptions(), inputs);
       break;
     case tflite::BuiltinOperator_MEAN:
       outputs = _opCreator->convertMean(op->builtin_options.AsReducerOptions(), inputs);
       break;
     case tflite::BuiltinOperator_FULLY_CONNECTED:
       outputs =
-          _opCreator->convertFullyConnected(op->builtin_options.AsFullyConnectedOptions(), inputs);
+        _opCreator->convertFullyConnected(op->builtin_options.AsFullyConnectedOptions(), inputs);
       break;
     case tflite::BuiltinOperator_SOFTMAX:
       outputs = _opCreator->convertSoftmax(op->builtin_options.AsSoftmaxOptions(), inputs);
@@ -333,7 +333,7 @@ void TfliteImporter::walkOperator(const tflite::SubGraphT *subgraph, const tflit
       break;
     case tflite::BuiltinOperator_TRANSPOSE_CONV:
       outputs =
-          _opCreator->convertTransposeConv(op->builtin_options.AsTransposeConvOptions(), inputs);
+        _opCreator->convertTransposeConv(op->builtin_options.AsTransposeConvOptions(), inputs);
       break;
     case tflite::BuiltinOperator_PAD:
       outputs = _opCreator->convertPad(op->builtin_options.AsPadOptions(), inputs);
@@ -352,7 +352,7 @@ void TfliteImporter::walkOperator(const tflite::SubGraphT *subgraph, const tflit
       break;
     case tflite::BuiltinOperator_STRIDED_SLICE:
       outputs =
-          _opCreator->convertStridedSlice(op->builtin_options.AsStridedSliceOptions(), inputs);
+        _opCreator->convertStridedSlice(op->builtin_options.AsStridedSliceOptions(), inputs);
       break;
     case tflite::BuiltinOperator_LEAKY_RELU:
       outputs = _opCreator->convertLeakyReLU(op->builtin_options.AsLeakyReluOptions(), inputs);
diff --git a/compiler/mir/src/mir_tflite_importer/tflite_op_creator.cpp b/compiler/mir/src/mir_tflite_importer/tflite_op_creator.cpp
index d9f98da55..58425e9a9 100644
--- a/compiler/mir/src/mir_tflite_importer/tflite_op_creator.cpp
+++ b/compiler/mir/src/mir_tflite_importer/tflite_op_creator.cpp
@@ -92,9 +92,9 @@ static void calculatePadding(mir::ops::PaddingType padding_type, const mir::Shap
       {
         // Assuming NHWC format.
         const std::int32_t total_padding =
-            (input_shape.dim(1 + i) % strides[i] == 0)
-                ? std::max(0, window_size[i] - strides[i])
-                : std::max(0, window_size[i] - input_shape.dim(1 + i) % strides[i]);
+          (input_shape.dim(1 + i) % strides[i] == 0)
+            ? std::max(0, window_size[i] - strides[i])
+            : std::max(0, window_size[i] - input_shape.dim(1 + i) % strides[i]);
         padding_before[i] = total_padding / 2;
         padding_after[i] = total_padding - padding_before[i];
       }
@@ -332,7 +332,7 @@ TFLiteOpCreator::convertResizeNearestNeighbor(const tflite::ResizeNearestNeighbo
   Shape res_shape{input_shape.dim(0), size_tensor.at(mir::Index{0}), size_tensor.at(mir::Index{1}),
                   input_shape.dim(3)};
   auto result =
-      createOp<ops::ResizeOp>(input, ops::ResizeOp::ResizeMethod::nearestNeighbor, res_shape);
+    createOp<ops::ResizeOp>(input, ops::ResizeOp::ResizeMethod::nearestNeighbor, res_shape);
   return {result->getOutput(0)};
 }
 
diff --git a/compiler/mir/src/ops/AvgPool2DOp.cpp b/compiler/mir/src/ops/AvgPool2DOp.cpp
index 52b67303f..945917208 100644
--- a/compiler/mir/src/ops/AvgPool2DOp.cpp
+++ b/compiler/mir/src/ops/AvgPool2DOp.cpp
@@ -50,7 +50,7 @@ void AvgPool2DOp::inferOutputTypes()
     //   (in_size - window_size + 1 + stride - 1) / stride =
     //   (in_size - window_size) / stride + 1
     output_shape.dim(spatial_dim_index) =
-        (padded_input - _attributes.window[i]) / _attributes.strides[i] + 1;
+      (padded_input - _attributes.window[i]) / _attributes.strides[i] + 1;
   }
 
   setOutputType(0, {getInput(0)->getElementType(), output_shape});
diff --git a/compiler/mir/src/ops/Conv2DOp.cpp b/compiler/mir/src/ops/Conv2DOp.cpp
index 1addc5734..1de73b62d 100644
--- a/compiler/mir/src/ops/Conv2DOp.cpp
+++ b/compiler/mir/src/ops/Conv2DOp.cpp
@@ -54,7 +54,7 @@ void Conv2DOp::inferOutputTypes()
     //   (in_size - kernel_size + 1 + stride - 1) / stride =
     //   (in_size - kernel_size) / stride + 1
     output_shape.dim(spatial_dim_index) =
-        (padded_input - kernel_shape.dim(1 + i)) / _attributes.strides[i] + 1;
+      (padded_input - kernel_shape.dim(1 + i)) / _attributes.strides[i] + 1;
   }
 
   auto dt = getInput(0)->getElementType();
diff --git a/compiler/mir/src/ops/DeConv2DOp.cpp b/compiler/mir/src/ops/DeConv2DOp.cpp
index 35b111bc0..08829d327 100644
--- a/compiler/mir/src/ops/DeConv2DOp.cpp
+++ b/compiler/mir/src/ops/DeConv2DOp.cpp
@@ -36,8 +36,8 @@ void DeConv2DOp::inferPaddings()
   {
     const int spatial_dim_index = getDataSpatialDimIndex(_attributes.data_format, i);
     const std::int32_t total_padding =
-        (input_shape.dim(spatial_dim_index) - 1) * _attributes.strides[i] + kernel_shape.dim(i) -
-        output_shape.dim(spatial_dim_index);
+      (input_shape.dim(spatial_dim_index) - 1) * _attributes.strides[i] + kernel_shape.dim(i) -
+      output_shape.dim(spatial_dim_index);
 
     switch (_attributes.padding_type)
     {
@@ -85,8 +85,8 @@ void DeConv2DOp::inferOutputTypes()
   {
     const int spatial_dim_index = getDataSpatialDimIndex(_attributes.data_format, i);
     output_shape.dim(spatial_dim_index) =
-        (input_shape.dim(spatial_dim_index) - 1) * _attributes.strides[i] + kernel_shape.dim(i) -
-        (_attributes.padding_before.at(i) + _attributes.padding_after.at(i));
+      (input_shape.dim(spatial_dim_index) - 1) * _attributes.strides[i] + kernel_shape.dim(i) -
+      (_attributes.padding_before.at(i) + _attributes.padding_after.at(i));
   }
 
   setOutputType(0, {getInput(0)->getElementType(), output_shape});
diff --git a/compiler/mir/src/ops/DepthwiseConv2DOp.cpp b/compiler/mir/src/ops/DepthwiseConv2DOp.cpp
index 0154bcd09..521d2eb49 100644
--- a/compiler/mir/src/ops/DepthwiseConv2DOp.cpp
+++ b/compiler/mir/src/ops/DepthwiseConv2DOp.cpp
@@ -50,7 +50,7 @@ void DepthwiseConv2DOp::inferOutputTypes()
     //   (in_size - kernel_size + 1 + stride - 1) / stride =
     //   (in_size - kernel_size) / stride + 1
     output_shape.dim(spatial_dim_index) =
-        (padded_input - kernel_shape.dim(i)) / _attributes.strides[i] + 1;
+      (padded_input - kernel_shape.dim(i)) / _attributes.strides[i] + 1;
   }
 
   setOutputType(0, {getInput(0)->getElementType(), output_shape});
diff --git a/compiler/mir/src/ops/MaxPool2DOp.cpp b/compiler/mir/src/ops/MaxPool2DOp.cpp
index 38e72424e..0cb3aa93c 100644
--- a/compiler/mir/src/ops/MaxPool2DOp.cpp
+++ b/compiler/mir/src/ops/MaxPool2DOp.cpp
@@ -50,7 +50,7 @@ void MaxPool2DOp::inferOutputTypes()
     //   (in_size - window_size + 1 + stride - 1) / stride =
     //   (in_size - window_size) / stride + 1
     output_shape.dim(spatial_dim_index) =
-        (padded_input - _attributes.window[i]) / _attributes.strides[i] + 1;
+      (padded_input - _attributes.window[i]) / _attributes.strides[i] + 1;
   }
 
   setOutputType(0, {getInput(0)->getElementType(), output_shape});
diff --git a/compiler/mir/src/ops/PadOp.cpp b/compiler/mir/src/ops/PadOp.cpp
index 465856d92..38feaccdc 100644
--- a/compiler/mir/src/ops/PadOp.cpp
+++ b/compiler/mir/src/ops/PadOp.cpp
@@ -30,7 +30,7 @@ void PadOp::inferOutputTypes()
   for (int32_t dim = 0; dim < num_dims; ++dim)
   {
     out_shape.dim(dim) =
-        _attributes.padding_before[dim] + input_shape.dim(dim) + _attributes.padding_after[dim];
+      _attributes.padding_before[dim] + input_shape.dim(dim) + _attributes.padding_after[dim];
   }
 
   setOutputType(0, {getInput(0)->getElementType(), out_shape});
diff --git a/compiler/mir/src/ops/TransposeOp.cpp b/compiler/mir/src/ops/TransposeOp.cpp
index 92282e17d..d04cdb4f2 100644
--- a/compiler/mir/src/ops/TransposeOp.cpp
+++ b/compiler/mir/src/ops/TransposeOp.cpp
@@ -22,7 +22,7 @@ namespace ops
 {
 
 TransposeOp::TransposeOp(Output *arg, const std::vector<std::size_t> &axis_order)
-    : Operation(Type::transpose, {arg}), _axis_order(axis_order)
+  : Operation(Type::transpose, {arg}), _axis_order(axis_order)
 {
   assert(_axis_order.size() == static_cast<std::size_t>(getInputShape(0).rank()));
   inferOutputTypes();
@@ -34,7 +34,7 @@ void TransposeOp::inferOutputTypes()
   Shape output_shape(input_shape.rank());
   for (std::size_t i = 0; i < _axis_order.size(); ++i)
     output_shape.dim(static_cast<std::int64_t>(i)) =
-        input_shape.dim(static_cast<int32_t>(_axis_order.at(i)));
+      input_shape.dim(static_cast<int32_t>(_axis_order.at(i)));
 
   setOutputType(0, {getInput(0)->getElementType(), output_shape});
 }
diff --git a/compiler/mir/unittests/ShapeInference.cpp b/compiler/mir/unittests/ShapeInference.cpp
index bae4ec5e2..c902b1e12 100644
--- a/compiler/mir/unittests/ShapeInference.cpp
+++ b/compiler/mir/unittests/ShapeInference.cpp
@@ -80,8 +80,8 @@ TEST(ShapeInferenceTest, ResizeWithScale)
   auto input = g.create<ops::InputOp>(input_type);
 
   auto op =
-      g.create<ops::ResizeOp>(input->getOutput(0), ops::ResizeOp::ResizeMethod::nearestNeighbor,
-                              std::vector<float>{1, 6, 2, 1});
+    g.create<ops::ResizeOp>(input->getOutput(0), ops::ResizeOp::ResizeMethod::nearestNeighbor,
+                            std::vector<float>{1, 6, 2, 1});
 
   ASSERT_EQ(result_shape, op->getOutputShape(0));
 }
diff --git a/compiler/mir/unittests/ShapeRange.cpp b/compiler/mir/unittests/ShapeRange.cpp
index 3b32d0c61..3797e3ccc 100644
--- a/compiler/mir/unittests/ShapeRange.cpp
+++ b/compiler/mir/unittests/ShapeRange.cpp
@@ -29,7 +29,7 @@ struct ParamType
 
   template <typename... Args>
   explicit ParamType(int32_t actual_len, Args &&... args)
-      : actual_length(actual_len), shape({static_cast<int32_t>(args)...})
+    : actual_length(actual_len), shape({static_cast<int32_t>(args)...})
   {
   }
 };
diff --git a/compiler/mir2loco/src/mir2loco.test.cpp b/compiler/mir2loco/src/mir2loco.test.cpp
index 3870caeb5..92ab99488 100644
--- a/compiler/mir2loco/src/mir2loco.test.cpp
+++ b/compiler/mir2loco/src/mir2loco.test.cpp
@@ -140,10 +140,10 @@ TEST_F(TestTransformer_mir2loco, Avg_Pool_Test)
 
   loco::Pull *pull_node = dynamic_cast<loco::Pull *>(loco_graph->nodes()->at(0));
   loco::FeatureEncode *encode_node =
-      dynamic_cast<loco::FeatureEncode *>(loco_graph->nodes()->at(1));
+    dynamic_cast<loco::FeatureEncode *>(loco_graph->nodes()->at(1));
   loco::AvgPool2D *pool_node = dynamic_cast<loco::AvgPool2D *>(loco_graph->nodes()->at(2));
   loco::FeatureDecode *decode_node =
-      dynamic_cast<loco::FeatureDecode *>(loco_graph->nodes()->at(3));
+    dynamic_cast<loco::FeatureDecode *>(loco_graph->nodes()->at(3));
   loco::Push *push_node = dynamic_cast<loco::Push *>(loco_graph->nodes()->at(4));
 
   ASSERT_NE(pull_node, nullptr);
@@ -188,10 +188,10 @@ TEST_F(TestTransformer_mir2loco, Max_Pool_Test)
 
   loco::Pull *pull_node = dynamic_cast<loco::Pull *>(loco_graph->nodes()->at(0));
   loco::FeatureEncode *encode_node =
-      dynamic_cast<loco::FeatureEncode *>(loco_graph->nodes()->at(1));
+    dynamic_cast<loco::FeatureEncode *>(loco_graph->nodes()->at(1));
   loco::MaxPool2D *pool_node = dynamic_cast<loco::MaxPool2D *>(loco_graph->nodes()->at(2));
   loco::FeatureDecode *decode_node =
-      dynamic_cast<loco::FeatureDecode *>(loco_graph->nodes()->at(3));
+    dynamic_cast<loco::FeatureDecode *>(loco_graph->nodes()->at(3));
   loco::Push *push_node = dynamic_cast<loco::Push *>(loco_graph->nodes()->at(4));
 
   ASSERT_NE(pull_node, nullptr);
@@ -273,7 +273,7 @@ TEST_F(TestTransformer_mir2loco, Reshape_Test)
 
   loco::Pull *pull_node = dynamic_cast<loco::Pull *>(loco_graph->nodes()->at(0));
   loco::Reshape<loco::ReshapeType::Fixed> *reshape_node =
-      dynamic_cast<loco::Reshape<loco::ReshapeType::Fixed> *>(loco_graph->nodes()->at(1));
+    dynamic_cast<loco::Reshape<loco::ReshapeType::Fixed> *>(loco_graph->nodes()->at(1));
   loco::Push *push_node = dynamic_cast<loco::Push *>(loco_graph->nodes()->at(2));
 
   ASSERT_NE(pull_node, nullptr);
@@ -385,11 +385,11 @@ TEST_F(TestTransformer_mir2loco, Conv2D_Test)
   loco::Pull *pull_node = dynamic_cast<loco::Pull *>(loco_graph->nodes()->at(0));
   loco::ConstGen *const_node = dynamic_cast<loco::ConstGen *>(loco_graph->nodes()->at(1));
   loco::FeatureEncode *encode_node =
-      dynamic_cast<loco::FeatureEncode *>(loco_graph->nodes()->at(2));
+    dynamic_cast<loco::FeatureEncode *>(loco_graph->nodes()->at(2));
   loco::FilterEncode *filter_node = dynamic_cast<loco::FilterEncode *>(loco_graph->nodes()->at(3));
   loco::Conv2D *conv_node = dynamic_cast<loco::Conv2D *>(loco_graph->nodes()->at(4));
   loco::FeatureDecode *decode_node =
-      dynamic_cast<loco::FeatureDecode *>(loco_graph->nodes()->at(5));
+    dynamic_cast<loco::FeatureDecode *>(loco_graph->nodes()->at(5));
   loco::Push *push_node = dynamic_cast<loco::Push *>(loco_graph->nodes()->at(6));
 
   ASSERT_NE(pull_node, nullptr);
@@ -430,7 +430,7 @@ TEST_F(TestTransformer_mir2loco, Softmax_Test)
 
   loco::Pull *pull_node = dynamic_cast<loco::Pull *>(loco_graph->nodes()->at(0));
   loco::TensorSoftmax *softmax_node =
-      dynamic_cast<loco::TensorSoftmax *>(loco_graph->nodes()->at(1));
+    dynamic_cast<loco::TensorSoftmax *>(loco_graph->nodes()->at(1));
   loco::Push *push_node = dynamic_cast<loco::Push *>(loco_graph->nodes()->at(2));
 
   ASSERT_NE(pull_node, nullptr);
@@ -520,7 +520,7 @@ TEST_F(TestTransformer_mir2loco, DepthwiseConv2D_Test)
   attributes.padding_after = {7, 4};
 
   auto *conv =
-      mir_graph.create<mir::ops::DepthwiseConv2DOp>(input, filter, attributes)->getOutput(0);
+    mir_graph.create<mir::ops::DepthwiseConv2DOp>(input, filter, attributes)->getOutput(0);
 
   mir_graph.create<mir::ops::OutputOp>(conv);
   input->setName("x");
@@ -545,7 +545,7 @@ TEST_F(TestTransformer_mir2loco, DepthwiseConv2D_Test)
   loco::DepthwiseConv2D *dw_conv_node = dynamic_cast<loco::DepthwiseConv2D *>(*encode_uses.begin());
   ASSERT_NE(dw_conv_node, nullptr);
   loco::DepthwiseFilterEncode *filter_node =
-      dynamic_cast<loco::DepthwiseFilterEncode *>(dw_conv_node->ker());
+    dynamic_cast<loco::DepthwiseFilterEncode *>(dw_conv_node->ker());
   ASSERT_NE(filter_node, nullptr);
   ASSERT_EQ(dw_conv_node->ifm(), encode_node);
   // Check params
@@ -611,7 +611,7 @@ TEST_F(TestTransformer_mir2loco, DeConv2D_Test)
   auto encode_uses = loco::succs(encode_node);
   ASSERT_EQ(encode_uses.size(), 1);
   loco::TransposedConv2D *tr_conv_node =
-      dynamic_cast<loco::TransposedConv2D *>(*encode_uses.begin());
+    dynamic_cast<loco::TransposedConv2D *>(*encode_uses.begin());
   ASSERT_NE(tr_conv_node, nullptr);
   loco::FilterEncode *filter_node = dynamic_cast<loco::FilterEncode *>(tr_conv_node->ker());
   ASSERT_NE(filter_node, nullptr);
@@ -703,8 +703,8 @@ TEST_F(TestTransformer_mir2loco, Transpose_Test)
   mir::TensorType input_type{mir::DataType::FLOAT32, {2, 7, 9, 5}};
   auto *input = mir_graph.create<mir::ops::InputOp>(input_type)->getOutput(0);
   auto *transpose =
-      mir_graph.create<mir::ops::TransposeOp>(input, std::vector<std::size_t>{3, 0, 1, 2})
-          ->getOutput(0);
+    mir_graph.create<mir::ops::TransposeOp>(input, std::vector<std::size_t>{3, 0, 1, 2})
+      ->getOutput(0);
   mir_graph.create<mir::ops::OutputOp>(transpose);
   input->setName("x");
   transpose->setName("y");
diff --git a/compiler/moco-log/CMakeLists.txt b/compiler/moco-log/CMakeLists.txt
index 036b4e74b..af6052d0c 100644
--- a/compiler/moco-log/CMakeLists.txt
+++ b/compiler/moco-log/CMakeLists.txt
@@ -5,5 +5,4 @@ add_library(moco_log SHARED ${SOURCES})
 target_include_directories(moco_log PUBLIC include)
 target_link_libraries(moco_log PUBLIC hermes)
 target_link_libraries(moco_log PRIVATE hermes_std)
-target_link_libraries(moco_log PRIVATE stdex)
 install(TARGETS moco_log DESTINATION lib)
diff --git a/compiler/moco-log/src/LoggingContext.cpp b/compiler/moco-log/src/LoggingContext.cpp
index a004e1d3d..c75e5e21f 100644
--- a/compiler/moco-log/src/LoggingContext.cpp
+++ b/compiler/moco-log/src/LoggingContext.cpp
@@ -18,7 +18,8 @@
 #include "moco/Log.h"
 
 #include <hermes/ConsoleReporter.h>
-#include <stdex/Memory.h>
+
+#include <memory>
 
 namespace moco
 {
@@ -30,8 +31,8 @@ hermes::Context *LoggingContext::get(void)
   if (ctx == nullptr)
   {
     ctx = new hermes::Context;
-    ctx->sinks()->append(stdex::make_unique<hermes::ConsoleReporter>());
-    ctx->config(stdex::make_unique<LoggerConfig>());
+    ctx->sinks()->append(std::make_unique<hermes::ConsoleReporter>());
+    ctx->config(std::make_unique<LoggerConfig>());
   }
 
   return ctx;
diff --git a/compiler/moco-tf/CMakeLists.txt b/compiler/moco-tf/CMakeLists.txt
index 5516388a4..7c42761ba 100644
--- a/compiler/moco-tf/CMakeLists.txt
+++ b/compiler/moco-tf/CMakeLists.txt
@@ -19,7 +19,6 @@ target_link_libraries(moco_tf_frontend PRIVATE moco_support)
 target_link_libraries(moco_tf_frontend PRIVATE bino)
 target_link_libraries(moco_tf_frontend PRIVATE fipe)
 target_link_libraries(moco_tf_frontend PRIVATE locop)
-target_link_libraries(moco_tf_frontend PRIVATE stdex)
 target_link_libraries(moco_tf_frontend PRIVATE moco_log)
 target_link_libraries(moco_tf_frontend PRIVATE pepper_str)
 target_link_libraries(moco_tf_frontend PRIVATE pepper_strcast)
@@ -44,7 +43,6 @@ target_link_libraries(moco_tf_frontend_test fipe)
 target_link_libraries(moco_tf_frontend_test locop)
 target_link_libraries(moco_tf_frontend_test moco_log)
 target_link_libraries(moco_tf_frontend_test moco_tf_frontend)
-target_link_libraries(moco_tf_frontend_test stdex)
 target_link_libraries(moco_tf_frontend_test plier_tf)
 target_link_libraries(moco_tf_frontend_test locoex_customop)
 target_link_libraries(moco_tf_frontend_test logo)
diff --git a/compiler/moco-tf/requires.cmake b/compiler/moco-tf/requires.cmake
index 3e0fabee9..90590e374 100644
--- a/compiler/moco-tf/requires.cmake
+++ b/compiler/moco-tf/requires.cmake
@@ -2,7 +2,6 @@ require("fipe")
 require("loco")
 require("moco")
 require("locop")
-require("stdex")
 require("moco-log")
 require("pepper-strcast")
 require("locomotiv")
diff --git a/compiler/moco-tf/src/BroadcastHelper.h b/compiler/moco-tf/src/BroadcastHelper.h
index 6238ad269..d4e1bba55 100644
--- a/compiler/moco-tf/src/BroadcastHelper.h
+++ b/compiler/moco-tf/src/BroadcastHelper.h
@@ -65,7 +65,7 @@ private:
  * This mimics "tf.broadcast_to" API in TensorFlow.
  */
 static inline auto broadcast_to(const loco::TensorShape &shape)
-    -> decltype(bino::transform_both(std::declval<BroadcastFunctor>()))
+  -> decltype(bino::transform_both(std::declval<BroadcastFunctor>()))
 {
   return bino::transform_both(BroadcastFunctor{shape});
 }
diff --git a/compiler/moco-tf/src/Canonicalization/ConcatV2Canonicalizer.cpp b/compiler/moco-tf/src/Canonicalization/ConcatV2Canonicalizer.cpp
index b59a3f3d7..71f6230b7 100644
--- a/compiler/moco-tf/src/Canonicalization/ConcatV2Canonicalizer.cpp
+++ b/compiler/moco-tf/src/Canonicalization/ConcatV2Canonicalizer.cpp
@@ -24,7 +24,6 @@
 
 #include <loco/Service/ShapeInference.h>
 
-#include <stdex/Memory.h>
 #include <oops/UserExn.h>
 
 namespace
diff --git a/compiler/moco-tf/src/Canonicalization/Conv2DBackpropInputCanonicalizer.cpp b/compiler/moco-tf/src/Canonicalization/Conv2DBackpropInputCanonicalizer.cpp
index d3cbd4ab3..1d3343933 100644
--- a/compiler/moco-tf/src/Canonicalization/Conv2DBackpropInputCanonicalizer.cpp
+++ b/compiler/moco-tf/src/Canonicalization/Conv2DBackpropInputCanonicalizer.cpp
@@ -32,7 +32,7 @@ using plier::tf::DataLayout;
 
 void set_filter_enc(loco::FilterEncode *filter_enc)
 {
-  auto enc = stdex::make_unique<loco::PermutingEncoder<loco::Domain::Filter>>();
+  auto enc = std::make_unique<loco::PermutingEncoder<loco::Domain::Filter>>();
 
   // In TensorFlow, Conv2dBackpropInput's filter is a 4-D tensor of following shape:
   // [filter_height, filter_width, out_channels, in_channels] or HWOI or HWNC (in/out in loco sense)
@@ -163,9 +163,9 @@ loco::Padding2D Padding2DInference::operator()(void)
     // 'tight fit' output. When output size (set by 'input sizes' node input) is
     // larger than tight fit, extra spaces filled with zero.
     auto tight_output_vertical = tight_output_for_valid_padding(
-        input().vertical.value(), stride().vertical(), window().vertical());
+      input().vertical.value(), stride().vertical(), window().vertical());
     auto tight_output_horizontal = tight_output_for_valid_padding(
-        input().horizontal.value(), stride().horizontal(), window().horizontal());
+      input().horizontal.value(), stride().horizontal(), window().horizontal());
 
     if (output().vertical.value() < tight_output_vertical or
         output().horizontal.value() < tight_output_horizontal)
@@ -191,8 +191,8 @@ loco::Padding2D Padding2DInference::operator()(void)
     auto whole_pad_vertical = padding_needed(input().vertical.value(), output().vertical.value(),
                                              stride().vertical(), window().vertical());
     auto whole_pad_horizontal =
-        padding_needed(input().horizontal.value(), output().horizontal.value(),
-                       stride().horizontal(), window().horizontal());
+      padding_needed(input().horizontal.value(), output().horizontal.value(), stride().horizontal(),
+                     window().horizontal());
 
     loco::Padding2D res;
 
diff --git a/compiler/moco-tf/src/Canonicalization/Conv2DCanonicalizer.cpp b/compiler/moco-tf/src/Canonicalization/Conv2DCanonicalizer.cpp
index a955793a8..30f01cdd3 100644
--- a/compiler/moco-tf/src/Canonicalization/Conv2DCanonicalizer.cpp
+++ b/compiler/moco-tf/src/Canonicalization/Conv2DCanonicalizer.cpp
@@ -29,7 +29,7 @@ using plier::tf::DataLayout;
 
 void set_filter_enc(loco::FilterEncode *filter_enc)
 {
-  auto enc = stdex::make_unique<loco::PermutingEncoder<loco::Domain::Filter>>();
+  auto enc = std::make_unique<loco::PermutingEncoder<loco::Domain::Filter>>();
 
   // In TensorFlow, conv2d filter is a 4-D tensor of following shape:
   // [filter_height, filter_width, in_channels, out_channels] -> HWIO (HWCN)
diff --git a/compiler/moco-tf/src/Canonicalization/DepthwiseConv2dNativeCanonicalizer.cpp b/compiler/moco-tf/src/Canonicalization/DepthwiseConv2dNativeCanonicalizer.cpp
index 50dddf637..dd04c2427 100644
--- a/compiler/moco-tf/src/Canonicalization/DepthwiseConv2dNativeCanonicalizer.cpp
+++ b/compiler/moco-tf/src/Canonicalization/DepthwiseConv2dNativeCanonicalizer.cpp
@@ -30,7 +30,7 @@ using plier::tf::DataLayout;
 
 void set_filter_enc(loco::DepthwiseFilterEncode *filter_enc)
 {
-  auto enc = stdex::make_unique<loco::PermutingEncoder<loco::Domain::DepthwiseFilter>>();
+  auto enc = std::make_unique<loco::PermutingEncoder<loco::Domain::DepthwiseFilter>>();
 
   // In TensorFlow, depthwiseconv2dnative filter is a 4-D tensor of following shape:
   // [filter_height, filter_width, in_channels, channel_multiplier] -> HWCM
@@ -47,28 +47,28 @@ bool canonicalize_depthwiseconv2dnative(loco::Graph *graph, moco::TFDepthwiseCon
   LOGGER(l);
 
   /**
- * @note This will replace TFDepthwiseConv2dNative node with Canonical FeatureEncode +
- *       DepthwiseFilterEncode + DepthwiseConv2D + FeatureDecode
- *
- *       Before
- *              A -+- TFDepthwiseConv2dNative - C
- *                 |
- *              B -+
- *
- *       After
- *
- *            A -+ FeatureEncode ----------------+- DepthwiseConv2D - FeatureDecode - C
- *               |                               |
- *               +-(TFDepthwiseConv2dNative)     |
- *               |                               |
- *            B -+ DepthwiseFilterEncode --------+
- *
- *       Where
- *                 A : ifm of TFDepthwiseConv2dNative
- *                 B : ker of TFDepthwiseConv2dNative
- *                 C : a node that uses TFDepthwiseConv2dNative as an input
- *                 TFDepthwiseConv2dNative is disconnected from other nodes
- */
+   * @note This will replace TFDepthwiseConv2dNative node with Canonical FeatureEncode +
+   *       DepthwiseFilterEncode + DepthwiseConv2D + FeatureDecode
+   *
+   *       Before
+   *              A -+- TFDepthwiseConv2dNative - C
+   *                 |
+   *              B -+
+   *
+   *       After
+   *
+   *            A -+ FeatureEncode ----------------+- DepthwiseConv2D - FeatureDecode - C
+   *               |                               |
+   *               +-(TFDepthwiseConv2dNative)     |
+   *               |                               |
+   *            B -+ DepthwiseFilterEncode --------+
+   *
+   *       Where
+   *                 A : ifm of TFDepthwiseConv2dNative
+   *                 B : ker of TFDepthwiseConv2dNative
+   *                 C : a node that uses TFDepthwiseConv2dNative as an input
+   *                 TFDepthwiseConv2dNative is disconnected from other nodes
+   */
 
   INFO(l) << "TFNodeCanonicalize TFDepthwiseConv2dNative begin";
 
diff --git a/compiler/moco-tf/src/Canonicalization/PadCanonicalizer.cpp b/compiler/moco-tf/src/Canonicalization/PadCanonicalizer.cpp
index 36136aed4..28ecc3fc0 100644
--- a/compiler/moco-tf/src/Canonicalization/PadCanonicalizer.cpp
+++ b/compiler/moco-tf/src/Canonicalization/PadCanonicalizer.cpp
@@ -20,8 +20,6 @@
 
 #include "loco/Service/TypeInference.h"
 
-#include <stdex/Memory.h>
-
 namespace
 {
 
diff --git a/compiler/moco-tf/src/Canonicalization/Relu6Canonicalizer.cpp b/compiler/moco-tf/src/Canonicalization/Relu6Canonicalizer.cpp
index c53a880a8..1179ef7f6 100644
--- a/compiler/moco-tf/src/Canonicalization/Relu6Canonicalizer.cpp
+++ b/compiler/moco-tf/src/Canonicalization/Relu6Canonicalizer.cpp
@@ -18,8 +18,6 @@
 
 #include <moco/IR/TFDialect.h>
 
-#include <stdex/Memory.h>
-
 namespace
 {
 
diff --git a/compiler/moco-tf/src/Canonicalization/ReluCanonicalizer.cpp b/compiler/moco-tf/src/Canonicalization/ReluCanonicalizer.cpp
index 7965dc931..bb2a71bc0 100644
--- a/compiler/moco-tf/src/Canonicalization/ReluCanonicalizer.cpp
+++ b/compiler/moco-tf/src/Canonicalization/ReluCanonicalizer.cpp
@@ -18,8 +18,6 @@
 
 #include <moco/IR/TFDialect.h>
 
-#include <stdex/Memory.h>
-
 namespace
 {
 
diff --git a/compiler/moco-tf/src/Canonicalization/RsqrtCanonicalizer.cpp b/compiler/moco-tf/src/Canonicalization/RsqrtCanonicalizer.cpp
index c31dbf6d6..25eae6288 100644
--- a/compiler/moco-tf/src/Canonicalization/RsqrtCanonicalizer.cpp
+++ b/compiler/moco-tf/src/Canonicalization/RsqrtCanonicalizer.cpp
@@ -23,7 +23,6 @@
 
 #include <loco/Service/TypeInference.h>
 
-#include <stdex/Memory.h>
 #include <oops/UserExn.h>
 
 namespace
diff --git a/compiler/moco-tf/src/Canonicalization/SoftmaxCanonicalizer.cpp b/compiler/moco-tf/src/Canonicalization/SoftmaxCanonicalizer.cpp
index 98af7b693..9fcb76c2a 100644
--- a/compiler/moco-tf/src/Canonicalization/SoftmaxCanonicalizer.cpp
+++ b/compiler/moco-tf/src/Canonicalization/SoftmaxCanonicalizer.cpp
@@ -31,16 +31,16 @@ bool canonicalize_softmax(loco::Graph *graph, moco::TFSoftmax *node)
   INFO(l) << "TFNodeCanonicalize TFSoftmax begin";
 
   /**
-  * This will replace shape inferred TFSoftmax node into canonical TensorSoftmax
-  *
-  * Before
-  *           In ---- TFSoftmax ---- Out(s)
-  *
-  * After
-  *             ------ TFSoftmax
-  *            /
-  *           In ---- TensorSoftmax ----- Out(s)
-  */
+   * This will replace shape inferred TFSoftmax node into canonical TensorSoftmax
+   *
+   * Before
+   *           In ---- TFSoftmax ---- Out(s)
+   *
+   * After
+   *             ------ TFSoftmax
+   *            /
+   *           In ---- TensorSoftmax ----- Out(s)
+   */
 
   auto nodeshape = moco::node_shape(node);
   // Canonicalization into TensorSoftmax is valid when softmax has shape info
diff --git a/compiler/moco-tf/src/Canonicalization/SoftmaxCanonicalizer.h b/compiler/moco-tf/src/Canonicalization/SoftmaxCanonicalizer.h
index ebaf04cfe..0dd31503f 100644
--- a/compiler/moco-tf/src/Canonicalization/SoftmaxCanonicalizer.h
+++ b/compiler/moco-tf/src/Canonicalization/SoftmaxCanonicalizer.h
@@ -30,8 +30,8 @@ namespace tf
 {
 
 /**
-* @brief Canonicalize TF-dialect TFSoftmax into canonical Softmax node
-*/
+ * @brief Canonicalize TF-dialect TFSoftmax into canonical Softmax node
+ */
 class SoftmaxCanonicalizer : public SimpleNodeTransform<moco::TFSoftmax>
 {
 public:
diff --git a/compiler/moco-tf/src/Canonicalization/StopGradientCanonicalizer.cpp b/compiler/moco-tf/src/Canonicalization/StopGradientCanonicalizer.cpp
index 574fa3993..47ac40ea8 100644
--- a/compiler/moco-tf/src/Canonicalization/StopGradientCanonicalizer.cpp
+++ b/compiler/moco-tf/src/Canonicalization/StopGradientCanonicalizer.cpp
@@ -30,16 +30,16 @@ bool canonicalize_stopgradient(loco::Graph *graph, moco::TFStopGradient *node)
   INFO(l) << "TFNodeCanonicalize TFStopGradient begin";
 
   /**
-  * This will replace shape inferred TFStopGradient node into canonical Forward
-  *
-  * Before
-  *           In --- TFStopGradient --- Out(s)
-  *
-  * After
-  *               -- TFStopGradient
-  *              /
-  *           In --- Forward --- Out(s)
-  */
+   * This will replace shape inferred TFStopGradient node into canonical Forward
+   *
+   * Before
+   *           In --- TFStopGradient --- Out(s)
+   *
+   * After
+   *               -- TFStopGradient
+   *              /
+   *           In --- Forward --- Out(s)
+   */
 
   // Create loco node to replace
   auto forward_node = graph->nodes()->create<loco::Forward>();
diff --git a/compiler/moco-tf/src/Canonicalization/StopGradientCanonicalizer.h b/compiler/moco-tf/src/Canonicalization/StopGradientCanonicalizer.h
index 6a17728a6..8346914c0 100644
--- a/compiler/moco-tf/src/Canonicalization/StopGradientCanonicalizer.h
+++ b/compiler/moco-tf/src/Canonicalization/StopGradientCanonicalizer.h
@@ -30,8 +30,8 @@ namespace tf
 {
 
 /**
-* @brief Canonicalize TF-dialect TFStopGradient into canonical Forward node
-*/
+ * @brief Canonicalize TF-dialect TFStopGradient into canonical Forward node
+ */
 class StopGradientCanonicalizer : public SimpleNodeTransform<moco::TFStopGradient>
 {
 public:
diff --git a/compiler/moco-tf/src/Canonicalization/TFPushCanonicalizer.cpp b/compiler/moco-tf/src/Canonicalization/TFPushCanonicalizer.cpp
index 081e0e5f9..3adf1733c 100644
--- a/compiler/moco-tf/src/Canonicalization/TFPushCanonicalizer.cpp
+++ b/compiler/moco-tf/src/Canonicalization/TFPushCanonicalizer.cpp
@@ -18,8 +18,6 @@
 
 #include <moco/IR/TFDialect.h>
 
-#include <stdex/Memory.h>
-
 namespace
 {
 
diff --git a/compiler/moco-tf/src/Canonicalization/TanhCanonicalizer.cpp b/compiler/moco-tf/src/Canonicalization/TanhCanonicalizer.cpp
index 3f48a50fc..3b6e3c90c 100644
--- a/compiler/moco-tf/src/Canonicalization/TanhCanonicalizer.cpp
+++ b/compiler/moco-tf/src/Canonicalization/TanhCanonicalizer.cpp
@@ -18,8 +18,6 @@
 
 #include <moco/IR/TFDialect.h>
 
-#include <stdex/Memory.h>
-
 namespace
 {
 
diff --git a/compiler/moco-tf/src/Canonicalizer.cpp b/compiler/moco-tf/src/Canonicalizer.cpp
index 04bc7c57a..8e23d91df 100644
--- a/compiler/moco-tf/src/Canonicalizer.cpp
+++ b/compiler/moco-tf/src/Canonicalizer.cpp
@@ -56,8 +56,7 @@
 
 #include <logo/Phase.h>
 
-#include <stdex/Memory.h>
-
+#include <memory>
 #include <cassert>
 
 namespace
@@ -92,41 +91,41 @@ void Canonicalizer::canonicalize(loco::Graph *g) const
 
   /* TRANSFORM DECLARATION BEGIN */
   // Run shape and type inference at the top
-  phase.emplace_back(stdex::make_unique<ShapeInferencePass>());
-  phase.emplace_back(stdex::make_unique<TypeInferencePass>());
+  phase.emplace_back(std::make_unique<ShapeInferencePass>());
+  phase.emplace_back(std::make_unique<TypeInferencePass>());
 
-  phase.emplace_back(stdex::make_unique<AddCanonicalizer>());
-  phase.emplace_back(stdex::make_unique<AvgPoolCanonicalizer>());
+  phase.emplace_back(std::make_unique<AddCanonicalizer>());
+  phase.emplace_back(std::make_unique<AvgPoolCanonicalizer>());
   if (moco::tf::get<moco::tf::Knob::CanonicalizeBiasAdd>())
-    phase.emplace_back(stdex::make_unique<BiasAddCanonicalizer>());
-  phase.emplace_back(stdex::make_unique<ConcatV2Canonicalizer>());
+    phase.emplace_back(std::make_unique<BiasAddCanonicalizer>());
+  phase.emplace_back(std::make_unique<ConcatV2Canonicalizer>());
   if (moco::tf::get<moco::tf::Knob::CanonicalizeConst>())
-    phase.emplace_back(stdex::make_unique<ConstCanonicalizer>());
-  phase.emplace_back(stdex::make_unique<Conv2DBackpropInputCanonicalizer>());
+    phase.emplace_back(std::make_unique<ConstCanonicalizer>());
+  phase.emplace_back(std::make_unique<Conv2DBackpropInputCanonicalizer>());
   if (moco::tf::get<moco::tf::Knob::CanonicalizeConv2D>())
-    phase.emplace_back(stdex::make_unique<Conv2DCanonicalizer>());
-  phase.emplace_back(stdex::make_unique<DepthwiseConv2dNativeCanonicalizer>());
-  phase.emplace_back(stdex::make_unique<IdentityCanonicalizer>());
-  phase.emplace_back(stdex::make_unique<MaximumCanonicalizer>());
-  phase.emplace_back(stdex::make_unique<MaxPoolCanonicalizer>());
-  phase.emplace_back(stdex::make_unique<MeanCanonicalizer>());
-  phase.emplace_back(stdex::make_unique<MulCanonicalizer>());
-  phase.emplace_back(stdex::make_unique<PadCanonicalizer>());
-  phase.emplace_back(stdex::make_unique<PlaceholderCanonicalizer>());
-  phase.emplace_back(stdex::make_unique<RealDivCanonicalizer>());
-  phase.emplace_back(stdex::make_unique<ReluCanonicalizer>());
-  phase.emplace_back(stdex::make_unique<Relu6Canonicalizer>());
-  phase.emplace_back(stdex::make_unique<ReshapeCanonicalizer>());
-  phase.emplace_back(stdex::make_unique<RsqrtCanonicalizer>());
-  phase.emplace_back(stdex::make_unique<SoftmaxCanonicalizer>());
-  phase.emplace_back(stdex::make_unique<SqrtCanonicalizer>());
+    phase.emplace_back(std::make_unique<Conv2DCanonicalizer>());
+  phase.emplace_back(std::make_unique<DepthwiseConv2dNativeCanonicalizer>());
+  phase.emplace_back(std::make_unique<IdentityCanonicalizer>());
+  phase.emplace_back(std::make_unique<MaximumCanonicalizer>());
+  phase.emplace_back(std::make_unique<MaxPoolCanonicalizer>());
+  phase.emplace_back(std::make_unique<MeanCanonicalizer>());
+  phase.emplace_back(std::make_unique<MulCanonicalizer>());
+  phase.emplace_back(std::make_unique<PadCanonicalizer>());
+  phase.emplace_back(std::make_unique<PlaceholderCanonicalizer>());
+  phase.emplace_back(std::make_unique<RealDivCanonicalizer>());
+  phase.emplace_back(std::make_unique<ReluCanonicalizer>());
+  phase.emplace_back(std::make_unique<Relu6Canonicalizer>());
+  phase.emplace_back(std::make_unique<ReshapeCanonicalizer>());
+  phase.emplace_back(std::make_unique<RsqrtCanonicalizer>());
+  phase.emplace_back(std::make_unique<SoftmaxCanonicalizer>());
+  phase.emplace_back(std::make_unique<SqrtCanonicalizer>());
   // NOTE SquaredDifference is handled in ResolveSquaredDifference
-  phase.emplace_back(stdex::make_unique<SqueezeCanonicalizer>());
-  phase.emplace_back(stdex::make_unique<StopGradientCanonicalizer>());
-  phase.emplace_back(stdex::make_unique<SubCanonicalizer>());
-  phase.emplace_back(stdex::make_unique<TanhCanonicalizer>());
+  phase.emplace_back(std::make_unique<SqueezeCanonicalizer>());
+  phase.emplace_back(std::make_unique<StopGradientCanonicalizer>());
+  phase.emplace_back(std::make_unique<SubCanonicalizer>());
+  phase.emplace_back(std::make_unique<TanhCanonicalizer>());
   // For virtual nodes
-  phase.emplace_back(stdex::make_unique<TFPushCanonicalizer>());
+  phase.emplace_back(std::make_unique<TFPushCanonicalizer>());
   /* TRANSFORM DECLARATION END */
 
   ProgressReporter prog(g, logo::PhaseStrategy::Restart);
diff --git a/compiler/moco-tf/src/CodecHelper.h b/compiler/moco-tf/src/CodecHelper.h
index 85e4e2164..a4ca8d5ca 100644
--- a/compiler/moco-tf/src/CodecHelper.h
+++ b/compiler/moco-tf/src/CodecHelper.h
@@ -18,7 +18,8 @@
 #define __CODEC_HELPER_H__
 
 #include <plier/tf/Convert.h>
-#include <stdex/Memory.h>
+
+#include <memory>
 
 namespace
 {
@@ -27,7 +28,7 @@ using plier::tf::DataLayout;
 
 void set_feature_enc(loco::FeatureEncode *feature_enc, DataLayout data_layout)
 {
-  auto enc = stdex::make_unique<loco::PermutingEncoder<loco::Domain::Feature>>();
+  auto enc = std::make_unique<loco::PermutingEncoder<loco::Domain::Feature>>();
 
   if (data_layout == DataLayout::NHWC)
   {
@@ -49,7 +50,7 @@ void set_feature_enc(loco::FeatureEncode *feature_enc, DataLayout data_layout)
 
 void set_feature_dec(loco::FeatureDecode *feature_dec, DataLayout data_layout)
 {
-  auto dec = stdex::make_unique<loco::PermutingDecoder<loco::Domain::Feature>>();
+  auto dec = std::make_unique<loco::PermutingDecoder<loco::Domain::Feature>>();
 
   if (data_layout == DataLayout::NHWC)
   {
diff --git a/compiler/moco-tf/src/Frontend.cpp b/compiler/moco-tf/src/Frontend.cpp
index a17d5dd0e..0d5250b17 100644
--- a/compiler/moco-tf/src/Frontend.cpp
+++ b/compiler/moco-tf/src/Frontend.cpp
@@ -31,13 +31,13 @@
 
 #include <loco/Service/ShapeInference.h>
 
-#include <stdex/Memory.h>
 #include <oops/UserExn.h>
 
 #include <google/protobuf/io/coded_stream.h>
 #include <google/protobuf/io/zero_copy_stream_impl.h>
 #include <google/protobuf/text_format.h>
 
+#include <memory>
 #include <iostream>
 #include <sstream>
 #include <fstream>
@@ -157,7 +157,7 @@ moco::GraphBuilderRegistry make_graph_builder_registry(const moco::ModelSignatur
   for (const auto &custom_op : sig.customops())
   {
     std::unique_ptr<moco::tf::COpCallGraphBuilder> builder =
-        stdex::make_unique<moco::tf::COpCallGraphBuilder>(&sig);
+      std::make_unique<moco::tf::COpCallGraphBuilder>(&sig);
     registry.add(custom_op, std::move(builder));
   }
 
@@ -243,7 +243,7 @@ std::unique_ptr<loco::Graph> Frontend::import(const ModelSignature &signature,
     auto input = graph->inputs()->at(n);
     auto input_node = moco::placeholder_node(graph.get(), n);
     assert(input_node != nullptr);
-    input->shape(stdex::make_unique<loco::TensorShape>(tensor_shape(input_node)));
+    input->shape(std::make_unique<loco::TensorShape>(tensor_shape(input_node)));
   }
 
   for (uint32_t n = 0; n < graph->outputs()->size(); ++n)
@@ -251,7 +251,7 @@ std::unique_ptr<loco::Graph> Frontend::import(const ModelSignature &signature,
     auto output = graph->outputs()->at(n);
     auto output_node = moco::push_node(graph.get(), n);
     assert(output_node != nullptr);
-    output->shape(stdex::make_unique<loco::TensorShape>(::tensor_shape(output_node)));
+    output->shape(std::make_unique<loco::TensorShape>(::tensor_shape(output_node)));
   }
 
   // Convert graph to hold only Canonical dialect
diff --git a/compiler/moco-tf/src/Knob.cpp b/compiler/moco-tf/src/Knob.cpp
index 0e1c7e0ea..a13895f68 100644
--- a/compiler/moco-tf/src/Knob.cpp
+++ b/compiler/moco-tf/src/Knob.cpp
@@ -109,12 +109,12 @@ namespace moco
 namespace tf
 {
 
-#define KNOB_BOOL(NAME, DEFAULT, DESC)                                                           \
-  template <> typename KnobTrait<Knob::NAME>::ValueType get<Knob::NAME>(void)                    \
-  {                                                                                              \
-    static typename KnobTrait<Knob::NAME>::ValueType value =                                     \
-        ::knob_load<typename KnobTrait<Knob::NAME>::ValueType>(::knob_loader(), #NAME, DEFAULT); \
-    return value;                                                                                \
+#define KNOB_BOOL(NAME, DEFAULT, DESC)                                                         \
+  template <> typename KnobTrait<Knob::NAME>::ValueType get<Knob::NAME>(void)                  \
+  {                                                                                            \
+    static typename KnobTrait<Knob::NAME>::ValueType value =                                   \
+      ::knob_load<typename KnobTrait<Knob::NAME>::ValueType>(::knob_loader(), #NAME, DEFAULT); \
+    return value;                                                                              \
   }
 #include "Knob.lst"
 #undef KNOB_BOOL
diff --git a/compiler/moco-tf/src/LogHelper.cpp b/compiler/moco-tf/src/LogHelper.cpp
index 92ff75569..6b127020a 100644
--- a/compiler/moco-tf/src/LogHelper.cpp
+++ b/compiler/moco-tf/src/LogHelper.cpp
@@ -74,7 +74,7 @@ namespace tf
 
 FormattedGraph fmt(loco::Graph *g)
 {
-  auto node_summary_builder = stdex::make_unique<TFNodeSummaryBuilderFactory>();
+  auto node_summary_builder = std::make_unique<TFNodeSummaryBuilderFactory>();
   return std::move(locop::fmt<locop::LinearV1>(g).with(std::move(node_summary_builder)));
 }
 
diff --git a/compiler/moco-tf/src/Op/COpCall.cpp b/compiler/moco-tf/src/Op/COpCall.cpp
index 801196f0f..af4bc9dc4 100644
--- a/compiler/moco-tf/src/Op/COpCall.cpp
+++ b/compiler/moco-tf/src/Op/COpCall.cpp
@@ -23,9 +23,9 @@
 #include <moco/Names.h>
 #include <moco/tf/Frontend.h>
 #include <loco.h>
-#include <stdex/Memory.h>
 #include <oops/UserExn.h>
 
+#include <memory>
 #include <vector>
 #include <cassert>
 #include <stdexcept>
@@ -37,7 +37,7 @@ class COpCallGraphUpdate final : public moco::GraphUpdate
 {
 public:
   COpCallGraphUpdate(locoex::COpCall *node, const std::vector<moco::TensorName> &input_names)
-      : _node(node), _input_names(input_names)
+    : _node(node), _input_names(input_names)
   {
   }
 
@@ -94,11 +94,11 @@ void COpCallGraphBuilder::build(const tensorflow::NodeDef &tf_node,
 
       if (val.value_case() == tensorflow::AttrValue::kF)
       {
-        call_node->attr(name, stdex::make_unique<locoex::COpAttrFloat>(val.f()));
+        call_node->attr(name, std::make_unique<locoex::COpAttrFloat>(val.f()));
       }
       else if (val.value_case() == tensorflow::AttrValue::kI)
       {
-        call_node->attr(name, stdex::make_unique<locoex::COpAttrInt>(val.i()));
+        call_node->attr(name, std::make_unique<locoex::COpAttrInt>(val.i()));
       }
       // TODO define more types
       else
@@ -118,7 +118,7 @@ void COpCallGraphBuilder::build(const tensorflow::NodeDef &tf_node,
   {
     input_names.emplace_back(TensorName(tf_node.input(i)));
   }
-  auto update = stdex::make_unique<COpCallGraphUpdate>(call_node, input_names);
+  auto update = std::make_unique<COpCallGraphUpdate>(call_node, input_names);
   updates->enroll(std::move(update));
 }
 
diff --git a/compiler/moco-tf/src/Op/COpCall.h b/compiler/moco-tf/src/Op/COpCall.h
index 0bb8a93c9..2f0ee1e36 100644
--- a/compiler/moco-tf/src/Op/COpCall.h
+++ b/compiler/moco-tf/src/Op/COpCall.h
@@ -32,7 +32,9 @@ namespace tf
 class COpCallGraphBuilder final : public GraphBuilder
 {
 public:
-  COpCallGraphBuilder(const ModelSignature *signature) : _signature(signature) { /* empty */}
+  COpCallGraphBuilder(const ModelSignature *signature) : _signature(signature)
+  { /* empty */
+  }
   bool validate(const tensorflow::NodeDef &) const override;
   void build(const tensorflow::NodeDef &, GraphBuilderContext *) const override;
 
diff --git a/compiler/moco-tf/src/Op/COpCall.test.cpp b/compiler/moco-tf/src/Op/COpCall.test.cpp
index f13118292..7e1ffa954 100644
--- a/compiler/moco-tf/src/Op/COpCall.test.cpp
+++ b/compiler/moco-tf/src/Op/COpCall.test.cpp
@@ -27,10 +27,11 @@
 
 #include <loco.h>
 #include <plier/tf/TestHelper.h>
-#include <stdex/Memory.h>
 
 #include <gtest/gtest.h>
 
+#include <memory>
+
 using namespace moco::tf::test;
 
 namespace
@@ -91,7 +92,7 @@ TEST(Call_Test, Call_01)
 
   // import
   moco::GraphBuilderRegistry registry{&moco::GraphBuilderRegistry::get()};
-  registry.add("new_custom_op", stdex::make_unique<moco::tf::COpCallGraphBuilder>(&signature));
+  registry.add("new_custom_op", std::make_unique<moco::tf::COpCallGraphBuilder>(&signature));
 
   moco::Importer importer(&registry);
   std::unique_ptr<loco::Graph> graph = importer.import(signature, graph_def);
diff --git a/compiler/moco-tf/src/Optimizer.cpp b/compiler/moco-tf/src/Optimizer.cpp
index f33b4109b..51e1e1c4f 100644
--- a/compiler/moco-tf/src/Optimizer.cpp
+++ b/compiler/moco-tf/src/Optimizer.cpp
@@ -22,7 +22,7 @@
 
 #include <logo/Phase.h>
 
-#include <stdex/Memory.h>
+#include <memory>
 
 namespace moco
 {
@@ -35,48 +35,48 @@ void Optimizer::optimize(loco::Graph *g) const
 
   /* TRANSFORM DECLARATION BEGIN */
   // Shape inference is required for ResolveRedundantReshape
-  phase.emplace_back(stdex::make_unique<ShapeInferencePass>());
+  phase.emplace_back(std::make_unique<ShapeInferencePass>());
 
   if (moco::tf::get<moco::tf::Knob::ConstantFolding>())
   {
-    phase.emplace_back(stdex::make_unique<logo::ConstantFoldingPass>());
+    phase.emplace_back(std::make_unique<logo::ConstantFoldingPass>());
   }
 
   if (moco::tf::get<moco::tf::Knob::RemoveDeadNode>())
   {
-    phase.emplace_back(stdex::make_unique<logo::RemoveDeadNodePass>());
+    phase.emplace_back(std::make_unique<logo::RemoveDeadNodePass>());
   }
 
   if (moco::tf::get<moco::tf::Knob::ReorderDecode>() &&
       moco::tf::get<moco::tf::Knob::ReorderDecodeTensorBiasAdd>())
   {
-    phase.emplace_back(stdex::make_unique<logo::ReorderDecodePass<loco::TensorBiasAdd>>());
+    phase.emplace_back(std::make_unique<logo::ReorderDecodePass<loco::TensorBiasAdd>>());
   }
 
   if (moco::tf::get<moco::tf::Knob::ReorderDecode>() &&
       moco::tf::get<moco::tf::Knob::ReorderDecodeReLU>())
   {
-    phase.emplace_back(stdex::make_unique<logo::ReorderDecodePass<loco::ReLU>>());
+    phase.emplace_back(std::make_unique<logo::ReorderDecodePass<loco::ReLU>>());
   }
 
   if (moco::tf::get<moco::tf::Knob::SimplifyDomainConversion>())
   {
-    phase.emplace_back(stdex::make_unique<logo::SimplifyDomainConversionPass>());
+    phase.emplace_back(std::make_unique<logo::SimplifyDomainConversionPass>());
   }
 
   if (moco::tf::get<moco::tf::Knob::RemoveForwardNode>())
   {
-    phase.emplace_back(stdex::make_unique<logo::RemoveForwardNodePass>());
+    phase.emplace_back(std::make_unique<logo::RemoveForwardNodePass>());
   }
 
   if (moco::tf::get<moco::tf::Knob::ResolveDuplicateReshape>())
   {
-    phase.emplace_back(stdex::make_unique<logo::ResolveDuplicateReshapePass>());
+    phase.emplace_back(std::make_unique<logo::ResolveDuplicateReshapePass>());
   }
 
   if (moco::tf::get<moco::tf::Knob::ResolveRedundantReshape>())
   {
-    phase.emplace_back(stdex::make_unique<logo::ResolveRedundantReshapePass>());
+    phase.emplace_back(std::make_unique<logo::ResolveRedundantReshapePass>());
   }
   /* TRANSFORM DECLARATION END */
 
diff --git a/compiler/moco-tf/src/ProgressReporter.h b/compiler/moco-tf/src/ProgressReporter.h
index 190d972c5..440d29221 100644
--- a/compiler/moco-tf/src/ProgressReporter.h
+++ b/compiler/moco-tf/src/ProgressReporter.h
@@ -30,7 +30,7 @@ class ProgressReporter : public logo::PhaseEventListener
 {
 public:
   ProgressReporter(loco::Graph *graph, logo::PhaseStrategy strategy)
-      : _graph{graph}, _strategy{strategy}
+    : _graph{graph}, _strategy{strategy}
   {
     // DO NOTHING
   }
diff --git a/compiler/moco-tf/src/TFFormattedGraph.h b/compiler/moco-tf/src/TFFormattedGraph.h
index f79208536..81978954f 100644
--- a/compiler/moco-tf/src/TFFormattedGraph.h
+++ b/compiler/moco-tf/src/TFFormattedGraph.h
@@ -19,7 +19,7 @@
 
 #include <locop/FormattedGraph.h>
 
-#include <stdex/Memory.h>
+#include <memory>
 
 namespace moco
 {
@@ -49,7 +49,7 @@ public:
 public:
   std::unique_ptr<locop::NodeSummaryBuilder> create(const locop::SymbolTable *tlb) const final
   {
-    return stdex::make_unique<MocoNodeSummaryBuilder>(tlb);
+    return std::make_unique<MocoNodeSummaryBuilder>(tlb);
   }
 };
 
diff --git a/compiler/moco-tf/src/TFOptimizer.cpp b/compiler/moco-tf/src/TFOptimizer.cpp
index 2256b99b8..720cd9d9a 100644
--- a/compiler/moco-tf/src/TFOptimizer.cpp
+++ b/compiler/moco-tf/src/TFOptimizer.cpp
@@ -22,7 +22,7 @@
 
 #include <logo/Phase.h>
 
-#include <stdex/Memory.h>
+#include <memory>
 
 namespace moco
 {
@@ -36,39 +36,39 @@ void TFOptimizer::optimize(loco::Graph *g) const
   /* TRANSFORM DECLARATION BEGIN */
   if (moco::tf::get<moco::tf::Knob::ResolveFusedBatchNorm>())
   {
-    phase.emplace_back(stdex::make_unique<moco::ResolveFusedBatchNorm>());
+    phase.emplace_back(std::make_unique<moco::ResolveFusedBatchNorm>());
   }
   if (moco::tf::get<moco::tf::Knob::FuseBinaryIntoPreceding>())
   {
-    phase.emplace_back(stdex::make_unique<moco::FuseBinaryIntoPreceding>());
+    phase.emplace_back(std::make_unique<moco::FuseBinaryIntoPreceding>());
   }
   if (moco::tf::get<moco::tf::Knob::ResolveConstantShape>())
   {
-    phase.emplace_back(stdex::make_unique<moco::ResolveConstantShape>());
+    phase.emplace_back(std::make_unique<moco::ResolveConstantShape>());
   }
   if (moco::tf::get<moco::tf::Knob::ResolveReshapeWildcardDim>())
   {
-    phase.emplace_back(stdex::make_unique<moco::ResolveReshapeWildcardDim>());
+    phase.emplace_back(std::make_unique<moco::ResolveReshapeWildcardDim>());
   }
   if (moco::tf::get<moco::tf::Knob::ResolveSquaredDifference>())
   {
-    phase.emplace_back(stdex::make_unique<moco::ResolveSquaredDifference>());
+    phase.emplace_back(std::make_unique<moco::ResolveSquaredDifference>());
   }
   if (moco::tf::get<moco::tf::Knob::RemoveTFIdentityNode>())
   {
-    phase.emplace_back(stdex::make_unique<moco::RemoveTFIdentityNode>());
+    phase.emplace_back(std::make_unique<moco::RemoveTFIdentityNode>());
   }
   if (moco::tf::get<moco::tf::Knob::RemoveDeadNode>())
   {
-    phase.emplace_back(stdex::make_unique<logo::RemoveDeadNodePass>());
+    phase.emplace_back(std::make_unique<logo::RemoveDeadNodePass>());
   }
   if (moco::tf::get<moco::tf::Knob::SqueezeReduceNode>())
   {
-    phase.emplace_back(stdex::make_unique<moco::SqueezeReduceNode>());
+    phase.emplace_back(std::make_unique<moco::SqueezeReduceNode>());
   }
   // Shape inference is needed for added nodes doing above transformations
-  phase.emplace_back(stdex::make_unique<moco::tf::ShapeInferencePass>());
-  phase.emplace_back(stdex::make_unique<moco::tf::TypeInferencePass>());
+  phase.emplace_back(std::make_unique<moco::tf::ShapeInferencePass>());
+  phase.emplace_back(std::make_unique<moco::tf::TypeInferencePass>());
   /* TRANSFORM DECLARATION END */
 
   ProgressReporter prog(g, logo::PhaseStrategy::Saturate);
diff --git a/compiler/moco-tf/src/TestHelper.test.cpp b/compiler/moco-tf/src/TestHelper.test.cpp
index 1e8c38e36..36ce1114a 100644
--- a/compiler/moco-tf/src/TestHelper.test.cpp
+++ b/compiler/moco-tf/src/TestHelper.test.cpp
@@ -48,7 +48,7 @@ void setup_output_node(loco::Graph *graph, loco::Node *last_node)
 
 #include <moco/IR/Nodes/TFConst.h>
 
-#include <stdex/Memory.h>
+#include <memory>
 
 #include <gtest/gtest.h>
 
@@ -62,7 +62,7 @@ namespace test
 TFNodeBuildTester::TFNodeBuildTester()
 {
   _graph = loco::make_graph();
-  _tensor_names = stdex::make_unique<moco::SymbolTable>();
+  _tensor_names = std::make_unique<moco::SymbolTable>();
 }
 
 void TFNodeBuildTester::inputs(const std::vector<std::string> &names)
@@ -91,8 +91,8 @@ void TFNodeBuildTester::run(tensorflow::NodeDef &nodedef, moco::GraphBuilder &gr
 {
   assert(_output != nullptr);
 
-  auto node_defs = stdex::make_unique<moco::NodeDefTable>();
-  auto updates = stdex::make_unique<moco::UpdateQueue>();
+  auto node_defs = std::make_unique<moco::NodeDefTable>();
+  auto updates = std::make_unique<moco::UpdateQueue>();
 
   moco::GraphBuilderContext gb_context(_graph.get(), node_defs.get(), _tensor_names.get(),
                                        updates.get());
diff --git a/compiler/moco-tf/src/Transforms/ShapeInferencePass.cpp b/compiler/moco-tf/src/Transforms/ShapeInferencePass.cpp
index 64ba9dfb1..8f46cfbbc 100644
--- a/compiler/moco-tf/src/Transforms/ShapeInferencePass.cpp
+++ b/compiler/moco-tf/src/Transforms/ShapeInferencePass.cpp
@@ -46,8 +46,8 @@ bool ShapeInferencePass::run(loco::Graph *graph)
   loco::MultiDialectShapeInferenceRule rules;
 
   rules.bind(loco::CanonicalDialect::get(), &canonical_rule)
-      .bind(TFDialect::get(), &tf_rule)
-      .bind(locoex::COpDialect::get(), &cop_rule);
+    .bind(TFDialect::get(), &tf_rule)
+    .bind(locoex::COpDialect::get(), &cop_rule);
 
   return loco::apply(&rules).to(graph);
 }
diff --git a/compiler/moco-tf/src/Transforms/TypeInferencePass.cpp b/compiler/moco-tf/src/Transforms/TypeInferencePass.cpp
index db6cf7521..2e2d4a9c1 100644
--- a/compiler/moco-tf/src/Transforms/TypeInferencePass.cpp
+++ b/compiler/moco-tf/src/Transforms/TypeInferencePass.cpp
@@ -42,8 +42,8 @@ bool TypeInferencePass::run(loco::Graph *graph)
   loco::MultiDialectTypeInferenceRule rules;
 
   rules.bind(loco::CanonicalDialect::get(), &canonical_rule)
-      .bind(TFDialect::get(), &tf_rule)
-      .bind(locoex::COpDialect::get(), &cop_rule);
+    .bind(TFDialect::get(), &tf_rule)
+    .bind(locoex::COpDialect::get(), &cop_rule);
 
   loco::apply(&rules).to(graph);
 
diff --git a/compiler/moco/import/CMakeLists.txt b/compiler/moco/import/CMakeLists.txt
index 43107776e..460c2c98b 100644
--- a/compiler/moco/import/CMakeLists.txt
+++ b/compiler/moco/import/CMakeLists.txt
@@ -7,7 +7,6 @@ target_include_directories(moco_import PRIVATE src)
 target_include_directories(moco_import PUBLIC include)
 target_link_libraries(moco_import PUBLIC moco_lang)
 target_link_libraries(moco_import PUBLIC mio_tf)
-target_link_libraries(moco_import PUBLIC stdex)
 target_link_libraries(moco_import PRIVATE nncc_common)
 target_link_libraries(moco_import PRIVATE plier_tf)
 target_link_libraries(moco_import PRIVATE oops)
diff --git a/compiler/moco/import/include/moco/Import/GraphBuilderContext.h b/compiler/moco/import/include/moco/Import/GraphBuilderContext.h
index ae4f02c2a..76a9644b5 100644
--- a/compiler/moco/import/include/moco/Import/GraphBuilderContext.h
+++ b/compiler/moco/import/include/moco/Import/GraphBuilderContext.h
@@ -118,7 +118,7 @@ class GraphBuilderContext
 public:
   GraphBuilderContext(loco::Graph *g, NodeDefTable *nodedef, SymbolTable *tensor_names,
                       UpdateQueue *updates)
-      : _g(g), _nodedef(nodedef), _tensor_names(tensor_names), _updates(updates)
+    : _g(g), _nodedef(nodedef), _tensor_names(tensor_names), _updates(updates)
   {
     // DO NOTHING
   }
diff --git a/compiler/moco/import/include/moco/Import/GraphBuilderRegistry.h b/compiler/moco/import/include/moco/Import/GraphBuilderRegistry.h
index da65cffb8..c99dca1cf 100644
--- a/compiler/moco/import/include/moco/Import/GraphBuilderRegistry.h
+++ b/compiler/moco/import/include/moco/Import/GraphBuilderRegistry.h
@@ -82,6 +82,6 @@ private:
   std::map<const std::string, std::unique_ptr<GraphBuilder>> _builder_map;
 };
 
-} // namespace mono
+} // namespace moco
 
 #endif // __MOCO_IMPORT_GRAPH_BUILDER_REGISTRY_H__
diff --git a/compiler/moco/import/include/moco/Import/Nodes/Softmax.h b/compiler/moco/import/include/moco/Import/Nodes/Softmax.h
index 43fbb8852..290818958 100644
--- a/compiler/moco/import/include/moco/Import/Nodes/Softmax.h
+++ b/compiler/moco/import/include/moco/Import/Nodes/Softmax.h
@@ -23,8 +23,8 @@ namespace moco
 {
 
 /**
-* @brief GraphBuilder for Softmax node
-*/
+ * @brief GraphBuilder for Softmax node
+ */
 class SoftmaxGraphBuilder final : public GraphBuilder
 {
 public:
diff --git a/compiler/moco/import/src/GraphBuilderRegistry.cpp b/compiler/moco/import/src/GraphBuilderRegistry.cpp
index 3a028513f..7e91ca9d0 100644
--- a/compiler/moco/import/src/GraphBuilderRegistry.cpp
+++ b/compiler/moco/import/src/GraphBuilderRegistry.cpp
@@ -17,45 +17,45 @@
 #include "moco/Import/GraphBuilderRegistry.h"
 #include "moco/Import/Nodes.h"
 
-#include <stdex/Memory.h>
+#include <memory>
 
 namespace moco
 {
 
 GraphBuilderRegistry::GraphBuilderRegistry()
 {
-  add("Add", stdex::make_unique<AddGraphBuilder>());
-  add("AvgPool", stdex::make_unique<AvgPoolGraphBuilder>());
-  add("BiasAdd", stdex::make_unique<BiasAddGraphBuilder>());
-  add("ConcatV2", stdex::make_unique<ConcatV2GraphBuilder>());
-  add("Const", stdex::make_unique<ConstGraphBuilder>());
-  add("Conv2D", stdex::make_unique<Conv2DGraphBuilder>());
-  add("Conv2DBackpropInput", stdex::make_unique<Conv2DBackpropInputGraphBuilder>());
-  add("DepthwiseConv2dNative", stdex::make_unique<DepthwiseConv2dNativeGraphBuilder>());
-  add("FakeQuantWithMinMaxVars", stdex::make_unique<FakeQuantWithMinMaxVarsGraphBuilder>());
-  add("FusedBatchNorm", stdex::make_unique<FusedBatchNormGraphBuilder>());
-  add("Identity", stdex::make_unique<IdentityGraphBuilder>());
-  add("Maximum", stdex::make_unique<MaximumGraphBuilder>());
-  add("MaxPool", stdex::make_unique<MaxPoolGraphBuilder>());
-  add("Mean", stdex::make_unique<MeanGraphBuilder>());
-  add("Mul", stdex::make_unique<MulGraphBuilder>());
-  add("Pack", stdex::make_unique<PackGraphBuilder>());
-  add("Pad", stdex::make_unique<PadGraphBuilder>());
-  add("Placeholder", stdex::make_unique<PlaceholderGraphBuilder>());
-  add("RealDiv", stdex::make_unique<RealDivGraphBuilder>());
-  add("Relu", stdex::make_unique<ReluGraphBuilder>());
-  add("Relu6", stdex::make_unique<Relu6GraphBuilder>());
-  add("Reshape", stdex::make_unique<ReshapeGraphBuilder>());
-  add("Rsqrt", stdex::make_unique<RsqrtGraphBuilder>());
-  add("Shape", stdex::make_unique<ShapeGraphBuilder>());
-  add("Softmax", stdex::make_unique<SoftmaxGraphBuilder>());
-  add("Sqrt", stdex::make_unique<SqrtGraphBuilder>());
-  add("SquaredDifference", stdex::make_unique<SquaredDifferenceGraphBuilder>());
-  add("Squeeze", stdex::make_unique<SqueezeGraphBuilder>());
-  add("StopGradient", stdex::make_unique<StopGradientGraphBuilder>());
-  add("StridedSlice", stdex::make_unique<StridedSliceGraphBuilder>());
-  add("Sub", stdex::make_unique<SubGraphBuilder>());
-  add("Tanh", stdex::make_unique<TanhGraphBuilder>());
+  add("Add", std::make_unique<AddGraphBuilder>());
+  add("AvgPool", std::make_unique<AvgPoolGraphBuilder>());
+  add("BiasAdd", std::make_unique<BiasAddGraphBuilder>());
+  add("ConcatV2", std::make_unique<ConcatV2GraphBuilder>());
+  add("Const", std::make_unique<ConstGraphBuilder>());
+  add("Conv2D", std::make_unique<Conv2DGraphBuilder>());
+  add("Conv2DBackpropInput", std::make_unique<Conv2DBackpropInputGraphBuilder>());
+  add("DepthwiseConv2dNative", std::make_unique<DepthwiseConv2dNativeGraphBuilder>());
+  add("FakeQuantWithMinMaxVars", std::make_unique<FakeQuantWithMinMaxVarsGraphBuilder>());
+  add("FusedBatchNorm", std::make_unique<FusedBatchNormGraphBuilder>());
+  add("Identity", std::make_unique<IdentityGraphBuilder>());
+  add("Maximum", std::make_unique<MaximumGraphBuilder>());
+  add("MaxPool", std::make_unique<MaxPoolGraphBuilder>());
+  add("Mean", std::make_unique<MeanGraphBuilder>());
+  add("Mul", std::make_unique<MulGraphBuilder>());
+  add("Pack", std::make_unique<PackGraphBuilder>());
+  add("Pad", std::make_unique<PadGraphBuilder>());
+  add("Placeholder", std::make_unique<PlaceholderGraphBuilder>());
+  add("RealDiv", std::make_unique<RealDivGraphBuilder>());
+  add("Relu", std::make_unique<ReluGraphBuilder>());
+  add("Relu6", std::make_unique<Relu6GraphBuilder>());
+  add("Reshape", std::make_unique<ReshapeGraphBuilder>());
+  add("Rsqrt", std::make_unique<RsqrtGraphBuilder>());
+  add("Shape", std::make_unique<ShapeGraphBuilder>());
+  add("Softmax", std::make_unique<SoftmaxGraphBuilder>());
+  add("Sqrt", std::make_unique<SqrtGraphBuilder>());
+  add("SquaredDifference", std::make_unique<SquaredDifferenceGraphBuilder>());
+  add("Squeeze", std::make_unique<SqueezeGraphBuilder>());
+  add("StopGradient", std::make_unique<StopGradientGraphBuilder>());
+  add("StridedSlice", std::make_unique<StridedSliceGraphBuilder>());
+  add("Sub", std::make_unique<SubGraphBuilder>());
+  add("Tanh", std::make_unique<TanhGraphBuilder>());
 
   // Virtual node like `TFPush` need not to be added here
 }
diff --git a/compiler/moco/import/src/Importer.cpp b/compiler/moco/import/src/Importer.cpp
index 3813affce..333f0f6a9 100644
--- a/compiler/moco/import/src/Importer.cpp
+++ b/compiler/moco/import/src/Importer.cpp
@@ -23,9 +23,9 @@
 #include <moco/IR/Nodes/TFPlaceholder.h>
 #include <moco/IR/TFNode.h>
 
-#include <stdex/Memory.h>
 #include <oops/UserExn.h>
 
+#include <memory>
 #include <cassert>
 #include <sstream>
 #include <stdexcept>
@@ -36,9 +36,9 @@ namespace
 void convert_graph(const moco::GraphBuilderSource &source, const moco::ModelSignature &signature,
                    tensorflow::GraphDef &tf_graph_def, loco::Graph *graph)
 {
-  auto nodedef = stdex::make_unique<moco::NodeDefTable>();
-  auto tensor_names = stdex::make_unique<moco::SymbolTable>();
-  auto updates = stdex::make_unique<moco::UpdateQueue>();
+  auto nodedef = std::make_unique<moco::NodeDefTable>();
+  auto tensor_names = std::make_unique<moco::SymbolTable>();
+  auto updates = std::make_unique<moco::UpdateQueue>();
 
   moco::GraphBuilderContext gb_context(graph, nodedef.get(), tensor_names.get(), updates.get());
 
diff --git a/compiler/moco/import/src/Nodes/Add.cpp b/compiler/moco/import/src/Nodes/Add.cpp
index 6981a55e1..af743316b 100644
--- a/compiler/moco/import/src/Nodes/Add.cpp
+++ b/compiler/moco/import/src/Nodes/Add.cpp
@@ -19,7 +19,8 @@
 #include <moco/IR/Nodes/TFAdd.h>
 
 #include <loco.h>
-#include <stdex/Memory.h>
+
+#include <memory>
 
 namespace
 {
@@ -78,7 +79,7 @@ void AddGraphBuilder::build(const tensorflow::NodeDef &node, GraphBuilderContext
   add_input_names.push_back(TensorName(node.input(0))); // x
   add_input_names.push_back(TensorName(node.input(1))); // y
 
-  auto tf_add_update = stdex::make_unique<TFAddGraphUpdate>(tf_add, add_input_names);
+  auto tf_add_update = std::make_unique<TFAddGraphUpdate>(tf_add, add_input_names);
   updates->enroll(std::move(tf_add_update));
 }
 
diff --git a/compiler/moco/import/src/Nodes/AvgPool.cpp b/compiler/moco/import/src/Nodes/AvgPool.cpp
index 6d7fd36bb..95232b977 100644
--- a/compiler/moco/import/src/Nodes/AvgPool.cpp
+++ b/compiler/moco/import/src/Nodes/AvgPool.cpp
@@ -22,10 +22,10 @@
 
 #include "Convert.h"
 #include <loco/IR/PermutingCodec.h>
-#include <stdex/Memory.h>
 #include <plier/tf/Convert.h>
 #include <oops/UserExn.h>
 
+#include <memory>
 #include <cassert>
 #include <stdexcept>
 
@@ -40,7 +40,7 @@ class TFAvgPoolGraphUpdate final : public GraphUpdate
 {
 public:
   TFAvgPoolGraphUpdate(TFAvgPool *node, const TensorName &name)
-      : _avgpool_node(node), _value_name(name)
+    : _avgpool_node(node), _value_name(name)
   {
   }
 
@@ -127,7 +127,7 @@ void AvgPoolGraphBuilder::build(const tensorflow::NodeDef &node, GraphBuilderCon
   tensor_names->enroll(output_name, avgPool_node);
 
   // Record ifm inputs to featureEncode_node
-  auto update = stdex::make_unique<TFAvgPoolGraphUpdate>(avgPool_node, TensorName(node.input(0)));
+  auto update = std::make_unique<TFAvgPoolGraphUpdate>(avgPool_node, TensorName(node.input(0)));
 
   updates->enroll(std::move(update));
 }
diff --git a/compiler/moco/import/src/Nodes/BiasAdd.cpp b/compiler/moco/import/src/Nodes/BiasAdd.cpp
index a3eb91116..d4bc161d5 100644
--- a/compiler/moco/import/src/Nodes/BiasAdd.cpp
+++ b/compiler/moco/import/src/Nodes/BiasAdd.cpp
@@ -22,10 +22,10 @@
 
 #include <loco.h>
 #include <loco/IR/PermutingCodec.h>
-#include <stdex/Memory.h>
 #include <plier/tf/Convert.h>
 #include <oops/UserExn.h>
 
+#include <memory>
 #include <cassert>
 #include <vector>
 
@@ -37,7 +37,7 @@ class TFBiasAddGraphUpdate final : public GraphUpdate
 {
 public:
   TFBiasAddGraphUpdate(TFBiasAdd *biasadd, std::vector<TensorName> &names)
-      : _biasadd(biasadd), _names(names)
+    : _biasadd(biasadd), _names(names)
   {
   }
 
@@ -115,7 +115,7 @@ void BiasAddGraphBuilder::build(const tensorflow::NodeDef &node, GraphBuilderCon
   input_names.push_back(TensorName(node.input(0)));
   input_names.push_back(TensorName(node.input(1)));
 
-  auto update = stdex::make_unique<TFBiasAddGraphUpdate>(tf_bias_add, input_names);
+  auto update = std::make_unique<TFBiasAddGraphUpdate>(tf_bias_add, input_names);
   updates->enroll(std::move(update));
 }
 
diff --git a/compiler/moco/import/src/Nodes/Concat.cpp b/compiler/moco/import/src/Nodes/Concat.cpp
index 8bf8a84b5..dea60a737 100644
--- a/compiler/moco/import/src/Nodes/Concat.cpp
+++ b/compiler/moco/import/src/Nodes/Concat.cpp
@@ -21,9 +21,9 @@
 #include <moco/Names.h>
 
 #include <loco.h>
-#include <stdex/Memory.h>
 #include <plier/tf/Convert.h>
 
+#include <memory>
 #include <cassert>
 
 namespace
@@ -35,7 +35,7 @@ class TFConcatV2GraphUpdate final : public GraphUpdate
 {
 public:
   TFConcatV2GraphUpdate(TFConcatV2 *node, std::vector<TensorName> names)
-      : _node(node), _names(names)
+    : _node(node), _names(names)
   {
   }
 
@@ -102,7 +102,7 @@ void ConcatV2GraphBuilder::build(const tensorflow::NodeDef &node,
   TensorName output_name(node.name(), 0);
   tensor_names->enroll(output_name, concat_node);
 
-  auto update = stdex::make_unique<TFConcatV2GraphUpdate>(concat_node, input_names);
+  auto update = std::make_unique<TFConcatV2GraphUpdate>(concat_node, input_names);
   updates->enroll(std::move(update));
 }
 
diff --git a/compiler/moco/import/src/Nodes/Const.cpp b/compiler/moco/import/src/Nodes/Const.cpp
index 15ea717db..7744cf889 100644
--- a/compiler/moco/import/src/Nodes/Const.cpp
+++ b/compiler/moco/import/src/Nodes/Const.cpp
@@ -228,7 +228,7 @@ void ConstGraphBuilder::build(const tensorflow::NodeDef &node, GraphBuilderConte
       read_value_float32(const_node, num_elements, input_tensor);
       break;
 
-    // TODO support other types
+      // TODO support other types
 
     default:
       assert(false);
diff --git a/compiler/moco/import/src/Nodes/Conv2D.cpp b/compiler/moco/import/src/Nodes/Conv2D.cpp
index e6b98dcd1..acb9f76c6 100644
--- a/compiler/moco/import/src/Nodes/Conv2D.cpp
+++ b/compiler/moco/import/src/Nodes/Conv2D.cpp
@@ -24,10 +24,10 @@
 
 #include <loco.h>
 #include <loco/IR/PermutingCodec.h>
-#include <stdex/Memory.h>
 #include <plier/tf/Convert.h>
 #include <oops/UserExn.h>
 
+#include <memory>
 #include <cassert>
 #include <stdexcept>
 #include <algorithm>
@@ -131,7 +131,7 @@ void Conv2DGraphBuilder::build(const tensorflow::NodeDef &node, GraphBuilderCont
   input_names.push_back(TensorName(node.input(1))); // kernel
 
   // Record ifm inputs to featureEncode_node
-  auto tfconv2d_update = stdex::make_unique<TFConv2DGraphUpdate>(conv2d, input_names);
+  auto tfconv2d_update = std::make_unique<TFConv2DGraphUpdate>(conv2d, input_names);
 
   updates->enroll(std::move(tfconv2d_update));
 }
diff --git a/compiler/moco/import/src/Nodes/Conv2DBackpropInput.cpp b/compiler/moco/import/src/Nodes/Conv2DBackpropInput.cpp
index 74c6605ab..10fee9a8e 100644
--- a/compiler/moco/import/src/Nodes/Conv2DBackpropInput.cpp
+++ b/compiler/moco/import/src/Nodes/Conv2DBackpropInput.cpp
@@ -21,10 +21,11 @@
 #include "Convert.h"
 
 #include <loco.h>
-#include <stdex/Memory.h>
 #include <plier/tf/Convert.h>
 #include <oops/UserExn.h>
 
+#include <memory>
+
 namespace
 {
 using namespace moco;
@@ -34,7 +35,7 @@ class Conv2DBackpropInputGraphUpdate final : public GraphUpdate
 {
 public:
   Conv2DBackpropInputGraphUpdate(TFConv2DBackpropInput *node, std::vector<TensorName> names)
-      : _node(node), _input_names(names)
+    : _node(node), _input_names(names)
   {
     // DO NOTHING
   }
@@ -132,7 +133,7 @@ void Conv2DBackpropInputGraphBuilder::build(const tensorflow::NodeDef &node,
 
   // update
   auto conv2d_backprop_update =
-      stdex::make_unique<Conv2DBackpropInputGraphUpdate>(conv2d_backprop, input_names);
+    std::make_unique<Conv2DBackpropInputGraphUpdate>(conv2d_backprop, input_names);
 
   updates->enroll(std::move(conv2d_backprop_update));
 }
diff --git a/compiler/moco/import/src/Nodes/DepthwiseConv2dNative.cpp b/compiler/moco/import/src/Nodes/DepthwiseConv2dNative.cpp
index 3991a4d51..62e57207d 100644
--- a/compiler/moco/import/src/Nodes/DepthwiseConv2dNative.cpp
+++ b/compiler/moco/import/src/Nodes/DepthwiseConv2dNative.cpp
@@ -24,9 +24,9 @@
 
 #include <plier/tf/Convert.h>
 #include <loco/IR/PermutingCodec.h>
-#include <stdex/Memory.h>
 #include <oops/UserExn.h>
 
+#include <memory>
 #include <cassert>
 
 using namespace plier::tf;
@@ -39,7 +39,7 @@ class TFDepthwiseConv2dNativeGraphUpdate final : public GraphUpdate
 {
 public:
   TFDepthwiseConv2dNativeGraphUpdate(TFDepthwiseConv2dNative *node, std::vector<TensorName> names)
-      : _node(node), _names(names)
+    : _node(node), _names(names)
   {
   }
 
@@ -139,8 +139,8 @@ void DepthwiseConv2dNativeGraphBuilder::build(const tensorflow::NodeDef &node,
   input_names.push_back(TensorName(node.input(1))); // kernel
 
   // Record ifm inputs to featureEncode_node
-  auto tfdepthwiseconv2dnative_update = stdex::make_unique<TFDepthwiseConv2dNativeGraphUpdate>(
-      depthwiseconv2d_native_node, input_names);
+  auto tfdepthwiseconv2dnative_update =
+    std::make_unique<TFDepthwiseConv2dNativeGraphUpdate>(depthwiseconv2d_native_node, input_names);
 
   updates->enroll(std::move(tfdepthwiseconv2dnative_update));
 }
diff --git a/compiler/moco/import/src/Nodes/FakeQuantWithMinMaxVars.cpp b/compiler/moco/import/src/Nodes/FakeQuantWithMinMaxVars.cpp
index d2fa3d1eb..0bd354dc5 100644
--- a/compiler/moco/import/src/Nodes/FakeQuantWithMinMaxVars.cpp
+++ b/compiler/moco/import/src/Nodes/FakeQuantWithMinMaxVars.cpp
@@ -24,8 +24,8 @@
 
 #include <plier/tf/Convert.h>
 #include <loco/IR/PermutingCodec.h>
-#include <stdex/Memory.h>
 
+#include <memory>
 #include <cassert>
 
 using namespace plier::tf;
@@ -39,7 +39,7 @@ class TFFakeQuantWithMinMaxVarsGraphUpdate final : public GraphUpdate
 public:
   TFFakeQuantWithMinMaxVarsGraphUpdate(TFFakeQuantWithMinMaxVars *node,
                                        std::vector<TensorName> names)
-      : _node(node), _names(names)
+    : _node(node), _names(names)
   {
   }
 
@@ -115,7 +115,7 @@ void FakeQuantWithMinMaxVarsGraphBuilder::build(const tensorflow::NodeDef &node,
 
   // Record ifm inputs to featureEncode_node
   auto tffakequant_update =
-      stdex::make_unique<TFFakeQuantWithMinMaxVarsGraphUpdate>(fakequant_node, input_names);
+    std::make_unique<TFFakeQuantWithMinMaxVarsGraphUpdate>(fakequant_node, input_names);
 
   updates->enroll(std::move(tffakequant_update));
 }
diff --git a/compiler/moco/import/src/Nodes/FusedBatchNorm.cpp b/compiler/moco/import/src/Nodes/FusedBatchNorm.cpp
index 59f98017c..8fc439ae3 100644
--- a/compiler/moco/import/src/Nodes/FusedBatchNorm.cpp
+++ b/compiler/moco/import/src/Nodes/FusedBatchNorm.cpp
@@ -19,9 +19,10 @@
 #include <moco/IR/Nodes/TFFusedBatchNorm.h>
 
 #include <loco.h>
-#include <stdex/Memory.h>
 #include <plier/tf/Convert.h>
 
+#include <memory>
+
 namespace
 {
 
@@ -34,7 +35,7 @@ class FusedBatchNormGraphUpdate final : public GraphUpdate
 {
 public:
   FusedBatchNormGraphUpdate(TFFusedBatchNorm *node, std::vector<TensorName> names)
-      : _node(node), _names(names)
+    : _node(node), _names(names)
   {
   }
 
@@ -95,7 +96,7 @@ void FusedBatchNormGraphBuilder::build(const tensorflow::NodeDef &node,
   fbn_input_names.push_back(TensorName(node.input(3))); // mean
   fbn_input_names.push_back(TensorName(node.input(4))); // variance
 
-  auto tf_fbn_update = stdex::make_unique<FusedBatchNormGraphUpdate>(tf_fbn, fbn_input_names);
+  auto tf_fbn_update = std::make_unique<FusedBatchNormGraphUpdate>(tf_fbn, fbn_input_names);
   updates->enroll(std::move(tf_fbn_update));
 }
 
diff --git a/compiler/moco/import/src/Nodes/Identity.cpp b/compiler/moco/import/src/Nodes/Identity.cpp
index 8ca0e2d01..c3b912b48 100644
--- a/compiler/moco/import/src/Nodes/Identity.cpp
+++ b/compiler/moco/import/src/Nodes/Identity.cpp
@@ -20,8 +20,8 @@
 
 #include <moco/Names.h>
 #include <loco.h>
-#include <stdex/Memory.h>
 
+#include <memory>
 #include <vector>
 
 namespace
@@ -33,7 +33,7 @@ class TFIdentityGraphUpdate final : public GraphUpdate
 {
 public:
   TFIdentityGraphUpdate(TFIdentity *node, const std::vector<TensorName> &names)
-      : _node(node), _names(names)
+    : _node(node), _names(names)
   {
   }
 
@@ -88,7 +88,7 @@ void IdentityGraphBuilder::build(const tensorflow::NodeDef &node,
   {
     names.emplace_back(TensorName(node.input(i)));
   }
-  auto update = stdex::make_unique<TFIdentityGraphUpdate>(identity_node, names);
+  auto update = std::make_unique<TFIdentityGraphUpdate>(identity_node, names);
   updates->enroll(std::move(update));
 }
 
diff --git a/compiler/moco/import/src/Nodes/MaxPool.cpp b/compiler/moco/import/src/Nodes/MaxPool.cpp
index 63275a3b8..cf4b21224 100644
--- a/compiler/moco/import/src/Nodes/MaxPool.cpp
+++ b/compiler/moco/import/src/Nodes/MaxPool.cpp
@@ -24,10 +24,10 @@
 
 #include <loco.h>
 #include <loco/IR/PermutingCodec.h>
-#include <stdex/Memory.h>
 #include <plier/tf/Convert.h>
 #include <oops/UserExn.h>
 
+#include <memory>
 #include <cassert>
 #include <stdexcept>
 
@@ -40,7 +40,7 @@ class TFMaxPoolGraphUpdate final : public GraphUpdate
 {
 public:
   TFMaxPoolGraphUpdate(TFMaxPool *node, const TensorName &name)
-      : _maxpool_node(node), _input_name(name)
+    : _maxpool_node(node), _input_name(name)
   {
   }
 
@@ -132,7 +132,7 @@ void MaxPoolGraphBuilder::build(const tensorflow::NodeDef &node, GraphBuilderCon
   tensor_names->enroll(output_name, maxPool_node);
 
   // Record ifm inputs to featureEncode_node
-  auto update = stdex::make_unique<TFMaxPoolGraphUpdate>(maxPool_node, TensorName(node.input(0)));
+  auto update = std::make_unique<TFMaxPoolGraphUpdate>(maxPool_node, TensorName(node.input(0)));
 
   updates->enroll(std::move(update));
 }
diff --git a/compiler/moco/import/src/Nodes/Maximum.cpp b/compiler/moco/import/src/Nodes/Maximum.cpp
index 43bbbabe6..d2d039f27 100644
--- a/compiler/moco/import/src/Nodes/Maximum.cpp
+++ b/compiler/moco/import/src/Nodes/Maximum.cpp
@@ -19,7 +19,8 @@
 #include <moco/IR/Nodes/TFMaximum.h>
 
 #include <loco.h>
-#include <stdex/Memory.h>
+
+#include <memory>
 
 namespace
 {
@@ -80,7 +81,7 @@ void MaximumGraphBuilder::build(const tensorflow::NodeDef &node, GraphBuilderCon
   add_input_names.push_back(TensorName(node.input(0))); // x
   add_input_names.push_back(TensorName(node.input(1))); // y
 
-  auto tf_maximum_update = stdex::make_unique<TFMaximumGraphUpdate>(tf_maximum, add_input_names);
+  auto tf_maximum_update = std::make_unique<TFMaximumGraphUpdate>(tf_maximum, add_input_names);
   updates->enroll(std::move(tf_maximum_update));
 }
 
diff --git a/compiler/moco/import/src/Nodes/Mean.cpp b/compiler/moco/import/src/Nodes/Mean.cpp
index 30fb0f1f7..3f559bc41 100644
--- a/compiler/moco/import/src/Nodes/Mean.cpp
+++ b/compiler/moco/import/src/Nodes/Mean.cpp
@@ -19,9 +19,10 @@
 #include <moco/IR/Nodes/TFMean.h>
 
 #include <loco.h>
-#include <stdex/Memory.h>
 #include <plier/tf/Convert.h>
 
+#include <memory>
+
 namespace
 {
 using namespace moco;
@@ -34,7 +35,7 @@ class MeanGraphUpdate final : public GraphUpdate
 public:
   MeanGraphUpdate(TFMean *node, const TensorName &&input_name,
                   const TensorName &&reduction_indices_name)
-      : _node(node), _input_name(input_name), _reduction_indices_name(reduction_indices_name)
+    : _node(node), _input_name(input_name), _reduction_indices_name(reduction_indices_name)
   {
     // DO NOTHING
   }
@@ -91,8 +92,8 @@ void MeanGraphBuilder::build(const tensorflow::NodeDef &node, GraphBuilderContex
   TensorName output_name(node.name(), 0);
   tensor_names->enroll(output_name, tf_mean);
 
-  auto update = stdex::make_unique<MeanGraphUpdate>(tf_mean, TensorName(node.input(0)),
-                                                    TensorName(node.input(1)));
+  auto update = std::make_unique<MeanGraphUpdate>(tf_mean, TensorName(node.input(0)),
+                                                  TensorName(node.input(1)));
   updates->enroll(std::move(update));
 }
 
diff --git a/compiler/moco/import/src/Nodes/Mul.cpp b/compiler/moco/import/src/Nodes/Mul.cpp
index ab926b59e..91c5a60e5 100644
--- a/compiler/moco/import/src/Nodes/Mul.cpp
+++ b/compiler/moco/import/src/Nodes/Mul.cpp
@@ -19,7 +19,8 @@
 #include <moco/IR/Nodes/TFMul.h>
 
 #include <loco.h>
-#include <stdex/Memory.h>
+
+#include <memory>
 
 namespace
 {
@@ -78,7 +79,7 @@ void MulGraphBuilder::build(const tensorflow::NodeDef &node, GraphBuilderContext
   add_input_names.push_back(TensorName(node.input(0))); // x
   add_input_names.push_back(TensorName(node.input(1))); // y
 
-  auto tf_mul_update = stdex::make_unique<TFMulGraphUpdate>(tf_mul, add_input_names);
+  auto tf_mul_update = std::make_unique<TFMulGraphUpdate>(tf_mul, add_input_names);
   updates->enroll(std::move(tf_mul_update));
 }
 
diff --git a/compiler/moco/import/src/Nodes/Pack.cpp b/compiler/moco/import/src/Nodes/Pack.cpp
index 45815a30e..153ee44ef 100644
--- a/compiler/moco/import/src/Nodes/Pack.cpp
+++ b/compiler/moco/import/src/Nodes/Pack.cpp
@@ -23,9 +23,9 @@
 
 #include <loco.h>
 #include <loco/IR/NodeShape.h>
-#include <stdex/Memory.h>
 #include <plier/tf/Convert.h>
 
+#include <memory>
 #include <cassert>
 
 namespace
@@ -95,7 +95,7 @@ void PackGraphBuilder::build(const tensorflow::NodeDef &node, GraphBuilderContex
   TensorName output_name(node.name(), 0);
   tensor_names->enroll(output_name, pack_node);
 
-  auto update = stdex::make_unique<TFPackGraphUpdate>(pack_node, input_names);
+  auto update = std::make_unique<TFPackGraphUpdate>(pack_node, input_names);
   updates->enroll(std::move(update));
 }
 
diff --git a/compiler/moco/import/src/Nodes/Pad.cpp b/compiler/moco/import/src/Nodes/Pad.cpp
index 262a68fa0..c1f466b44 100644
--- a/compiler/moco/import/src/Nodes/Pad.cpp
+++ b/compiler/moco/import/src/Nodes/Pad.cpp
@@ -19,9 +19,10 @@
 #include <moco/IR/Nodes/TFPad.h>
 
 #include <loco.h>
-#include <stdex/Memory.h>
 #include <plier/tf/Convert.h>
 
+#include <memory>
+
 namespace
 {
 
@@ -84,7 +85,7 @@ void PadGraphBuilder::build(const tensorflow::NodeDef &node, GraphBuilderContext
   add_input_names.push_back(TensorName(node.input(1))); // paddings
 
   // Queue node input update
-  auto tf_pad_update = stdex::make_unique<TFPadGraphUpdate>(tf_pad, add_input_names);
+  auto tf_pad_update = std::make_unique<TFPadGraphUpdate>(tf_pad, add_input_names);
   updates->enroll(std::move(tf_pad_update));
 }
 
diff --git a/compiler/moco/import/src/Nodes/RealDiv.cpp b/compiler/moco/import/src/Nodes/RealDiv.cpp
index de3d57673..c747a2fb3 100644
--- a/compiler/moco/import/src/Nodes/RealDiv.cpp
+++ b/compiler/moco/import/src/Nodes/RealDiv.cpp
@@ -19,7 +19,8 @@
 #include <moco/IR/Nodes/TFRealDiv.h>
 
 #include <loco.h>
-#include <stdex/Memory.h>
+
+#include <memory>
 
 namespace
 {
@@ -79,7 +80,7 @@ void RealDivGraphBuilder::build(const tensorflow::NodeDef &node, GraphBuilderCon
   div_input_names.push_back(TensorName(node.input(0))); // x
   div_input_names.push_back(TensorName(node.input(1))); // y
 
-  auto tf_div_update = stdex::make_unique<TFRealDivGraphUpdate>(tf_div, div_input_names);
+  auto tf_div_update = std::make_unique<TFRealDivGraphUpdate>(tf_div, div_input_names);
   updates->enroll(std::move(tf_div_update));
 }
 
diff --git a/compiler/moco/import/src/Nodes/Relu.cpp b/compiler/moco/import/src/Nodes/Relu.cpp
index eedc8155d..c99e484e2 100644
--- a/compiler/moco/import/src/Nodes/Relu.cpp
+++ b/compiler/moco/import/src/Nodes/Relu.cpp
@@ -20,8 +20,8 @@
 
 #include <moco/Names.h>
 #include <loco.h>
-#include <stdex/Memory.h>
 
+#include <memory>
 #include <cassert>
 #include <stdexcept>
 
@@ -79,7 +79,7 @@ void ReluGraphBuilder::build(const tensorflow::NodeDef &node, GraphBuilderContex
   tensor_names->enroll(output_name, relu_node);
 
   // Queue node input update
-  auto update = stdex::make_unique<TFReluGraphUpdate>(relu_node, TensorName(node.input(0)));
+  auto update = std::make_unique<TFReluGraphUpdate>(relu_node, TensorName(node.input(0)));
   updates->enroll(std::move(update));
 }
 
diff --git a/compiler/moco/import/src/Nodes/Relu6.cpp b/compiler/moco/import/src/Nodes/Relu6.cpp
index 4700ba408..b7bbac5ce 100644
--- a/compiler/moco/import/src/Nodes/Relu6.cpp
+++ b/compiler/moco/import/src/Nodes/Relu6.cpp
@@ -18,7 +18,7 @@
 
 #include <moco/IR/Nodes/TFRelu6.h>
 
-#include <stdex/Memory.h>
+#include <memory>
 
 namespace
 {
@@ -73,7 +73,7 @@ void Relu6GraphBuilder::build(const tensorflow::NodeDef &node, GraphBuilderConte
   tensor_names->enroll(output_name, relu_node);
 
   // Queue node input update
-  auto update = stdex::make_unique<TFRelu6GraphUpdate>(relu_node, TensorName(node.input(0)));
+  auto update = std::make_unique<TFRelu6GraphUpdate>(relu_node, TensorName(node.input(0)));
   updates->enroll(std::move(update));
 }
 
diff --git a/compiler/moco/import/src/Nodes/Reshape.cpp b/compiler/moco/import/src/Nodes/Reshape.cpp
index 26e22513f..bdcafbf70 100644
--- a/compiler/moco/import/src/Nodes/Reshape.cpp
+++ b/compiler/moco/import/src/Nodes/Reshape.cpp
@@ -21,8 +21,8 @@
 #include <moco/Names.h>
 #include <plier/tf/Convert.h>
 #include <loco.h>
-#include <stdex/Memory.h>
 
+#include <memory>
 #include <cassert>
 #include <stdexcept>
 
@@ -94,7 +94,7 @@ void ReshapeGraphBuilder::build(const tensorflow::NodeDef &node, GraphBuilderCon
   input_names.push_back(TensorName(node.input(1))); // shape
 
   // Queue node input update
-  auto update = stdex::make_unique<ReshapeGraphUpdate>(reshape, input_names);
+  auto update = std::make_unique<ReshapeGraphUpdate>(reshape, input_names);
 
   updates->enroll(std::move(update));
 }
diff --git a/compiler/moco/import/src/Nodes/Rsqrt.cpp b/compiler/moco/import/src/Nodes/Rsqrt.cpp
index 979ac90c9..f96d99b68 100644
--- a/compiler/moco/import/src/Nodes/Rsqrt.cpp
+++ b/compiler/moco/import/src/Nodes/Rsqrt.cpp
@@ -19,7 +19,8 @@
 #include <moco/IR/Nodes/TFRsqrt.h>
 
 #include <loco.h>
-#include <stdex/Memory.h>
+
+#include <memory>
 
 namespace
 {
@@ -74,8 +75,7 @@ void RsqrtGraphBuilder::build(const tensorflow::NodeDef &node, GraphBuilderConte
   tensor_names->enroll(output_name, tf_rsqrt);
 
   // Queue node input update
-  auto tf_rsqrt_update =
-      stdex::make_unique<TFRsqrtGraphUpdate>(tf_rsqrt, TensorName(node.input(0)));
+  auto tf_rsqrt_update = std::make_unique<TFRsqrtGraphUpdate>(tf_rsqrt, TensorName(node.input(0)));
   updates->enroll(std::move(tf_rsqrt_update));
 }
 
diff --git a/compiler/moco/import/src/Nodes/Shape.cpp b/compiler/moco/import/src/Nodes/Shape.cpp
index 1e112ebb0..b7eb339ef 100644
--- a/compiler/moco/import/src/Nodes/Shape.cpp
+++ b/compiler/moco/import/src/Nodes/Shape.cpp
@@ -19,9 +19,10 @@
 #include <moco/IR/Nodes/TFShape.h>
 
 #include <loco.h>
-#include <stdex/Memory.h>
 #include <plier/tf/Convert.h>
 
+#include <memory>
+
 namespace
 {
 using namespace moco;
@@ -33,7 +34,7 @@ class ShapeGraphUpdate final : public GraphUpdate
 {
 public:
   ShapeGraphUpdate(TFShape *node, const TensorName &&input_name)
-      : _node(node), _input_name(input_name)
+    : _node(node), _input_name(input_name)
   {
     // DO NOTHING
   }
@@ -93,7 +94,7 @@ void ShapeGraphBuilder::build(const tensorflow::NodeDef &node, GraphBuilderConte
   TensorName output_name(node.name(), 0);
   tensor_names->enroll(output_name, tf_shape);
 
-  auto update = stdex::make_unique<ShapeGraphUpdate>(tf_shape, TensorName(node.input(0)));
+  auto update = std::make_unique<ShapeGraphUpdate>(tf_shape, TensorName(node.input(0)));
   updates->enroll(std::move(update));
 }
 
diff --git a/compiler/moco/import/src/Nodes/Softmax.cpp b/compiler/moco/import/src/Nodes/Softmax.cpp
index 6f2c609ff..4fa962750 100644
--- a/compiler/moco/import/src/Nodes/Softmax.cpp
+++ b/compiler/moco/import/src/Nodes/Softmax.cpp
@@ -19,21 +19,22 @@
 #include <moco/IR/Nodes/TFSoftmax.h>
 
 #include <loco.h>
-#include <stdex/Memory.h>
 #include <plier/tf/Convert.h>
 
+#include <memory>
+
 namespace
 {
 using namespace moco;
 
 /**
-* @brief GraphUpdate for Softmax node
-*/
+ * @brief GraphUpdate for Softmax node
+ */
 class SoftmaxGraphUpdate final : public GraphUpdate
 {
 public:
   SoftmaxGraphUpdate(TFSoftmax *node, const TensorName &&input_name)
-      : _node(node), _input_name(input_name)
+    : _node(node), _input_name(input_name)
   {
     // DO NOTHING
   }
@@ -79,7 +80,7 @@ void SoftmaxGraphBuilder::build(const tensorflow::NodeDef &node, GraphBuilderCon
   TensorName output_name(node.name(), 0);
   tensor_names->enroll(output_name, tf_softmax);
 
-  auto update = stdex::make_unique<SoftmaxGraphUpdate>(tf_softmax, TensorName(node.input(0)));
+  auto update = std::make_unique<SoftmaxGraphUpdate>(tf_softmax, TensorName(node.input(0)));
   updates->enroll(std::move(update));
 }
 
diff --git a/compiler/moco/import/src/Nodes/Sqrt.cpp b/compiler/moco/import/src/Nodes/Sqrt.cpp
index f891e48f6..0dbe15ede 100644
--- a/compiler/moco/import/src/Nodes/Sqrt.cpp
+++ b/compiler/moco/import/src/Nodes/Sqrt.cpp
@@ -19,7 +19,8 @@
 #include <moco/IR/Nodes/TFSqrt.h>
 
 #include <loco.h>
-#include <stdex/Memory.h>
+
+#include <memory>
 
 namespace
 {
@@ -74,7 +75,7 @@ void SqrtGraphBuilder::build(const tensorflow::NodeDef &node, GraphBuilderContex
   tensor_names->enroll(output_name, tf_sqrt);
 
   // Queue node input update
-  auto tf_sqrt_update = stdex::make_unique<TFSqrtGraphUpdate>(tf_sqrt, TensorName(node.input(0)));
+  auto tf_sqrt_update = std::make_unique<TFSqrtGraphUpdate>(tf_sqrt, TensorName(node.input(0)));
   updates->enroll(std::move(tf_sqrt_update));
 }
 
diff --git a/compiler/moco/import/src/Nodes/SquaredDifference.cpp b/compiler/moco/import/src/Nodes/SquaredDifference.cpp
index 17a1fe93d..441f02a19 100644
--- a/compiler/moco/import/src/Nodes/SquaredDifference.cpp
+++ b/compiler/moco/import/src/Nodes/SquaredDifference.cpp
@@ -19,7 +19,8 @@
 #include <moco/IR/Nodes/TFSquaredDifference.h>
 
 #include <loco.h>
-#include <stdex/Memory.h>
+
+#include <memory>
 
 namespace
 {
@@ -33,7 +34,7 @@ class TFSquaredDifferenceGraphUpdate final : public GraphUpdate
 {
 public:
   TFSquaredDifferenceGraphUpdate(TFSquaredDifference *node, std::vector<TensorName> names)
-      : _node(node), _names(names)
+    : _node(node), _names(names)
   {
   }
 
@@ -85,7 +86,7 @@ void SquaredDifferenceGraphBuilder::build(const tensorflow::NodeDef &node,
 
   // Queue node input update
   auto tf_sqrt_update =
-      stdex::make_unique<TFSquaredDifferenceGraphUpdate>(tf_sqdiff, add_input_names);
+    std::make_unique<TFSquaredDifferenceGraphUpdate>(tf_sqdiff, add_input_names);
   updates->enroll(std::move(tf_sqrt_update));
 }
 
diff --git a/compiler/moco/import/src/Nodes/Squeeze.cpp b/compiler/moco/import/src/Nodes/Squeeze.cpp
index 1b4ebae6f..b013b840f 100644
--- a/compiler/moco/import/src/Nodes/Squeeze.cpp
+++ b/compiler/moco/import/src/Nodes/Squeeze.cpp
@@ -21,10 +21,11 @@
 #include <moco/Names.h>
 
 #include <loco.h>
-#include <stdex/Memory.h>
 #include <plier/tf/Convert.h>
 #include <oops/UserExn.h>
 
+#include <memory>
+
 namespace
 {
 using namespace moco;
@@ -36,7 +37,7 @@ class SqueezeGraphUpdate final : public GraphUpdate
 {
 public:
   SqueezeGraphUpdate(TFSqueeze *node, const TensorName &&input_name)
-      : _node(node), _input_name(input_name)
+    : _node(node), _input_name(input_name)
   {
     // DO NOTHING
   }
@@ -105,7 +106,7 @@ void SqueezeGraphBuilder::build(const tensorflow::NodeDef &node, GraphBuilderCon
   TensorName output_name(node.name(), 0);
   tensor_names->enroll(output_name, tf_squeeze);
 
-  auto update = stdex::make_unique<SqueezeGraphUpdate>(tf_squeeze, TensorName(node.input(0)));
+  auto update = std::make_unique<SqueezeGraphUpdate>(tf_squeeze, TensorName(node.input(0)));
   updates->enroll(std::move(update));
 }
 
diff --git a/compiler/moco/import/src/Nodes/StopGradient.cpp b/compiler/moco/import/src/Nodes/StopGradient.cpp
index 9caec6943..82f49dc4a 100644
--- a/compiler/moco/import/src/Nodes/StopGradient.cpp
+++ b/compiler/moco/import/src/Nodes/StopGradient.cpp
@@ -20,7 +20,8 @@
 
 #include <loco.h>
 #include <plier/tf/Convert.h>
-#include <stdex/Memory.h>
+
+#include <memory>
 
 namespace
 {
@@ -80,7 +81,7 @@ void StopGradientGraphBuilder::build(const tensorflow::NodeDef &node,
 
   // Queue node input update
   auto tf_stopgradient_update =
-      stdex::make_unique<TFStopGradientGraphUpdate>(tf_stopgradient, TensorName(node.input(0)));
+    std::make_unique<TFStopGradientGraphUpdate>(tf_stopgradient, TensorName(node.input(0)));
   updates->enroll(std::move(tf_stopgradient_update));
 }
 
diff --git a/compiler/moco/import/src/Nodes/StridedSlice.cpp b/compiler/moco/import/src/Nodes/StridedSlice.cpp
index 06d388be0..b0744a7e2 100644
--- a/compiler/moco/import/src/Nodes/StridedSlice.cpp
+++ b/compiler/moco/import/src/Nodes/StridedSlice.cpp
@@ -24,10 +24,11 @@
 #include "Convert.h"
 
 #include <loco.h>
-#include <stdex/Memory.h>
 #include <plier/tf/Convert.h>
 #include <oops/UserExn.h>
 
+#include <memory>
+
 namespace
 {
 using namespace moco;
@@ -36,7 +37,7 @@ class TFStridedSliceGraphUpdate final : public GraphUpdate
 {
 public:
   TFStridedSliceGraphUpdate(TFStridedSlice *node, std::vector<TensorName> names)
-      : _node(node), _names(names)
+    : _node(node), _names(names)
   {
   }
 
@@ -179,7 +180,7 @@ void StridedSliceGraphBuilder::build(const tensorflow::NodeDef &node,
   input_names.push_back(TensorName(node.input(2))); // end
   input_names.push_back(TensorName(node.input(3))); // strides
 
-  auto tfconv2d_update = stdex::make_unique<TFStridedSliceGraphUpdate>(stridedslice, input_names);
+  auto tfconv2d_update = std::make_unique<TFStridedSliceGraphUpdate>(stridedslice, input_names);
 
   updates->enroll(std::move(tfconv2d_update));
 }
diff --git a/compiler/moco/import/src/Nodes/Sub.cpp b/compiler/moco/import/src/Nodes/Sub.cpp
index bdad81d67..4a657663e 100644
--- a/compiler/moco/import/src/Nodes/Sub.cpp
+++ b/compiler/moco/import/src/Nodes/Sub.cpp
@@ -19,7 +19,8 @@
 #include <moco/IR/Nodes/TFSub.h>
 
 #include <loco.h>
-#include <stdex/Memory.h>
+
+#include <memory>
 
 namespace
 {
@@ -78,7 +79,7 @@ void SubGraphBuilder::build(const tensorflow::NodeDef &node, GraphBuilderContext
   sub_input_names.push_back(TensorName(node.input(0))); // x
   sub_input_names.push_back(TensorName(node.input(1))); // y
 
-  auto tf_sub_update = stdex::make_unique<TFSubGraphUpdate>(tf_sub, sub_input_names);
+  auto tf_sub_update = std::make_unique<TFSubGraphUpdate>(tf_sub, sub_input_names);
   updates->enroll(std::move(tf_sub_update));
 }
 
diff --git a/compiler/moco/import/src/Nodes/Tanh.cpp b/compiler/moco/import/src/Nodes/Tanh.cpp
index c89fa862a..3a0b0a334 100644
--- a/compiler/moco/import/src/Nodes/Tanh.cpp
+++ b/compiler/moco/import/src/Nodes/Tanh.cpp
@@ -19,7 +19,8 @@
 #include <moco/IR/Nodes/TFTanh.h>
 
 #include <loco.h>
-#include <stdex/Memory.h>
+
+#include <memory>
 
 namespace
 {
@@ -74,7 +75,7 @@ void TanhGraphBuilder::build(const tensorflow::NodeDef &node, GraphBuilderContex
   tensor_names->enroll(output_name, tf_tanh);
 
   // Queue node input update
-  auto tf_tanh_update = stdex::make_unique<TFTanhGraphUpdate>(tf_tanh, TensorName(node.input(0)));
+  auto tf_tanh_update = std::make_unique<TFTanhGraphUpdate>(tf_tanh, TensorName(node.input(0)));
   updates->enroll(std::move(tf_tanh_update));
 }
 
diff --git a/compiler/moco/import/src/TestHelper.test.cpp b/compiler/moco/import/src/TestHelper.test.cpp
index 06c3dd372..d0390ad32 100644
--- a/compiler/moco/import/src/TestHelper.test.cpp
+++ b/compiler/moco/import/src/TestHelper.test.cpp
@@ -17,7 +17,8 @@
 #include "TestHelper.h"
 
 #include <moco/IR/Nodes/TFConst.h>
-#include <stdex/Memory.h>
+
+#include <memory>
 
 #include <gtest/gtest.h>
 
@@ -29,7 +30,7 @@ namespace test
 TFNodeBuildTester::TFNodeBuildTester()
 {
   _graph = loco::make_graph();
-  _tensor_names = stdex::make_unique<moco::SymbolTable>();
+  _tensor_names = std::make_unique<moco::SymbolTable>();
 }
 
 void TFNodeBuildTester::inputs(const std::vector<std::string> &names)
@@ -71,8 +72,8 @@ void TFNodeBuildTester::run(tensorflow::NodeDef &nodedef, moco::GraphBuilder &gr
 {
   assert(_output != nullptr);
 
-  auto node_defs = stdex::make_unique<moco::NodeDefTable>();
-  auto updates = stdex::make_unique<moco::UpdateQueue>();
+  auto node_defs = std::make_unique<moco::NodeDefTable>();
+  auto updates = std::make_unique<moco::UpdateQueue>();
 
   moco::GraphBuilderContext gb_context(_graph.get(), node_defs.get(), _tensor_names.get(),
                                        updates.get());
diff --git a/compiler/moco/lang/CMakeLists.txt b/compiler/moco/lang/CMakeLists.txt
index a64fdf92a..2543f2563 100644
--- a/compiler/moco/lang/CMakeLists.txt
+++ b/compiler/moco/lang/CMakeLists.txt
@@ -7,7 +7,6 @@ target_include_directories(moco_lang PRIVATE src)
 target_include_directories(moco_lang PUBLIC include)
 target_link_libraries(moco_lang PUBLIC loco)
 target_link_libraries(moco_lang PRIVATE nncc_common)
-target_link_libraries(moco_lang PRIVATE stdex)
 install(TARGETS moco_lang DESTINATION lib)  # moco_tf_frontend requires moco_lang
 
 if(NOT ENABLE_TEST)
diff --git a/compiler/moco/lang/include/moco/IR/Nodes/TFConv2DBackpropInput.h b/compiler/moco/lang/include/moco/IR/Nodes/TFConv2DBackpropInput.h
index 43e620d24..69d867436 100644
--- a/compiler/moco/lang/include/moco/IR/Nodes/TFConv2DBackpropInput.h
+++ b/compiler/moco/lang/include/moco/IR/Nodes/TFConv2DBackpropInput.h
@@ -68,7 +68,7 @@ node {
  *        Note that this convention is against loco canonical's convention.
  */
 class TFConv2DBackpropInput final
-    : public FixedArityNode<3, TFNodeImpl<TFOpcode::Conv2DBackpropInput>>
+  : public FixedArityNode<3, TFNodeImpl<TFOpcode::Conv2DBackpropInput>>
 {
 public:
   loco::Node *input_sizes(void) const { return at(0)->node(); }
diff --git a/compiler/moco/lang/include/moco/IR/Nodes/TFDepthwiseConv2dNative.h b/compiler/moco/lang/include/moco/IR/Nodes/TFDepthwiseConv2dNative.h
index aefc0b5d9..2d7fa0c10 100644
--- a/compiler/moco/lang/include/moco/IR/Nodes/TFDepthwiseConv2dNative.h
+++ b/compiler/moco/lang/include/moco/IR/Nodes/TFDepthwiseConv2dNative.h
@@ -25,7 +25,7 @@ namespace moco
 {
 
 class TFDepthwiseConv2dNative final
-    : public FixedArityNode<2, TFNodeImpl<TFOpcode::DepthwiseConv2dNative>>
+  : public FixedArityNode<2, TFNodeImpl<TFOpcode::DepthwiseConv2dNative>>
 {
 public:
   loco::Node *input(void) const { return at(0)->node(); }
diff --git a/compiler/moco/lang/include/moco/IR/Nodes/TFFakeQuantWithMinMaxVars.h b/compiler/moco/lang/include/moco/IR/Nodes/TFFakeQuantWithMinMaxVars.h
index ec54da596..55baac7de 100644
--- a/compiler/moco/lang/include/moco/IR/Nodes/TFFakeQuantWithMinMaxVars.h
+++ b/compiler/moco/lang/include/moco/IR/Nodes/TFFakeQuantWithMinMaxVars.h
@@ -25,7 +25,7 @@ namespace moco
 {
 
 class TFFakeQuantWithMinMaxVars final
-    : public FixedArityNode<3, TFNodeImpl<TFOpcode::FakeQuantWithMinMaxVars>>
+  : public FixedArityNode<3, TFNodeImpl<TFOpcode::FakeQuantWithMinMaxVars>>
 {
 public:
   loco::Node *inputs(void) const { return at(0)->node(); }
diff --git a/compiler/moco/lang/src/IR/TFDialect.cpp b/compiler/moco/lang/src/IR/TFDialect.cpp
index 35bbcc2c9..959ef98f5 100644
--- a/compiler/moco/lang/src/IR/TFDialect.cpp
+++ b/compiler/moco/lang/src/IR/TFDialect.cpp
@@ -21,8 +21,7 @@
 #include <loco/IR/GraphInputIndex.h>
 #include <loco/IR/GraphOutputIndex.h>
 
-#include <stdex/Memory.h>
-
+#include <memory>
 #include <cassert>
 #include <stdexcept>
 
@@ -78,8 +77,8 @@ namespace moco
 
 TFDialect::TFDialect()
 {
-  service<loco::GraphInputIndexQueryService>(stdex::make_unique<GiiQueryServiceImpl>());
-  service<loco::GraphOutputIndexQueryService>(stdex::make_unique<GoiQueryServiceImpl>());
+  service<loco::GraphInputIndexQueryService>(std::make_unique<GiiQueryServiceImpl>());
+  service<loco::GraphOutputIndexQueryService>(std::make_unique<GoiQueryServiceImpl>());
 }
 
 loco::Dialect *TFDialect::get(void)
diff --git a/compiler/moco/lang/src/IR/TFNode.cpp b/compiler/moco/lang/src/IR/TFNode.cpp
index ab9356196..55c0e0c64 100644
--- a/compiler/moco/lang/src/IR/TFNode.cpp
+++ b/compiler/moco/lang/src/IR/TFNode.cpp
@@ -17,6 +17,7 @@
 #include "moco/IR/TFNode.h"
 #include "moco/IR/TFDialect.h"
 
+#include <memory>
 #include <cassert>
 
 namespace moco
@@ -26,9 +27,6 @@ const loco::Dialect *TFNode::dialect(void) const { return TFDialect::get(); }
 
 } // namespace moco
 
-// TODO move this to appropriate place
-#include <stdex/Memory.h>
-
 namespace moco
 {
 
@@ -60,7 +58,7 @@ loco::GraphInputIndex index(const TFPlaceholder *node)
 
 void index(TFPlaceholder *node, const loco::GraphInputIndex index)
 {
-  node->annot(stdex::make_unique<GraphInputIndexAnnotation>(index));
+  node->annot(std::make_unique<GraphInputIndexAnnotation>(index));
 }
 
 loco::TensorShape tensor_shape(const TFPlaceholder *node)
diff --git a/compiler/moco/pass/CMakeLists.txt b/compiler/moco/pass/CMakeLists.txt
index 1eba86283..40c3d5a49 100644
--- a/compiler/moco/pass/CMakeLists.txt
+++ b/compiler/moco/pass/CMakeLists.txt
@@ -9,7 +9,6 @@ target_link_libraries(moco_pass PUBLIC loco)
 target_link_libraries(moco_pass PUBLIC logo_core)
 target_link_libraries(moco_pass PUBLIC moco_lang)
 target_link_libraries(moco_pass PRIVATE moco_support)
-target_link_libraries(moco_pass PRIVATE stdex)
 target_link_libraries(moco_pass PRIVATE oops)
 install(TARGETS moco_pass DESTINATION lib)
 
@@ -23,4 +22,3 @@ GTest_AddTest(moco_pass_test ${TESTS})
 target_include_directories(moco_pass_test PRIVATE src)
 target_link_libraries(moco_pass_test moco_pass)
 target_link_libraries(moco_pass_test moco_support)
-target_link_libraries(moco_pass_test stdex)
diff --git a/compiler/moco/pass/include/moco/Pass/Passes/ConstantFoldMul.h b/compiler/moco/pass/include/moco/Pass/Passes/ConstantFoldMul.h
index 5528b8612..a5e25a0ce 100644
--- a/compiler/moco/pass/include/moco/Pass/Passes/ConstantFoldMul.h
+++ b/compiler/moco/pass/include/moco/Pass/Passes/ConstantFoldMul.h
@@ -26,7 +26,7 @@ namespace moco
 
 /**
  * @brief  Constant folder for Const + Mul -> Const
-*/
+ */
 class ConstantFoldMul : public logo::Pass
 {
 public:
diff --git a/compiler/moco/pass/include/moco/Pass/Passes/ConstantFoldPack.h b/compiler/moco/pass/include/moco/Pass/Passes/ConstantFoldPack.h
index fc6bc0ace..f99c633ac 100644
--- a/compiler/moco/pass/include/moco/Pass/Passes/ConstantFoldPack.h
+++ b/compiler/moco/pass/include/moco/Pass/Passes/ConstantFoldPack.h
@@ -28,7 +28,7 @@ namespace moco
 
 /**
  * @brief  Constant folder for Const + Pack -> Const
-*/
+ */
 class ConstantFoldPack : public logo::Pass
 {
 public:
diff --git a/compiler/moco/pass/include/moco/Pass/Passes/ConstantFoldStridedSlice.h b/compiler/moco/pass/include/moco/Pass/Passes/ConstantFoldStridedSlice.h
index 1e3492c2c..f57bdc05e 100644
--- a/compiler/moco/pass/include/moco/Pass/Passes/ConstantFoldStridedSlice.h
+++ b/compiler/moco/pass/include/moco/Pass/Passes/ConstantFoldStridedSlice.h
@@ -26,7 +26,7 @@ namespace moco
 
 /**
  * @brief  Constant folder for Const + StridedSlice -> Const
-*/
+ */
 class ConstantFoldStridedSlice : public logo::Pass
 {
 public:
diff --git a/compiler/moco/pass/include/moco/Pass/Passes/FuseBinaryIntoPreceding.h b/compiler/moco/pass/include/moco/Pass/Passes/FuseBinaryIntoPreceding.h
index 24e3567c0..4d5318c35 100644
--- a/compiler/moco/pass/include/moco/Pass/Passes/FuseBinaryIntoPreceding.h
+++ b/compiler/moco/pass/include/moco/Pass/Passes/FuseBinaryIntoPreceding.h
@@ -26,7 +26,7 @@ namespace moco
 
 /**
  * @brief  Fuse TFAdd, TFMul to preceding TFConv2D or TFDepthWiseConv2D
-*/
+ */
 class FuseBinaryIntoPreceding : public logo::Pass
 {
 public:
diff --git a/compiler/moco/pass/include/moco/Pass/Passes/ResolveFusedBatchNorm.h b/compiler/moco/pass/include/moco/Pass/Passes/ResolveFusedBatchNorm.h
index ce5ea0bb0..1910a9ac7 100644
--- a/compiler/moco/pass/include/moco/Pass/Passes/ResolveFusedBatchNorm.h
+++ b/compiler/moco/pass/include/moco/Pass/Passes/ResolveFusedBatchNorm.h
@@ -26,7 +26,7 @@ namespace moco
 
 /**
  * @brief  Trasform TFFusedBatchNorm into TFAdd + TFRsqrt + TFMul + TFBatchNorm
-*/
+ */
 class ResolveFusedBatchNorm : public logo::Pass
 {
 public:
diff --git a/compiler/moco/pass/src/ConstantFoldAdd.test.cpp b/compiler/moco/pass/src/ConstantFoldAdd.test.cpp
index bc9489fbd..fdfbfb8d3 100644
--- a/compiler/moco/pass/src/ConstantFoldAdd.test.cpp
+++ b/compiler/moco/pass/src/ConstantFoldAdd.test.cpp
@@ -19,7 +19,8 @@
 
 #include <moco/IR/TFNodes.h>
 #include <loco.h>
-#include <stdex/Memory.h>
+
+#include <memory>
 
 #include <gtest/gtest.h>
 
@@ -60,7 +61,7 @@ TEST(ConstantFoldAdd, basic_vector)
   }
   setup_output_node(&graph, add_node);
 
-  auto pass = stdex::make_unique<moco::ConstantFoldAdd>();
+  auto pass = std::make_unique<moco::ConstantFoldAdd>();
   bool cont = true;
   while (cont)
   {
@@ -92,7 +93,7 @@ TEST(ConstantFoldAdd, basic_refinedet_1)
   }
   setup_output_node(&graph, add_node);
 
-  auto pass = stdex::make_unique<moco::ConstantFoldAdd>();
+  auto pass = std::make_unique<moco::ConstantFoldAdd>();
   bool cont = true;
   while (cont)
   {
diff --git a/compiler/moco/pass/src/ConstantFoldHelper.cpp b/compiler/moco/pass/src/ConstantFoldHelper.cpp
index 79b04863c..9dd5e00cd 100644
--- a/compiler/moco/pass/src/ConstantFoldHelper.cpp
+++ b/compiler/moco/pass/src/ConstantFoldHelper.cpp
@@ -164,7 +164,7 @@ void apply_binary_s32(const moco::TFConst *lhs, const moco::TFConst *rhs, moco::
   for (uint32_t e = 0; e < nume; e++)
   {
     output->at<loco::DataType::S32>(e) =
-        f.apply(lhs->at<loco::DataType::S32>(e), rhs->at<loco::DataType::S32>(e));
+      f.apply(lhs->at<loco::DataType::S32>(e), rhs->at<loco::DataType::S32>(e));
   }
 }
 
@@ -180,7 +180,7 @@ void apply_binary_f32(const moco::TFConst *lhs, const moco::TFConst *rhs, moco::
   for (uint32_t e = 0; e < nume; e++)
   {
     output->at<loco::DataType::FLOAT32>(e) =
-        f.apply(lhs->at<loco::DataType::FLOAT32>(e), rhs->at<loco::DataType::FLOAT32>(e));
+      f.apply(lhs->at<loco::DataType::FLOAT32>(e), rhs->at<loco::DataType::FLOAT32>(e));
   }
 }
 
diff --git a/compiler/moco/pass/src/ConstantFoldMul.test.cpp b/compiler/moco/pass/src/ConstantFoldMul.test.cpp
index 4e9b78fd4..c7e7d9e65 100644
--- a/compiler/moco/pass/src/ConstantFoldMul.test.cpp
+++ b/compiler/moco/pass/src/ConstantFoldMul.test.cpp
@@ -19,7 +19,8 @@
 
 #include <moco/IR/TFNodes.h>
 #include <loco.h>
-#include <stdex/Memory.h>
+
+#include <memory>
 
 #include <gtest/gtest.h>
 
@@ -60,7 +61,7 @@ TEST(ConstantFoldMul, basic_vector)
   }
   setup_output_node(&graph, mul_node);
 
-  auto pass = stdex::make_unique<moco::ConstantFoldMul>();
+  auto pass = std::make_unique<moco::ConstantFoldMul>();
   bool cont = true;
   while (cont)
   {
@@ -92,7 +93,7 @@ TEST(ConstantFoldMul, basic_refinedet_1)
   }
   setup_output_node(&graph, mul_node);
 
-  auto pass = stdex::make_unique<moco::ConstantFoldMul>();
+  auto pass = std::make_unique<moco::ConstantFoldMul>();
   bool cont = true;
   while (cont)
   {
diff --git a/compiler/moco/pass/src/ConstantFoldPack.test.cpp b/compiler/moco/pass/src/ConstantFoldPack.test.cpp
index cb6eff0c8..c0fa48c7b 100644
--- a/compiler/moco/pass/src/ConstantFoldPack.test.cpp
+++ b/compiler/moco/pass/src/ConstantFoldPack.test.cpp
@@ -19,7 +19,8 @@
 
 #include <moco/IR/TFNodes.h>
 #include <loco.h>
-#include <stdex/Memory.h>
+
+#include <memory>
 
 #include <gtest/gtest.h>
 
@@ -69,7 +70,7 @@ TEST(ConstantFoldPack, basic_scalar4_vector)
   identity->input(pack_node);
   setup_output_node(&graph, identity);
 
-  auto pass = stdex::make_unique<moco::ConstantFoldPack>();
+  auto pass = std::make_unique<moco::ConstantFoldPack>();
   bool cont = true;
   while (cont)
   {
diff --git a/compiler/moco/pass/src/ConstantFoldStridedSlice.test.cpp b/compiler/moco/pass/src/ConstantFoldStridedSlice.test.cpp
index b5bada221..3e8449977 100644
--- a/compiler/moco/pass/src/ConstantFoldStridedSlice.test.cpp
+++ b/compiler/moco/pass/src/ConstantFoldStridedSlice.test.cpp
@@ -19,7 +19,8 @@
 
 #include <moco/IR/TFNodes.h>
 #include <loco.h>
-#include <stdex/Memory.h>
+
+#include <memory>
 
 #include <gtest/gtest.h>
 
@@ -83,7 +84,7 @@ TEST(ConstantFoldStridedSlice, basic_matrix55_11)
   }
   setup_output_node(&graph, sslice_node);
 
-  auto pass = stdex::make_unique<moco::ConstantFoldStridedSlice>();
+  auto pass = std::make_unique<moco::ConstantFoldStridedSlice>();
   bool cont = true;
   while (cont)
   {
@@ -121,7 +122,7 @@ TEST(ConstantFoldStridedSlice, basic_vector4_0)
   }
   setup_output_node(&graph, sslice_node);
 
-  auto pass = stdex::make_unique<moco::ConstantFoldStridedSlice>();
+  auto pass = std::make_unique<moco::ConstantFoldStridedSlice>();
   bool cont = true;
   while (cont)
   {
@@ -157,7 +158,7 @@ TEST(ConstantFoldStridedSlice, basic_vector4_1)
   }
   setup_output_node(&graph, sslice_node);
 
-  auto pass = stdex::make_unique<moco::ConstantFoldStridedSlice>();
+  auto pass = std::make_unique<moco::ConstantFoldStridedSlice>();
   bool cont = true;
   while (cont)
   {
@@ -193,7 +194,7 @@ TEST(ConstantFoldStridedSlice, basic_vector4_2)
   }
   setup_output_node(&graph, sslice_node);
 
-  auto pass = stdex::make_unique<moco::ConstantFoldStridedSlice>();
+  auto pass = std::make_unique<moco::ConstantFoldStridedSlice>();
   bool cont = true;
   while (cont)
   {
diff --git a/compiler/moco/pass/src/Passes/FuseBinaryIntoPreceding.cpp b/compiler/moco/pass/src/Passes/FuseBinaryIntoPreceding.cpp
index f97546a80..9374dd5f9 100644
--- a/compiler/moco/pass/src/Passes/FuseBinaryIntoPreceding.cpp
+++ b/compiler/moco/pass/src/Passes/FuseBinaryIntoPreceding.cpp
@@ -318,7 +318,7 @@ bool fuse_to_preceding(loco::Graph *graph, moco::TFMul *node)
     fused_node = fused_conv_node<FuseType::Conv2D, moco::TFConv2D>(graph, mulparam, conv2d);
   else if (auto dw_conv2d = dynamic_cast<moco::TFDepthwiseConv2dNative *>(precedingOp))
     fused_node = fused_conv_node<FuseType::DepthwiseConv2D, moco::TFDepthwiseConv2dNative>(
-        graph, mulparam, dw_conv2d);
+      graph, mulparam, dw_conv2d);
 
   // Not ready yet
   if (fused_node == nullptr)
@@ -515,7 +515,7 @@ bool FuseBinaryIntoPreceding::run(loco::Graph *graph)
         }
       }
       {
-          // TODO support Div
+        // TODO support Div
       }
 
       {
diff --git a/compiler/moco/pass/src/Passes/ResolveSquaredDifference.cpp b/compiler/moco/pass/src/Passes/ResolveSquaredDifference.cpp
index b66add1ae..44e92e9a7 100644
--- a/compiler/moco/pass/src/Passes/ResolveSquaredDifference.cpp
+++ b/compiler/moco/pass/src/Passes/ResolveSquaredDifference.cpp
@@ -24,8 +24,6 @@
 #include <loco/IR/NodeShape.h>
 #include <loco/Service/ShapeInference.h>
 
-#include <stdex/Memory.h>
-
 namespace
 {
 
diff --git a/compiler/moco/requires.cmake b/compiler/moco/requires.cmake
index 1a7d36454..18b3a76aa 100644
--- a/compiler/moco/requires.cmake
+++ b/compiler/moco/requires.cmake
@@ -1,6 +1,5 @@
 require("loco")
 require("locop")
-require("stdex")
 require("moco-log")
 require("plier-tf")
 require("mio-tf")
diff --git a/compiler/moco/service/CMakeLists.txt b/compiler/moco/service/CMakeLists.txt
index dff0233b1..5213f718e 100644
--- a/compiler/moco/service/CMakeLists.txt
+++ b/compiler/moco/service/CMakeLists.txt
@@ -9,7 +9,6 @@ target_link_libraries(moco_service PUBLIC loco)
 target_link_libraries(moco_service PUBLIC moco_lang)
 target_link_libraries(moco_service PRIVATE moco_support)
 target_link_libraries(moco_service PRIVATE nncc_common)
-target_link_libraries(moco_service PRIVATE stdex)
 target_link_libraries(moco_service PRIVATE oops)
 install(TARGETS moco_service DESTINATION lib)
 
diff --git a/compiler/moco/service/src/Service/TFShapeInferenceRule.cpp b/compiler/moco/service/src/Service/TFShapeInferenceRule.cpp
index 98434155e..6a9864dc5 100644
--- a/compiler/moco/service/src/Service/TFShapeInferenceRule.cpp
+++ b/compiler/moco/service/src/Service/TFShapeInferenceRule.cpp
@@ -302,7 +302,7 @@ public:
     // output count is from input count, depth is from kernel 'CM' which is dim(2) * dim(3)
     auto output_feature_shape = input_feature_shape;
     output_feature_shape.depth() =
-        loco::Dimension(ker_tensor_shape.dim(2).value() * ker_tensor_shape.dim(3).value());
+      loco::Dimension(ker_tensor_shape.dim(2).value() * ker_tensor_shape.dim(3).value());
 
     auto output_plane_shape = infer_plane_shape(input_plane_shape);
 
diff --git a/compiler/moco/support/include/moco/Support/TFShapeInferenceHelper.h b/compiler/moco/support/include/moco/Support/TFShapeInferenceHelper.h
index 52324700a..c8a547681 100644
--- a/compiler/moco/support/include/moco/Support/TFShapeInferenceHelper.h
+++ b/compiler/moco/support/include/moco/Support/TFShapeInferenceHelper.h
@@ -136,11 +136,11 @@ protected:
     if (_padding == "VALID")
     {
       res.height =
-          (p.input.height.value() + p.stride.height.value() - p.effective_window.height.value()) /
-          p.stride.height.value();
+        (p.input.height.value() + p.stride.height.value() - p.effective_window.height.value()) /
+        p.stride.height.value();
       res.width =
-          (p.input.width.value() + p.stride.width.value() - p.effective_window.width.value()) /
-          p.stride.width.value();
+        (p.input.width.value() + p.stride.width.value() - p.effective_window.width.value()) /
+        p.stride.width.value();
     }
     else if (_padding == "SAME")
     {
diff --git a/compiler/nest/core/include/nest/expr/AddNode.h b/compiler/nest/core/include/nest/expr/AddNode.h
index b9b5afb22..bb95692b6 100644
--- a/compiler/nest/core/include/nest/expr/AddNode.h
+++ b/compiler/nest/core/include/nest/expr/AddNode.h
@@ -30,7 +30,7 @@ class AddNode final : public Node
 {
 public:
   AddNode(const std::shared_ptr<expr::Node> &lhs, const std::shared_ptr<expr::Node> &rhs)
-      : _lhs{lhs}, _rhs{rhs}
+    : _lhs{lhs}, _rhs{rhs}
   {
     // DO NOTHING
   }
diff --git a/compiler/nest/core/include/nest/expr/DerefNode.h b/compiler/nest/core/include/nest/expr/DerefNode.h
index 19adfe3b3..8e3cc5690 100644
--- a/compiler/nest/core/include/nest/expr/DerefNode.h
+++ b/compiler/nest/core/include/nest/expr/DerefNode.h
@@ -31,7 +31,7 @@ class DerefNode final : public Node
 public:
   template <typename... Args>
   DerefNode(const DomainID &id, Args &&... indicies)
-      : _id{id}, _sub{std::forward<Args>(indicies)...}
+    : _id{id}, _sub{std::forward<Args>(indicies)...}
   {
     // DO NOTHING
   }
diff --git a/compiler/nest/core/include/nest/expr/MulNode.h b/compiler/nest/core/include/nest/expr/MulNode.h
index f388b33a3..bbf64d9bc 100644
--- a/compiler/nest/core/include/nest/expr/MulNode.h
+++ b/compiler/nest/core/include/nest/expr/MulNode.h
@@ -30,7 +30,7 @@ class MulNode final : public Node
 {
 public:
   MulNode(const std::shared_ptr<expr::Node> &lhs, const std::shared_ptr<expr::Node> &rhs)
-      : _lhs{lhs}, _rhs{rhs}
+    : _lhs{lhs}, _rhs{rhs}
   {
     // DO NOTHING
   }
diff --git a/compiler/nest/core/src/Block.test.cpp b/compiler/nest/core/src/Block.test.cpp
index d8faa0bdb..c48fcfa35 100644
--- a/compiler/nest/core/src/Block.test.cpp
+++ b/compiler/nest/core/src/Block.test.cpp
@@ -24,7 +24,7 @@ struct DummyNode final : public nest::stmt::Node
 {
   // Dummy Node for testing
 };
-}
+} // namespace
 
 TEST(BLOCK, use_case_1)
 {
diff --git a/compiler/nest/core/src/Closure.test.cpp b/compiler/nest/core/src/Closure.test.cpp
index 495e2186a..458179fb8 100644
--- a/compiler/nest/core/src/Closure.test.cpp
+++ b/compiler/nest/core/src/Closure.test.cpp
@@ -23,7 +23,7 @@ namespace
 struct DummyNode final : public nest::expr::Node
 {
 };
-}
+} // namespace
 
 TEST(Closure, ctor)
 {
diff --git a/compiler/nest/core/src/Expr.test.cpp b/compiler/nest/core/src/Expr.test.cpp
index 2e26c234a..1b2e7135a 100644
--- a/compiler/nest/core/src/Expr.test.cpp
+++ b/compiler/nest/core/src/Expr.test.cpp
@@ -25,7 +25,7 @@ namespace
 struct DummyNode final : public nest::expr::Node
 {
 };
-}
+} // namespace
 
 TEST(EXPR, operator_sum)
 {
diff --git a/compiler/nest/core/src/Ret.test.cpp b/compiler/nest/core/src/Ret.test.cpp
index a85223578..98f47d897 100644
--- a/compiler/nest/core/src/Ret.test.cpp
+++ b/compiler/nest/core/src/Ret.test.cpp
@@ -23,7 +23,7 @@ namespace
 struct DummyNode final : public nest::expr::Node
 {
 };
-}
+} // namespace
 
 TEST(RET, ctor)
 {
diff --git a/compiler/nest/core/src/expr/AddNode.test.cpp b/compiler/nest/core/src/expr/AddNode.test.cpp
index dba6cc826..d8ef1d08b 100644
--- a/compiler/nest/core/src/expr/AddNode.test.cpp
+++ b/compiler/nest/core/src/expr/AddNode.test.cpp
@@ -25,7 +25,7 @@ namespace
 struct DummyNode final : public nest::expr::Node
 {
 };
-}
+} // namespace
 
 TEST(ADD_NODE, cast)
 {
diff --git a/compiler/nest/core/src/expr/DerefNode.test.cpp b/compiler/nest/core/src/expr/DerefNode.test.cpp
index 125d8bf1e..d0badd509 100644
--- a/compiler/nest/core/src/expr/DerefNode.test.cpp
+++ b/compiler/nest/core/src/expr/DerefNode.test.cpp
@@ -25,7 +25,7 @@ namespace
 struct DummyNode final : public nest::expr::Node
 {
 };
-}
+} // namespace
 
 TEST(DEREF_NODE, cast)
 {
diff --git a/compiler/nest/core/src/expr/MulNode.test.cpp b/compiler/nest/core/src/expr/MulNode.test.cpp
index 85cb5a56e..bccbcb3b5 100644
--- a/compiler/nest/core/src/expr/MulNode.test.cpp
+++ b/compiler/nest/core/src/expr/MulNode.test.cpp
@@ -25,7 +25,7 @@ namespace
 struct DummyNode final : public nest::expr::Node
 {
 };
-}
+} // namespace
 
 TEST(MUL_NODE, cast)
 {
diff --git a/compiler/nest/core/src/stmt/PushNode.test.cpp b/compiler/nest/core/src/stmt/PushNode.test.cpp
index c02c69220..fb58a125e 100644
--- a/compiler/nest/core/src/stmt/PushNode.test.cpp
+++ b/compiler/nest/core/src/stmt/PushNode.test.cpp
@@ -25,7 +25,7 @@ namespace
 struct DummyExprNode final : public nest::expr::Node
 {
 };
-}
+} // namespace
 
 TEST(STMT_PUSH_NODE, cast)
 {
diff --git a/compiler/nnc/backends/acl_soft_backend/AclCppGenerator.cpp b/compiler/nnc/backends/acl_soft_backend/AclCppGenerator.cpp
index 3a5b9ecaf..cad05cc1d 100644
--- a/compiler/nnc/backends/acl_soft_backend/AclCppGenerator.cpp
+++ b/compiler/nnc/backends/acl_soft_backend/AclCppGenerator.cpp
@@ -30,7 +30,7 @@ using namespace std;
 namespace fs = boost::filesystem;
 
 AclCppCodeGenerator::AclCppCodeGenerator(string output_dir, string artifact_name)
-    : _output_dir(std::move(output_dir)), _artifact_name(std::move(artifact_name))
+  : _output_dir(std::move(output_dir)), _artifact_name(std::move(artifact_name))
 {
 }
 
diff --git a/compiler/nnc/backends/acl_soft_backend/AclCppOpGenerator.cpp b/compiler/nnc/backends/acl_soft_backend/AclCppOpGenerator.cpp
index b5e3734ae..0abe3ec72 100644
--- a/compiler/nnc/backends/acl_soft_backend/AclCppOpGenerator.cpp
+++ b/compiler/nnc/backends/acl_soft_backend/AclCppOpGenerator.cpp
@@ -33,8 +33,8 @@ using namespace std;
 using namespace mir;
 
 AclCppOpGenerator::AclCppOpGenerator(const string &name, ostream &par_out)
-    : _parOut(par_out), _module(name), _constrBlock(nullptr), _infBlock(nullptr),
-      _clScheduler(AF::id("arm_compute::CLScheduler"))
+  : _parOut(par_out), _module(name), _constrBlock(nullptr), _infBlock(nullptr),
+    _clScheduler(AF::id("arm_compute::CLScheduler"))
 {
 }
 
@@ -60,13 +60,14 @@ const ArtifactModule &AclCppOpGenerator::generate(mir::Graph *g)
   _parInVar = _artifactClass->var(false, "std::ifstream", "_parIn");
   _parIn = _parInVar->use();
   string par_file_name = _module.name() + ".par";
-  _constrBlock->call("open", {AF::lit("\"" + par_file_name + "\""),
-                              AF::lit("std::ios_base::in | std::ios_base::binary")},
-                     _parIn);
+  _constrBlock->call(
+    "open",
+    {AF::lit("\"" + par_file_name + "\""), AF::lit("std::ios_base::in | std::ios_base::binary")},
+    _parIn);
   auto file_fail = _constrBlock->ifCond(AF::call("fail", {}, _parIn));
   auto file_fail_block = file_fail->getBlock();
   file_fail_block->addStatement(
-      AF::lit("throw std::string(\"Failed to open file: " + par_file_name + " for reading\")"));
+    AF::lit("throw std::string(\"Failed to open file: " + par_file_name + " for reading\")"));
 
   // Traverse the computational graph.
   g->accept(this);
@@ -89,8 +90,8 @@ void AclCppOpGenerator::visit(ops::ConcatOp &op)
   const auto *ir_output = op.getOutput(0);
 
   static const char *axis_names[] = {
-      "arm_compute::DataLayoutDimension::BATCHES", "arm_compute::DataLayoutDimension::CHANNEL",
-      "arm_compute::DataLayoutDimension::HEIGHT", "arm_compute::DataLayoutDimension::WIDTH"};
+    "arm_compute::DataLayoutDimension::BATCHES", "arm_compute::DataLayoutDimension::CHANNEL",
+    "arm_compute::DataLayoutDimension::HEIGHT", "arm_compute::DataLayoutDimension::WIDTH"};
 
   int axis = op.getAxis();
   assert(axis >= 0 && axis < static_cast<int>(sizeof(axis_names) / sizeof(axis_names[0])) &&
@@ -105,8 +106,8 @@ void AclCppOpGenerator::visit(ops::ConcatOp &op)
   for (const Operation::Output *ir_input : ir_inputs)
     _constrBlock->call("push_back", {AF::ref(AF::id(tensorName(ir_input)))}, inputs);
 
-  auto layer = genLayer("arm_compute::CLConcatenateLayer", prefix,
-                        {inputs, AF::ref(out), AF::lit(axis_name)});
+  auto layer =
+    genLayer("arm_compute::CLConcatenateLayer", prefix, {inputs, AF::ref(out), AF::lit(axis_name)});
 
   addToPersistentTensors(out);
   genLayerExecution(layer);
@@ -214,13 +215,13 @@ shared_ptr<ArtifactVariable> AclCppOpGenerator::genPadStrideInfo(const Op &op, c
   string var_name = prefix + "_pad_stride_info";
 
   list<std::shared_ptr<ArtifactExpr>> var_init_params = {
-      AF::lit(to_string(strides.dim(1))),
-      AF::lit(to_string(strides.dim(0))),
-      AF::lit(to_string(padding_before.at(1))),
-      AF::lit(to_string(padding_after.at(1))),
-      AF::lit(to_string(padding_before.at(0))),
-      AF::lit(to_string(padding_after.at(0))),
-      AF::lit("arm_compute::DimensionRoundingType::FLOOR")};
+    AF::lit(to_string(strides.dim(1))),
+    AF::lit(to_string(strides.dim(0))),
+    AF::lit(to_string(padding_before.at(1))),
+    AF::lit(to_string(padding_after.at(1))),
+    AF::lit(to_string(padding_before.at(0))),
+    AF::lit(to_string(padding_after.at(0))),
+    AF::lit("arm_compute::DimensionRoundingType::FLOOR")};
 
   auto pad_stride_info_var = block->var(type_name, var_name, {}, var_init_params);
 
@@ -316,7 +317,7 @@ static bool shouldSerializeConstant(const ops::ConstantOp &op)
   // themselves,
   // so we don't serialize them here, also we don't serialize tensors from dangling ConstantOp
   static std::map<Operation::Type, std::size_t> self_serializing_ops_to_inputs{
-      {Operation::Type::conv2D, 1}, {Operation::Type::fullyConnected, 1}};
+    {Operation::Type::conv2D, 1}, {Operation::Type::fullyConnected, 1}};
 
   for (Operation::Use use : op.getOutput(0)->getUses())
   {
@@ -420,8 +421,8 @@ void AclCppOpGenerator::visit(ops::PadOp &op)
   for (int i = 0; i < ir_input->getShape().rank(); ++i)
   {
     auto pad_var = _constrBlock->var(
-        "arm_compute::PaddingInfo", prefix + "_pad_" + to_string(i), {},
-        {AF::lit(to_string(padding_before[i])), AF::lit(to_string(padding_after[i]))});
+      "arm_compute::PaddingInfo", prefix + "_pad_" + to_string(i), {},
+      {AF::lit(to_string(padding_before[i])), AF::lit(to_string(padding_after[i]))});
     auto pad = pad_var->use();
     _constrBlock->call("push_back", {pad}, pad_list);
   }
@@ -430,7 +431,7 @@ void AclCppOpGenerator::visit(ops::PadOp &op)
   // FIXME Set up the `constant_value` parameter.
   assert(op.getPaddingValue() == 0.0f);
   auto layer =
-      genLayer("arm_compute::CLPadLayer", prefix, {AF::ref(input), AF::ref(out), pad_list});
+    genLayer("arm_compute::CLPadLayer", prefix, {AF::ref(input), AF::ref(out), pad_list});
   genLayerExecution(layer);
 }
 
@@ -449,7 +450,7 @@ void AclCppOpGenerator::genPooling(Op &op, const std::string &pooling_type, bool
   // Transpose data from MIR format to format compatible with ACL
   const string transposed_input_name = output_tensor_name + "transposed_input";
   shared_ptr<ArtifactId> transposed_input =
-      genTransposeMIRtoACL(transposed_input_name, ir_input->getShape(), in_id);
+    genTransposeMIRtoACL(transposed_input_name, ir_input->getShape(), in_id);
 
   const string layer_name = output_tensor_name + "_pooling_layer";
 
@@ -459,31 +460,31 @@ void AclCppOpGenerator::genPooling(Op &op, const std::string &pooling_type, bool
 
   // Create kernel window info
   shared_ptr<ArtifactVariable> kernel_window_var = _constrBlock->var(
-      "arm_compute::Size2D", layer_name + "_kernel_window", {},
-      {AF::lit(to_string(op.getWindowSize()[1])), AF::lit(to_string(op.getWindowSize()[0]))});
+    "arm_compute::Size2D", layer_name + "_kernel_window", {},
+    {AF::lit(to_string(op.getWindowSize()[1])), AF::lit(to_string(op.getWindowSize()[0]))});
   shared_ptr<ArtifactId> kernel_window = kernel_window_var->use();
 
   // Create pooling info: pooling type, kernel info, strides, etc
   shared_ptr<ArtifactVariable> pooling_info_var =
-      _constrBlock->var("arm_compute::PoolingLayerInfo", layer_name + "_pooling_info", {},
-                        {AF::lit(pooling_type), kernel_window, pad_stride_info,
-                         AF::lit(exclude_padding ? "true" : "false")});
+    _constrBlock->var("arm_compute::PoolingLayerInfo", layer_name + "_pooling_info", {},
+                      {AF::lit(pooling_type), kernel_window, pad_stride_info,
+                       AF::lit(exclude_padding ? "true" : "false")});
   shared_ptr<ArtifactId> pooling_info = pooling_info_var->use();
 
   // Generate auxiliary tensor to hold transposed output of pool in NCHW format
   Shape transposed_output_shape = transposeShape<0, 3, 1, 2>(ir_output->getShape());
   shared_ptr<ArtifactId> transposed_output =
-      genTensor(layer_name + "_out_transpose", transposed_output_shape);
+    genTensor(layer_name + "_out_transpose", transposed_output_shape);
 
   // Actual layer creation
   shared_ptr<ArtifactId> layer =
-      genLayer("arm_compute::CLPoolingLayer", layer_name,
-               {AF::ref(transposed_input), AF::ref(transposed_output), pooling_info});
+    genLayer("arm_compute::CLPoolingLayer", layer_name,
+             {AF::ref(transposed_input), AF::ref(transposed_output), pooling_info});
   genTensorAllocation(_infBlock, transposed_output);
   genLayerExecution(layer);
 
   shared_ptr<ArtifactId> output =
-      genTransposeACLtoMIR(output_tensor_name, transposed_output_shape, transposed_output);
+    genTransposeACLtoMIR(output_tensor_name, transposed_output_shape, transposed_output);
 
   genTensorDeallocation(_infBlock, transposed_input);
   genTensorDeallocation(_infBlock, transposed_output);
@@ -521,13 +522,13 @@ void AclCppOpGenerator::genConvolution(Op &op, const string &acl_func_name, cons
 
   // Generate auxiliary tensor to hold transposed input of convolution in NCHW format
   shared_ptr<ArtifactId> transposed_input =
-      genTransposeMIRtoACL(output_tensor_name + "_transposed_input", ir_input->getShape(), input);
+    genTransposeMIRtoACL(output_tensor_name + "_transposed_input", ir_input->getShape(), input);
 
   // Create the transposed output tensor in the DOM.
   const string transposed_output_name = output_tensor_name + "_transposed_output";
   Shape transposed_output_shape = transposeShape<0, 3, 1, 2>(ir_output->getShape());
   shared_ptr<ArtifactId> transposed_output =
-      genTensor(transposed_output_name, transposed_output_shape);
+    genTensor(transposed_output_name, transposed_output_shape);
 
   string operation_name = output_tensor_name + suffix;
 
@@ -564,7 +565,7 @@ void AclCppOpGenerator::genConvolution(Op &op, const string &acl_func_name, cons
 
   // Generate auxiliar tensor to hold transposed output of convolution in NHWC format
   shared_ptr<ArtifactId> output =
-      genTransposeACLtoMIR(output_tensor_name, transposed_output_shape, transposed_output);
+    genTransposeACLtoMIR(output_tensor_name, transposed_output_shape, transposed_output);
 
   genTensorDeallocation(_infBlock, transposed_input);
   genTensorDeallocation(_infBlock, transposed_output);
@@ -589,9 +590,9 @@ void AclCppOpGenerator::genActivation(const Operation &op, const std::string &ac
   // constructor. This instance profide information about the concrete activation function,
   // like: ReLU, Tanh etc and two optional parameter (alpha and betha) needed by some activations.
   auto activation_info_var = _constrBlock->var(
-      "arm_compute::ActivationLayerInfo", prefix + "_activation_info", {},
-      {AF::lit("arm_compute::ActivationLayerInfo::ActivationFunction::" + activation_name),
-       AF::lit(to_string(a)), AF::lit(to_string(b))});
+    "arm_compute::ActivationLayerInfo", prefix + "_activation_info", {},
+    {AF::lit("arm_compute::ActivationLayerInfo::ActivationFunction::" + activation_name),
+     AF::lit(to_string(a)), AF::lit(to_string(b))});
   auto activation_info = activation_info_var->use();
 
   // Create an instance of the CLActivationLayer class as a member of the artifact class.
@@ -619,9 +620,10 @@ shared_ptr<ArtifactId> AclCppOpGenerator::genAddition(const string &prefix, size
   auto arithmetic_add_layer = arithmetic_add_layer_var->use();
 
   // Generate the call: arithmetic_add_layer.configure(&in1, &in2, &out);
-  _constrBlock->call("configure", {AF::ref(in1), AF::ref(in2), AF::ref(out),
-                                   AF::lit("arm_compute::ConvertPolicy::WRAP")},
-                     arithmetic_add_layer);
+  _constrBlock->call(
+    "configure",
+    {AF::ref(in1), AF::ref(in2), AF::ref(out), AF::lit("arm_compute::ConvertPolicy::WRAP")},
+    arithmetic_add_layer);
 
   // Generate the call: arithmetic_add_layer.run();
   _infBlock->call("run", {}, arithmetic_add_layer);
@@ -696,8 +698,8 @@ string AclCppOpGenerator::tensorName(const Operation::Output *ir_tensor) const
   if (!tensor_name.empty())
   {
     tensor_name = "_" + tensor_name;
-    replace_if(tensor_name.begin(), tensor_name.end(), [](char c) { return std::isalnum(c) == 0; },
-               '_');
+    replace_if(
+      tensor_name.begin(), tensor_name.end(), [](char c) { return std::isalnum(c) == 0; }, '_');
   }
   else
   {
@@ -740,7 +742,7 @@ shared_ptr<ArtifactId> AclCppOpGenerator::genTensor(const string &name, const Sh
 
     const char *type_name = "arm_compute::TensorShape";
     shared_ptr<ArtifactId> shape =
-        genVectorInitializedVar(_constrBlock, type_name, name + "_shape", shape_vectorized);
+      genVectorInitializedVar(_constrBlock, type_name, name + "_shape", shape_vectorized);
     _constrBlock->call("initializeTensor", {id, shape});
 
     if (gen_accessor)
@@ -903,7 +905,7 @@ void AclCppOpGenerator::genTranspose(const std::shared_ptr<nnc::ArtifactId> &inp
 
   // Create operation parameter containing permutation vector
   shared_ptr<ArtifactId> perm_vector = genVectorInitializedVar(
-      _constrBlock, "arm_compute::PermutationVector", out_name + "_perm_param", acl_perm);
+    _constrBlock, "arm_compute::PermutationVector", out_name + "_perm_param", acl_perm);
 
   // Instantiate the CLPermute object.
   string layer_name = out_name + "_transpose_layer";
diff --git a/compiler/nnc/backends/acl_soft_backend/ArtifactModel.cpp b/compiler/nnc/backends/acl_soft_backend/ArtifactModel.cpp
index 8888697e7..bbaa1f523 100644
--- a/compiler/nnc/backends/acl_soft_backend/ArtifactModel.cpp
+++ b/compiler/nnc/backends/acl_soft_backend/ArtifactModel.cpp
@@ -25,8 +25,8 @@ using namespace std;
 ArtifactFunctionCall::ArtifactFunctionCall(string func_name,
                                            list<shared_ptr<ArtifactExpr>> param_list,
                                            shared_ptr<ArtifactExpr> on, ArtifactCallType call_type)
-    : _funcName(std::move(func_name)), _callType(call_type), _on(std::move(on)),
-      _paramList(std::move(param_list))
+  : _funcName(std::move(func_name)), _callType(call_type), _on(std::move(on)),
+    _paramList(std::move(param_list))
 {
 }
 
diff --git a/compiler/nnc/backends/acl_soft_backend/ArtifactModel.h b/compiler/nnc/backends/acl_soft_backend/ArtifactModel.h
index 106c9bec3..89d803021 100644
--- a/compiler/nnc/backends/acl_soft_backend/ArtifactModel.h
+++ b/compiler/nnc/backends/acl_soft_backend/ArtifactModel.h
@@ -204,7 +204,7 @@ class ArtifactUnaryExpr : public ArtifactExpr
 {
 public:
   ArtifactUnaryExpr(ArtifactUnOp op, std::shared_ptr<ArtifactExpr> expr)
-      : _op(op), _expr(std::move(expr))
+    : _op(op), _expr(std::move(expr))
   {
   }
 
@@ -248,7 +248,7 @@ class ArtifactBinaryExpr : public ArtifactExpr
 public:
   ArtifactBinaryExpr(ArtifactBinOp op, std::shared_ptr<ArtifactExpr> left,
                      std::shared_ptr<ArtifactExpr> right)
-      : _op(op), _left(std::move(left)), _right(std::move(right))
+    : _op(op), _left(std::move(left)), _right(std::move(right))
   {
   }
 
@@ -271,7 +271,7 @@ class ArtifactIndex : public ArtifactExpr
 {
 public:
   ArtifactIndex(std::shared_ptr<ArtifactExpr> expr, std::shared_ptr<ArtifactExpr> ind)
-      : _expr(std::move(expr)), _ind(std::move(ind))
+    : _expr(std::move(expr)), _ind(std::move(ind))
   {
   }
 
@@ -328,8 +328,8 @@ public:
   ArtifactVariable(std::string type_name, std::string var_name,
                    std::list<std::shared_ptr<ArtifactExpr>> dimensions = {},
                    std::list<std::shared_ptr<ArtifactExpr>> initializers = {})
-      : _typeName(std::move(type_name)), _dimensions(std::move(dimensions)),
-        _initializers(std::move(initializers)), ArtifactNamed(std::move(var_name))
+    : _typeName(std::move(type_name)), _dimensions(std::move(dimensions)),
+      _initializers(std::move(initializers)), ArtifactNamed(std::move(var_name))
   {
   }
 
@@ -469,7 +469,7 @@ public:
   explicit ArtifactForLoop(std::shared_ptr<ArtifactVariable> init = nullptr,
                            std::shared_ptr<ArtifactExpr> cond = nullptr,
                            std::shared_ptr<ArtifactExpr> iter = nullptr)
-      : _init(std::move(init)), _cond(std::move(cond)), _iter(std::move(iter))
+    : _init(std::move(init)), _cond(std::move(cond)), _iter(std::move(iter))
   {
   }
 
@@ -527,7 +527,7 @@ public:
    */
   ArtifactFunction(std::string ret_type_name, const std::string &func_name,
                    std::list<std::shared_ptr<ArtifactVariable>> params = {})
-      : ArtifactNamed(func_name), _params(std::move(params)), _retTypeName(std::move(ret_type_name))
+    : ArtifactNamed(func_name), _params(std::move(params)), _retTypeName(std::move(ret_type_name))
   {
   }
 
@@ -568,7 +568,7 @@ public:
                         const std::string &var_name,
                         const std::list<std::shared_ptr<ArtifactExpr>> &dimensions = {},
                         const std::list<std::shared_ptr<ArtifactExpr>> &initializers = {})
-      : ArtifactClassMember(owner), ArtifactVariable(type_name, var_name, dimensions, initializers)
+    : ArtifactClassMember(owner), ArtifactVariable(type_name, var_name, dimensions, initializers)
   {
   }
 
@@ -584,7 +584,7 @@ public:
   ArtifactClassFunction(const ArtifactClass *owner, const std::string &ret_type_name,
                         const std::string &func_name,
                         const std::list<std::shared_ptr<ArtifactVariable>> &params = {})
-      : ArtifactClassMember(owner), ArtifactFunction(ret_type_name, func_name, params)
+    : ArtifactClassMember(owner), ArtifactFunction(ret_type_name, func_name, params)
   {
   }
 
diff --git a/compiler/nnc/backends/interpreter/InterpreterBackend.cpp b/compiler/nnc/backends/interpreter/InterpreterBackend.cpp
index 923a7cfc7..895daa115 100644
--- a/compiler/nnc/backends/interpreter/InterpreterBackend.cpp
+++ b/compiler/nnc/backends/interpreter/InterpreterBackend.cpp
@@ -104,7 +104,7 @@ static void writeTensorToHDF5File(const TensorVariant &tensor, std::string tenso
 static TensorVariant readTensorFromFile(const std::string &filename, const TensorType &type)
 {
   const std::size_t input_data_size =
-      type.getShape().numElements() * getDataTypeSize(type.getElementType());
+    type.getShape().numElements() * getDataTypeSize(type.getElementType());
 
   std::ifstream stream(filename, std::ios::in | std::ios::binary);
   if (stream.fail())
@@ -117,9 +117,9 @@ static TensorVariant readTensorFromFile(const std::string &filename, const Tenso
   int64_t file_size = end - begin;
 
   if (static_cast<std::size_t>(file_size) != input_data_size)
-    throw std::runtime_error("File \"" + filename + "\" has incorrect size: " +
-                             std::to_string(file_size) + "(expected: " +
-                             std::to_string(input_data_size) + ").");
+    throw std::runtime_error("File \"" + filename +
+                             "\" has incorrect size: " + std::to_string(file_size) +
+                             "(expected: " + std::to_string(input_data_size) + ").");
 
   std::unique_ptr<char[]> data(new char[input_data_size]);
   stream.read(data.get(), input_data_size);
@@ -130,7 +130,7 @@ static TensorVariant readTensorFromFile(const std::string &filename, const Tenso
 }
 
 InterpreterBackend::InterpreterBackend(std::string input_dir, std::string output_dir)
-    : _input_dir(std::move(input_dir)), _output_dir(std::move(output_dir))
+  : _input_dir(std::move(input_dir)), _output_dir(std::move(output_dir))
 {
 }
 
diff --git a/compiler/nnc/backends/soft_backend/CPPGenerator.cpp b/compiler/nnc/backends/soft_backend/CPPGenerator.cpp
index 236881b80..097122882 100644
--- a/compiler/nnc/backends/soft_backend/CPPGenerator.cpp
+++ b/compiler/nnc/backends/soft_backend/CPPGenerator.cpp
@@ -80,7 +80,7 @@ static unique_ptr<ofstream> getStream(const string &path)
 }
 
 CPPCodeGenerator::CPPCodeGenerator(std::string output_dir, std::string artifact_name)
-    : _output_dir(std::move(output_dir)), _artifact_name(std::move(artifact_name))
+  : _output_dir(std::move(output_dir)), _artifact_name(std::move(artifact_name))
 {
 }
 
@@ -187,12 +187,14 @@ void CPPCodeGenerator::materializeHeader(ostream &out, const ModelAnalyzer &ma)
   string class_name = ma.getModelName() + "Model";
 
   out.write(cpp_header_types, sizeof(cpp_header_types));
-  out << "class " << class_name << "\n"
-                                   "{\n"
-                                   "public:\n"
-                                   "  "
-      << class_name << "(const std::string& parametersPath);\n"
-                       "  ~"
+  out << "class " << class_name
+      << "\n"
+         "{\n"
+         "public:\n"
+         "  "
+      << class_name
+      << "(const std::string& parametersPath);\n"
+         "  ~"
       << class_name << "();\n";
   // generate input setters
   if (ma.getInputs().size() == 1)
@@ -215,10 +217,12 @@ void CPPCodeGenerator::materializeHeader(ostream &out, const ModelAnalyzer &ma)
   out << "  void doInference();\n\n"
          "private:\n"
          "  "
-      << class_name << "() = delete;\n"
-                       "  "
-      << class_name << "(const " << class_name << "& orig) = delete;\n"
-                                                  "  "
+      << class_name
+      << "() = delete;\n"
+         "  "
+      << class_name << "(const " << class_name
+      << "& orig) = delete;\n"
+         "  "
       << class_name << "& operator=(const " << class_name << "& orig) = delete;\n";
   // generate input/output tensors
   for (const size_t in_tensor_id : ma.getInputs())
@@ -273,8 +277,9 @@ void CPPCodeGenerator::printSetter(ostream &out, const string &class_name,
 {
 
   const string &var_name = _formattedTensors[td.id];
-  out << "bool " << class_name << "::set" << setter_name << "(const Tensor& t)\n"
-                                                            "{\n";
+  out << "bool " << class_name << "::set" << setter_name
+      << "(const Tensor& t)\n"
+         "{\n";
   // need to insert input correctness check
   const mir::Shape expected = td.shape;
   int rank = expected.rank();
@@ -286,9 +291,10 @@ void CPPCodeGenerator::printSetter(ostream &out, const string &class_name,
       out << "  "
           << "if (t.getShape()[" << i << "] != " << expected.dim(i) << ") return false;\n";
   }
-  out << "  " << var_name << " = t;\n"
-                             "  return true;\n"
-                             "}\n\n";
+  out << "  " << var_name
+      << " = t;\n"
+         "  return true;\n"
+         "}\n\n";
 }
 
 void CPPCodeGenerator::printGetter(ostream &out, const string &class_name,
@@ -296,11 +302,13 @@ void CPPCodeGenerator::printGetter(ostream &out, const string &class_name,
 {
 
   const string &var_name = _formattedTensors[td.id];
-  out << "shared_ptr<Tensor> " << class_name << "::get" << getter_name << "()\n"
-                                                                          "{\n"
-                                                                          "  return "
-      << var_name << ";\n"
-                     "}\n\n";
+  out << "shared_ptr<Tensor> " << class_name << "::get" << getter_name
+      << "()\n"
+         "{\n"
+         "  return "
+      << var_name
+      << ";\n"
+         "}\n\n";
 }
 
 void CPPCodeGenerator::materializeCall(ostream &out, const ModelAnalyzer &ma,
@@ -435,13 +443,15 @@ void CPPCodeGenerator::materializeCode(ostream &out, const ModelAnalyzer &ma, co
       << "(const string& parametersPath)\n"
          "{\n"
          "  readParameters(_parameters, _paramSize, parametersPath, "
-      << s.getFormatVersion() << ", " << s.getModelHash() << ");\n"
-                                                             "}\n\n";
+      << s.getFormatVersion() << ", " << s.getModelHash()
+      << ");\n"
+         "}\n\n";
   // gen NN destructor
-  out << class_name << "::~" << class_name << "()\n"
-                                              "{\n"
-                                              "  releaseParameters(_parameters, _paramSize);\n"
-                                              "}\n\n";
+  out << class_name << "::~" << class_name
+      << "()\n"
+         "{\n"
+         "  releaseParameters(_parameters, _paramSize);\n"
+         "}\n\n";
   // generate input setters
   // generate main setter if network has only one
   const auto &inputs = ma.getInputs();
@@ -473,8 +483,9 @@ void CPPCodeGenerator::materializeCode(ostream &out, const ModelAnalyzer &ma, co
     const TensorDescriptor &td = tensors[output_tensor_id];
     printGetter(out, class_name, output_tensor_name, td);
   }
-  out << "void " << class_name << "::doInference()\n"
-                                  "{\n";
+  out << "void " << class_name
+      << "::doInference()\n"
+         "{\n";
   for (size_t output_tensor_id : ma.getPersistentTensors())
   {
     const string &output_tensor_name = _formattedTensors[output_tensor_id];
diff --git a/compiler/nnc/backends/soft_backend/ModelAnalyzer.cpp b/compiler/nnc/backends/soft_backend/ModelAnalyzer.cpp
index 82e62b531..2d555d0a9 100644
--- a/compiler/nnc/backends/soft_backend/ModelAnalyzer.cpp
+++ b/compiler/nnc/backends/soft_backend/ModelAnalyzer.cpp
@@ -62,7 +62,7 @@ void ModelAnalyzer::appendOperationToInference(Operation *op, const string &func
     {
       const auto &tensor_name = output.getName();
       const auto tensor_id =
-          tensor_name.empty() ? declareTemporaryTensor() : declarePersistentTensor(tensor_name);
+        tensor_name.empty() ? declareTemporaryTensor() : declarePersistentTensor(tensor_name);
       node_output_tensors.push_back(tensor_id);
     }
   }
@@ -82,7 +82,7 @@ void ModelAnalyzer::appendOperationToInference(Operation *op, const string &func
 
   std::copy(aux_args.begin(), aux_args.end(), std::back_inserter(node_input_tensors));
   unique_ptr<Action> operation_call(new CallFunction(
-      op, function_name, std::move(node_input_tensors), std::move(node_output_tensors)));
+    op, function_name, std::move(node_input_tensors), std::move(node_output_tensors)));
   _inferenceSequence.push_back(std::move(operation_call));
   _opToDescr[op] = _inferenceSequence.back().get();
 }
diff --git a/compiler/nnc/backends/soft_backend/ModelAnalyzer.h b/compiler/nnc/backends/soft_backend/ModelAnalyzer.h
index 471c31011..6522bc655 100644
--- a/compiler/nnc/backends/soft_backend/ModelAnalyzer.h
+++ b/compiler/nnc/backends/soft_backend/ModelAnalyzer.h
@@ -42,9 +42,9 @@ class ModelAnalyzer : public mir::Visitor
 {
 public:
   /**
- * @brief contructs inference sequence
- * @param g pointer to graph to linearize
- */
+   * @brief contructs inference sequence
+   * @param g pointer to graph to linearize
+   */
   void analyze(const mir::Graph *g);
 
   void visit(mir::ops::AbsOp &) override;
diff --git a/compiler/nnc/backends/soft_backend/SequencedIR.h b/compiler/nnc/backends/soft_backend/SequencedIR.h
index 9a761243e..ff062e043 100644
--- a/compiler/nnc/backends/soft_backend/SequencedIR.h
+++ b/compiler/nnc/backends/soft_backend/SequencedIR.h
@@ -91,7 +91,7 @@ struct TransposeTensor : public Action
 {
 
   TransposeTensor(size_t input, size_t output, std::vector<int32_t> &&perm)
-      : Action(Type::transposeTensor), perm(std::move(perm)), input(input), output(output)
+    : Action(Type::transposeTensor), perm(std::move(perm)), input(input), output(output)
   {
   }
 
@@ -121,8 +121,8 @@ struct CallFunction : public Action
 
   CallFunction(mir::Operation *op, std::string func_name, std::vector<size_t> &&inputs,
                std::vector<size_t> &&outputs)
-      : Action(Type::callFunction), mirOp(op), funcName(std::move(func_name)), inputs(inputs),
-        outputs(outputs), paramStartOffset(0)
+    : Action(Type::callFunction), mirOp(op), funcName(std::move(func_name)), inputs(inputs),
+      outputs(outputs), paramStartOffset(0)
   {
   }
 
diff --git a/compiler/nnc/driver/Options.cpp b/compiler/nnc/driver/Options.cpp
index e22d01847..c1997fe6a 100644
--- a/compiler/nnc/driver/Options.cpp
+++ b/compiler/nnc/driver/Options.cpp
@@ -35,7 +35,7 @@ Option<bool> caffeFrontend(optname("--caffe"), overview("treat input file as Caf
 #else
                            showopt(false)
 #endif // NNC_FRONTEND_CAFFE_ENABLED
-                               );
+);
 Option<bool> onnxFrontend(optname("--onnx"), overview("treat input file as ONNX model"), false,
                           optional(true), optvalues(""), nullptr, separators(""),
 #ifdef NNC_FRONTEND_ONNX_ENABLED
@@ -43,7 +43,7 @@ Option<bool> onnxFrontend(optname("--onnx"), overview("treat input file as ONNX
 #else
                           showopt(false)
 #endif // NNC_FRONTEND_ONNX_ENABLED
-                              );
+);
 
 Option<bool> caffe2Frontend(optname("--caffe2"),
                             overview("treat input file as Caffe2 model (predict_net.pb)"), false,
@@ -83,16 +83,16 @@ Option<bool> tflFrontend(optname("--tflite"),
 #else
                          showopt(false)
 #endif // NNC_FRONTEND_TFLITE_ENABLED
-                             );
+);
 Option<std::string>
-    target(optname("--target"),
-           overview("select target language to emit for given architecture."
-                    "Valid values are '" NNC_TARGET_ARM_CPP "', '" NNC_TARGET_X86_CPP
-                    "', '" NNC_TARGET_ARM_GPU_CPP "', '" NNC_TARGET_INTERPRETER "'"),
-           std::string(), optional(false),
-           optvalues(NNC_TARGET_ARM_CPP "," NNC_TARGET_X86_CPP "," NNC_TARGET_ARM_GPU_CPP
-                                        "," NNC_TARGET_INTERPRETER),
-           nullptr, separators("="));
+  target(optname("--target"),
+         overview("select target language to emit for given architecture."
+                  "Valid values are '" NNC_TARGET_ARM_CPP "', '" NNC_TARGET_X86_CPP
+                  "', '" NNC_TARGET_ARM_GPU_CPP "', '" NNC_TARGET_INTERPRETER "'"),
+         std::string(), optional(false),
+         optvalues(NNC_TARGET_ARM_CPP "," NNC_TARGET_X86_CPP "," NNC_TARGET_ARM_GPU_CPP
+                                      "," NNC_TARGET_INTERPRETER),
+         nullptr, separators("="));
 
 /**
  * Options for *frontend*
diff --git a/compiler/nnc/include/pass/PassData.h b/compiler/nnc/include/pass/PassData.h
index e2c0b8129..1ff8af927 100644
--- a/compiler/nnc/include/pass/PassData.h
+++ b/compiler/nnc/include/pass/PassData.h
@@ -30,9 +30,8 @@ class PassData
 {
 public:
   /* implicit */ PassData(std::nullptr_t data)
-      : // NOLINT(google-explicit-constructor, hicpp-explicit-conversions)
-        _dataContainer{.unknown = data},
-        _dataType(PDT::UNKNOWN)
+    : // NOLINT(google-explicit-constructor, hicpp-explicit-conversions)
+      _dataContainer{.unknown = data}, _dataType(PDT::UNKNOWN)
   {
   }
 
@@ -40,9 +39,8 @@ public:
    * @brief Implicit conversion from Graph* to PassData
    */
   /* implicit */ PassData(mir::Graph *graph)
-      : // NOLINT(google-explicit-constructor, hicpp-explicit-conversions)
-        _dataContainer{.graph = graph},
-        _dataType(PDT::GRAPH)
+    : // NOLINT(google-explicit-constructor, hicpp-explicit-conversions)
+      _dataContainer{.graph = graph}, _dataType(PDT::GRAPH)
   {
   }
 
@@ -60,9 +58,8 @@ public:
    * @brief Implicit conversion from Graph* to PassData
    */
   /* implicit */ PassData(mir::TensorVariant *tv)
-      : // NOLINT(google-explicit-constructor, hicpp-explicit-conversions)
-        _dataContainer{.tensorVariant = tv},
-        _dataType(PDT::TENSOR_VARIANT)
+    : // NOLINT(google-explicit-constructor, hicpp-explicit-conversions)
+      _dataContainer{.tensorVariant = tv}, _dataType(PDT::TENSOR_VARIANT)
   {
   }
 
diff --git a/compiler/nnc/include/passes/optimizations/CombineTransposes.h b/compiler/nnc/include/passes/optimizations/CombineTransposes.h
index 7d227cd5d..a08676e47 100644
--- a/compiler/nnc/include/passes/optimizations/CombineTransposes.h
+++ b/compiler/nnc/include/passes/optimizations/CombineTransposes.h
@@ -33,6 +33,7 @@ public:
   PassData run(PassData data) override;
 
   std::string getName() override { return "opt_combine_transposes"; };
+
 private:
 };
 
diff --git a/compiler/nnc/include/passes/optimizations/OptimizationUtils.h b/compiler/nnc/include/passes/optimizations/OptimizationUtils.h
index 9a9212c12..83f455b2d 100644
--- a/compiler/nnc/include/passes/optimizations/OptimizationUtils.h
+++ b/compiler/nnc/include/passes/optimizations/OptimizationUtils.h
@@ -25,11 +25,11 @@ namespace nnc
 namespace opt_util
 {
 /**
-* @brief Swap adjacent nodes in Graph. Creates new nodes and replaces the old ones with new.
-* @param g MIR Graph
-* @param top Node
-* @param bottom Node
-*/
+ * @brief Swap adjacent nodes in Graph. Creates new nodes and replaces the old ones with new.
+ * @param g MIR Graph
+ * @param top Node
+ * @param bottom Node
+ */
 void swapAdjacent(mir::Graph *g, mir::Operation *top, mir::Operation *bottom);
 
 // TODO: this function and it's usages should be removed, after DCE optimization will be implemented
diff --git a/compiler/nnc/include/support/CommandLine.h b/compiler/nnc/include/support/CommandLine.h
index 40777ff46..66466276d 100644
--- a/compiler/nnc/include/support/CommandLine.h
+++ b/compiler/nnc/include/support/CommandLine.h
@@ -38,7 +38,7 @@ class BadOption : public std::logic_error
 {
 public:
   explicit BadOption(const std::string &msg, std::string optname = "", std::string value = "")
-      : std::logic_error(msg), _option_name(std::move(optname)), _option_value(std::move(value))
+    : std::logic_error(msg), _option_name(std::move(optname)), _option_value(std::move(value))
   {
   }
 
@@ -387,7 +387,7 @@ private:
   std::map<std::string, IOption *> _options_name; // map of name -> option
   std::vector<IOption *> _options;                // options
   std::map<IOption::Group, std::vector<IOption *>>
-      _grouped_options;   // map of groups: group -> vector of options
+    _grouped_options;     // map of groups: group -> vector of options
   std::string _prog_name; // name of program
   int _args_num = 0;      // number of command line arguments
 };
@@ -530,7 +530,7 @@ Option<T>::Option(const std::vector<std::string> &optnames, const std::string &d
   _group = group;
 
   _can_have_several_vals =
-      std::is_same<T, std::vector<std::string>>::value || std::is_same<T, std::vector<int>>::value;
+    std::is_same<T, std::vector<std::string>>::value || std::is_same<T, std::vector<int>>::value;
   assert(!(_can_have_several_vals && !_seps.empty()) &&
          "option with several values can't have separators");
 
diff --git a/compiler/nnc/passes/optimizations/CombineTransposes.cpp b/compiler/nnc/passes/optimizations/CombineTransposes.cpp
index e381a9cae..8a584d2d5 100644
--- a/compiler/nnc/passes/optimizations/CombineTransposes.cpp
+++ b/compiler/nnc/passes/optimizations/CombineTransposes.cpp
@@ -72,12 +72,12 @@ nnc::PassData nnc::CombineTransposes::run(nnc::PassData data)
       };
       auto *bottom_transpose = dynamic_cast<mir::ops::TransposeOp *>(match.second);
       auto combined_axis_order =
-          combineAxisOrders(top_transpose->getAxisOrder(), bottom_transpose->getAxisOrder());
+        combineAxisOrders(top_transpose->getAxisOrder(), bottom_transpose->getAxisOrder());
 
       if (!isIdentityTranspose(combined_axis_order))
       {
         auto new_tr_op =
-            g->create<mir::ops::TransposeOp>(top_transpose->getInput(0), combined_axis_order);
+          g->create<mir::ops::TransposeOp>(top_transpose->getInput(0), combined_axis_order);
 
         g->replaceNode(bottom_transpose, new_tr_op);
       }
diff --git a/compiler/nnc/passes/optimizations/DeadCodeElimination.cpp b/compiler/nnc/passes/optimizations/DeadCodeElimination.cpp
index b89dca1b7..371d9703f 100644
--- a/compiler/nnc/passes/optimizations/DeadCodeElimination.cpp
+++ b/compiler/nnc/passes/optimizations/DeadCodeElimination.cpp
@@ -33,8 +33,8 @@ nnc::PassData nnc::DeadCodeElimination::run(PassData data)
       return;
 
     bool has_no_uses =
-        std::all_of(op->getOutputs().cbegin(), op->getOutputs().cend(),
-                    [](const Operation::Output &output) { return output.getUses().empty(); });
+      std::all_of(op->getOutputs().cbegin(), op->getOutputs().cend(),
+                  [](const Operation::Output &output) { return output.getUses().empty(); });
 
     if (has_no_uses)
     {
diff --git a/compiler/nnc/passes/optimizations/FuseArithmeticOps.cpp b/compiler/nnc/passes/optimizations/FuseArithmeticOps.cpp
index 91686ef74..d69439fc3 100644
--- a/compiler/nnc/passes/optimizations/FuseArithmeticOps.cpp
+++ b/compiler/nnc/passes/optimizations/FuseArithmeticOps.cpp
@@ -215,10 +215,10 @@ bool sinkAddThroughMul(Graph *g)
     // Create new operations
     auto old_add_input = old_add_op->getInput(0);
     auto new_mul_op =
-        g->copyOpWithInputs(old_mul_op, {old_add_input, ols_mul_const_op->getOutput(0)});
+      g->copyOpWithInputs(old_mul_op, {old_add_input, ols_mul_const_op->getOutput(0)});
     auto new_add_const_op = mergeConstantOps(g, old_add_const_op, ols_mul_const_op, OpType::mul);
     auto new_add_op =
-        g->copyOpWithInputs(old_add_op, {new_mul_op->getOutput(0), new_add_const_op->getOutput(0)});
+      g->copyOpWithInputs(old_add_op, {new_mul_op->getOutput(0), new_add_const_op->getOutput(0)});
 
     // Replace old mul with new add and remove old nodes
     g->replaceNode(old_mul_op, new_add_op);
diff --git a/compiler/nnc/passes/transformations/DataFormatSwitcher.cpp b/compiler/nnc/passes/transformations/DataFormatSwitcher.cpp
index 8ff842660..fcdbba878 100644
--- a/compiler/nnc/passes/transformations/DataFormatSwitcher.cpp
+++ b/compiler/nnc/passes/transformations/DataFormatSwitcher.cpp
@@ -27,7 +27,7 @@
 namespace nnc
 {
 DataFormatSwitcher::DataFormatSwitcher(const mir::DataFormat target_format)
-    : _target_format(target_format)
+  : _target_format(target_format)
 {
 }
 
@@ -89,10 +89,10 @@ mir::Operation::Output *DataFormatSwitcher::insertTransposeBefore(mir::Operation
   mir::Operation::Output *new_out;
   if (_target_format == mir::DataFormat::NHWC)
     new_out = _graph->create<mir::ops::TransposeOp>(out, std::vector<std::size_t>{0, 2, 3, 1})
-                  ->getOutput(0); // NCHW -> NHWC
+                ->getOutput(0); // NCHW -> NHWC
   else
     new_out = _graph->create<mir::ops::TransposeOp>(out, std::vector<std::size_t>{0, 3, 1, 2})
-                  ->getOutput(0); // NHWC -> NCHW
+                ->getOutput(0); // NHWC -> NCHW
   if (out->getType().isQuantized())
     new_out->setQuantization(out->getType().getQuantization());
   return new_out;
@@ -103,10 +103,10 @@ mir::Operation::Output *DataFormatSwitcher::insertTransposeAfter(mir::Operation:
   mir::Operation::Output *new_out;
   if (_target_format == mir::DataFormat::NHWC)
     new_out = _graph->create<mir::ops::TransposeOp>(out, std::vector<std::size_t>{0, 3, 1, 2})
-                  ->getOutput(0); // NHWC -> NCHW
+                ->getOutput(0); // NHWC -> NCHW
   else
     new_out = _graph->create<mir::ops::TransposeOp>(out, std::vector<std::size_t>{0, 2, 3, 1})
-                  ->getOutput(0); // NCHW -> NHWC
+                ->getOutput(0); // NCHW -> NHWC
   if (out->getType().isQuantized())
     new_out->setQuantization(out->getType().getQuantization());
   return new_out;
diff --git a/compiler/nnc/passes/transformations/LowerConv2D.cpp b/compiler/nnc/passes/transformations/LowerConv2D.cpp
index 9e32978bc..9ae20527d 100644
--- a/compiler/nnc/passes/transformations/LowerConv2D.cpp
+++ b/compiler/nnc/passes/transformations/LowerConv2D.cpp
@@ -36,11 +36,11 @@ static void lowerConv2D(mir::Graph *graph, mir::ops::Conv2DOp *op)
     // [O, H, W, I / M] == [M, H, W, 1] -> [H, W, M, 1]
     std::vector<std::size_t> perm{1, 2, 0, 3};
     mir::Operation::Output *new_kernel =
-        graph->create<mir::ops::TransposeOp>(kernel, perm)->getOutput(0);
+      graph->create<mir::ops::TransposeOp>(kernel, perm)->getOutput(0);
     mir::Conv2DOpAttributes attributes = op->getAttributes();
     attributes.num_groups = 1;
     mir::Operation::Output *new_result =
-        graph->create<mir::ops::DepthwiseConv2DOp>(input, new_kernel, attributes)->getOutput(0);
+      graph->create<mir::ops::DepthwiseConv2DOp>(input, new_kernel, attributes)->getOutput(0);
     graph->replaceNode(op, new_result->getNode());
   }
 }
diff --git a/compiler/nnc/tests/acl_soft_backend/AclCppOperations.cpp b/compiler/nnc/tests/acl_soft_backend/AclCppOperations.cpp
index 4ae020355..d39c9dcb5 100644
--- a/compiler/nnc/tests/acl_soft_backend/AclCppOperations.cpp
+++ b/compiler/nnc/tests/acl_soft_backend/AclCppOperations.cpp
@@ -157,7 +157,7 @@ static void runAclSystemTest(const string &name)
 
   // Copy the model input HDF5 file to the remote device.
   ASSERT_TRUE(
-      copyToOdroid(binDir + "/" + name + "/in_" + name + "_caffe.hdf5", dir_name + "/in.hdf5"));
+    copyToOdroid(binDir + "/" + name + "/in_" + name + "_caffe.hdf5", dir_name + "/in.hdf5"));
 
   // Switch to the artifact directory on the remote device and run the artifact.
   ASSERT_TRUE(runOnOdroid("cd " + dir_name + "; ./nnc_test"));
diff --git a/compiler/nnc/tests/acl_soft_backend/artifact_cmake/main.cpp b/compiler/nnc/tests/acl_soft_backend/artifact_cmake/main.cpp
index c326b390b..ea4bddac8 100644
--- a/compiler/nnc/tests/acl_soft_backend/artifact_cmake/main.cpp
+++ b/compiler/nnc/tests/acl_soft_backend/artifact_cmake/main.cpp
@@ -31,12 +31,13 @@ static unique_ptr<char[]> getTensorData(CLTensor &tensor)
   Iterator i(&tensor, window);
   char *ptr = &buf[0];
 
-  execute_window_loop(window,
-                      [&i, &ptr](const Coordinates &) {
-                        memcpy(ptr, i.ptr(), sizeof(float));
-                        ptr += sizeof(float);
-                      },
-                      i);
+  execute_window_loop(
+    window,
+    [&i, &ptr](const Coordinates &) {
+      memcpy(ptr, i.ptr(), sizeof(float));
+      ptr += sizeof(float);
+    },
+    i);
 
   tensor.unmap();
   return buf;
@@ -52,12 +53,13 @@ static void readTensor(CLTensor &tensor, H5::DataSet &dataset)
   Iterator i(&tensor, window);
   char *ptr = &buf[0];
 
-  execute_window_loop(window,
-                      [&i, &ptr](const Coordinates &) {
-                        memcpy(i.ptr(), ptr, sizeof(float));
-                        ptr += sizeof(float);
-                      },
-                      i);
+  execute_window_loop(
+    window,
+    [&i, &ptr](const Coordinates &) {
+      memcpy(i.ptr(), ptr, sizeof(float));
+      ptr += sizeof(float);
+    },
+    i);
 
   tensor.unmap();
 }
diff --git a/compiler/nnc/tests/soft_backend/CompileCPP.cpp b/compiler/nnc/tests/soft_backend/CompileCPP.cpp
index 63aeb4a1b..4ede0cf05 100644
--- a/compiler/nnc/tests/soft_backend/CompileCPP.cpp
+++ b/compiler/nnc/tests/soft_backend/CompileCPP.cpp
@@ -101,7 +101,7 @@ int main()
   string target_compiler = "g++ -Wall --std=c++11";
 
   string compiler_command =
-      target_compiler + " -I" + output_dir + " " + main_path + " " + code_path;
+    target_compiler + " -I" + output_dir + " " + main_path + " " + code_path;
 
   // call compiler
   int res = system(compiler_command.c_str());
diff --git a/compiler/nnc/unittests/acl_backend/DOMToText.cpp b/compiler/nnc/unittests/acl_backend/DOMToText.cpp
index be0e6713c..aaf0c2055 100644
--- a/compiler/nnc/unittests/acl_backend/DOMToText.cpp
+++ b/compiler/nnc/unittests/acl_backend/DOMToText.cpp
@@ -148,9 +148,9 @@ TEST(acl_backend_dom_to_text, ArtifactUnaryExpr)
   const char *var_name = "id";
   shared_ptr<ArtifactId> var = AF::id(var_name);
   pair<ArtifactUnOp, const char *> test_cases[] = {
-      {ArtifactUnOp::preIncr, "++id"},   {ArtifactUnOp::preDecr, "--id"},
-      {ArtifactUnOp::heapNew, "new id"}, {ArtifactUnOp::heapFree, "delete id"},
-      {ArtifactUnOp::postIncr, "id++"},  {ArtifactUnOp::postDecr, "id--"}};
+    {ArtifactUnOp::preIncr, "++id"},   {ArtifactUnOp::preDecr, "--id"},
+    {ArtifactUnOp::heapNew, "new id"}, {ArtifactUnOp::heapFree, "delete id"},
+    {ArtifactUnOp::postIncr, "id++"},  {ArtifactUnOp::postDecr, "id--"}};
 
   for (auto test : test_cases)
   {
@@ -181,14 +181,14 @@ TEST(acl_backend_dom_to_text, ArtifactBinaryExpr)
   shared_ptr<ArtifactId> op2 = AF::id(op2_name);
 
   pair<ArtifactBinOp, const char *> test_cases[] = {
-      {ArtifactBinOp::eq, "a == b"},          {ArtifactBinOp::notEq, "a != b"},
-      {ArtifactBinOp::less, "a < b"},         {ArtifactBinOp::lessOrEq, "a <= b"},
-      {ArtifactBinOp::great, "a > b"},        {ArtifactBinOp::greatOrEq, "a >= b"},
-      {ArtifactBinOp::assign, "a = b"},       {ArtifactBinOp::plus, "a + b"},
-      {ArtifactBinOp::minus, "a - b"},        {ArtifactBinOp::mult, "a * b"},
-      {ArtifactBinOp::div, "a / b"},          {ArtifactBinOp::plusAssign, "a += b"},
-      {ArtifactBinOp::minusAssign, "a -= b"}, {ArtifactBinOp::multAssign, "a *= b"},
-      {ArtifactBinOp::divAssign, "a /= b"}};
+    {ArtifactBinOp::eq, "a == b"},          {ArtifactBinOp::notEq, "a != b"},
+    {ArtifactBinOp::less, "a < b"},         {ArtifactBinOp::lessOrEq, "a <= b"},
+    {ArtifactBinOp::great, "a > b"},        {ArtifactBinOp::greatOrEq, "a >= b"},
+    {ArtifactBinOp::assign, "a = b"},       {ArtifactBinOp::plus, "a + b"},
+    {ArtifactBinOp::minus, "a - b"},        {ArtifactBinOp::mult, "a * b"},
+    {ArtifactBinOp::div, "a / b"},          {ArtifactBinOp::plusAssign, "a += b"},
+    {ArtifactBinOp::minusAssign, "a -= b"}, {ArtifactBinOp::multAssign, "a *= b"},
+    {ArtifactBinOp::divAssign, "a /= b"}};
 
   for (auto test : test_cases)
   {
@@ -286,12 +286,12 @@ TEST(acl_backend_dom_to_text, ArtifactForLoop)
 
   shared_ptr<ArtifactVariable> iter = AF::var(var_type, var_name, {}, {AF::lit("0")});
   shared_ptr<ArtifactExpr> step =
-      AF::bin(ArtifactBinOp::plusAssign, AF::id(var_name), AF::lit("1"));
+    AF::bin(ArtifactBinOp::plusAssign, AF::id(var_name), AF::lit("1"));
   shared_ptr<ArtifactExpr> cond =
-      AF::bin(ArtifactBinOp::lessOrEq, AF::id(var_name), AF::lit("123"));
+    AF::bin(ArtifactBinOp::lessOrEq, AF::id(var_name), AF::lit("123"));
 
   shared_ptr<ArtifactBinaryExpr> expr =
-      AF::bin(ArtifactBinOp::plusAssign, AF::id("hello"), AF::id("world"));
+    AF::bin(ArtifactBinOp::plusAssign, AF::id("hello"), AF::id("world"));
 
   ArtifactForLoop loop(iter, cond, step);
 
@@ -308,10 +308,10 @@ TEST(acl_backend_dom_to_text, ArtifactIf)
   const char *var_name = "i";
 
   shared_ptr<ArtifactExpr> cond =
-      AF::bin(ArtifactBinOp::lessOrEq, AF::id(var_name), AF::lit("123"));
+    AF::bin(ArtifactBinOp::lessOrEq, AF::id(var_name), AF::lit("123"));
 
   shared_ptr<ArtifactBinaryExpr> expr =
-      AF::bin(ArtifactBinOp::plusAssign, AF::id("hello"), AF::id("world"));
+    AF::bin(ArtifactBinOp::plusAssign, AF::id("hello"), AF::id("world"));
 
   ArtifactIf if_stmt(cond);
 
@@ -415,7 +415,7 @@ static shared_ptr<ArtifactClassVariable> createClsVariable(ArtifactClass &cls, c
   list<shared_ptr<ArtifactExpr>> dims{dim1, dim2};
   list<shared_ptr<ArtifactExpr>> initializers{AF::lit("123")};
   shared_ptr<ArtifactClassVariable> var_decl =
-      cls.var(is_public, var_type, var_name, dims, initializers);
+    cls.var(is_public, var_type, var_name, dims, initializers);
   return var_decl;
 }
 
@@ -483,8 +483,8 @@ TEST(acl_backend_dom_to_text, ArtifactModule)
   const char *code_prefix = "#include \"module.h\"\n\n#include <list>\n\n#include \"bar.h\"\n\n";
   const char *code_suffix = "\nClass::Class() {\n}\n\n";
 
-  string ref_data = string(code_prefix) +
-                    string(AclArtifactUtilities, sizeof(AclArtifactUtilities)) + code_suffix;
+  string ref_data =
+    string(code_prefix) + string(AclArtifactUtilities, sizeof(AclArtifactUtilities)) + code_suffix;
   m.accept(&code_gen);
   ASSERT_EQ(code_out.str(), ref_data);
 
diff --git a/compiler/nnc/unittests/acl_backend/MIRToDOM.cpp b/compiler/nnc/unittests/acl_backend/MIRToDOM.cpp
index a9b36a145..f411fde42 100644
--- a/compiler/nnc/unittests/acl_backend/MIRToDOM.cpp
+++ b/compiler/nnc/unittests/acl_backend/MIRToDOM.cpp
@@ -117,12 +117,12 @@ void checkDomIncludes(const ArtifactModule &m)
 
   // check ordinary includes, like '#include  "artifact_data.h"'
   checkHeadersSetsEqual(
-      m.headerIncludes(),
-      {"arm_compute/core/Types.h", "arm_compute/runtime/BlobLifetimeManager.h",
-       "arm_compute/runtime/CL/CLBufferAllocator.h", "arm_compute/runtime/CL/CLFunctions.h",
-       "arm_compute/runtime/CL/CLScheduler.h", "arm_compute/runtime/MemoryManagerOnDemand.h",
-       "arm_compute/runtime/PoolManager.h"},
-      "system header includes diverged");
+    m.headerIncludes(),
+    {"arm_compute/core/Types.h", "arm_compute/runtime/BlobLifetimeManager.h",
+     "arm_compute/runtime/CL/CLBufferAllocator.h", "arm_compute/runtime/CL/CLFunctions.h",
+     "arm_compute/runtime/CL/CLScheduler.h", "arm_compute/runtime/MemoryManagerOnDemand.h",
+     "arm_compute/runtime/PoolManager.h"},
+    "system header includes diverged");
 
   checkHeadersSetsEqual(m.sourceSysIncludes(), {}, "system source includes diverged");
 }
@@ -287,10 +287,10 @@ TEST(acl_backend_mir_to_dom, conv2d)
 
   Graph g;
   OpConstructor op_generator =
-      [kernel_tensor](mir::Graph &g, const std::vector<mir::Operation::Output *> &inputs) {
-        auto kernel = g.create<mir::ops::ConstantOp>(kernel_tensor)->getOutput(0);
-        return g.create<mir::ops::Conv2DOp>(inputs[0], kernel, mir::Conv2DOpAttributes());
-      };
+    [kernel_tensor](mir::Graph &g, const std::vector<mir::Operation::Output *> &inputs) {
+      auto kernel = g.create<mir::ops::ConstantOp>(kernel_tensor)->getOutput(0);
+      return g.create<mir::ops::Conv2DOp>(inputs[0], kernel, mir::Conv2DOpAttributes());
+    };
 
   vector<Shape> input_shapes{{1, 10, 10, channels}};
 
@@ -312,11 +312,11 @@ TEST(acl_backend_mir_to_dom, depthwise_conv)
 
   Graph g;
   OpConstructor op_generator =
-      [kernel_tensor](mir::Graph &g, const std::vector<mir::Operation::Output *> &inputs) {
-        Conv2DOpAttributes attributes;
-        auto kernel = g.create<mir::ops::ConstantOp>(kernel_tensor)->getOutput(0);
-        return g.create<mir::ops::DepthwiseConv2DOp>(inputs[0], kernel, attributes);
-      };
+    [kernel_tensor](mir::Graph &g, const std::vector<mir::Operation::Output *> &inputs) {
+      Conv2DOpAttributes attributes;
+      auto kernel = g.create<mir::ops::ConstantOp>(kernel_tensor)->getOutput(0);
+      return g.create<mir::ops::DepthwiseConv2DOp>(inputs[0], kernel, attributes);
+    };
 
   vector<Shape> input_shapes{{1, 10, 10, channels}};
 
diff --git a/compiler/nnc/unittests/optimizations/SinkTest.cpp b/compiler/nnc/unittests/optimizations/SinkTest.cpp
index 8c5b2767e..be171d1cb 100644
--- a/compiler/nnc/unittests/optimizations/SinkTest.cpp
+++ b/compiler/nnc/unittests/optimizations/SinkTest.cpp
@@ -103,7 +103,7 @@ TEST(OptPass, sinkTrConcat)
   Operation *tr1 = g.create<ops::TransposeOp>(in1->getOutput(0), vector<size_t>{0, 3, 1, 2});
   Operation *tr2 = g.create<ops::TransposeOp>(in2->getOutput(0), vector<size_t>{0, 3, 1, 2});
   Operation *conc =
-      g.create<ops::ConcatOp>(vector<Operation::Output *>{tr1->getOutput(0), tr2->getOutput(0)}, 1);
+    g.create<ops::ConcatOp>(vector<Operation::Output *>{tr1->getOutput(0), tr2->getOutput(0)}, 1);
   Operation *tanh = g.create<ops::TanhOp>(conc->getOutput(0));
   Operation *out = g.create<ops::OutputOp>(tanh->getOutput(0));
   (void)out;
@@ -141,7 +141,7 @@ TEST(OptPass, sinkReluConcat)
   Operation *relu1 = g.create<ops::ReluOp>(in1->getOutput(0));
   Operation *relu2 = g.create<ops::ReluOp>(in2->getOutput(0));
   Operation *conc = g.create<ops::ConcatOp>(
-      vector<Operation::Output *>{relu1->getOutput(0), relu2->getOutput(0)}, 1);
+    vector<Operation::Output *>{relu1->getOutput(0), relu2->getOutput(0)}, 1);
   Operation *tanh = g.create<ops::TanhOp>(conc->getOutput(0));
   Operation *out = g.create<ops::OutputOp>(tanh->getOutput(0));
   (void)out;
diff --git a/compiler/nnc/unittests/soft_backend/CPPOperations.cpp b/compiler/nnc/unittests/soft_backend/CPPOperations.cpp
index 508ee954d..e593333fa 100644
--- a/compiler/nnc/unittests/soft_backend/CPPOperations.cpp
+++ b/compiler/nnc/unittests/soft_backend/CPPOperations.cpp
@@ -120,11 +120,10 @@ namespace
  * @brief Creates graph with one operation generated by opGen function and returns this operation
  * node
  */
-mir::Operation *
-fillGraph(mir::Graph &g,
-          const function<mir::Operation *(mir::Graph &g, vector<mir::Operation::Output *> &inputs)>
-              &op_gen,
-          const vector<unique_ptr<mir::TensorVariant>> &input_ntensors)
+mir::Operation *fillGraph(
+  mir::Graph &g,
+  const function<mir::Operation *(mir::Graph &g, vector<mir::Operation::Output *> &inputs)> &op_gen,
+  const vector<unique_ptr<mir::TensorVariant>> &input_ntensors)
 {
   // Create operation inputs.
   vector<mir::Operation::Output *> inputs;
@@ -295,8 +294,8 @@ void compareResults(const mir::TensorVariant &ref_nnc_tensor, const Tensor &test
     float ref_data = mir::Tensor<float>(ref_nnc_tensor).at(nnc_idx);
     float test_data = test_art_tensor.at(artifact_idx);
     ASSERT_TRUE(areFloatsNear(ref_data, test_data, 32, 1e-5))
-        << "Tensor element " << nnc_idx << " diverged, reference: " << ref_data
-        << " test result: " << test_data;
+      << "Tensor element " << nnc_idx << " diverged, reference: " << ref_data
+      << " test result: " << test_data;
   }
 }
 
@@ -306,10 +305,10 @@ void compareResults(const mir::TensorVariant &ref_nnc_tensor, const Tensor &test
  */
 template <typename TestFunc, typename... Args>
 void createAndRunTestGraph(
-    function<mir::Operation *(mir::Graph &, const std::vector<mir::Operation::Output *> &inputs)>
-        op_generator,
-    TestFunc artifactOperation, const vector<unique_ptr<mir::TensorVariant>> &input_ntensors,
-    Args &... input_atensors)
+  function<mir::Operation *(mir::Graph &, const std::vector<mir::Operation::Output *> &inputs)>
+    op_generator,
+  TestFunc artifactOperation, const vector<unique_ptr<mir::TensorVariant>> &input_ntensors,
+  Args &... input_atensors)
 {
   mir::Graph g;
   mir::Operation *actual_operation = fillGraph(g, op_generator, input_ntensors);
@@ -657,7 +656,7 @@ TEST(cpp_operations_test, resize_NN_test)
     auto op_generator = [&res_shape](mir::Graph &g,
                                      const std::vector<mir::Operation::Output *> &inputs) {
       return g.create<mir::ops::ResizeOp>(
-          inputs[0], mir::ops::ResizeOp::ResizeMethod::nearestNeighbor, res_shape);
+        inputs[0], mir::ops::ResizeOp::ResizeMethod::nearestNeighbor, res_shape);
     };
 
     createAndRunTestGraph(op_generator, resize, input_ntensors, input_atensor);
@@ -668,7 +667,7 @@ TEST(cpp_operations_test, resize_NN_test_scales)
 {
   cout << "\n";
   std::vector<float> test_scales[] = {
-      {1, 2, 2, 1}, {1, 2, 3, 1}, {1, 3, 2, 1}, {1, 2.5, 2, 1}, {1, 3, 9, 1}};
+    {1, 2, 2, 1}, {1, 2, 3, 1}, {1, 3, 2, 1}, {1, 2.5, 2, 1}, {1, 3, 9, 1}};
   for (const std::vector<float> &scales : test_scales)
   {
     vector<int> input_shape_data{1, 4, 4, 1};
@@ -678,7 +677,7 @@ TEST(cpp_operations_test, resize_NN_test_scales)
     auto op_generator = [&scales](mir::Graph &g,
                                   const std::vector<mir::Operation::Output *> &inputs) {
       return g.create<mir::ops::ResizeOp>(
-          inputs[0], mir::ops::ResizeOp::ResizeMethod::nearestNeighbor, scales);
+        inputs[0], mir::ops::ResizeOp::ResizeMethod::nearestNeighbor, scales);
     };
     createAndRunTestGraph(op_generator, resize, input_ntensors, input_atensor);
   }
@@ -711,10 +710,10 @@ TEST(cpp_operations_test, avgpool)
             for (const auto include_pad : {false, true})
             {
               attributes.include_pad = include_pad;
-              auto op_generator = [&attributes](
-                  mir::Graph &g, const std::vector<mir::Operation::Output *> &inputs) {
-                return g.create<mir::ops::AvgPool2DOp>(inputs[0], attributes);
-              };
+              auto op_generator =
+                [&attributes](mir::Graph &g, const std::vector<mir::Operation::Output *> &inputs) {
+                  return g.create<mir::ops::AvgPool2DOp>(inputs[0], attributes);
+                };
 
               createAndRunTestGraph(op_generator, avgPool, input_ntensors, input_atensor);
             }
@@ -742,8 +741,9 @@ TEST(cpp_operations_test, maxpool)
             vector<unique_ptr<mir::TensorVariant>> input_ntensors(1);
             fillTensors(input_ntensors[0], input_atensor, shape_data, 1.0f);
 
-            auto op_generator = [&window_size, &strides](
-                mir::Graph &g, const std::vector<mir::Operation::Output *> &inputs) {
+            auto op_generator = [&window_size,
+                                 &strides](mir::Graph &g,
+                                           const std::vector<mir::Operation::Output *> &inputs) {
               mir::MaxPool2DOpAttributes attributes;
               attributes.window = window_size;
               attributes.strides = strides;
@@ -838,7 +838,7 @@ TEST(cpp_operations_test, reduceMeanTst)
       vector<unique_ptr<mir::TensorVariant>> input_ntensors(1);
       fillTensors(input_ntensors[0], input_atensor, input_shape_data, 1.0f);
       auto op_generator = [&axis_list, keep_dims](
-          mir::Graph &g, const std::vector<mir::Operation::Output *> &inputs) {
+                            mir::Graph &g, const std::vector<mir::Operation::Output *> &inputs) {
         auto op = g.create<mir::ops::ReduceMeanOp>(inputs[0], axis_list, keep_dims);
         return op;
       };
@@ -873,7 +873,8 @@ TEST(cpp_operations_test, slice4d)
   vector<int> shape_data{5, 30, 40, 12};
   vector<int> starts[] = {{0, 0, 0, 0}, {1, 1, 1, 1}, {1, 0, 1, 0}, {0, 1, 1, 0}};
   vector<int> sizes[] = {
-      {-1, -1, -1, -1}, {4, -1, 10, -1},
+    {-1, -1, -1, -1},
+    {4, -1, 10, -1},
   };
   for (auto st : starts)
   {
diff --git a/compiler/nnc/unittests/support/CommandLineTest.cpp b/compiler/nnc/unittests/support/CommandLineTest.cpp
index 73f77aa20..993c4086f 100644
--- a/compiler/nnc/unittests/support/CommandLineTest.cpp
+++ b/compiler/nnc/unittests/support/CommandLineTest.cpp
@@ -69,8 +69,8 @@ Option<int32_t> NNegOpt(optname("-neg_val"),
 
 // test option with default negative value
 Option<int32_t>
-    NDefaultNegOpt(optname("-default_neg_val"),
-                   overview("description of integer option with default negative value"), -33);
+  NDefaultNegOpt(optname("-default_neg_val"),
+                 overview("description of integer option with default negative value"), -33);
 // test option with positive values
 Option<uint32_t> NPosOpt(optname("-pos_val"),
                          overview("description of integer option with positive value"), 1,
@@ -124,28 +124,28 @@ TEST(SUPPORT_NNC, verify_cl_options)
 {
   // create command line
   const char *argv[] = {
-      "CLTest", // program name
-      // string options
-      "-m", "multiopt_value",                        // second name for option with several names
-      "--single", "single_value",                    // option with single name
-      "-several_separators:SOME_VALUE1,SOME_VALUE2", // test option with several separators
-      "--one_separarot=AAA_VALUE",                   // test option whit one separator
-      "-default_val_opt",                            // test option with default value
-      "--optional_opt", "/home/guest/tmp",           // test optional option
-      "-valid_opt", "value2",                        // test options with defined values
-      // integer options
-      "-neg_val", "-42",  // test negative value for integer option
-      "-default_neg_val", // test integer option with default value
-      "-pos_val", "33",   // test positive value for integer option
-      // char options
-      "-char-opt", "b", "-dash_opt", "-",
-      // bool options
-      "-bool_opt=false", "-bool-opt2",
-      // vector of strings options
-      "-vec_opt1", "1", "c", "222", "ABC", "857", "-vec_opt2", "--vec_opt_with_vals", "abc", "123",
-      "xxx", "abc", "xxx",
-      // grouped options
-      "-group_opt1", "-group_opt2", "abc", "-group_opt3", "11", nullptr};
+    "CLTest", // program name
+    // string options
+    "-m", "multiopt_value",                        // second name for option with several names
+    "--single", "single_value",                    // option with single name
+    "-several_separators:SOME_VALUE1,SOME_VALUE2", // test option with several separators
+    "--one_separarot=AAA_VALUE",                   // test option whit one separator
+    "-default_val_opt",                            // test option with default value
+    "--optional_opt", "/home/guest/tmp",           // test optional option
+    "-valid_opt", "value2",                        // test options with defined values
+    // integer options
+    "-neg_val", "-42",  // test negative value for integer option
+    "-default_neg_val", // test integer option with default value
+    "-pos_val", "33",   // test positive value for integer option
+    // char options
+    "-char-opt", "b", "-dash_opt", "-",
+    // bool options
+    "-bool_opt=false", "-bool-opt2",
+    // vector of strings options
+    "-vec_opt1", "1", "c", "222", "ABC", "857", "-vec_opt2", "--vec_opt_with_vals", "abc", "123",
+    "xxx", "abc", "xxx",
+    // grouped options
+    "-group_opt1", "-group_opt2", "abc", "-group_opt3", "11", nullptr};
   int argc = (sizeof(argv) / sizeof(argv[0])) - 1;
 
   // It must be failed if option is not passed and other options are in the same group
diff --git a/compiler/nnc/unittests/transformations/Switcher.cpp b/compiler/nnc/unittests/transformations/Switcher.cpp
index 049ac44cd..2f4793369 100644
--- a/compiler/nnc/unittests/transformations/Switcher.cpp
+++ b/compiler/nnc/unittests/transformations/Switcher.cpp
@@ -88,7 +88,7 @@ TEST(TRANSFORMATIONS, Switcher_DWConv2D_NHWC2NCHW)
   attributes.padding_before = {67, 123};
   attributes.padding_after = {32, 356};
   auto *dw_conv =
-      g.create<mir::ops::DepthwiseConv2DOp>(input->getOutput(0), kernel->getOutput(0), attributes);
+    g.create<mir::ops::DepthwiseConv2DOp>(input->getOutput(0), kernel->getOutput(0), attributes);
 
   auto *output = g.create<mir::ops::OutputOp>(dw_conv->getOutput(0));
 
@@ -138,7 +138,7 @@ TEST(TRANSFORMATIONS, Switcher_DeConv2D_NHWC2NCHW)
   attributes.padding_before = {31, 72};
   attributes.padding_after = {32, 71};
   auto *deconv =
-      g.create<mir::ops::DeConv2DOp>(input->getOutput(0), kernel->getOutput(0), attributes);
+    g.create<mir::ops::DeConv2DOp>(input->getOutput(0), kernel->getOutput(0), attributes);
 
   auto *output = g.create<mir::ops::OutputOp>(deconv->getOutput(0));
 
diff --git a/compiler/nnkit-caffe/backend/CMakeLists.txt b/compiler/nnkit-caffe/backend/CMakeLists.txt
index b18aa4f11..567d95438 100644
--- a/compiler/nnkit-caffe/backend/CMakeLists.txt
+++ b/compiler/nnkit-caffe/backend/CMakeLists.txt
@@ -1,3 +1,2 @@
 add_library(nnkit_caffe_backend SHARED Module.cpp)
 target_link_libraries(nnkit_caffe_backend nnkit_support_caffe)
-target_link_libraries(nnkit_caffe_backend stdex)
diff --git a/compiler/nnkit-caffe/backend/Module.cpp b/compiler/nnkit-caffe/backend/Module.cpp
index cb24a4e60..0bd39125f 100644
--- a/compiler/nnkit-caffe/backend/Module.cpp
+++ b/compiler/nnkit-caffe/backend/Module.cpp
@@ -17,11 +17,12 @@
 #include "nnkit/support/caffe/Backend.h"
 
 #include <nnkit/CmdlineArguments.h>
-#include <stdex/Memory.h>
+
+#include <memory>
 
 extern "C" std::unique_ptr<nnkit::Backend> make_backend(const nnkit::CmdlineArguments &args)
 {
-  using stdex::make_unique;
+  using std::make_unique;
 
   auto net = make_unique<::caffe::Net<float>>(args.at(0), caffe::TEST);
 
diff --git a/compiler/nnkit-intf/tensor/include/nnkit/TensorContext.h b/compiler/nnkit-intf/tensor/include/nnkit/TensorContext.h
index 07d8d154c..87056dd64 100644
--- a/compiler/nnkit-intf/tensor/include/nnkit/TensorContext.h
+++ b/compiler/nnkit-intf/tensor/include/nnkit/TensorContext.h
@@ -37,8 +37,8 @@ struct TensorContext
                                          const nncc::core::ADT::tensor::Reader<T> &)>;
 
   template <typename T>
-  using TypedAccessor = std::function<void(const TensorContext &, uint32_t n,
-                                           nncc::core::ADT::tensor::Accessor<T> &)>;
+  using TypedAccessor =
+    std::function<void(const TensorContext &, uint32_t n, nncc::core::ADT::tensor::Accessor<T> &)>;
 
   virtual ~TensorContext() = default;
 
diff --git a/compiler/nnkit-misc/backend/CMakeLists.txt b/compiler/nnkit-misc/backend/CMakeLists.txt
index d351d5ce5..327fbab3c 100644
--- a/compiler/nnkit-misc/backend/CMakeLists.txt
+++ b/compiler/nnkit-misc/backend/CMakeLists.txt
@@ -4,7 +4,6 @@ add_library(nnkit_support_backend STATIC ${SOURCES})
 target_include_directories(nnkit_support_backend PUBLIC include)
 target_link_libraries(nnkit_support_backend PUBLIC nnkit_intf_backend)
 target_link_libraries(nnkit_support_backend PUBLIC dl)
-target_link_libraries(nnkit_support_backend PUBLIC stdex)
 
 find_package(Threads QUIET)
 
diff --git a/compiler/nnkit-misc/backend/src/BackendPlugin.cpp b/compiler/nnkit-misc/backend/src/BackendPlugin.cpp
index 54b1fdc83..75e0763c4 100644
--- a/compiler/nnkit-misc/backend/src/BackendPlugin.cpp
+++ b/compiler/nnkit-misc/backend/src/BackendPlugin.cpp
@@ -17,7 +17,7 @@
 #include "nnkit/BackendPlugin.h"
 
 #include <cassert>
-#include <stdex/Memory.h>
+#include <memory>
 #include <iostream>
 
 // NOTE dlfcn.h is not a standard library
@@ -82,7 +82,7 @@ std::unique_ptr<BackendPlugin> make_backend_plugin(const std::string &path)
     exit(1);
   }
 
-  return stdex::make_unique<BackendPlugin>(handle, entry);
+  return std::make_unique<BackendPlugin>(handle, entry);
 }
 
 } // namespace nnkit
diff --git a/compiler/nnkit-mocotf/backend/Backend.cpp b/compiler/nnkit-mocotf/backend/Backend.cpp
index 4900684eb..598370635 100644
--- a/compiler/nnkit-mocotf/backend/Backend.cpp
+++ b/compiler/nnkit-mocotf/backend/Backend.cpp
@@ -17,13 +17,13 @@
 #include "nnkit/support/moco/tf/Backend.h"
 
 #include <nnkit/CmdlineArguments.h>
-#include <stdex/Memory.h>
 
+#include <memory>
 #include <cassert>
 
 extern "C" std::unique_ptr<nnkit::Backend> make_backend(const nnkit::CmdlineArguments &args)
 {
-  using stdex::make_unique;
+  using std::make_unique;
 
   assert(args.size() == 2); // args.at[0] : *.pb path, args.at[1]: *.info path
 
diff --git a/compiler/nnkit-mocotf/backend/CMakeLists.txt b/compiler/nnkit-mocotf/backend/CMakeLists.txt
index 72e16c75a..3dcd7e564 100644
--- a/compiler/nnkit-mocotf/backend/CMakeLists.txt
+++ b/compiler/nnkit-mocotf/backend/CMakeLists.txt
@@ -1,3 +1,2 @@
 add_library(nnkit_moco_tf_backend SHARED Backend.cpp)
 target_link_libraries(nnkit_moco_tf_backend nnkit_support_moco_tf)
-target_link_libraries(nnkit_moco_tf_backend stdex)
diff --git a/compiler/nnkit-mocotf/requires.cmake b/compiler/nnkit-mocotf/requires.cmake
index 6949ec808..1461e8443 100644
--- a/compiler/nnkit-mocotf/requires.cmake
+++ b/compiler/nnkit-mocotf/requires.cmake
@@ -1,4 +1,3 @@
-require("stdex")
 # To use "nnkit_support_tftestinfo"
 require("tfinfo")
 require("loco")
diff --git a/compiler/nnkit-mocotf/support/CMakeLists.txt b/compiler/nnkit-mocotf/support/CMakeLists.txt
index 76c7c04b1..1b20d946b 100644
--- a/compiler/nnkit-mocotf/support/CMakeLists.txt
+++ b/compiler/nnkit-mocotf/support/CMakeLists.txt
@@ -10,4 +10,3 @@ target_link_libraries(nnkit_support_moco_tf nnkit_support_tftestinfo)
 target_link_libraries(nnkit_support_moco_tf locomotiv)
 target_link_libraries(nnkit_support_moco_tf moco_tf_frontend)
 target_link_libraries(nnkit_support_moco_tf loco)
-target_link_libraries(nnkit_support_moco_tf stdex)
diff --git a/compiler/nnkit-mocotf/support/src/Backend.cpp b/compiler/nnkit-mocotf/support/src/Backend.cpp
index 2d9e21fd7..89dd73271 100644
--- a/compiler/nnkit-mocotf/support/src/Backend.cpp
+++ b/compiler/nnkit-mocotf/support/src/Backend.cpp
@@ -25,11 +25,11 @@
 
 #include <moco/tf/Frontend.h>
 #include <moco/Names.h>
-#include <stdex/Memory.h>
 
 #include <nncc/core/ADT/tensor/Buffer.h>
 #include <nncc/core/ADT/tensor/LexicalLayout.h>
 
+#include <memory>
 #include <utility> // std::move
 #include <stdexcept>
 
@@ -116,7 +116,7 @@ Backend::Backend(const char *pb_path, const char *info_path)
 
   // set member vars
   _loco_graph = std::move(loco_graph);
-  _sess = stdex::make_unique<locomotiv::Session>(_loco_graph.get());
+  _sess = std::make_unique<locomotiv::Session>(_loco_graph.get());
 }
 
 void Backend::prepare(const std::function<void(nnkit::TensorContext &)> &f)
@@ -131,7 +131,7 @@ void Backend::prepare(const std::function<void(nnkit::TensorContext &)> &f)
   for (int n = 0; n < _inputs.size(); n++)
   {
     auto buf = make_buffer<float, LexicalLayout>(_inputs.at(n)->shape());
-    buf_list.emplace_back(stdex::make_unique<nncc::core::ADT::tensor::Buffer<float>>(buf));
+    buf_list.emplace_back(std::make_unique<nncc::core::ADT::tensor::Buffer<float>>(buf));
   }
 
   // fill test input values
diff --git a/compiler/nnkit-mocotf/support/src/InputTensorContext.cpp b/compiler/nnkit-mocotf/support/src/InputTensorContext.cpp
index 98f500730..25ddc0982 100644
--- a/compiler/nnkit-mocotf/support/src/InputTensorContext.cpp
+++ b/compiler/nnkit-mocotf/support/src/InputTensorContext.cpp
@@ -37,7 +37,7 @@ void InputTensorContext::getMutableFloatTensor(uint32_t n,
 }
 
 void InputTensorContext::getConstFloatTensor(
-    uint32_t n, const nnkit::TensorContext::TypedReader<float> &f) const
+  uint32_t n, const nnkit::TensorContext::TypedReader<float> &f) const
 {
   auto buf = _buffers.at(n).get();
   f(*this, n, *buf);
diff --git a/compiler/nnkit-mocotf/support/src/InputTensorContext.h b/compiler/nnkit-mocotf/support/src/InputTensorContext.h
index bbb25adea..4100d229a 100644
--- a/compiler/nnkit-mocotf/support/src/InputTensorContext.h
+++ b/compiler/nnkit-mocotf/support/src/InputTensorContext.h
@@ -45,7 +45,7 @@ class InputTensorContext final : public TensorContext
 
 public:
   InputTensorContext(const ParsedTensors &parsed_tensors, const Buffers &buffers)
-      : TensorContext(parsed_tensors), _buffers(buffers)
+    : TensorContext(parsed_tensors), _buffers(buffers)
   { /* empty */
   }
 
diff --git a/compiler/nnkit-mocotf/support/src/OutputTensorContext.cpp b/compiler/nnkit-mocotf/support/src/OutputTensorContext.cpp
index 2b36fc67a..6ef1e4598 100644
--- a/compiler/nnkit-mocotf/support/src/OutputTensorContext.cpp
+++ b/compiler/nnkit-mocotf/support/src/OutputTensorContext.cpp
@@ -30,7 +30,7 @@ namespace tf
 {
 
 void OutputTensorContext::getConstFloatTensor(
-    uint32_t n, const nnkit::TensorContext::TypedReader<float> &f) const
+  uint32_t n, const nnkit::TensorContext::TypedReader<float> &f) const
 { // for output
   using nncc::core::ADT::tensor::LexicalLayout;
   using nncc::core::ADT::tensor::make_overlay;
diff --git a/compiler/nnkit-mocotf/support/src/OutputTensorContext.h b/compiler/nnkit-mocotf/support/src/OutputTensorContext.h
index 8cb8d8bf0..f825729e9 100644
--- a/compiler/nnkit-mocotf/support/src/OutputTensorContext.h
+++ b/compiler/nnkit-mocotf/support/src/OutputTensorContext.h
@@ -43,7 +43,7 @@ class OutputTensorContext final : public TensorContext
 {
 public:
   OutputTensorContext(const ParsedTensors &parsed_tensors, locomotiv::Session *sess)
-      : TensorContext(parsed_tensors), _sess(sess)
+    : TensorContext(parsed_tensors), _sess(sess)
   { /* empty */
   }
 
diff --git a/compiler/nnkit-onnxrt/backend/Backend.cpp b/compiler/nnkit-onnxrt/backend/Backend.cpp
index 9247fbf34..a6c62b7b3 100644
--- a/compiler/nnkit-onnxrt/backend/Backend.cpp
+++ b/compiler/nnkit-onnxrt/backend/Backend.cpp
@@ -17,13 +17,13 @@
 #include "nnkit/support/onnx/Backend.h"
 
 #include <nnkit/CmdlineArguments.h>
-#include <stdex/Memory.h>
 
+#include <memory>
 #include <cassert>
 
 extern "C" std::unique_ptr<nnkit::Backend> make_backend(const nnkit::CmdlineArguments &args)
 {
   assert(args.size() == 1); // args.at[0] : onnx file
 
-  return stdex::make_unique<::nnkit::support::onnx::Backend>(args.at(0));
+  return std::make_unique<::nnkit::support::onnx::Backend>(args.at(0));
 }
diff --git a/compiler/nnkit-onnxrt/backend/CMakeLists.txt b/compiler/nnkit-onnxrt/backend/CMakeLists.txt
index b00e5593d..ae462de8d 100644
--- a/compiler/nnkit-onnxrt/backend/CMakeLists.txt
+++ b/compiler/nnkit-onnxrt/backend/CMakeLists.txt
@@ -1,3 +1,2 @@
 add_library(nnkit_onnx_backend SHARED Backend.cpp)
 target_link_libraries(nnkit_onnx_backend nnkit_support_onnx)
-target_link_libraries(nnkit_onnx_backend stdex)
diff --git a/compiler/nnkit-onnxrt/requires.cmake b/compiler/nnkit-onnxrt/requires.cmake
index d370fc17c..be53ae74f 100644
--- a/compiler/nnkit-onnxrt/requires.cmake
+++ b/compiler/nnkit-onnxrt/requires.cmake
@@ -1,2 +1 @@
-require("stdex")
 require("nnkit-intf")
diff --git a/compiler/nnkit-onnxrt/support/CMakeLists.txt b/compiler/nnkit-onnxrt/support/CMakeLists.txt
index 1b51d4ed8..3d3bb2671 100644
--- a/compiler/nnkit-onnxrt/support/CMakeLists.txt
+++ b/compiler/nnkit-onnxrt/support/CMakeLists.txt
@@ -5,6 +5,5 @@ set_target_properties(nnkit_support_onnx-1.4 PROPERTIES POSITION_INDEPENDENT_COD
 target_include_directories(nnkit_support_onnx-1.4 PUBLIC include)
 target_link_libraries(nnkit_support_onnx-1.4 nnkit_intf_backend)
 target_link_libraries(nnkit_support_onnx-1.4 onnxruntime)
-target_link_libraries(nnkit_support_onnx-1.4 stdex)
 
 add_library(nnkit_support_onnx ALIAS nnkit_support_onnx-1.4)
diff --git a/compiler/nnkit-onnxrt/support/include/nnkit/support/onnx/TensorSet.h b/compiler/nnkit-onnxrt/support/include/nnkit/support/onnx/TensorSet.h
index b38fc9bb0..26753fed7 100644
--- a/compiler/nnkit-onnxrt/support/include/nnkit/support/onnx/TensorSet.h
+++ b/compiler/nnkit-onnxrt/support/include/nnkit/support/onnx/TensorSet.h
@@ -37,7 +37,7 @@ class TensorSet final
 {
 public:
   TensorSet(Allocator *allocator, size_t nums)
-      : _allocator(allocator), _names(nums), _types(nums), _dims(nums), _tensors(nums, nullptr)
+    : _allocator(allocator), _names(nums), _types(nums), _dims(nums), _tensors(nums, nullptr)
   {
     // DO NOTHING
   }
@@ -60,7 +60,7 @@ public:
     Status status;
 
     status =
-        OrtCreateTensorAsOrtValue(_allocator, dims.data(), dims.size(), type, &_tensors[index]);
+      OrtCreateTensorAsOrtValue(_allocator, dims.data(), dims.size(), type, &_tensors[index]);
     status.throwOnError();
 
     assert(OrtIsTensor(_tensors[index]));
diff --git a/compiler/nnkit-onnxrt/support/src/Runner.cpp b/compiler/nnkit-onnxrt/support/src/Runner.cpp
index bc6a81a5c..8159ed7c2 100644
--- a/compiler/nnkit-onnxrt/support/src/Runner.cpp
+++ b/compiler/nnkit-onnxrt/support/src/Runner.cpp
@@ -17,7 +17,7 @@
 #include "nnkit/support/onnx/Runner.h"
 #include "nnkit/support/onnx/Status.h"
 
-#include <stdex/Memory.h>
+#include <memory>
 #include <cassert>
 
 namespace nnkit
@@ -27,7 +27,7 @@ namespace support
 namespace onnx
 {
 
-Runner::Runner(const std::string &path) : _allocator(stdex::make_unique<Allocator>())
+Runner::Runner(const std::string &path) : _allocator(std::make_unique<Allocator>())
 {
   Status status;
 
@@ -61,7 +61,7 @@ void Runner::prepareInputs(void)
   status = OrtSessionGetInputCount(_session, &num_input_nodes);
   status.throwOnError();
 
-  _inputs = stdex::make_unique<TensorSet>(_allocator.get(), num_input_nodes);
+  _inputs = std::make_unique<TensorSet>(_allocator.get(), num_input_nodes);
 
   for (size_t i = 0; i < num_input_nodes; ++i)
   {
@@ -113,7 +113,7 @@ void Runner::prepareOutputs(void)
   status = OrtSessionGetOutputCount(_session, &num_output_nodes);
   status.throwOnError();
 
-  _outputs = stdex::make_unique<TensorSet>(_allocator.get(), num_output_nodes);
+  _outputs = std::make_unique<TensorSet>(_allocator.get(), num_output_nodes);
 
   for (size_t i = 0; i < num_output_nodes; ++i)
   {
diff --git a/compiler/nnkit-tf/backend/Backend.cpp b/compiler/nnkit-tf/backend/Backend.cpp
index ee0476469..99c857e46 100644
--- a/compiler/nnkit-tf/backend/Backend.cpp
+++ b/compiler/nnkit-tf/backend/Backend.cpp
@@ -17,13 +17,13 @@
 #include "nnkit/support/tf/Backend.h"
 
 #include <nnkit/CmdlineArguments.h>
-#include <stdex/Memory.h>
 
+#include <memory>
 #include <cassert>
 
 extern "C" std::unique_ptr<nnkit::Backend> make_backend(const nnkit::CmdlineArguments &args)
 {
-  using stdex::make_unique;
+  using std::make_unique;
 
   assert(args.size() == 2); // args.at[0] : test.pb path, argas.at[1]: test.info path
 
diff --git a/compiler/nnkit-tf/backend/CMakeLists.txt b/compiler/nnkit-tf/backend/CMakeLists.txt
index dd2e469e8..d0078453e 100644
--- a/compiler/nnkit-tf/backend/CMakeLists.txt
+++ b/compiler/nnkit-tf/backend/CMakeLists.txt
@@ -1,3 +1,2 @@
 add_library(nnkit_tf_backend SHARED Backend.cpp)
 target_link_libraries(nnkit_tf_backend nnkit_support_tf)
-target_link_libraries(nnkit_tf_backend stdex)
diff --git a/compiler/nnkit-tf/requires.cmake b/compiler/nnkit-tf/requires.cmake
index 4b9fd68b2..a757bdda4 100644
--- a/compiler/nnkit-tf/requires.cmake
+++ b/compiler/nnkit-tf/requires.cmake
@@ -1,3 +1,2 @@
-require("stdex")
 require("tfinfo")
 require("nnkit-intf")
diff --git a/compiler/nnkit-tf/support/CMakeLists.txt b/compiler/nnkit-tf/support/CMakeLists.txt
index 0f5c0a6dd..d064131ea 100644
--- a/compiler/nnkit-tf/support/CMakeLists.txt
+++ b/compiler/nnkit-tf/support/CMakeLists.txt
@@ -3,7 +3,7 @@ file(GLOB_RECURSE SOURCES "src/*.cpp")
 add_library(nnkit_support_tf-1.13 STATIC ${SOURCES})
 set_target_properties(nnkit_support_tf-1.13 PROPERTIES POSITION_INDEPENDENT_CODE ON)
 target_include_directories(nnkit_support_tf-1.13 PUBLIC include)
-target_link_libraries(nnkit_support_tf-1.13 nnkit_intf_backend stdex nnkit_support_tftestinfo)
+target_link_libraries(nnkit_support_tf-1.13 nnkit_intf_backend nnkit_support_tftestinfo)
 target_link_libraries(nnkit_support_tf-1.13 tensorflow-1.13)
 
 add_library(nnkit_support_tf ALIAS nnkit_support_tf-1.13)
diff --git a/compiler/nnkit-tf/support/include/nnkit/support/tf/TensorContext.h b/compiler/nnkit-tf/support/include/nnkit/support/tf/TensorContext.h
index f1ecd6c9c..fec614733 100644
--- a/compiler/nnkit-tf/support/include/nnkit/support/tf/TensorContext.h
+++ b/compiler/nnkit-tf/support/include/nnkit/support/tf/TensorContext.h
@@ -36,7 +36,7 @@ class TensorContext final : public nnkit::TensorContext
 {
 public:
   TensorContext(const std::vector<std::unique_ptr<ParsedTensor>> &tensors, TensorDataMap &data_map)
-      : _tensors(tensors), _data_map(data_map)
+    : _tensors(tensors), _data_map(data_map)
   {
     // empty
   }
diff --git a/compiler/nnkit-tf/support/include/nnkit/support/tf/TensorDataMap.h b/compiler/nnkit-tf/support/include/nnkit/support/tf/TensorDataMap.h
index daa1a95b3..5b12aa9a7 100644
--- a/compiler/nnkit-tf/support/include/nnkit/support/tf/TensorDataMap.h
+++ b/compiler/nnkit-tf/support/include/nnkit/support/tf/TensorDataMap.h
@@ -41,7 +41,9 @@ using nnkit::support::tftestinfo::ParsedTensor;
 class TensorDataMap
 {
 public:
-  TensorDataMap() { /* empty */}
+  TensorDataMap()
+  { /* empty */
+  }
 
   uint8_t *allocate(const ParsedTensor *parsed_tensor)
   {
diff --git a/compiler/nnkit-tf/support/src/Backend.cpp b/compiler/nnkit-tf/support/src/Backend.cpp
index f28e05f74..54bc4984d 100644
--- a/compiler/nnkit-tf/support/src/Backend.cpp
+++ b/compiler/nnkit-tf/support/src/Backend.cpp
@@ -50,7 +50,7 @@ Backend::Backend(const char *pb_path, const char *info_path) : _tf_runner(pb_pat
         angkor::TensorShape shape;
         if (!_tf_runner.getTensorShapeFromGraphDef(parsed_tensor, shape))
           throw oops::UserExn(
-              "Info you provided may be wrong or not enough. Please check the info file.");
+            "Info you provided may be wrong or not enough. Please check the info file.");
 
         parsed_tensor->mutable_shape().resize(shape.rank());
         for (int r = 0; r < shape.rank(); r++)
diff --git a/compiler/nnkit-tf/support/src/Runner.cpp b/compiler/nnkit-tf/support/src/Runner.cpp
index 0d36ee2f4..d2c37cd29 100644
--- a/compiler/nnkit-tf/support/src/Runner.cpp
+++ b/compiler/nnkit-tf/support/src/Runner.cpp
@@ -263,8 +263,8 @@ void Runner::prepareInputs(const std::vector<std::unique_ptr<ParsedTensor>> &inp
       throw std::runtime_error("Not supported tensor type");
 
     TF_Tensor *input_tensor =
-        create_tensor(TF_FLOAT, shape.data(), shape.size(), data_map.data(tensor.get()),
-                      num_elements(tensor->shape()) * size);
+      create_tensor(TF_FLOAT, shape.data(), shape.size(), data_map.data(tensor.get()),
+                    num_elements(tensor->shape()) * size);
 
     _input_ops.emplace_back(input_op);
     _input_tensors.emplace_back(input_tensor);
@@ -308,7 +308,7 @@ void Runner::run()
                 0,       // Target operations, number of targets.
                 nullptr, // Run metadata.
                 _status  // Output status.
-                );
+  );
 
   if (TF_GetCode(_status) != TF_OK)
     throw std::runtime_error(TF_Message(_status));
diff --git a/compiler/nnkit-tflite/backend/Backend.cpp b/compiler/nnkit-tflite/backend/Backend.cpp
index 08ba338e8..b84c5076e 100644
--- a/compiler/nnkit-tflite/backend/Backend.cpp
+++ b/compiler/nnkit-tflite/backend/Backend.cpp
@@ -51,12 +51,13 @@ private:
   std::unique_ptr<::tflite::FlatBufferModel> _model;
   std::unique_ptr<::tflite::Interpreter> _interp;
 };
-}
+} // namespace
 
 #include <nnkit/CmdlineArguments.h>
-#include <stdex/Memory.h>
+
+#include <memory>
 
 extern "C" std::unique_ptr<nnkit::Backend> make_backend(const nnkit::CmdlineArguments &args)
 {
-  return stdex::make_unique<GenericBackend>(args.at(0));
+  return std::make_unique<GenericBackend>(args.at(0));
 }
diff --git a/compiler/nnkit-tflite/backend/CMakeLists.txt b/compiler/nnkit-tflite/backend/CMakeLists.txt
index 3f4a8ca53..31606b15e 100644
--- a/compiler/nnkit-tflite/backend/CMakeLists.txt
+++ b/compiler/nnkit-tflite/backend/CMakeLists.txt
@@ -4,4 +4,3 @@ endif(NOT TARGET nnkit_support_tflite)
 
 add_library(nnkit_tflite_backend SHARED Backend.cpp)
 target_link_libraries(nnkit_tflite_backend nnkit_support_tflite)
-target_link_libraries(nnkit_tflite_backend stdex)
diff --git a/compiler/nnkit-tflite/requires.cmake b/compiler/nnkit-tflite/requires.cmake
index d370fc17c..be53ae74f 100644
--- a/compiler/nnkit-tflite/requires.cmake
+++ b/compiler/nnkit-tflite/requires.cmake
@@ -1,2 +1 @@
-require("stdex")
 require("nnkit-intf")
diff --git a/compiler/nnkit/actions/HDF5/CMakeLists.txt b/compiler/nnkit/actions/HDF5/CMakeLists.txt
index 63d3320c5..0b1e2e516 100644
--- a/compiler/nnkit/actions/HDF5/CMakeLists.txt
+++ b/compiler/nnkit/actions/HDF5/CMakeLists.txt
@@ -12,10 +12,8 @@ add_library(nnkit_HDF5_export_action SHARED Export.cpp)
 target_include_directories(nnkit_HDF5_export_action PRIVATE ${HDF5_INCLUDE_DIRS})
 target_link_libraries(nnkit_HDF5_export_action nnkit_intf_action)
 target_link_libraries(nnkit_HDF5_export_action nnkit_HDF5_common)
-target_link_libraries(nnkit_HDF5_export_action stdex)
 
 add_library(nnkit_HDF5_import_action SHARED Import.cpp)
 target_include_directories(nnkit_HDF5_import_action PRIVATE ${HDF5_INCLUDE_DIRS})
 target_link_libraries(nnkit_HDF5_import_action nnkit_intf_action)
 target_link_libraries(nnkit_HDF5_import_action nnkit_HDF5_common)
-target_link_libraries(nnkit_HDF5_import_action stdex)
diff --git a/compiler/nnkit/actions/HDF5/Export.cpp b/compiler/nnkit/actions/HDF5/Export.cpp
index 389f5c050..f21a7ff4e 100644
--- a/compiler/nnkit/actions/HDF5/Export.cpp
+++ b/compiler/nnkit/actions/HDF5/Export.cpp
@@ -58,7 +58,7 @@ public:
         H5::DataSpace dataspace(rank, dims);
 
         auto dataset =
-            _value_grp.createDataSet(value_filename(n), H5::PredType::IEEE_F32BE, dataspace);
+          _value_grp.createDataSet(value_filename(n), H5::PredType::IEEE_F32BE, dataspace);
 
         float *data = new float[nncc::core::ADT::tensor::num_elements(shape)];
 
@@ -84,7 +84,7 @@ public:
           H5::StrType name_datatype(H5::PredType::C_S1, name.size());
 
           auto name_attr =
-              _name_grp.createAttribute(value_filename(n), name_datatype, name_dataspace);
+            _name_grp.createAttribute(value_filename(n), name_datatype, name_dataspace);
 
           name_attr.write(name_datatype, name);
         }
@@ -101,9 +101,10 @@ private:
 };
 
 #include <nnkit/CmdlineArguments.h>
-#include <stdex/Memory.h>
+
+#include <memory>
 
 extern "C" std::unique_ptr<nnkit::Action> make_action(const nnkit::CmdlineArguments &args)
 {
-  return stdex::make_unique<HD5ExportAction>(args.at(0));
+  return std::make_unique<HD5ExportAction>(args.at(0));
 }
diff --git a/compiler/nnkit/actions/HDF5/Import.cpp b/compiler/nnkit/actions/HDF5/Import.cpp
index bba5ab701..069f42f56 100644
--- a/compiler/nnkit/actions/HDF5/Import.cpp
+++ b/compiler/nnkit/actions/HDF5/Import.cpp
@@ -92,9 +92,10 @@ private:
 };
 
 #include <nnkit/CmdlineArguments.h>
-#include <stdex/Memory.h>
+
+#include <memory>
 
 extern "C" std::unique_ptr<nnkit::Action> make_action(const nnkit::CmdlineArguments &args)
 {
-  return stdex::make_unique<HD5ImportAction>(args.at(0));
+  return std::make_unique<HD5ImportAction>(args.at(0));
 }
diff --git a/compiler/nnkit/actions/builtin/CMakeLists.txt b/compiler/nnkit/actions/builtin/CMakeLists.txt
index 910e12ea9..4de70dfc3 100644
--- a/compiler/nnkit/actions/builtin/CMakeLists.txt
+++ b/compiler/nnkit/actions/builtin/CMakeLists.txt
@@ -1,7 +1,5 @@
 add_library(nnkit_show_action SHARED Show.cpp)
 target_link_libraries(nnkit_show_action nnkit_intf_action)
-target_link_libraries(nnkit_show_action stdex)
 
 add_library(nnkit_randomize_action SHARED Randomize.cpp)
 target_link_libraries(nnkit_randomize_action nnkit_intf_action)
-target_link_libraries(nnkit_randomize_action stdex)
diff --git a/compiler/nnkit/actions/builtin/Randomize.cpp b/compiler/nnkit/actions/builtin/Randomize.cpp
index 9b023ef3b..b6e17c7c3 100644
--- a/compiler/nnkit/actions/builtin/Randomize.cpp
+++ b/compiler/nnkit/actions/builtin/Randomize.cpp
@@ -52,9 +52,10 @@ struct RandomizeAction final : public nnkit::Action
 };
 
 #include <nnkit/CmdlineArguments.h>
-#include <stdex/Memory.h>
+
+#include <memory>
 
 extern "C" std::unique_ptr<nnkit::Action> make_action(const nnkit::CmdlineArguments &args)
 {
-  return stdex::make_unique<RandomizeAction>();
+  return std::make_unique<RandomizeAction>();
 }
diff --git a/compiler/nnkit/actions/builtin/Show.cpp b/compiler/nnkit/actions/builtin/Show.cpp
index 2630177ef..0be15a8cd 100644
--- a/compiler/nnkit/actions/builtin/Show.cpp
+++ b/compiler/nnkit/actions/builtin/Show.cpp
@@ -63,9 +63,10 @@ void ShowAction::run(nnkit::TensorContext &ctx)
 }
 
 #include <nnkit/CmdlineArguments.h>
-#include <stdex/Memory.h>
+
+#include <memory>
 
 extern "C" std::unique_ptr<nnkit::Action> make_action(const nnkit::CmdlineArguments &args)
 {
-  return stdex::make_unique<ShowAction>();
+  return std::make_unique<ShowAction>();
 }
diff --git a/compiler/nnkit/tools/benchmark/CMakeLists.txt b/compiler/nnkit/tools/benchmark/CMakeLists.txt
index c2cde00f4..7f01f8bd1 100644
--- a/compiler/nnkit/tools/benchmark/CMakeLists.txt
+++ b/compiler/nnkit/tools/benchmark/CMakeLists.txt
@@ -11,4 +11,3 @@ file(GLOB_RECURSE SOURCES "src/*.cpp")
 add_executable(nnkit-benchmark ${SOURCES})
 target_link_libraries(nnkit-benchmark nnkit_support_cmdline)
 target_link_libraries(nnkit-benchmark nnkit_support_backend)
-target_link_libraries(nnkit-benchmark stdex)
diff --git a/compiler/nnkit/tools/benchmark/src/Benchmark.cpp b/compiler/nnkit/tools/benchmark/src/Benchmark.cpp
index 6c3ebc90b..632c989bd 100644
--- a/compiler/nnkit/tools/benchmark/src/Benchmark.cpp
+++ b/compiler/nnkit/tools/benchmark/src/Benchmark.cpp
@@ -18,8 +18,7 @@
 #include <nnkit/VectorArguments.h>
 #include <nnkit/BackendPlugin.h>
 
-#include <stdex/Memory.h>
-
+#include <memory>
 #include <map>
 #include <string>
 
@@ -28,7 +27,7 @@
 #include <iostream>
 #include <iomanip>
 
-using stdex::make_unique;
+using std::make_unique;
 
 using std::chrono::milliseconds;
 using std::chrono::microseconds;
diff --git a/compiler/nnkit/tools/run/CMakeLists.txt b/compiler/nnkit/tools/run/CMakeLists.txt
index 5f42ed941..d1b716090 100644
--- a/compiler/nnkit/tools/run/CMakeLists.txt
+++ b/compiler/nnkit/tools/run/CMakeLists.txt
@@ -19,4 +19,3 @@ target_link_libraries(nnkit-run nnkit_intf_action)
 target_link_libraries(nnkit-run nnkit_intf_backend)
 target_link_libraries(nnkit-run nnkit_support_cmdline)
 target_link_libraries(nnkit-run nnkit_support_backend)
-target_link_libraries(nnkit-run stdex)
diff --git a/compiler/nnkit/tools/run/nnkit-run.cpp b/compiler/nnkit/tools/run/nnkit-run.cpp
index e60e5797a..cc5a337bd 100644
--- a/compiler/nnkit/tools/run/nnkit-run.cpp
+++ b/compiler/nnkit/tools/run/nnkit-run.cpp
@@ -35,7 +35,7 @@ public:
 private:
   nnkit::VectorArguments _args;
 };
-}
+} // namespace
 
 namespace
 {
@@ -59,7 +59,7 @@ private:
   std::string _path;
   std::unique_ptr<nnkit::BackendPlugin> _plugin;
 };
-}
+} // namespace
 
 // TODO Extract Action-related helpers
 #include <nnkit/Action.h>
@@ -120,7 +120,7 @@ private:
   void *_handle;
   Entry _entry;
 };
-}
+} // namespace
 
 namespace
 {
@@ -139,10 +139,9 @@ public:
 private:
   ActionBinder _binder;
 };
-}
-
-#include <stdex/Memory.h>
+} // namespace
 
+#include <memory>
 #include <map>
 #include <iostream>
 
@@ -170,7 +169,7 @@ int main(int argc, char **argv)
   std::map<std::string, std::function<void(const std::string &arg)>> argparse;
 
   argparse["--backend"] = [&sections](const std::string &tag) {
-    sections.backend = stdex::make_unique<BackendSection>(tag);
+    sections.backend = std::make_unique<BackendSection>(tag);
   };
 
   argparse["--backend-arg"] = [&sections](const std::string &arg) {
diff --git a/compiler/nnop/include/nnop/PadInfo.h b/compiler/nnop/include/nnop/PadInfo.h
index 228f08514..d17a33abf 100644
--- a/compiler/nnop/include/nnop/PadInfo.h
+++ b/compiler/nnop/include/nnop/PadInfo.h
@@ -26,7 +26,7 @@ class PadInfo
 {
 public:
   PadInfo(uint32_t top, uint32_t bottom, uint32_t left, uint32_t right)
-      : _top{top}, _bottom{bottom}, _left{left}, _right{right}
+    : _top{top}, _bottom{bottom}, _left{left}, _right{right}
   {
     // DO NOTHING
   }
diff --git a/compiler/nnop/include/nnop/StrideInfo.h b/compiler/nnop/include/nnop/StrideInfo.h
index e47489fa7..653603d6c 100644
--- a/compiler/nnop/include/nnop/StrideInfo.h
+++ b/compiler/nnop/include/nnop/StrideInfo.h
@@ -39,6 +39,6 @@ private:
   uint32_t _vertical;
 };
 
-} // namespace nncc
+} // namespace nnop
 
 #endif // __NNOP_STRIDE_INFO_H__
diff --git a/compiler/nnsuite/conv/model/src/RandomModel.cpp b/compiler/nnsuite/conv/model/src/RandomModel.cpp
index 7b15d4c96..6d4a6147d 100644
--- a/compiler/nnsuite/conv/model/src/RandomModel.cpp
+++ b/compiler/nnsuite/conv/model/src/RandomModel.cpp
@@ -28,8 +28,8 @@ namespace conv
 {
 
 RandomModel::RandomModel(int32_t seed)
-    : _ifm_shape{1, 8, 8}, _ifm_name{"ifm"}, _ofm_name{"ofm"}, _ofm_shape{2, 6, 6},
-      _ker_buffer{kernel::Shape{2, 1, 3, 3}, kernel::NCHWLayout{}}
+  : _ifm_shape{1, 8, 8}, _ifm_name{"ifm"}, _ofm_name{"ofm"}, _ofm_shape{2, 6, 6},
+    _ker_buffer{kernel::Shape{2, 1, 3, 3}, kernel::NCHWLayout{}}
 {
   std::default_random_engine gen{static_cast<uint32_t>(seed)};
   std::normal_distribution<float> dist{0.0f, 1.0f};
diff --git a/compiler/nnsuite/conv/nnkit-caffe/CMakeLists.txt b/compiler/nnsuite/conv/nnkit-caffe/CMakeLists.txt
index 6445cc6fb..7e860f874 100644
--- a/compiler/nnsuite/conv/nnkit-caffe/CMakeLists.txt
+++ b/compiler/nnsuite/conv/nnkit-caffe/CMakeLists.txt
@@ -9,7 +9,6 @@ list(REMOVE_ITEM SOURCES ${TESTS})
 add_library(nnsuite_conv_caffe SHARED ${SOURCES})
 target_link_libraries(nnsuite_conv_caffe nnsuite_conv)
 target_link_libraries(nnsuite_conv_caffe nnkit_support_caffe)
-target_link_libraries(nnsuite_conv_caffe stdex)
 
 nnas_find_package(GTest QUIET)
 
diff --git a/compiler/nnsuite/conv/nnkit-caffe/ConvBackend.cpp b/compiler/nnsuite/conv/nnkit-caffe/ConvBackend.cpp
index 31d2b33fc..664ca94f3 100644
--- a/compiler/nnsuite/conv/nnkit-caffe/ConvBackend.cpp
+++ b/compiler/nnsuite/conv/nnkit-caffe/ConvBackend.cpp
@@ -23,9 +23,9 @@
 #include <nncc/core/ADT/kernel/Overlay.h>
 #include <nncc/core/ADT/kernel/NCHWLayout.h>
 
-#include <stdex/Memory.h>
+#include <memory>
 
-using stdex::make_unique;
+using std::make_unique;
 
 std::unique_ptr<nnkit::Backend> ConvBackend::create(const nnsuite::conv::Model &model)
 {
diff --git a/compiler/nnsuite/conv/nnkit-caffe/ConvBackend.test.cpp b/compiler/nnsuite/conv/nnkit-caffe/ConvBackend.test.cpp
index 776bf186b..20c42385a 100644
--- a/compiler/nnsuite/conv/nnkit-caffe/ConvBackend.test.cpp
+++ b/compiler/nnsuite/conv/nnkit-caffe/ConvBackend.test.cpp
@@ -35,8 +35,8 @@ public:
   TestModel(const std::string &ifm_name, const feature::Shape &ifm_shape,
             const std::string &ofm_name, const feature::Shape &ofm_shape,
             const kernel::Shape &ker_shape, const kernel::Layout &ker_layout, float *ker_data)
-      : _ifm_name(ifm_name), _ifm_shape(ifm_shape), _ofm_name(ofm_name), _ofm_shape(ofm_shape),
-        _ker{ker_shape, ker_layout, ker_data}
+    : _ifm_name(ifm_name), _ifm_shape(ifm_shape), _ofm_name(ofm_name),
+      _ofm_shape(ofm_shape), _ker{ker_shape, ker_layout, ker_data}
   {
     // DO NOTHING
   }
diff --git a/compiler/nnsuite/conv/nnkit-tflite/CMakeLists.txt b/compiler/nnsuite/conv/nnkit-tflite/CMakeLists.txt
index c1cf88812..8e870490e 100644
--- a/compiler/nnsuite/conv/nnkit-tflite/CMakeLists.txt
+++ b/compiler/nnsuite/conv/nnkit-tflite/CMakeLists.txt
@@ -9,7 +9,6 @@ list(REMOVE_ITEM SOURCES ${TESTS})
 add_library(nnsuite_conv_tflite SHARED ${SOURCES})
 target_link_libraries(nnsuite_conv_tflite nnsuite_conv)
 target_link_libraries(nnsuite_conv_tflite nnkit_support_tflite-1.7)
-target_link_libraries(nnsuite_conv_tflite stdex)
 
 nnas_find_package(GTest QUIET)
 
diff --git a/compiler/nnsuite/conv/nnkit-tflite/ConvBackend.cpp b/compiler/nnsuite/conv/nnkit-tflite/ConvBackend.cpp
index 8ec9ce491..ea189ff6e 100644
--- a/compiler/nnsuite/conv/nnkit-tflite/ConvBackend.cpp
+++ b/compiler/nnsuite/conv/nnkit-tflite/ConvBackend.cpp
@@ -74,7 +74,7 @@ static inline std::vector<int> as_dims(const nncc::core::ADT::kernel::Shape &sha
 }
 
 ConvBackend::ConvBackend(const nnsuite::conv::Model &model)
-    : _ifm_name{model.ifm_name()}, _ofm_name{model.ofm_name()}
+  : _ifm_name{model.ifm_name()}, _ofm_name{model.ofm_name()}
 {
   using nncc::core::ADT::kernel::Overlay;
   using nncc::core::ADT::kernel::NHWCLayout;
@@ -123,12 +123,12 @@ ConvBackend::ConvBackend(const nnsuite::conv::Model &model)
                                        as_dims(model.ifm_shape()), quantization);
 
   _interp.SetTensorParametersReadOnly(
-      2, kTfLiteFloat32 /* type */, "kernel" /* name */, as_dims(model.ker_shape()), quantization,
-      reinterpret_cast<const char *>(_kernel.data()), _kernel.size() * sizeof(float));
+    2, kTfLiteFloat32 /* type */, "kernel" /* name */, as_dims(model.ker_shape()), quantization,
+    reinterpret_cast<const char *>(_kernel.data()), _kernel.size() * sizeof(float));
 
   _interp.SetTensorParametersReadOnly(
-      3, kTfLiteFloat32 /* type */, "bias" /* name */, {static_cast<int>(_bias.size())},
-      quantization, reinterpret_cast<const char *>(_bias.data()), _bias.size() * sizeof(float));
+    3, kTfLiteFloat32 /* type */, "bias" /* name */, {static_cast<int>(_bias.size())}, quantization,
+    reinterpret_cast<const char *>(_bias.data()), _bias.size() * sizeof(float));
 
   auto param = typed_malloc<TfLiteConvParams>();
 
diff --git a/compiler/nnsuite/conv/nnkit-tflite/ConvBackend.test.cpp b/compiler/nnsuite/conv/nnkit-tflite/ConvBackend.test.cpp
index db82f0cf9..98ac78fc2 100644
--- a/compiler/nnsuite/conv/nnkit-tflite/ConvBackend.test.cpp
+++ b/compiler/nnsuite/conv/nnkit-tflite/ConvBackend.test.cpp
@@ -38,8 +38,8 @@ public:
   TestModel(const std::string &ifm_name, const feature::Shape &ifm_shape,
             const std::string &ofm_name, const feature::Shape &ofm_shape,
             const kernel::Shape &ker_shape, const kernel::Layout &ker_layout, float *ker_data)
-      : _ifm_name(ifm_name), _ifm_shape(ifm_shape), _ofm_name(ofm_name), _ofm_shape(ofm_shape),
-        _ker{ker_shape, ker_layout, ker_data}
+    : _ifm_name(ifm_name), _ifm_shape(ifm_shape), _ofm_name(ofm_name),
+      _ofm_shape(ofm_shape), _ker{ker_shape, ker_layout, ker_data}
   {
     // DO NOTHING
   }
diff --git a/compiler/nnsuite/conv/nnkit-tflite/Entry.cpp b/compiler/nnsuite/conv/nnkit-tflite/Entry.cpp
index 2c84f72e6..c1e013767 100644
--- a/compiler/nnsuite/conv/nnkit-tflite/Entry.cpp
+++ b/compiler/nnsuite/conv/nnkit-tflite/Entry.cpp
@@ -21,8 +21,7 @@
 #include <nnkit/Backend.h>
 #include <nnkit/CmdlineArguments.h>
 
-#include <stdex/Memory.h>
-
+#include <memory>
 #include <chrono>
 #include <iostream>
 
@@ -40,5 +39,5 @@ extern "C" std::unique_ptr<nnkit::Backend> make_backend(const nnkit::CmdlineArgu
 
   const nnsuite::conv::RandomModel model{seed};
 
-  return stdex::make_unique<ConvBackend>(model);
+  return std::make_unique<ConvBackend>(model);
 }
diff --git a/compiler/one-cmds/CMakeLists.txt b/compiler/one-cmds/CMakeLists.txt
index 1472295c3..7d7a28fe1 100644
--- a/compiler/one-cmds/CMakeLists.txt
+++ b/compiler/one-cmds/CMakeLists.txt
@@ -4,6 +4,7 @@ set(ONE_COMMAND_FILES
     one-import-bcq
     one-import-tf
     one-import-tflite
+    one-import-onnx
     one-optimize
     one-quantize
     one-pack
@@ -78,4 +79,6 @@ if(NOT ENABLE_TEST)
   return()
 endif(NOT ENABLE_TEST)
 
+add_subdirectory(dummy-driver)
 add_subdirectory(tests)
+add_subdirectory(validate-onnx2circle)
diff --git a/compiler/one-cmds/dummy-driver/CMakeLists.txt b/compiler/one-cmds/dummy-driver/CMakeLists.txt
new file mode 100644
index 000000000..8e122ccf3
--- /dev/null
+++ b/compiler/one-cmds/dummy-driver/CMakeLists.txt
@@ -0,0 +1,21 @@
+# dummy driver for interface test
+set(DUMMY_DRIVER_SRC src/dummy-compile.cpp)
+set(HELP_DRIVER_SRC src/help-compile.cpp)
+
+add_executable(dummy-compile ${DUMMY_DRIVER_SRC})
+add_executable(help-compile ${HELP_DRIVER_SRC})
+
+set(DUMMY_DRIVER "${CMAKE_CURRENT_BINARY_DIR}/dummy-compile")
+set(HELP_DRIVER "${CMAKE_CURRENT_BINARY_DIR}/help-compile")
+
+install(FILES ${DUMMY_DRIVER}
+        PERMISSIONS OWNER_WRITE OWNER_READ OWNER_EXECUTE
+                    GROUP_READ GROUP_EXECUTE
+                    WORLD_READ WORLD_EXECUTE
+        DESTINATION test)
+
+install(FILES ${HELP_DRIVER}
+        PERMISSIONS OWNER_WRITE OWNER_READ OWNER_EXECUTE
+                    GROUP_READ GROUP_EXECUTE
+                    WORLD_READ WORLD_EXECUTE
+        DESTINATION test)
diff --git a/compiler/one-cmds/dummy-driver/src/dummy-compile.cpp b/compiler/one-cmds/dummy-driver/src/dummy-compile.cpp
new file mode 100644
index 000000000..2ad09a3dd
--- /dev/null
+++ b/compiler/one-cmds/dummy-driver/src/dummy-compile.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * dummy-compile only tests its interface rather than its functionality.
+ *
+ * ./dummy-compile -o ${OUTPUT_NAME} ${INPUT_NAME}
+ *
+ * NOTE argv[3](INPUT_NAME) is not used here.
+ */
+
+#include <iostream>
+#include <fstream>
+#include <string>
+
+int main(int argc, char **argv)
+{
+  if (argc != 4)
+    return EXIT_FAILURE;
+
+  std::string opt_o{"-o"};
+  std::string argv_1{argv[1]};
+
+  if (opt_o != argv_1)
+    return EXIT_FAILURE;
+
+  std::string output_name{argv[2]};
+  std::ofstream outfile(output_name);
+
+  outfile << "dummy-compile dummy output!!" << std::endl;
+
+  outfile.close();
+
+  return EXIT_SUCCESS;
+}
diff --git a/compiler/one-cmds/dummy-driver/src/help-compile.cpp b/compiler/one-cmds/dummy-driver/src/help-compile.cpp
new file mode 100644
index 000000000..9be9018f3
--- /dev/null
+++ b/compiler/one-cmds/dummy-driver/src/help-compile.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * help-compile prints dummy help message.
+ *
+ * $ ./help-compile -h
+ * HELP MESSAGE!!
+ */
+
+#include <iostream>
+#include <fstream>
+#include <string>
+
+int main(int argc, char **argv)
+{
+  if (argc != 2)
+    return EXIT_FAILURE;
+
+  std::string opt_h{"-h"};
+  std::string argv_1{argv[1]};
+
+  if (opt_h != argv_1)
+    return EXIT_FAILURE;
+
+  std::cout << "HELP MESSAGE!!" << std::endl;
+
+  return EXIT_SUCCESS;
+}
diff --git a/compiler/one-cmds/how-to-prepare-virtualenv.txt b/compiler/one-cmds/how-to-prepare-virtualenv.txt
index f3dcf704b..6d846c081 100644
--- a/compiler/one-cmds/how-to-prepare-virtualenv.txt
+++ b/compiler/one-cmds/how-to-prepare-virtualenv.txt
@@ -9,6 +9,9 @@ This document explains about 'one-prepare-venv' command.
 version 2.3.0, recommanded 2.x version as of now, so that 'one-import-tf'
 command can execute properly.
 
+'one-prepare-venv' will also prepare onnx and onnx-tensorflow version 1.7.0 so
+that 'one-import-onnx' command can execute properly.
+
 
 Prerequisite
 ------------
diff --git a/compiler/one-cmds/how-to-use-one-commands.txt b/compiler/one-cmds/how-to-use-one-commands.txt
index d4e3269e8..d034fa9a2 100644
--- a/compiler/one-cmds/how-to-use-one-commands.txt
+++ b/compiler/one-cmds/how-to-use-one-commands.txt
@@ -149,8 +149,16 @@ one-optimize
 one-optimize provides network or operator transformation shown below.
 
 Current transformation options are
+- disable_validation : This will turn off operator validations.
+- fold_add_v2 : This removes AddV2 operation which can be folded
+- fold_cast : This removes Cast operation which can be folded
 - fold_dequantize : This removes Dequantize operation which can be folded
+- fold_sparse_to_dense : This removes SparseToDense operation which can be folded
+- forward_reshape_to_unaryop: This will move Reshape after UnaryOp for centain condition
 - fuse_add_with_tconv: This fuses Add operator with the preceding TConv operator if possible
+- fuse_batchnorm_with_conv : This fuses BatchNorm operator to convolution operator
+- fuse_batchnorm_with_dwconv : This fuses BatchNorm operator to depthwise convolution operator
+- fuse_batchnorm_with_tconv : This fuses BatchNorm operator to transpose convolution operator
 - fuse_bcq: This enables Binary-Coded-bases Quantized DNNs
    - read https://arxiv.org/abs/2005.09904 for detailed information
 - fuse_instnorm: This will convert instance normalization related operators to
@@ -161,12 +169,30 @@ Current transformation options are
 - make_batchnorm_gamma_positive: This makes negative gamma of batch normalization into a small positive value (1e-10).
   Note that this pass can change the execution result of the model.
   So, use it only when the impact is known to be acceptable.
+- mute_warnings : This will turn off warning messages.
+- generate_profile_data : This will turn on profiling data generation.
+- remove_redundant_reshape : This fuses or removes redundant reshape operators.
+- remove_redundant_transpose : This fuses or removes redundant transpose operators.
+- remove_unnecessary_reshape : This removes unnecessary reshape operators.
+- remove_unnecessary_slice : This removes unnecessary slice operators.
+- remove_unnecessary_strided_slice : This removes unnecessary strided slice operators.
+- remove_unnecessary_split : This removes unnecessary split operators.
 - replace_cw_mul_add_with_depthwise_conv: This will replace channel-wise Mul/Add with DepthwiseConv2D.
 - resolve_customop_add: This will convert Custom(Add) to normal Add operator
 - resolve_customop_batchmatmul: This will convert Custom(BatchMatMul) to
   normal BatchMatMul operator
 - resolve_customop_matmul: This will convert Custom(MatMul) to normal MatMul
   operator
+- shuffle_weight_to_16x1float32 : This will convert weight format of FullyConnected to SHUFFLED16x1FLOAT32.
+  Note that it only converts weights whose row is a multiple of 16.
+- substitute_pack_to_reshape : This will convert single input Pack to Reshape.
+- substitute_squeeze_to_reshape : This will convert certain condition Squeeze to Reshape.
+- substitute_transpose_to_reshape : This will convert certain condition Transpose to Reshape.
+- transform_min_max_to_relu6: This will transform Minimum-Maximum pattern to Relu6 operator.
+
+There are options to enable multiple options at once for convenience.
+- O1: fuse_bcq, fuse_instnorm, resolve_customop_add, resolve_customop_batchmatmul,
+  resolve_customop_matmul, remove_redundant_transpose, substitute_pack_to_reshape
 
 
 one-quantize
diff --git a/compiler/one-cmds/one-build b/compiler/one-cmds/one-build
index 82b193f9e..34ce42fca 100644
--- a/compiler/one-cmds/one-build
+++ b/compiler/one-cmds/one-build
@@ -3,6 +3,7 @@
 ''''export PY_PATH=${SCRIPT_PATH}/venv/bin/python                                       # '''
 ''''test -f ${PY_PATH} && exec ${PY_PATH} "$0" "$@"                                     # '''
 ''''echo "Error: Virtual environment not found. Please run 'one-prepare-venv' command." # '''
+''''exit 255                                                                            # '''
 
 # Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
 #
@@ -57,7 +58,9 @@ def _get_driver_name(driver_name):
         'one-import-bcq': 'one-import-bcq',
         'one-import-tf': 'one-import-tf',
         'one-import-tflite': 'one-import-tflite',
+        'one-import-onnx': 'one-import-onnx',
         'one-optimize': 'one-optimize',
+        'one-quantize': 'one-quantize',
         'one-pack': 'one-pack',
         'one-codegen': 'one-codegen'
     }[driver_name]
@@ -78,7 +81,7 @@ def _is_available_driver(config, driver_name):
 
 def _verify_cfg(driver_list, config):
     if not config.has_section('one-build'):
-        raise ImportError('\'one-build\' section is required in configuraion file')
+        raise ImportError('[one-build] section is required in configuraion file')
 
     import_driver_cnt = 0
     if _is_available_driver(config, 'one-import-tf'):
@@ -87,6 +90,8 @@ def _verify_cfg(driver_list, config):
         import_driver_cnt += 1
     if _is_available_driver(config, 'one-import-bcq'):
         import_driver_cnt += 1
+    if _is_available_driver(config, 'one-import-onnx'):
+        import_driver_cnt += 1
     if import_driver_cnt > 1:
         raise AssertionError('Only one import-* driver can be executed')
 
@@ -106,8 +111,8 @@ def main():
 
     # verify configuration file
     drivers = [
-        'one-import-tf', 'one-import-tflite', 'one-import-bcq', 'one-optimize',
-        'one-quantize', 'one-pack', 'one-codegen'
+        'one-import-tf', 'one-import-tflite', 'one-import-bcq', 'one-import-onnx',
+        'one-optimize', 'one-quantize', 'one-pack', 'one-codegen'
     ]
     _verify_cfg(drivers, config)
 
diff --git a/compiler/one-cmds/one-build.template.cfg b/compiler/one-cmds/one-build.template.cfg
index ab6ac81d7..52d860813 100644
--- a/compiler/one-cmds/one-build.template.cfg
+++ b/compiler/one-cmds/one-build.template.cfg
@@ -2,6 +2,7 @@
 one-import-tf=True
 one-import-tflite=False
 one-import-bcq=False
+one-import-onnx=False
 one-optimize=True
 one-quantize=False
 one-pack=True
@@ -18,6 +19,7 @@ converter_version=v1
 [one-optimize]
 input_path=inception_v3.circle
 output_path=inception_v3.opt.circle
+generate_profile_data=False
 
 [one-pack]
 input_path=inception_v3.opt.circle
diff --git a/compiler/one-cmds/one-codegen b/compiler/one-cmds/one-codegen
index fbe3d52d2..ebd8ad7e5 100644
--- a/compiler/one-cmds/one-codegen
+++ b/compiler/one-cmds/one-codegen
@@ -20,6 +20,8 @@
 # limitations under the License.
 
 import argparse
+import copy
+import itertools
 import os
 import subprocess
 import sys
@@ -40,7 +42,9 @@ def _get_backends_list():
 
 
 def _get_parser():
-    parser = argparse.ArgumentParser(description='command line tool for code generation')
+    codegen_usage = 'one-codegen [-h] [-v] [-C CONFIG] [-b BACKEND] [--] [COMMANDS FOR BACKEND]'
+    parser = argparse.ArgumentParser(
+        description='command line tool for code generation', usage=codegen_usage)
 
     _utils._add_default_arg(parser)
 
@@ -68,18 +72,35 @@ def _verify_arg(parser, args):
 
 
 def _parse_arg(parser):
-    args, unknown_args = parser.parse_known_args()
+    codegen_args = []
+    backend_args = []
+    unknown_args = []
+    argv = copy.deepcopy(sys.argv)
+    # delete file name
+    del argv[0]
+    # split by '--'
+    args = [list(y) for x, y in itertools.groupby(argv, lambda z: z == '--') if not x]
+    # one-codegen has two interfaces
+    # 1. one-codegen [-h] [-v] [-C CONFIG] [-b BACKEND] [COMMANDS FOR BACKEND]
+    if len(args) == 1:
+        codegen_args = args[0]
+        codegen_args, unknown_args = parser.parse_known_args(codegen_args)
+    # 2. one-codegen [-h] [-v] [-C CONFIG] [-b BACKEND] -- [COMMANDS FOR BACKEND]
+    if len(args) == 2:
+        codegen_args = args[0]
+        backend_args = args[1]
+        codegen_args = parser.parse_args(codegen_args)
     # print version
-    if args.version:
+    if len(args) and codegen_args.version:
         _utils._print_version_and_exit(__file__)
 
-    return args, unknown_args
+    return codegen_args, backend_args, unknown_args
 
 
 def main():
     # parse arguments
     parser = _get_parser()
-    args, unknown_args = _parse_arg(parser)
+    args, backend_args, unknown_args = _parse_arg(parser)
 
     # parse configuration file
     _utils._parse_cfg(args, 'one-codegen')
@@ -90,7 +111,7 @@ def main():
     # make a command to run given backend driver
     dir_path = os.path.dirname(os.path.realpath(__file__))
     codegen_path = os.path.join(dir_path, getattr(args, 'backend') + '-compile')
-    codegen_cmd = [codegen_path] + unknown_args
+    codegen_cmd = [codegen_path] + backend_args + unknown_args
     if _utils._is_valid_attr(args, 'command'):
         codegen_cmd += getattr(args, 'command').split()
 
@@ -100,6 +121,8 @@ def main():
             bufsize=1) as p:
         for line in p.stdout:
             sys.stdout.buffer.write(line)
+    if p.returncode != 0:
+        sys.exit(p.returncode)
 
 
 if __name__ == '__main__':
diff --git a/compiler/one-cmds/one-import-onnx b/compiler/one-cmds/one-import-onnx
new file mode 100644
index 000000000..1bcf2e838
--- /dev/null
+++ b/compiler/one-cmds/one-import-onnx
@@ -0,0 +1,161 @@
+#!/usr/bin/env bash
+''''export SCRIPT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"                  # '''
+''''export PY_PATH=${SCRIPT_PATH}/venv/bin/python                                       # '''
+''''test -f ${PY_PATH} && exec ${PY_PATH} "$0" "$@"                                     # '''
+''''echo "Error: Virtual environment not found. Please run 'one-prepare-venv' command." # '''
+''''exit 255                                                                            # '''
+
+# Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+import subprocess
+import sys
+import tempfile
+import onnx
+import onnx_tf
+
+import utils as _utils
+
+
+def _get_parser():
+    parser = argparse.ArgumentParser(
+        description='command line tool to convert ONNX to circle')
+
+    _utils._add_default_arg(parser)
+
+    ## tf2tfliteV2 arguments
+    tf2tfliteV2_group = parser.add_argument_group('converter arguments')
+
+    # input and output path.
+    tf2tfliteV2_group.add_argument(
+        '-i', '--input_path', type=str, help='full filepath of the input file')
+    tf2tfliteV2_group.add_argument(
+        '-o', '--output_path', type=str, help='full filepath of the output file')
+
+    # input and output arrays.
+    tf2tfliteV2_group.add_argument(
+        '-I',
+        '--input_arrays',
+        type=str,
+        help='names of the input arrays, comma-separated')
+    tf2tfliteV2_group.add_argument(
+        '-O',
+        '--output_arrays',
+        type=str,
+        help='names of the output arrays, comma-separated')
+
+    # fixed options
+    tf2tfliteV2_group.add_argument('--model_format', default='saved_model')
+    tf2tfliteV2_group.add_argument('--converter_version', default='v2')
+
+    return parser
+
+
+def _verify_arg(parser, args):
+    """verify given arguments"""
+    # check if required arguments is given
+    missing = []
+    if not _utils._is_valid_attr(args, 'input_path'):
+        missing.append('-i/--input_path')
+    if not _utils._is_valid_attr(args, 'output_path'):
+        missing.append('-o/--output_path')
+    if len(missing):
+        parser.error('the following arguments are required: ' + ' '.join(missing))
+
+
+def _parse_arg(parser):
+    args = parser.parse_args()
+    # print version
+    if args.version:
+        _utils._print_version_and_exit(__file__)
+
+    return args
+
+
+def _convert(args):
+    # get file path to log
+    dir_path = os.path.dirname(os.path.realpath(__file__))
+    logfile_path = os.path.realpath(args.output_path) + '.log'
+
+    with open(logfile_path, 'wb') as f, tempfile.TemporaryDirectory() as tmpdir:
+        # convert onnx to tf saved model
+        onnx_model = onnx.load(getattr(args, 'input_path'))
+        tf_savedmodel = onnx_tf.backend.prepare(onnx_model)
+
+        savedmodel_name = os.path.splitext(os.path.basename(
+            args.output_path))[0] + '.savedmodel'
+        savedmodel_output_path = os.path.join(tmpdir, savedmodel_name)
+        tf_savedmodel.export_graph(savedmodel_output_path)
+
+        # make a command to convert from tf to tflite
+        tf2tfliteV2_path = os.path.join(dir_path, 'tf2tfliteV2.py')
+        tf2tfliteV2_output_name = os.path.splitext(os.path.basename(
+            args.output_path))[0] + '.tflite'
+        tf2tfliteV2_output_path = os.path.join(tmpdir, tf2tfliteV2_output_name)
+
+        tf2tfliteV2_cmd = _utils._make_tf2tfliteV2_cmd(
+            args, tf2tfliteV2_path, savedmodel_output_path, tf2tfliteV2_output_path)
+
+        f.write((' '.join(tf2tfliteV2_cmd) + '\n').encode())
+
+        # convert tf to tflite
+        with subprocess.Popen(
+                tf2tfliteV2_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+                bufsize=1) as p:
+            for line in p.stdout:
+                sys.stdout.buffer.write(line)
+                f.write(line)
+        if p.returncode != 0:
+            sys.exit(p.returncode)
+
+        # make a command to convert from tflite to circle
+        tflite2circle_path = os.path.join(dir_path, 'tflite2circle')
+        tflite2circle_cmd = _utils._make_tflite2circle_cmd(tflite2circle_path,
+                                                           tf2tfliteV2_output_path,
+                                                           getattr(args, 'output_path'))
+
+        f.write((' '.join(tflite2circle_cmd) + '\n').encode())
+
+        # convert tflite to circle
+        with subprocess.Popen(
+                tflite2circle_cmd,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.STDOUT,
+                bufsize=1) as p:
+            for line in p.stdout:
+                sys.stdout.buffer.write(line)
+                f.write(line)
+        if p.returncode != 0:
+            sys.exit(p.returncode)
+
+
+def main():
+    # parse arguments
+    parser = _get_parser()
+    args = _parse_arg(parser)
+
+    # parse configuration file
+    _utils._parse_cfg(args, 'one-import-onnx')
+
+    # verify arguments
+    _verify_arg(parser, args)
+
+    # convert
+    _convert(args)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/compiler/one-cmds/one-import-tflite b/compiler/one-cmds/one-import-tflite
index fba697f24..9b80f304b 100644
--- a/compiler/one-cmds/one-import-tflite
+++ b/compiler/one-cmds/one-import-tflite
@@ -90,6 +90,8 @@ def _convert(args):
             for line in p.stdout:
                 sys.stdout.buffer.write(line)
                 f.write(line)
+        if p.returncode != 0:
+            sys.exit(p.returncode)
 
 
 def main():
diff --git a/compiler/one-cmds/one-optimize b/compiler/one-cmds/one-optimize
index f03bb8dcc..8ce79d432 100644
--- a/compiler/one-cmds/one-optimize
+++ b/compiler/one-cmds/one-optimize
@@ -34,6 +34,15 @@ def _get_parser():
 
     _utils._add_default_arg(parser)
 
+    ## utility arguments
+    utility_group = parser.add_argument_group('arguments for utility')
+
+    utility_group.add_argument(
+        '-p',
+        '--generate_profile_data',
+        action='store_true',
+        help='generate profiling data')
+
     ## circle2circle arguments
     circle2circle_group = parser.add_argument_group('arguments for optimization')
 
@@ -44,50 +53,9 @@ def _get_parser():
         '-o', '--output_path', type=str, help='full filepath of the output file')
 
     # optimization pass
-    circle2circle_group.add_argument(
-        '--all', action='store_true', help='enable all optimization pass')
-    circle2circle_group.add_argument(
-        '--fold_dequantize', action='store_true', help='fold Dequantize op')
-    circle2circle_group.add_argument(
-        '--fuse_add_with_tconv', action='store_true', help='fuse Add op to Transposed')
-    circle2circle_group.add_argument(
-        '--fuse_batchnorm_with_tconv',
-        action='store_true',
-        help='fuse BatchNorm op to Transposed Convolution op')
-    circle2circle_group.add_argument(
-        '--fuse_bcq', action='store_true', help='apply Binary Coded Quantization')
-    circle2circle_group.add_argument(
-        '--fuse_preactivation_batchnorm',
-        action='store_true',
-        help='fuse BatchNorm operators of pre-activations to Convolution op')
-    circle2circle_group.add_argument(
-        '--make_batchnorm_gamma_positive',
-        action='store_true',
-        help="""make negative gamma of BatchNorm to a small positive value (1e-10).
-        Note that this pass can change the execution result of the model.
-        So, use it only when the impact is known to be acceptable.""")
-    circle2circle_group.add_argument(
-        '--fuse_activation_function',
-        action='store_true',
-        help='fuse Activation function to a preceding operator')
-    circle2circle_group.add_argument(
-        '--fuse_instnorm', action='store_true', help='fuse ops to InstanceNorm operator')
-    circle2circle_group.add_argument(
-        '--replace_cw_mul_add_with_depthwise_conv',
-        action='store_true',
-        help='replace channel-wise Mul/Add with DepthwiseConv2D')
-    circle2circle_group.add_argument(
-        '--resolve_customop_add',
-        action='store_true',
-        help='convert Custom(Add) op to Add op')
-    circle2circle_group.add_argument(
-        '--resolve_customop_batchmatmul',
-        action='store_true',
-        help='convert Custom(BatchMatmul) op to BatchMatmul op')
-    circle2circle_group.add_argument(
-        '--resolve_customop_matmul',
-        action='store_true',
-        help='convert Custom(Matmul) op to Matmul op')
+    for opt in _utils._CONSTANT.OPTIMIZATION_OPTS:
+        # opt = (option_name, help_message)
+        circle2circle_group.add_argument('--' + opt[0], action='store_true', help=opt[1])
 
     return parser
 
diff --git a/compiler/one-cmds/one-prepare-venv b/compiler/one-cmds/one-prepare-venv
index e5c88411f..bb3616574 100644
--- a/compiler/one-cmds/one-prepare-venv
+++ b/compiler/one-cmds/one-prepare-venv
@@ -51,6 +51,21 @@ python -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host file
 python -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host files.pythonhost.org \
   install Pillow==6.2.2
 
+# Install PyTorch and ONNX related
+python -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host files.pythonhost.org \
+  --trusted-host download.pytorch.org \
+  install torch==1.7.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
+
+# NOTE Latest onnx 1.8.1 has compatibility issue with onnx-tf 1.7.0
+#      MUST install with onnx==1.8.0
+# Provide install of custom onnx-tf
+if [ -n "${EXT_ONNX_TF_WHL}" ]; then
+  python -m pip --default-timeout=1000 install onnx==1.8.0 ${EXT_ONNX_TF_WHL}
+else
+  python -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host files.pythonhost.org \
+    install onnx==1.8.0 onnx-tf==1.7.0
+fi
+
 # Create python symoblic link
 rm -f ${DRIVER_PATH}/python
 ln -s venv/bin/python ${DRIVER_PATH}/python
diff --git a/compiler/one-cmds/one-quantize b/compiler/one-cmds/one-quantize
index 9bdfea8b8..475f44a49 100644
--- a/compiler/one-cmds/one-quantize
+++ b/compiler/one-cmds/one-quantize
@@ -38,10 +38,22 @@ def _get_parser():
     parser.add_argument(
         '-i', '--input_path', type=str, help='full filepath of the input file')
     parser.add_argument(
-        '-d', '--input_data', type=str, help='full filepath of the input data file')
+        '-d',
+        '--input_data',
+        type=str,
+        help=
+        'full filepath of the input data file. if not specified, run with random input data.'
+    )
     parser.add_argument(
         '-o', '--output_path', type=str, help='full filepath of the output file')
 
+    # argument for profiling
+    parser.add_argument(
+        '-p',
+        '--generate_profile_data',
+        action='store_true',
+        help='generate profiling data')
+
     ## arguments for quantization
     quantization_group = parser.add_argument_group('arguments for quantization')
 
@@ -66,26 +78,30 @@ def _get_parser():
         type=str,
         help='record mode (supported: percentile/moving_average, default=percentile)')
 
-    # set default values
-    quantization_group.set_defaults(
-        input_dtype='float32',
-        quantized_dtype='uint8',
-        granularity='layer',
-        min_percentile='1.0',
-        max_percentile='99.0',
-        mode='percentile')
-
     return parser
 
 
+def _set_default_values(args):
+    if not _utils._is_valid_attr(args, 'input_dtype'):
+        setattr(args, 'input_dtype', 'float32')
+    if not _utils._is_valid_attr(args, 'quantized_dtype'):
+        setattr(args, 'quantized_dtype', 'uint8')
+    if not _utils._is_valid_attr(args, 'granularity'):
+        setattr(args, 'granularity', 'layer')
+    if not _utils._is_valid_attr(args, 'mode'):
+        setattr(args, 'mode', 'percentile')
+    if not _utils._is_valid_attr(args, 'min_percentile'):
+        setattr(args, 'min_percentile', '1.0')
+    if not _utils._is_valid_attr(args, 'max_percentile'):
+        setattr(args, 'max_percentile', '99.0')
+
+
 def _verify_arg(parser, args):
     """verify given arguments"""
     # check if required arguments is given
     missing = []
     if not _utils._is_valid_attr(args, 'input_path'):
         missing.append('-i/--input_path')
-    if not _utils._is_valid_attr(args, 'input_data'):
-        missing.append('-d/--input_data')
     if not _utils._is_valid_attr(args, 'output_path'):
         missing.append('-o/--output_path')
     if len(missing):
@@ -128,6 +144,9 @@ def _quantize(args):
             tmpdir,
             os.path.splitext(os.path.basename(args.input_path))[0]) + '1.circle'
         circle_quantizer_cmd.append(tmp_output_path_1)
+        # profiling
+        if _utils._is_valid_attr(args, 'generate_profile_data'):
+            circle_quantizer_cmd.append('--generate_profile_data')
 
         f.write((' '.join(circle_quantizer_cmd) + '\n').encode())
 
@@ -168,6 +187,9 @@ def _quantize(args):
         if _utils._is_valid_attr(args, 'mode'):
             circle_record_minmax_cmd.append('--mode')
             circle_record_minmax_cmd.append(getattr(args, 'mode'))
+        # profiling
+        if _utils._is_valid_attr(args, 'generate_profile_data'):
+            circle_record_minmax_cmd.append('--generate_profile_data')
 
         f.write((' '.join(circle_record_minmax_cmd) + '\n').encode())
 
@@ -197,6 +219,9 @@ def _quantize(args):
         circle_quantizer_cmd.append(tmp_output_path_2)
         if _utils._is_valid_attr(args, 'output_path'):
             circle_quantizer_cmd.append(getattr(args, 'output_path'))
+        # profiling
+        if _utils._is_valid_attr(args, 'generate_profile_data'):
+            circle_quantizer_cmd.append('--generate_profile_data')
 
         f.write((' '.join(circle_quantizer_cmd) + '\n').encode())
 
@@ -221,6 +246,9 @@ def main():
     # parse configuration file
     _utils._parse_cfg(args, 'one-quantize')
 
+    # set default values
+    _set_default_values(args)
+
     # verify arguments
     _verify_arg(parser, args)
 
diff --git a/compiler/one-cmds/tests/CMakeLists.txt b/compiler/one-cmds/tests/CMakeLists.txt
index 412787a64..6f9f2847e 100644
--- a/compiler/one-cmds/tests/CMakeLists.txt
+++ b/compiler/one-cmds/tests/CMakeLists.txt
@@ -17,6 +17,10 @@ file(APPEND "${DRIVER_SCRIPT}" "  USER_PATH=$1\n")
 file(APPEND "${DRIVER_SCRIPT}" "  export PATH=$USER_PATH:$PATH\n")
 file(APPEND "${DRIVER_SCRIPT}" "fi\n")
 file(APPEND "${DRIVER_SCRIPT}" "\n")
+file(APPEND "${DRIVER_SCRIPT}" "# refer https://github.com/Samsung/ONE/issues/6286\n")
+file(APPEND "${DRIVER_SCRIPT}" "set -o pipefail\n\n")
+file(APPEND "${DRIVER_SCRIPT}" "fail_count=0\n")
+file(APPEND "${DRIVER_SCRIPT}" "trap \"(( fail_count++ ))\" ERR\n\n")
 
 foreach(TESTITEM IN ITEMS ${TESTITEMS})
   get_filename_component(ITEM_PREFIX ${TESTITEM} NAME_WE)
@@ -35,7 +39,16 @@ foreach(CONFIGITEM IN ITEMS ${CONFIGITEMS})
   install(FILES ${CONFIGITEM} DESTINATION test)
 endforeach(CONFIGITEM)
 
-file(APPEND "${DRIVER_SCRIPT}" "popd> /dev/null")
+file(APPEND "${DRIVER_SCRIPT}" "popd > /dev/null\n\n")
+
+file(APPEND "${DRIVER_SCRIPT}"
+"if [[ $fail_count != 0 ]]; then
+  echo \"$fail_count TESTS FAILED\"
+  exit 255
+else
+  echo \"ALL TESTS PASSED!\"
+fi\n
+")
 
 set(PREPARE_TEST_MATERIALS_SH "${CMAKE_CURRENT_SOURCE_DIR}/prepare_test_materials.sh")
 set(PREPROCESS_IMAGES_PY "${CMAKE_CURRENT_SOURCE_DIR}/preprocess_images.py")
diff --git a/compiler/one-cmds/tests/one-build_001.test b/compiler/one-cmds/tests/one-build_001.test
index fb4877344..ebbb3235b 100644
--- a/compiler/one-cmds/tests/one-build_001.test
+++ b/compiler/one-cmds/tests/one-build_001.test
@@ -14,6 +14,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# one-import-tf -> one-optimize
+
 filename_ext="$(basename -- $0)"
 filename="${filename_ext%.*}"
 
diff --git a/compiler/one-cmds/tests/one-build_002.test b/compiler/one-cmds/tests/one-build_002.test
index fdfd607e2..43fce4e6f 100644
--- a/compiler/one-cmds/tests/one-build_002.test
+++ b/compiler/one-cmds/tests/one-build_002.test
@@ -14,6 +14,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# one-import-tf -> one-optimize -> one-pack
+
 filename_ext="$(basename -- $0)"
 filename="${filename_ext%.*}"
 
diff --git a/compiler/one-cmds/tests/one-build_003.cfg b/compiler/one-cmds/tests/one-build_003.cfg
new file mode 100644
index 000000000..6aec3cab6
--- /dev/null
+++ b/compiler/one-cmds/tests/one-build_003.cfg
@@ -0,0 +1,21 @@
+[one-build]
+one-import-tf=True
+one-import-tflite=False
+one-import-bcq=False
+one-optimize=False
+one-quantize=True
+one-pack=False
+one-codegen=False
+
+[one-import-tf]
+input_path=inception_v3.pb
+output_path=inception_v3.circle
+input_arrays=input
+input_shapes=1,299,299,3
+output_arrays=InceptionV3/Predictions/Reshape_1
+converter_version=v1
+
+[one-quantize]
+input_path=inception_v3.circle
+output_path=inception_v3.quantized.circle
+input_data=inception_v3_test_data.h5
diff --git a/compiler/one-cmds/tests/one-build_003.test b/compiler/one-cmds/tests/one-build_003.test
new file mode 100644
index 000000000..d835be470
--- /dev/null
+++ b/compiler/one-cmds/tests/one-build_003.test
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# one-import-tf -> one-quantize
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+configfile="one-build_003.cfg"
+outputfile="inception_v3.quantized.circle"
+
+rm -rf ${outputfile}
+
+# run test
+one-build -C ${configfile} > /dev/null
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/one-build_004.cfg b/compiler/one-cmds/tests/one-build_004.cfg
new file mode 100644
index 000000000..c23405bea
--- /dev/null
+++ b/compiler/one-cmds/tests/one-build_004.cfg
@@ -0,0 +1,20 @@
+[one-build]
+one-import-tf=True
+one-import-tflite=False
+one-import-bcq=False
+one-optimize=False
+one-quantize=False
+one-pack=False
+one-codegen=True
+
+[one-import-tf]
+input_path=inception_v3.pb
+output_path=inception_v3.circle
+input_arrays=input
+input_shapes=1,299,299,3
+output_arrays=InceptionV3/Predictions/Reshape_1
+converter_version=v1
+
+[one-codegen]
+backend=dummy
+command=-o sample.tvn inception_v3.circle
diff --git a/compiler/one-cmds/tests/one-build_004.test b/compiler/one-cmds/tests/one-build_004.test
new file mode 100644
index 000000000..f4174bd73
--- /dev/null
+++ b/compiler/one-cmds/tests/one-build_004.test
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# one-import-tf -> one-codegen
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  rm -rf ../bin/dummy-compile
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+configfile="one-build_004.cfg"
+outputfile="sample.tvn"
+
+rm -rf ${outputfile}
+
+# copy dummy-compile to bin folder
+cp dummy-compile ../bin/dummy-compile
+
+# run test
+one-build -C ${configfile} > /dev/null
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+rm -rf ../bin/dummy-compile
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/one-build_005.cfg b/compiler/one-cmds/tests/one-build_005.cfg
new file mode 100644
index 000000000..841b37234
--- /dev/null
+++ b/compiler/one-cmds/tests/one-build_005.cfg
@@ -0,0 +1,20 @@
+[one-build]
+one-import-tf=False
+one-import-tflite=True
+one-import-bcq=False
+one-optimize=True
+one-quantize=False
+one-pack=False
+one-codegen=True
+
+[one-import-tflite]
+input_path=inception_v3.tflite
+output_path=inception_v3.circle
+
+[one-optimize]
+input_path=inception_v3.circle
+output_path=inception_v3.opt.circle
+
+[one-codegen]
+backend=dummy
+command=-o sample.tvn inception_v3.opt.circle
diff --git a/compiler/one-cmds/tests/one-build_005.test b/compiler/one-cmds/tests/one-build_005.test
new file mode 100644
index 000000000..772483ddc
--- /dev/null
+++ b/compiler/one-cmds/tests/one-build_005.test
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# one-import-tflite -> one-optimize -> one-codgen
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  rm -rf ../bin/dummy-compile
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+configfile="one-build_005.cfg"
+outputfile="sample.tvn"
+
+rm -rf ${outputfile}
+
+# copy dummy-compile to bin folder
+cp dummy-compile ../bin/dummy-compile
+
+# run test
+one-build -C ${configfile} > /dev/null
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+rm -rf ../bin/dummy-compile
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/one-build_006.cfg b/compiler/one-cmds/tests/one-build_006.cfg
new file mode 100644
index 000000000..e754bdeca
--- /dev/null
+++ b/compiler/one-cmds/tests/one-build_006.cfg
@@ -0,0 +1,29 @@
+[one-build]
+one-import-tf=True
+one-import-tflite=False
+one-import-bcq=False
+one-optimize=True
+one-quantize=True
+one-pack=False
+one-codegen=True
+
+[one-import-tf]
+input_path=inception_v3.pb
+output_path=inception_v3.circle
+input_arrays=input
+input_shapes=1,299,299,3
+output_arrays=InceptionV3/Predictions/Reshape_1
+converter_version=v1
+
+[one-optimize]
+input_path=inception_v3.circle
+output_path=inception_v3.opt.circle
+
+[one-quantize]
+input_path=inception_v3.opt.circle
+output_path=inception_v3.quantized.circle
+input_data=inception_v3_test_data.h5
+
+[one-codegen]
+backend=dummy
+command=-o sample.tvn inception_v3.quantized.circle
diff --git a/compiler/one-cmds/tests/one-build_006.test b/compiler/one-cmds/tests/one-build_006.test
new file mode 100644
index 000000000..caf8897b1
--- /dev/null
+++ b/compiler/one-cmds/tests/one-build_006.test
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# one-import-tf -> one-optimize -> one-quantize -> one-codegen
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  rm -rf ../bin/dummy-compile
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+configfile="one-build_006.cfg"
+outputfile="sample.tvn"
+
+rm -rf ${outputfile}
+
+# copy dummy-compile to bin folder
+cp dummy-compile ../bin/dummy-compile
+
+# run test
+one-build -C ${configfile} > /dev/null
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+rm -rf ../bin/dummy-compile
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/one-build_007.cfg b/compiler/one-cmds/tests/one-build_007.cfg
new file mode 100644
index 000000000..52610750d
--- /dev/null
+++ b/compiler/one-cmds/tests/one-build_007.cfg
@@ -0,0 +1,29 @@
+[one-build]
+one-import-tf=True
+one-import-tflite=False
+one-import-bcq=False
+one-optimize=False
+one-quantize=True
+one-pack=True
+one-codegen=False
+
+[one-import-tf]
+input_path=inception_v3.pb
+output_path=inception_v3.circle
+input_arrays=input
+input_shapes=1,299,299,3
+output_arrays=InceptionV3/Predictions/Reshape_1
+converter_version=v1
+
+[one-optimize]
+input_path=inception_v3.circle
+output_path=inception_v3.opt.circle
+
+[one-quantize]
+input_path=inception_v3.opt.circle
+output_path=inception_v3.quantized.circle
+input_data=inception_v3_test_data.h5
+
+[one-pack]
+input_path=inception_v3.quantized.circle
+output_path=inception_v3_pkg
diff --git a/compiler/one-cmds/tests/one-build_007.test b/compiler/one-cmds/tests/one-build_007.test
new file mode 100644
index 000000000..086187013
--- /dev/null
+++ b/compiler/one-cmds/tests/one-build_007.test
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# one-import-tf -> one-optimize -> one-quantize -> one-pack
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+configfile="one-build_007.cfg"
+outputfile="inception_v3_pkg"
+
+rm -rf ${outputfile}
+
+# run test
+one-build -C ${configfile} > /dev/null
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/one-build_008.cfg b/compiler/one-cmds/tests/one-build_008.cfg
new file mode 100644
index 000000000..615047c86
--- /dev/null
+++ b/compiler/one-cmds/tests/one-build_008.cfg
@@ -0,0 +1,23 @@
+[one-build]
+one-import-tf=False
+one-import-tflite=False
+one-import-bcq=False
+one-import-onnx=True
+one-optimize=True
+one-quantize=False
+one-pack=False
+one-codegen=True
+
+[one-import-onnx]
+input_path=test_onnx_model.onnx
+output_path=test_onnx_model.circle
+
+[one-optimize]
+input_path=test_onnx_model.circle
+output_path=test_onnx_model.opt.circle
+all=True
+remove_redundant_transpose=True
+
+[one-codegen]
+backend=dummy
+command=-o test_onnx_model.bin test_onnx_model.opt.circle
diff --git a/compiler/one-cmds/tests/one-build_008.test b/compiler/one-cmds/tests/one-build_008.test
new file mode 100644
index 000000000..bfb7666db
--- /dev/null
+++ b/compiler/one-cmds/tests/one-build_008.test
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# one-import-tf -> one-optimize -> one-quantize -> one-codegen
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  rm -rf ../bin/dummy-compile
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+configfile="one-build_008.cfg"
+outputfile="test_onnx_model.bin"
+
+rm -rf ${outputfile}
+
+# copy dummy-compile to bin folder
+cp dummy-compile ../bin/dummy-compile
+
+# run test
+one-build -C ${configfile} > /dev/null
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+rm -rf ../bin/dummy-compile
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/one-build_009.cfg b/compiler/one-cmds/tests/one-build_009.cfg
new file mode 100644
index 000000000..66bca250d
--- /dev/null
+++ b/compiler/one-cmds/tests/one-build_009.cfg
@@ -0,0 +1,24 @@
+[one-build]
+one-import-tf=False
+one-import-tflite=False
+one-import-bcq=False
+one-import-onnx=True
+one-optimize=True
+one-quantize=False
+one-pack=False
+one-codegen=True
+
+[one-import-onnx]
+input_path=onnx_conv2d_conv2d.onnx
+output_path=onnx_conv2d_conv2d.circle
+
+[one-optimize]
+input_path=onnx_conv2d_conv2d.circle
+output_path=onnx_conv2d_conv2d.opt.circle
+all=True
+remove_redundant_transpose=True
+convert_nchw_to_nhwc=True
+
+[one-codegen]
+backend=dummy
+command=-o onnx_conv2d_conv2d.bin onnx_conv2d_conv2d.opt.circle
diff --git a/compiler/one-cmds/tests/one-build_009.test b/compiler/one-cmds/tests/one-build_009.test
new file mode 100644
index 000000000..0d766261d
--- /dev/null
+++ b/compiler/one-cmds/tests/one-build_009.test
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+# Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# one-import-onnx -> one-optimize -> one-codegen
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  rm -rf ../bin/dummy-compile
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+configfile="one-build_009.cfg"
+outputfile="onnx_conv2d_conv2d.bin"
+
+rm -rf ${outputfile}
+
+# copy dummy-compile to bin folder
+cp dummy-compile ../bin/dummy-compile
+
+# run test
+one-build -C ${configfile} > /dev/null
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+rm -rf ../bin/dummy-compile
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/one-build_neg_003.test b/compiler/one-cmds/tests/one-build_neg_003.test
index a8ad24049..bcbd2f98a 100644
--- a/compiler/one-cmds/tests/one-build_neg_003.test
+++ b/compiler/one-cmds/tests/one-build_neg_003.test
@@ -21,7 +21,7 @@ filename="${filename_ext%.*}"
 
 trap_err_onexit()
 {
-  if grep -q "'one-build' section is required in configuraion file" "${filename}.log"; then
+  if grep -q "\[one-build\] section is required in configuraion file" "${filename}.log"; then
     echo "${filename_ext} SUCCESS"
     exit 0
   fi
diff --git a/compiler/one-cmds/tests/one-codegen_001.test b/compiler/one-cmds/tests/one-codegen_001.test
new file mode 100644
index 000000000..7c679b38e
--- /dev/null
+++ b/compiler/one-cmds/tests/one-codegen_001.test
@@ -0,0 +1,41 @@
+#!/bin/bash
+
+# Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+# copy help-compile to bin folder
+cp help-compile ../bin/help-compile
+
+# run test
+one-codegen -b help -- -h > ${filename}.log
+
+rm -rf ../bin/help-compile
+
+if grep -q "HELP MESSAGE!!" "${filename}.log"; then
+  echo "${filename_ext} SUCCESS"
+  exit 0
+fi
+
+trap_err_onexit
diff --git a/compiler/one-cmds/tests/one-codegen_002.test b/compiler/one-cmds/tests/one-codegen_002.test
new file mode 100644
index 000000000..feb848919
--- /dev/null
+++ b/compiler/one-cmds/tests/one-codegen_002.test
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+# Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# run one-codegen with dummy-compile driver
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  rm -rf ../bin/dummy-compile
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+outputfile="sample.tvn"
+
+rm -rf ${outputfile}
+
+# copy dummy-compile to bin folder
+cp dummy-compile ../bin/dummy-compile
+
+# run test
+one-codegen -b dummy -o ${outputfile} "dummy.circle"
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+rm -rf ../bin/dummy-compile
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/one-codegen_003.test b/compiler/one-cmds/tests/one-codegen_003.test
new file mode 100644
index 000000000..47d12a4f1
--- /dev/null
+++ b/compiler/one-cmds/tests/one-codegen_003.test
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+# Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# run one-codegen with dummy-compile driver
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  rm -rf ../bin/dummy-compile
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+outputfile="sample.tvn"
+
+rm -rf ${outputfile}
+
+# copy dummy-compile to bin folder
+cp dummy-compile ../bin/dummy-compile
+
+# run test
+one-codegen -b dummy -- -o ${outputfile} "dummy.circle"
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+rm -rf ../bin/dummy-compile
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/one-codegen_004.test b/compiler/one-cmds/tests/one-codegen_004.test
new file mode 100644
index 000000000..88f42338d
--- /dev/null
+++ b/compiler/one-cmds/tests/one-codegen_004.test
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+# Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# print one-codegen's help message
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+# run test
+one-codegen -h > ${filename}.log
+
+if grep -q "command line tool for code generation" "${filename}.log"; then
+  echo "${filename_ext} SUCCESS"
+  exit 0
+fi
+
+trap_err_onexit
diff --git a/compiler/one-cmds/tests/one-codegen_neg_001.test b/compiler/one-cmds/tests/one-codegen_neg_001.test
new file mode 100644
index 000000000..fd5d0cb30
--- /dev/null
+++ b/compiler/one-cmds/tests/one-codegen_neg_001.test
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+# Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# negative usage with no input
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  if grep -q "error: the following arguments are required" "${filename}.log"; then
+    echo "${filename_ext} SUCCESS"
+    exit 0
+  fi
+
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+# run test
+one-codegen > ${filename}.log 2>&1
+
+echo "${filename_ext} FAILED"
+exit 255
diff --git a/compiler/one-cmds/tests/one-import-onnx_001.test b/compiler/one-cmds/tests/one-import-onnx_001.test
new file mode 100644
index 000000000..6119b6882
--- /dev/null
+++ b/compiler/one-cmds/tests/one-import-onnx_001.test
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+# Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./test_onnx_model.onnx"
+outputfile="./test_onnx_model.circle"
+
+rm -rf ${outputfile}
+rm -rf ${outputfile}.log
+
+# run test
+one-import-onnx \
+--input_path ${inputfile} \
+--output_path ${outputfile} > ${outputfile}.log 2>&1
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/one-import_005.cfg b/compiler/one-cmds/tests/one-import_005.cfg
new file mode 100644
index 000000000..abe4c7d77
--- /dev/null
+++ b/compiler/one-cmds/tests/one-import_005.cfg
@@ -0,0 +1,13 @@
+[one-build]
+one-import-tf=False
+one-import-tflite=False
+one-import-bcq=False
+one-import-onnx=True
+one-optimize=False
+one-quantize=False
+one-pack=False
+one-codegen=False
+
+[one-import-onnx]
+input_path=test_onnx_model.onnx
+output_path=test_onnx_model.circle
diff --git a/compiler/one-cmds/tests/one-import_005.test b/compiler/one-cmds/tests/one-import_005.test
new file mode 100644
index 000000000..ca49db189
--- /dev/null
+++ b/compiler/one-cmds/tests/one-import_005.test
@@ -0,0 +1,40 @@
+#!/bin/bash
+
+# Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+configfile="one-import_005.cfg"
+outputfile="test_onnx_model.circle"
+
+rm -f ${outputfile}
+
+# run test
+one-build -C ${configfile} > ${filename}.log 2>&1
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/one-optimize_001.test b/compiler/one-cmds/tests/one-optimize_001.test
index 240a62506..0d58a6a9e 100644
--- a/compiler/one-cmds/tests/one-optimize_001.test
+++ b/compiler/one-cmds/tests/one-optimize_001.test
@@ -40,7 +40,7 @@ if [[ ! -s ${inputfile} ]]; then
 fi
 
 # run test
-one-optimize --all \
+one-optimize --O1 \
 --input_path ${inputfile} \
 --output_path ${outputfile} >> /dev/null
 
diff --git a/compiler/one-cmds/tests/one-optimize_neg_001.test b/compiler/one-cmds/tests/one-optimize_neg_001.test
index 4ee509697..a30b4164d 100644
--- a/compiler/one-cmds/tests/one-optimize_neg_001.test
+++ b/compiler/one-cmds/tests/one-optimize_neg_001.test
@@ -39,7 +39,7 @@ rm -rf ${outputfile}
 rm -rf ${outputfile}.log
 
 # run test
-one-optimize --all \
+one-optimize --O1 \
 --input_path ${inputfile} \
 --output_path ${outputfile} > ${filename}.log
 
diff --git a/compiler/one-cmds/tests/one-optimize_neg_002.test b/compiler/one-cmds/tests/one-optimize_neg_002.test
index 2c2a29a87..7ccf4a89c 100644
--- a/compiler/one-cmds/tests/one-optimize_neg_002.test
+++ b/compiler/one-cmds/tests/one-optimize_neg_002.test
@@ -39,7 +39,7 @@ rm -rf ${outputfile}
 rm -rf ${outputfile}.log
 
 # run test
-one-optimize --all \
+one-optimize --O1 \
 --input_path ${inputfile} \
 --output_path ${outputfile} > ${filename}.log
 
diff --git a/compiler/one-cmds/tests/one-optimize_neg_003.test b/compiler/one-cmds/tests/one-optimize_neg_003.test
index 95f08fd95..3fe7d330e 100644
--- a/compiler/one-cmds/tests/one-optimize_neg_003.test
+++ b/compiler/one-cmds/tests/one-optimize_neg_003.test
@@ -44,7 +44,7 @@ if [[ ! -s ${inputfile} ]]; then
 fi
 
 # run test
-one-optimize --all \
+one-optimize --O1 \
 --input_path "${inputfile}" > "${filename}.log" 2>&1
 
 echo "${filename_ext} FAILED"
diff --git a/compiler/one-cmds/tests/one-quantize_002.test b/compiler/one-cmds/tests/one-quantize_002.test
new file mode 100644
index 000000000..3704425d4
--- /dev/null
+++ b/compiler/one-cmds/tests/one-quantize_002.test
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+# Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+filename_ext="$(basename -- $0)"
+filename="${filename_ext%.*}"
+
+trap_err_onexit()
+{
+  echo "${filename_ext} FAILED"
+  exit 255
+}
+
+trap trap_err_onexit ERR
+
+inputfile="./inception_v3.circle"
+outputfile="./inception_v3.random.quantized.circle"
+
+rm -rf ${outputfile}
+
+# to create inception_v3.circle
+if [[ ! -s ${inputfile} ]]; then
+  /bin/bash one-import_001.test >> /dev/null
+  return_code=$?
+  if [[ ${return_code} != 0 ]]; then
+    trap_err_onexit
+  fi
+fi
+
+# run test without input data
+one-quantize \
+--input_dtype float32 \
+--quantized_dtype uint8 \
+--input_path ./inception_v3.circle \
+--output_path ./inception_v3.random.quantized.circle >> /dev/null
+
+if [[ ! -s "${outputfile}" ]]; then
+  trap_err_onexit
+fi
+
+echo "${filename_ext} SUCCESS"
diff --git a/compiler/one-cmds/tests/prepare_test_materials.sh b/compiler/one-cmds/tests/prepare_test_materials.sh
index bc3d65d92..694651d74 100644
--- a/compiler/one-cmds/tests/prepare_test_materials.sh
+++ b/compiler/one-cmds/tests/prepare_test_materials.sh
@@ -77,6 +77,20 @@ if [[ ! -s "test_keras_model.h5" ]]; then
     # https://github.com/Samsung/ONE/issues/4268#issuecomment-725025805
 fi
 
+if [[ ! -s "test_onnx_model.onnx" ]]; then
+    rm -rf test_onnx_model.zip
+    wget https://github.com/Samsung/ONE/files/5768243/test_onnx_model.zip
+    unzip test_onnx_model.zip
+    # https://github.com/Samsung/ONE/issues/5548#issuecomment-754373360
+fi
+
+if [[ ! -s "onnx_conv2d_conv2d.onnx" ]]; then
+    rm -rf onnx_conv2d_conv2d.zip
+    wget https://github.com/Samsung/ONE/files/5774648/onnx_conv2d_conv2d.zip
+    unzip onnx_conv2d_conv2d.zip
+    # https://github.com/Samsung/ONE/issues/5577#issuecomment-755078444
+fi
+
 # prepare 'inception_v3.circle' file used for quantization test
 inputfile="./inception_v3.pb"
 outputfile="./inception_v3.circle"
diff --git a/compiler/one-cmds/utils.py b/compiler/one-cmds/utils.py
index 6eff9d772..1d5c4d4fd 100644
--- a/compiler/one-cmds/utils.py
+++ b/compiler/one-cmds/utils.py
@@ -21,6 +21,61 @@ import subprocess
 import sys
 
 
+class _CONSTANT:
+    __slots__ = ()  # This prevents access via __dict__.
+    OPTIMIZATION_OPTS = (
+        # (OPTION_NAME, HELP_MESSAGE)
+        ('O1', 'enable O1 optimization pass'),
+        ('convert_nchw_to_nhwc',
+         'Experimental: This will convert NCHW operators to NHWC under the assumption that input model is NCHW.'
+         ),
+        ('nchw_to_nhwc_preserve_input_shape',
+         'preserve the input shape of the model (argument for convert_nchw_to_nhwc)'),
+        ('nchw_to_nhwc_preserve_output_shape',
+         'preserve the output shape of the model (argument for convert_nchw_to_nhwc)'),
+        ('fold_add_v2', 'fold AddV2 op with constant inputs'),
+        ('fold_cast', 'fold Cast op with constant input'),
+        ('fold_dequantize', 'fold Dequantize op'),
+        ('fold_sparse_to_dense', 'fold SparseToDense op'),
+        ('forward_reshape_to_unaryop', 'Forward Reshape op'),
+        ('fuse_add_with_tconv', 'fuse Add op to Transposed'),
+        ('fuse_batchnorm_with_conv', 'fuse BatchNorm op to Convolution op'),
+        ('fuse_batchnorm_with_dwconv', 'fuse BatchNorm op to Depthwise Convolution op'),
+        ('fuse_batchnorm_with_tconv', 'fuse BatchNorm op to Transposed Convolution op'),
+        ('fuse_bcq', 'apply Binary Coded Quantization'),
+        ('fuse_preactivation_batchnorm',
+         'fuse BatchNorm operators of pre-activations to Convolution op'),
+        ('make_batchnorm_gamma_positive',
+         'make negative gamma of BatchNorm to a small positive value (1e-10).'
+         ' Note that this pass can change the execution result of the model.'
+         ' So, use it only when the impact is known to be acceptable.'),
+        ('fuse_activation_function', 'fuse Activation function to a preceding operator'),
+        ('fuse_instnorm', 'fuse ops to InstanceNorm operator'),
+        ('replace_cw_mul_add_with_depthwise_conv',
+         'replace channel-wise Mul/Add with DepthwiseConv2D'),
+        ('remove_redundant_reshape', 'fuse or remove subsequent Reshape ops'),
+        ('remove_redundant_transpose', 'fuse or remove subsequent Transpose ops'),
+        ('remove_unnecessary_reshape', 'remove unnecessary reshape ops'),
+        ('remove_unnecessary_slice', 'remove unnecessary slice ops'),
+        ('remove_unnecessary_strided_slice', 'remove unnecessary strided slice ops'),
+        ('remove_unnecessary_split', 'remove unnecessary split ops'),
+        ('resolve_customop_add', 'convert Custom(Add) op to Add op'),
+        ('resolve_customop_batchmatmul',
+         'convert Custom(BatchMatmul) op to BatchMatmul op'),
+        ('resolve_customop_matmul', 'convert Custom(Matmul) op to Matmul op'),
+        ('shuffle_weight_to_16x1float32',
+         'convert weight format of FullyConnected op to SHUFFLED16x1FLOAT32.'
+         ' Note that it only converts weights whose row is a multiple of 16'),
+        ('substitute_pack_to_reshape', 'convert single input Pack op to Reshape op'),
+        ('substitute_squeeze_to_reshape', 'convert certain condition Squeeze to Reshape'),
+        ('substitute_transpose_to_reshape',
+         'convert certain condition Transpose to Reshape'),
+        ('transform_min_max_to_relu6', 'transform Minimum-Maximum pattern to Relu6 op'))
+
+
+_CONSTANT = _CONSTANT()
+
+
 def _add_default_arg(parser):
     # version
     parser.add_argument(
@@ -114,25 +169,13 @@ def _make_tflite2circle_cmd(driver_path, input_path, output_path):
 def _make_circle2circle_cmd(args, driver_path, input_path, output_path):
     """make a command for running circle2circle"""
     cmd = [os.path.expanduser(c) for c in [driver_path, input_path, output_path]]
+    # profiling
+    if _is_valid_attr(args, 'generate_profile_data'):
+        cmd.append('--generate_profile_data')
     # optimization pass
-    if _is_valid_attr(args, 'all'):
-        cmd.append('--all')
-    if _is_valid_attr(args, 'fold_dequantize'):
-        cmd.append('--fold_dequantize')
-    if _is_valid_attr(args, 'fuse_add_with_tconv'):
-        cmd.append('--fuse_add_with_tconv')
-    if _is_valid_attr(args, 'fuse_batchnorm_with_tconv'):
-        cmd.append('--fuse_batchnorm_with_tconv')
-    if _is_valid_attr(args, 'fuse_bcq'):
-        cmd.append('--fuse_bcq')
-    if _is_valid_attr(args, 'fuse_instnorm'):
-        cmd.append('--fuse_instnorm')
-    if _is_valid_attr(args, 'resolve_customop_add'):
-        cmd.append('--resolve_customop_add')
-    if _is_valid_attr(args, 'resolve_customop_batchmatmul'):
-        cmd.append('--resolve_customop_batchmatmul')
-    if _is_valid_attr(args, 'resolve_customop_matmul'):
-        cmd.append('--resolve_customop_matmul')
+    for opt in _CONSTANT.OPTIMIZATION_OPTS:
+        if _is_valid_attr(args, opt[0]):
+            cmd.append('--' + opt[0])
 
     return cmd
 
diff --git a/compiler/one-cmds/validate-onnx2circle/CMakeLists.txt b/compiler/one-cmds/validate-onnx2circle/CMakeLists.txt
new file mode 100644
index 000000000..6727359c9
--- /dev/null
+++ b/compiler/one-cmds/validate-onnx2circle/CMakeLists.txt
@@ -0,0 +1,5 @@
+install(FILES validate_onnx2circle.py
+        PERMISSIONS OWNER_WRITE OWNER_READ OWNER_EXECUTE
+                    GROUP_READ GROUP_EXECUTE
+                    WORLD_READ WORLD_EXECUTE
+        DESTINATION test)
diff --git a/compiler/one-cmds/validate-onnx2circle/README.md b/compiler/one-cmds/validate-onnx2circle/README.md
new file mode 100644
index 000000000..341df3d87
--- /dev/null
+++ b/compiler/one-cmds/validate-onnx2circle/README.md
@@ -0,0 +1,36 @@
+# validate-onnx2circle
+
+_validate-onnx2circle_ provides validation of onnx to optimized circle conversion
+by comparing execution results of original onnx model and optimized circle model.
+
+This is currently in experimental state.
+
+## How to run the script
+
+Install `onnx-runtime` inside virtual environment
+```
+source install_path/bin/venv/bin/activate
+
+python -m pip --default-timeout=1000 --trusted-host pypi.org \
+  --trusted-host files.pythonhost.org install onnxruntime==1.6.0
+
+deactivate
+```
+
+Run the sctipt
+```bash
+cd install_path/test
+
+driver='one/build/debug/compiler/luci-eval-driver/luci_eval_driver'
+onnx_filepath='path_to_onnx_model.onnx'
+circle_filepath='path_to_optimized_circle.circle'
+
+./validate_onnx2circle.py --driver ${driver} --onnx ${onnx_filepath} --circle ${circle_filepath}
+```
+
+Output will show something like this
+```
+Run ONNX...
+Run luci-interpreter...
+Compare 0 True
+```
diff --git a/compiler/one-cmds/validate-onnx2circle/validate_onnx2circle.py b/compiler/one-cmds/validate-onnx2circle/validate_onnx2circle.py
new file mode 100644
index 000000000..eac2f6d35
--- /dev/null
+++ b/compiler/one-cmds/validate-onnx2circle/validate_onnx2circle.py
@@ -0,0 +1,156 @@
+#!/usr/bin/env bash
+''''export SCRIPT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"                  # '''
+''''export PY_PATH=${SCRIPT_PATH}/../bin/venv/bin/python                                # '''
+''''test -f ${PY_PATH} && exec ${PY_PATH} "$0" "$@"                                     # '''
+''''echo "Error: Virtual environment not found. Please run 'one-prepare-venv' command." # '''
+''''exit 255                                                                            # '''
+
+# Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# NOTE This is an experimental script to evaluate onnx-circle conversion
+#      by running onnxruntime and luci-interpreter.
+#      Plan is to run this regularly in CI
+
+import subprocess
+import argparse
+import numpy as np
+import torch
+import onnx
+import onnxruntime as ort
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--driver', type=str, required=True)
+parser.add_argument('--onnx', type=str, required=True)
+parser.add_argument('--circle', type=str, required=True)
+args = parser.parse_args()
+
+driver = args.driver
+onnx_filepath = args.onnx
+circle_filepath = args.circle
+
+
+def to_numpy(tensor):
+    return tensor.cpu().numpy()
+
+
+def to_nhwc(tensor):
+    if (tensor.ndim == 4):
+        return np.transpose(tensor, (0, 2, 3, 1))
+    return tensor
+
+
+class OnnxRunner:
+    def __init__(self, filepath):
+        self.filepath = filepath
+        self.session = None
+        self.inputs = None
+        self.inputs_size = None
+        self.inputs_data = None
+        self.outputs = None
+        self.outputs_size = None
+
+    def load(self):
+        model = onnx.load(self.filepath)
+        onnx.checker.check_model(model)
+        self.session = ort.InferenceSession(self.filepath)
+
+    def feed_random_inputs(self):
+        self.inputs = self.session.get_inputs()
+        self.inputs_size = len(self.inputs)
+        # reset input dictionary
+        self.inputs_data = {}
+        for in_idx in range(self.inputs_size):
+            input_shape = self.inputs[in_idx].shape
+            input_type = self.inputs[in_idx].type
+            if input_type == 'tensor(float)':
+                torch_type = torch.float32
+            else:
+                # TODO support other dtype
+                raise SystemExit("Unsupported input dtype")
+
+            x = torch.randn(input_shape, dtype=torch_type)
+            input_npa = to_numpy(x)
+            self.inputs_data.update({self.inputs[in_idx].name: input_npa})
+
+            # save NHWC form of input for luci-interpreter
+            input_npa_nhwc = to_nhwc(input_npa)
+            input_npa_nhwc.tofile(circle_filepath + ".input" + str(in_idx))
+
+    def run(self):
+        self.outs = self.session.run(None, self.inputs_data)
+
+    def get_outputs(self):
+        self.outputs = self.session.get_outputs()
+        self.outputs_size = len(self.outputs)
+
+
+# Run ONNX model
+print("Run ONNX...")
+onnx_runner = OnnxRunner(onnx_filepath)
+onnx_runner.load()
+onnx_runner.feed_random_inputs()
+onnx_runner.run()
+onnx_runner.get_outputs()
+
+# Execute luci interpreter
+print("Run luci-interpreter...")
+process = subprocess.run(
+    [
+        driver, circle_filepath,
+        str(onnx_runner.inputs_size), circle_filepath + ".input",
+        circle_filepath + ".output"
+    ],
+    check=True)
+
+# Compare results
+rtolerance = 1e-03
+atolerance = 1e-04
+result_compare = True
+for idx in range(onnx_runner.outputs_size):
+    output_shape = onnx_runner.outputs[idx].shape
+    output_type = onnx_runner.outputs[idx].type
+    if output_type == 'tensor(float)':
+        output_np_type = np.float32
+    else:
+        # TODO support other dtype
+        raise SystemExit("Unsupported output dtype")
+
+    # output of luci-interpreter
+    output_data = np.fromfile(circle_filepath + ".output" + str(idx), output_np_type)
+    shape_file = open(circle_filepath + ".output" + str(idx) + ".shape", 'r')
+    output_shape = [int(i) for i in shape_file.read().split(',')]
+    luci_output_data = np.reshape(output_data, output_shape)
+
+    # output of onnx runtime
+    output_nchw = onnx_runner.outs[idx]
+    output_nhwc = to_nhwc(output_nchw)
+
+    # diff has tensor of boolean for each values within tolerance or not
+    diff = np.isclose(output_nhwc, luci_output_data, rtol=rtolerance, atol=atolerance)
+    # get one boolean if all are True then True
+    result_compare_one = np.all(diff)
+    print("Compare", idx, result_compare_one)
+    if (not result_compare_one):
+        diff_val = np.subtract(output_nhwc, luci_output_data)
+        print("ONNX Result", output_nhwc)
+        print("Diff", diff_val)
+        print("Diff Max", np.ndarray.max(diff_val))
+
+    result_compare = result_compare and result_compare_one
+
+if (not result_compare):
+    exit(-1)
+
+exit(0)
diff --git a/compiler/oneco/CMakeLists.txt b/compiler/oneco/CMakeLists.txt
index 10f466948..418bc27ac 100644
--- a/compiler/oneco/CMakeLists.txt
+++ b/compiler/oneco/CMakeLists.txt
@@ -20,7 +20,6 @@ target_include_directories(moco_onnx_frontend PRIVATE src)
 target_include_directories(moco_onnx_frontend PUBLIC include)
 target_link_libraries(moco_onnx_frontend PUBLIC moco_onnx_proto)
 target_link_libraries(moco_onnx_frontend PUBLIC loco)
-target_link_libraries(moco_onnx_frontend PRIVATE stdex)
 target_link_libraries(moco_onnx_frontend PRIVATE cwrap)
 
 nnas_find_package(GTest QUIET)
diff --git a/compiler/oneco/requires.cmake b/compiler/oneco/requires.cmake
index 4e99b0eac..c11a84d9c 100644
--- a/compiler/oneco/requires.cmake
+++ b/compiler/oneco/requires.cmake
@@ -1,3 +1,2 @@
-require("stdex")
 require("loco")
 require("cwrap")
diff --git a/compiler/oneco/src/Frontend.cpp b/compiler/oneco/src/Frontend.cpp
index d633c1c2e..4b1554ee8 100644
--- a/compiler/oneco/src/Frontend.cpp
+++ b/compiler/oneco/src/Frontend.cpp
@@ -76,8 +76,8 @@ void load_onnx(const std::string &path, moco::onnx::Frontend::FileType type,
 // TODO Make comments clear
 void convert_graph(::onnx::ModelProto &onnx_model_proto, loco::Graph *graph)
 {
-  auto nodes = stdex::make_unique<moco::onnx::SymbolTable>();
-  auto input_names = stdex::make_unique<moco::onnx::SymbolTable>();
+  auto nodes = std::make_unique<moco::onnx::SymbolTable>();
+  auto input_names = std::make_unique<moco::onnx::SymbolTable>();
 
   moco::onnx::GraphBuilderContext gb_context(graph, nodes.get(), input_names.get());
 
diff --git a/compiler/oneco/src/GraphBuilder.h b/compiler/oneco/src/GraphBuilder.h
index 7271eb81a..7e463ce9a 100644
--- a/compiler/oneco/src/GraphBuilder.h
+++ b/compiler/oneco/src/GraphBuilder.h
@@ -27,9 +27,9 @@ namespace onnx
 {
 
 /**
-* @brief Parent class of onnx operation graph builders
-* @note GraphBuilder call proper build and validate function according to opset version
-*/
+ * @brief Parent class of onnx operation graph builders
+ * @note GraphBuilder call proper build and validate function according to opset version
+ */
 class GraphBuilder
 {
 public:
diff --git a/compiler/oneco/src/GraphBuilderContext.h b/compiler/oneco/src/GraphBuilderContext.h
index f1f394b50..dd368e335 100644
--- a/compiler/oneco/src/GraphBuilderContext.h
+++ b/compiler/oneco/src/GraphBuilderContext.h
@@ -69,13 +69,13 @@ private:
 };
 
 /**
-* @brief Class to store context to build IR from onnx
-*/
+ * @brief Class to store context to build IR from onnx
+ */
 class GraphBuilderContext
 {
 public:
   GraphBuilderContext(loco::Graph *g, SymbolTable *nodes, SymbolTable *input_names)
-      : _g(g), _nodes(nodes), _input_names(input_names)
+    : _g(g), _nodes(nodes), _input_names(input_names)
   {
     // DO NOTHING
   }
diff --git a/compiler/oneco/src/GraphBuilderRegistry.h b/compiler/oneco/src/GraphBuilderRegistry.h
index 1bf4d9514..863a6ee3a 100644
--- a/compiler/oneco/src/GraphBuilderRegistry.h
+++ b/compiler/oneco/src/GraphBuilderRegistry.h
@@ -27,15 +27,15 @@ namespace onnx
 {
 
 /**
-* @brief Class to return graph builder for passed onnx Operator
-*/
+ * @brief Class to return graph builder for passed onnx Operator
+ */
 class GraphBuilderRegistry
 {
 public:
   /**
-  * @brief Returns registered GraphBuilder pointer for operator or
-  *        nullptr if not registered
-  */
+   * @brief Returns registered GraphBuilder pointer for operator or
+   *        nullptr if not registered
+   */
   const GraphBuilder *lookup(const std::string &op) const
   {
     if (_builder_map.find(op) == _builder_map.end())
@@ -63,16 +63,16 @@ private:
 } // namespace onnx
 } // namespace moco
 
-#include <stdex/Memory.h>
+#include <memory>
 
-#define REGISTER_OP_BUILDER(NAME, BUILDER)                                                    \
-  namespace                                                                                   \
-  {                                                                                           \
-  __attribute__((constructor)) void reg_op(void)                                              \
-  {                                                                                           \
-    std::unique_ptr<moco::onnx::BUILDER> builder = stdex::make_unique<moco::onnx::BUILDER>(); \
-    moco::onnx::GraphBuilderRegistry::get().add(#NAME, std::move(builder));                   \
-  }                                                                                           \
+#define REGISTER_OP_BUILDER(NAME, BUILDER)                                                  \
+  namespace                                                                                 \
+  {                                                                                         \
+  __attribute__((constructor)) void reg_op(void)                                            \
+  {                                                                                         \
+    std::unique_ptr<moco::onnx::BUILDER> builder = std::make_unique<moco::onnx::BUILDER>(); \
+    moco::onnx::GraphBuilderRegistry::get().add(#NAME, std::move(builder));                 \
+  }                                                                                         \
   }
 
 #endif // __MOCO_FRONTEND_ONNX_GRAPH_BUILDER_REGISTRY_H__
diff --git a/compiler/oneco/src/Op/Constant.h b/compiler/oneco/src/Op/Constant.h
index e25441d58..be74cfcdd 100644
--- a/compiler/oneco/src/Op/Constant.h
+++ b/compiler/oneco/src/Op/Constant.h
@@ -24,8 +24,8 @@ namespace onnx
 {
 
 /**
-  * @brief GraphBuilder for Constant(since version 1) node
-  */
+ * @brief GraphBuilder for Constant(since version 1) node
+ */
 class Constant_V1
 {
 public:
@@ -34,10 +34,10 @@ public:
 };
 
 /**
-  * @brief GraphBuilder for Constant(since version 9) node
-  * @note Until version 1, only FLOAT16, FLOAT, DOUBLE was supported
-  *       Since version 9, all types are supported
-  */
+ * @brief GraphBuilder for Constant(since version 9) node
+ * @note Until version 1, only FLOAT16, FLOAT, DOUBLE was supported
+ *       Since version 9, all types are supported
+ */
 class Constant_V9
 {
 public:
@@ -46,8 +46,8 @@ public:
 };
 
 /**
-  * @brief GraphBuilder for Constant node
-  */
+ * @brief GraphBuilder for Constant node
+ */
 class ConstantGraphBuilder : public GraphBuilder
 {
 public:
diff --git a/compiler/oneco/src/Op/Identity.h b/compiler/oneco/src/Op/Identity.h
index 41367bea0..dde614592 100644
--- a/compiler/oneco/src/Op/Identity.h
+++ b/compiler/oneco/src/Op/Identity.h
@@ -24,8 +24,8 @@ namespace onnx
 {
 
 /**
-  * @brief GraphBuilder for Identity(since version 1) node
-  */
+ * @brief GraphBuilder for Identity(since version 1) node
+ */
 class Identity_V1
 {
 public:
@@ -34,8 +34,8 @@ public:
 };
 
 /**
-  * @brief GraphBuilder for Identity node
-  */
+ * @brief GraphBuilder for Identity node
+ */
 class IdentityGraphBuilder : public GraphBuilder
 {
 public:
diff --git a/compiler/onnx2circle/CMakeLists.txt b/compiler/onnx2circle/CMakeLists.txt
index a0d393bd9..1a5a7e093 100644
--- a/compiler/onnx2circle/CMakeLists.txt
+++ b/compiler/onnx2circle/CMakeLists.txt
@@ -20,7 +20,6 @@ target_link_libraries(onnx2circle PRIVATE moco_log)
 target_link_libraries(onnx2circle PRIVATE exo)
 target_link_libraries(onnx2circle PRIVATE locop)
 target_link_libraries(onnx2circle PRIVATE hermes_std)
-target_link_libraries(onnx2circle PRIVATE stdex)
 target_link_libraries(onnx2circle PRIVATE angkor cwrap)
 target_link_libraries(onnx2circle PRIVATE mir2loco)
 target_link_libraries(onnx2circle PRIVATE mir_onnx_importer)
diff --git a/compiler/onnx2circle/requires.cmake b/compiler/onnx2circle/requires.cmake
index f52e40416..b2268ec8b 100644
--- a/compiler/onnx2circle/requires.cmake
+++ b/compiler/onnx2circle/requires.cmake
@@ -1,4 +1,3 @@
-require("stdex")
 require("hermes-std")
 require("mir2loco")
 require("mir")
diff --git a/compiler/onnx2circle/src/onnx2circle.cpp b/compiler/onnx2circle/src/onnx2circle.cpp
index c329ed3d5..1c03fa1fe 100644
--- a/compiler/onnx2circle/src/onnx2circle.cpp
+++ b/compiler/onnx2circle/src/onnx2circle.cpp
@@ -25,10 +25,8 @@
 #include "hermes/ConsoleReporter.h"
 #include "hermes/EnvConfig.h"
 
-#include "stdex/Memory.h"
-
 #include <cassert>
-
+#include <memory>
 #include <iostream>
 #include <stdexcept>
 #include <string>
@@ -56,8 +54,8 @@ struct LoggingContext
     if (ctx == nullptr)
     {
       ctx = new hermes::Context;
-      ctx->sinks()->append(stdex::make_unique<hermes::ConsoleReporter>());
-      ctx->config(stdex::make_unique<EnvConfig>("ONNX2CIRCLE_Log"));
+      ctx->sinks()->append(std::make_unique<hermes::ConsoleReporter>());
+      ctx->config(std::make_unique<EnvConfig>("ONNX2CIRCLE_Log"));
     }
 
     return ctx;
@@ -81,7 +79,7 @@ int main(int argc, char **argv)
   using EnvConfig = hermes::EnvConfig<hermes::EnvFormat::BooleanNumber>;
 
   // This line allows users to control all the exo-circle loggers via ONNX2CIRCLE_Log_Backend
-  exo::LoggingContext::get()->config(stdex::make_unique<EnvConfig>("ONNX2CIRCLE_Log_Backend"));
+  exo::LoggingContext::get()->config(std::make_unique<EnvConfig>("ONNX2CIRCLE_Log_Backend"));
 
   LOGGER(l);
 
diff --git a/compiler/onnxkit/CMakeLists.txt b/compiler/onnxkit/CMakeLists.txt
index 81c3622c9..9ccc779a8 100644
--- a/compiler/onnxkit/CMakeLists.txt
+++ b/compiler/onnxkit/CMakeLists.txt
@@ -24,7 +24,6 @@ target_include_directories(onnxkitproto PUBLIC ${ONNX_PROTO_INCLUDE_DIRS})
 target_link_libraries(onnxkitproto PUBLIC libprotobuf)
 
 add_executable(onnxkit ${SOURCES})
-target_link_libraries(onnxkit PRIVATE stdex)
 target_link_libraries(onnxkit PRIVATE cli)
 target_link_libraries(onnxkit PRIVATE onnxkitproto)
 target_link_libraries(onnxkit PRIVATE nncc_common)
diff --git a/compiler/onnxkit/README.md b/compiler/onnxkit/README.md
index d2066cf65..0a863950e 100644
--- a/compiler/onnxkit/README.md
+++ b/compiler/onnxkit/README.md
@@ -58,4 +58,3 @@ nncc$ cat decoded.pbtxt | path_to_onnxkit/onnxkit encode > encoded.pb
 - onnx
 - Protobuf
 - cli
-- stdex
diff --git a/compiler/onnxkit/src/Main.cpp b/compiler/onnxkit/src/Main.cpp
index 3dfd580ec..f97590f7d 100644
--- a/compiler/onnxkit/src/Main.cpp
+++ b/compiler/onnxkit/src/Main.cpp
@@ -18,14 +18,15 @@
 #include "DecodeCommand.hpp"
 
 #include <cli/App.h>
-#include <stdex/Memory.h>
+
+#include <memory>
 
 int main(int argc, char **argv)
 {
   cli::App app{argv[0]};
 
-  app.insert("encode", stdex::make_unique<EncodeCommand>());
-  app.insert("decode", stdex::make_unique<DecodeCommand>());
+  app.insert("encode", std::make_unique<EncodeCommand>());
+  app.insert("decode", std::make_unique<DecodeCommand>());
 
   return app.run(argc - 1, argv + 1);
 }
diff --git a/compiler/onnxkit/src/Support.cpp b/compiler/onnxkit/src/Support.cpp
index 8c0774175..151290a00 100644
--- a/compiler/onnxkit/src/Support.cpp
+++ b/compiler/onnxkit/src/Support.cpp
@@ -16,8 +16,7 @@
 
 #include "Support.hpp"
 
-#include <stdex/Memory.h>
-
+#include <memory>
 #include <cassert>
 #include <fstream>
 #include <stdexcept>
@@ -33,7 +32,7 @@ std::unique_ptr<T> open_fstream(const std::string &path, std::ios_base::openmode
     return nullptr;
   }
 
-  auto stream = stdex::make_unique<T>(path.c_str(), mode);
+  auto stream = std::make_unique<T>(path.c_str(), mode);
   if (!stream->is_open())
   {
     throw std::runtime_error{"ERROR: Failed to open " + path};
@@ -61,7 +60,7 @@ std::string Cmdline::get_or(unsigned int index, const std::string &s) const
 
 std::unique_ptr<UI> make_ui(const Cmdline &cmdargs)
 {
-  auto iocfg = stdex::make_unique<UI>();
+  auto iocfg = std::make_unique<UI>();
 
   auto in = open_fstream<std::ifstream>(cmdargs.get_or(0, "-"), std::ios::in | std::ios::binary);
   iocfg->in(std::move(in));
diff --git a/compiler/oops/CMakeLists.txt b/compiler/oops/CMakeLists.txt
index f12572d54..5cc115598 100644
--- a/compiler/oops/CMakeLists.txt
+++ b/compiler/oops/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_library(oops INTERFACE)
 target_include_directories(oops INTERFACE include)
 target_link_libraries(oops INTERFACE pepper_str)
+target_link_libraries(oops INTERFACE nncc_coverage)
 
 if(NOT ENABLE_TEST)
   return()
@@ -8,5 +9,5 @@ endif(NOT ENABLE_TEST)
 
 nnas_find_package(GTest REQUIRED)
 
-GTest_AddTest(oops_test test.cpp)
+GTest_AddTest(oops_test src/oops.test.cpp)
 target_link_libraries(oops_test oops)
diff --git a/compiler/oops/include/oops/InternalExn.h b/compiler/oops/include/oops/InternalExn.h
index e14332bb2..5da3277b7 100644
--- a/compiler/oops/include/oops/InternalExn.h
+++ b/compiler/oops/include/oops/InternalExn.h
@@ -40,20 +40,20 @@ class InternalExn : public std::exception
 {
 public:
   InternalExn(const char *filename, const int line, const std::string &msg)
-      : _filename(filename), _line(to_uint32(line)), _msg(msg)
+    : _filename(filename), _line(to_uint32(line)), _msg(msg)
   {
     construct_full_msg();
   }
 
   explicit InternalExn(const char *filename, const int line, const std::string &msg, uint32_t val)
-      : _filename(filename), _line(to_uint32(line)), _msg(msg + ": " + std::to_string(val))
+    : _filename(filename), _line(to_uint32(line)), _msg(msg + ": " + std::to_string(val))
   {
     construct_full_msg();
   }
 
   explicit InternalExn(const char *filename, const int line, const std::string &msg,
                        const std::string &val)
-      : _filename(filename), _line(to_uint32(line)), _msg(msg + ": " + val)
+    : _filename(filename), _line(to_uint32(line)), _msg(msg + ": " + val)
   {
     construct_full_msg();
   }
@@ -69,7 +69,7 @@ private:
   void construct_full_msg()
   {
     _full_msg =
-        "Internal Exception. " + _msg + " [" + _filename + ":" + std::to_string(_line) + "]";
+      "Internal Exception. " + _msg + " [" + _filename + ":" + std::to_string(_line) + "]";
   }
 
   std::string _full_msg;
diff --git a/compiler/oops/include/oops/UserExn.h b/compiler/oops/include/oops/UserExn.h
index d0138322d..84a6b81eb 100644
--- a/compiler/oops/include/oops/UserExn.h
+++ b/compiler/oops/include/oops/UserExn.h
@@ -72,7 +72,9 @@ private:
     out << pepper::str(attr, " = ", val);
   }
 
-  void build_info(std::stringstream &) { /* empty */}
+  void build_info(std::stringstream &)
+  { /* empty */
+  }
 
   // when only one info of string is provided
   void build_info(std::stringstream &out, const std::string &val) { out << val; }
diff --git a/compiler/oops/test.cpp b/compiler/oops/src/oops.test.cpp
index 666f62f54..666f62f54 100644
--- a/compiler/oops/test.cpp
+++ b/compiler/oops/src/oops.test.cpp
diff --git a/compiler/pepper-str/CMakeLists.txt b/compiler/pepper-str/CMakeLists.txt
index cbe01b86a..481073af7 100644
--- a/compiler/pepper-str/CMakeLists.txt
+++ b/compiler/pepper-str/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_library(pepper_str INTERFACE)
 target_include_directories(pepper_str INTERFACE include)
+target_link_libraries(pepper_str INTERFACE nncc_coverage)
 
 if(NOT ENABLE_TEST)
   return()
@@ -8,5 +9,5 @@ endif(NOT ENABLE_TEST)
 # Google Test is mandatory for test
 nnas_find_package(GTest REQUIRED)
 
-GTest_AddTest(pepper_str_test test.cpp)
+GTest_AddTest(pepper_str_test src/pepper-str.test.cpp)
 target_link_libraries(pepper_str_test pepper_str)
diff --git a/compiler/pepper-str/include/pepper/str.h b/compiler/pepper-str/include/pepper/str.h
index efbc3a9c8..0c74aa85a 100644
--- a/compiler/pepper-str/include/pepper/str.h
+++ b/compiler/pepper-str/include/pepper/str.h
@@ -47,7 +47,7 @@ inline void str_impl(std::ostream &os, Arg &&arg, Args &&... args)
   str_impl(os, std::forward<Args>(args)...);
 }
 
-} // namesapce details
+} // namespace details
 } // namespace pepper
 
 namespace pepper
diff --git a/compiler/pepper-str/test.cpp b/compiler/pepper-str/src/pepper-str.test.cpp
index 222c371c8..222c371c8 100644
--- a/compiler/pepper-str/test.cpp
+++ b/compiler/pepper-str/src/pepper-str.test.cpp
diff --git a/compiler/plier-tf/src/TestHelper.cpp b/compiler/plier-tf/src/TestHelper.cpp
index a551e89f9..c1565b5cc 100644
--- a/compiler/plier-tf/src/TestHelper.cpp
+++ b/compiler/plier-tf/src/TestHelper.cpp
@@ -40,7 +40,7 @@ struct membuf : std::streambuf
 struct imemstream : virtual membuf, std::istream
 {
   imemstream(char const *base, size_t size)
-      : membuf(base, size), std::istream(static_cast<std::streambuf *>(this))
+    : membuf(base, size), std::istream(static_cast<std::streambuf *>(this))
   {
   }
 };
diff --git a/compiler/pota-quantization-value-test/compare_tensors.py b/compiler/pota-quantization-value-test/compare_tensors.py
index 9c9b639bd..20e92c68b 100755
--- a/compiler/pota-quantization-value-test/compare_tensors.py
+++ b/compiler/pota-quantization-value-test/compare_tensors.py
@@ -68,7 +68,7 @@ def compare_quantization(tensor, tensor_name, expect_dir):
     for key in json_load:
         if key == "weights":
             expected_weights = np.array(json_load["weights"])
-            input_weights = tensor["weights"][:]
+            input_weights = tensor["weights"][()]
             abs_tolerance = 1
             # We use higher tolerance for int64 data (bias of int16-quantized model)
             if tensor["weights"].dtype == 'int64':
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/int16/quantization/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/int16/quantization/ifm.json
new file mode 100644
index 000000000..2fb0c68d8
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/int16/quantization/ifm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.00014983004075475037,
+  "zero_point": 0.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/int16/quantization/ofm1.json b/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/int16/quantization/ofm1.json
new file mode 100644
index 000000000..239a3a46d
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/int16/quantization/ofm1.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.00014586378529202193,
+  "zero_point": 0.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/int16/quantization/ofm2.json b/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/int16/quantization/ofm2.json
new file mode 100644
index 000000000..b4422f49e
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/int16/quantization/ofm2.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.00014956798986531794,
+  "zero_point": 0.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/int16/quantization/split_dim.json b/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/int16/quantization/split_dim.json
new file mode 100644
index 000000000..ac7cde187
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/int16/quantization/split_dim.json
@@ -0,0 +1,5 @@
+{
+  "weights": [
+     0
+  ]
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/int16/record_minmax/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/int16/record_minmax/ifm.json
new file mode 100644
index 000000000..5e333acde
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/int16/record_minmax/ifm.json
@@ -0,0 +1,4 @@
+{
+  "min": -4.909480743408203,
+  "max": 4.779518718719482
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/int16/record_minmax/ofm1.json b/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/int16/record_minmax/ofm1.json
new file mode 100644
index 000000000..1d23f8d9a
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/int16/record_minmax/ofm1.json
@@ -0,0 +1,4 @@
+{
+  "min": -4.073143873214722,
+  "max": 4.779518718719482
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/int16/record_minmax/ofm2.json b/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/int16/record_minmax/ofm2.json
new file mode 100644
index 000000000..ffd7d841d
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/int16/record_minmax/ofm2.json
@@ -0,0 +1,4 @@
+{
+  "min": -4.9008944129943846,
+  "max": 4.620573101043701
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/uint8/quantization/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/uint8/quantization/ifm.json
new file mode 100644
index 000000000..aaba6131c
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/uint8/quantization/ifm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.038689617067575455,
+  "zero_point": 128.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/uint8/quantization/ofm1.json b/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/uint8/quantization/ofm1.json
new file mode 100644
index 000000000..3c0134839
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/uint8/quantization/ofm1.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.035256847739219666,
+  "zero_point": 123.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/uint8/quantization/ofm2.json b/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/uint8/quantization/ofm2.json
new file mode 100644
index 000000000..20ebde60e
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/uint8/quantization/ofm2.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.0385618582367897,
+  "zero_point": 129.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/uint8/quantization/split_dim.json b/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/uint8/quantization/split_dim.json
new file mode 100644
index 000000000..ac7cde187
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/uint8/quantization/split_dim.json
@@ -0,0 +1,5 @@
+{
+  "weights": [
+     0
+  ]
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/uint8/record_minmax/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/uint8/record_minmax/ifm.json
new file mode 100644
index 000000000..c6dd19469
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/uint8/record_minmax/ifm.json
@@ -0,0 +1,4 @@
+{
+  "min": -4.959668273925781,
+  "max": 4.906183891296386
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/uint8/record_minmax/ofm1.json b/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/uint8/record_minmax/ofm1.json
new file mode 100644
index 000000000..4f890dddb
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/uint8/record_minmax/ofm1.json
@@ -0,0 +1,4 @@
+{
+  "min": -4.3535110282897955,
+  "max": 4.636985759735107
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/uint8/record_minmax/ofm2.json b/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/uint8/record_minmax/ofm2.json
new file mode 100644
index 000000000..78f9a648f
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/Split_000/channel/uint8/record_minmax/ofm2.json
@@ -0,0 +1,4 @@
+{
+  "min": -4.959668273925781,
+  "max": 4.8736056804656975
+}
diff --git a/compiler/pota-quantization-value-test/test.lst b/compiler/pota-quantization-value-test/test.lst
index dd1640428..4beec8c0e 100644
--- a/compiler/pota-quantization-value-test/test.lst
+++ b/compiler/pota-quantization-value-test/test.lst
@@ -26,6 +26,8 @@ addTest(PRelu_001 channel uint8)
 addTest(PRelu_001 channel int16)
 addTest(ReLU_000 layer uint8)
 addTest(ReLU_000 channel int16)
+addTest(Split_000 channel uint8)
+addTest(Split_000 channel int16)
 addTest(TransposeConv_001 channel uint8)
 addTest(TransposeConv_001 channel int16)
 addTest(TransposeConv_001 layer uint8)
diff --git a/compiler/pota-quantization-value-test/test_inputs/Split_000/channel/int16/0.txt b/compiler/pota-quantization-value-test/test_inputs/Split_000/channel/int16/0.txt
new file mode 100644
index 000000000..4b999a028
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Split_000/channel/int16/0.txt
@@ -0,0 +1 @@
+ 3.241328  , 2.7033713 ,-2.5329788 ,-4.078369  ,-3.6711028 , 2.8912613 , 0.6188993 , 3.3729403 , 2.9906578 , 0.69040877, 0.6443222 , 1.1676162 
diff --git a/compiler/pota-quantization-value-test/test_inputs/Split_000/channel/int16/1.txt b/compiler/pota-quantization-value-test/test_inputs/Split_000/channel/int16/1.txt
new file mode 100644
index 000000000..7061063b9
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Split_000/channel/int16/1.txt
@@ -0,0 +1 @@
+ 1.572614  , 3.6147017 , 1.4378501 ,-0.81497866, 1.5987366 , 3.7698908 ,-3.8637109 , 4.5728784 ,-0.8706349 , 0.7389268 , 4.64117   ,-0.96047217
diff --git a/compiler/pota-quantization-value-test/test_inputs/Split_000/channel/int16/2.txt b/compiler/pota-quantization-value-test/test_inputs/Split_000/channel/int16/2.txt
new file mode 100644
index 000000000..c048a8a9f
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Split_000/channel/int16/2.txt
@@ -0,0 +1 @@
+ 0.00864919,-3.1653113 ,-2.125551  , 2.9225516 ,-1.1439148 , 4.6509814 ,-2.097259  , 2.5843353 ,-2.067207  ,-2.5034845 ,-4.9441104 ,-3.9062042 
diff --git a/compiler/pota-quantization-value-test/test_inputs/Split_000/channel/int16/3.txt b/compiler/pota-quantization-value-test/test_inputs/Split_000/channel/int16/3.txt
new file mode 100644
index 000000000..55be3b464
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Split_000/channel/int16/3.txt
@@ -0,0 +1 @@
+ 1.0920542 , 0.5510192 , 1.3465579 ,-2.3510268 , 4.016736  , 4.7848744 ,-0.42403316, 0.00571597, 1.6412207 , 1.7787368 , 2.4728034 ,-3.5900247 
diff --git a/compiler/pota-quantization-value-test/test_inputs/Split_000/channel/int16/4.txt b/compiler/pota-quantization-value-test/test_inputs/Split_000/channel/int16/4.txt
new file mode 100644
index 000000000..04c7a1a8a
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Split_000/channel/int16/4.txt
@@ -0,0 +1 @@
+-2.9799085,-3.9477375, 0.6402844, 3.304766 , 3.8880465,-3.5069442,-2.3702915, 4.126247 ,-3.1614416, 2.9909244,-2.8755414, 0.2627986
diff --git a/compiler/pota-quantization-value-test/test_inputs/Split_000/channel/uint8/0.txt b/compiler/pota-quantization-value-test/test_inputs/Split_000/channel/uint8/0.txt
new file mode 100644
index 000000000..0e8d687b1
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Split_000/channel/uint8/0.txt
@@ -0,0 +1 @@
+-2.327701  , 1.9312059 ,-2.0069487 ,-1.2584914 ,-0.08435626, 0.47685367,-2.7456024 , 2.1275337 ,-4.9685698 , 1.8143541 , 0.52829266,-2.770121  
diff --git a/compiler/pota-quantization-value-test/test_inputs/Split_000/channel/uint8/1.txt b/compiler/pota-quantization-value-test/test_inputs/Split_000/channel/uint8/1.txt
new file mode 100644
index 000000000..67732e8f5
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Split_000/channel/uint8/1.txt
@@ -0,0 +1 @@
+ 0.01133719,-3.3741624 , 3.556686  ,-4.21059   , 0.49977505, 1.768375  , 3.867543  , 2.270572  ,-3.9507272 ,-4.595618  ,-4.7460327 , 0.5856542 
diff --git a/compiler/pota-quantization-value-test/test_inputs/Split_000/channel/uint8/2.txt b/compiler/pota-quantization-value-test/test_inputs/Split_000/channel/uint8/2.txt
new file mode 100644
index 000000000..7bc7124d6
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Split_000/channel/uint8/2.txt
@@ -0,0 +1 @@
+-2.7181    , 4.6819983 , 2.9022477 ,-0.10716935, 3.6687856 ,-2.5403244 ,-4.477037  , 2.5499978 ,-3.9294813 , 0.08725335,-2.243345  ,-1.4018577 
diff --git a/compiler/pota-quantization-value-test/test_inputs/Split_000/channel/uint8/3.txt b/compiler/pota-quantization-value-test/test_inputs/Split_000/channel/uint8/3.txt
new file mode 100644
index 000000000..0fac9fb70
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Split_000/channel/uint8/3.txt
@@ -0,0 +1 @@
+-3.920553  , 0.87464577,-1.0319884 , 2.1885726 , 2.755115  ,-1.6436632 ,-4.4507327 , 4.915525  , 2.9331517 , 4.7712016 , 4.676084  ,-1.7715888 
diff --git a/compiler/pota-quantization-value-test/test_inputs/Split_000/channel/uint8/4.txt b/compiler/pota-quantization-value-test/test_inputs/Split_000/channel/uint8/4.txt
new file mode 100644
index 000000000..df79104c2
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Split_000/channel/uint8/4.txt
@@ -0,0 +1 @@
+-2.181168  ,-1.6011912 ,-4.359466  ,-1.3662407 ,-0.06876431,-2.9213328 ,-0.5463467 ,-3.7916536 ,-3.751455  ,-2.822578  , 0.8914152 ,-3.0267959 
diff --git a/compiler/pp/CMakeLists.txt b/compiler/pp/CMakeLists.txt
index 2c25c6406..6d58458ca 100644
--- a/compiler/pp/CMakeLists.txt
+++ b/compiler/pp/CMakeLists.txt
@@ -6,6 +6,7 @@ add_library(pp STATIC ${SOURCES})
 set_target_properties(pp PROPERTIES POSITION_INDEPENDENT_CODE ON)
 target_include_directories(pp PUBLIC include)
 target_link_libraries(pp PRIVATE nncc_common)
+target_link_libraries(pp PUBLIC nncc_coverage)
 
 if(NOT ENABLE_TEST)
   return()
diff --git a/compiler/record-minmax-conversion-test/gen_h5_random_inputs.py b/compiler/record-minmax-conversion-test/gen_h5_random_inputs.py
index bdf86fe29..d57289abf 100755
--- a/compiler/record-minmax-conversion-test/gen_h5_random_inputs.py
+++ b/compiler/record-minmax-conversion-test/gen_h5_random_inputs.py
@@ -39,9 +39,16 @@ for i in range(num_data):
 
     for j in range(len(input_details)):
         input_detail = input_details[j]
-        # Generate random input [-5, 5)
-        input_data = np.array(10 * np.random.random_sample(input_detail["shape"]) - 5,
-                              input_detail["dtype"])
+        print(input_detail["dtype"])
+        if input_detail["dtype"] == np.bool_:
+            # Generate random bool [0, 1]
+            input_data = np.array(
+                np.random.random_integers(0, 1, input_detail["shape"]),
+                input_detail["dtype"])
+        elif input_detail["dtype"] == np.float32:
+            # Generate random input [-5, 5)
+            input_data = np.array(10 * np.random.random_sample(input_detail["shape"]) - 5,
+                                  input_detail["dtype"])
         sample.create_dataset(str(j), data=input_data)
 
 h5_file.close()
diff --git a/compiler/record-minmax-conversion-test/testall.sh b/compiler/record-minmax-conversion-test/testall.sh
index 29c9ed3d1..d7fc1de53 100755
--- a/compiler/record-minmax-conversion-test/testall.sh
+++ b/compiler/record-minmax-conversion-test/testall.sh
@@ -55,6 +55,16 @@ for TESTCASE in "$@"; do
       --input_data "${BIN_PATH}/${TESTCASE}.tflite.input.h5" \
       --output_model "${BIN_PATH}/${TESTCASE}.out.circle"
 
+    if [[ $? -ne 0 ]]; then
+      echo "FAILED TO GENERATE CIRCLE OUTPUT"
+      continue
+    fi
+
+    # Run record-minmax with auto generated random input
+    "${RECORD_MINMAX_PATH}" \
+      --input_model "${TESTCASE_FILE}.circle" \
+      --output_model "${BIN_PATH}/${TESTCASE}.outr.circle"
+
     if [[ $? -eq 0 ]]; then
       touch "${PASSED_TAG}"
     fi
diff --git a/compiler/record-minmax/CMakeLists.txt b/compiler/record-minmax/CMakeLists.txt
index f8a165bd3..da63bbf5f 100644
--- a/compiler/record-minmax/CMakeLists.txt
+++ b/compiler/record-minmax/CMakeLists.txt
@@ -17,9 +17,11 @@ target_link_libraries(record-minmax ${HDF5_CXX_LIBRARIES})
 target_link_libraries(record-minmax arser)
 target_link_libraries(record-minmax safemain)
 target_link_libraries(record-minmax luci_import)
+target_link_libraries(record-minmax luci_env)
 target_link_libraries(record-minmax luci_export)
 target_link_libraries(record-minmax luci_interpreter)
 target_link_libraries(record-minmax vconone)
+target_link_libraries(record-minmax nncc_coverage)
 
 install(TARGETS record-minmax DESTINATION bin)
 
@@ -27,6 +29,9 @@ if(NOT ENABLE_TEST)
   return()
 endif(NOT ENABLE_TEST)
 
+file(GLOB_RECURSE TESTS "tests/*.test.cpp")
+
 nnas_find_package(GTest REQUIRED)
-GTest_AddTest(record_minmax_function_test "${CMAKE_CURRENT_SOURCE_DIR}/tests/RecordFunction.test.cpp")
+GTest_AddTest(record_minmax_function_test "${TESTS}")
 target_include_directories(record_minmax_function_test PRIVATE include)
+target_link_libraries(record_minmax_function_test nncc_coverage)
diff --git a/compiler/record-minmax/driver/Driver.cpp b/compiler/record-minmax/driver/Driver.cpp
index 8b09498c3..6dbb693b2 100644
--- a/compiler/record-minmax/driver/Driver.cpp
+++ b/compiler/record-minmax/driver/Driver.cpp
@@ -19,6 +19,8 @@
 #include <arser/arser.h>
 #include <vconone/vconone.h>
 
+#include <luci/UserSettings.h>
+
 void print_version(void)
 {
   std::cout << "record-minmax version " << vconone::get_string() << std::endl;
@@ -30,47 +32,55 @@ int entry(const int argc, char **argv)
   using namespace record_minmax;
 
   arser::Arser arser(
-      "Embedding min/max values of activations to the circle model for post-training quantization");
+    "Embedding min/max values of activations to the circle model for post-training quantization");
 
   arser.add_argument("--version")
-      .nargs(0)
-      .required(false)
-      .default_value(false)
-      .help("Show version information and exit")
-      .exit_with(print_version);
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("Show version information and exit")
+    .exit_with(print_version);
 
   arser.add_argument("--input_model")
-      .nargs(1)
-      .type(arser::DataType::STR)
-      .required(true)
-      .help("Input model filepath");
+    .nargs(1)
+    .type(arser::DataType::STR)
+    .required(true)
+    .help("Input model filepath");
 
   arser.add_argument("--input_data")
-      .nargs(1)
-      .type(arser::DataType::STR)
-      .required(true)
-      .help("Input data filepath");
+    .nargs(1)
+    .type(arser::DataType::STR)
+    .required(false)
+    .help("Input data filepath. If not given, record-minmax will run with randomly generated data. "
+          "Note that the random dataset does not represent inference workload, leading to poor "
+          "model accuracy.");
 
   arser.add_argument("--output_model")
-      .nargs(1)
-      .type(arser::DataType::STR)
-      .required(true)
-      .help("Output model filepath");
+    .nargs(1)
+    .type(arser::DataType::STR)
+    .required(true)
+    .help("Output model filepath");
 
   arser.add_argument("--min_percentile")
-      .nargs(1)
-      .type(arser::DataType::FLOAT)
-      .help("Record n'th percentile of min");
+    .nargs(1)
+    .type(arser::DataType::FLOAT)
+    .help("Record n'th percentile of min");
 
   arser.add_argument("--max_percentile")
-      .nargs(1)
-      .type(arser::DataType::FLOAT)
-      .help("Record n'th percentile of max");
+    .nargs(1)
+    .type(arser::DataType::FLOAT)
+    .help("Record n'th percentile of max");
 
   arser.add_argument("--mode")
-      .nargs(1)
-      .type(arser::DataType::STR)
-      .help("Record mode. percentile (default) or moving_average");
+    .nargs(1)
+    .type(arser::DataType::STR)
+    .help("Record mode. percentile (default) or moving_average");
+
+  arser.add_argument("--generate_profile_data")
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("This will turn on profiling data generation.");
 
   try
   {
@@ -83,8 +93,9 @@ int entry(const int argc, char **argv)
     return 255;
   }
 
+  auto settings = luci::UserSettings::settings();
+
   auto input_model_path = arser.get<std::string>("--input_model");
-  auto input_data_path = arser.get<std::string>("--input_data");
   auto output_model_path = arser.get<std::string>("--output_model");
 
   // Default values
@@ -104,13 +115,26 @@ int entry(const int argc, char **argv)
   if (mode != "percentile" && mode != "moving_average")
     throw std::runtime_error("Unsupported mode");
 
+  if (arser["--generate_profile_data"])
+    settings->set(luci::UserSettings::Key::ProfilingDataGen, true);
+
   RecordMinMax rmm;
 
   // Initialize interpreter and observer
   rmm.initialize(input_model_path);
 
-  // Profile min/max while executing the given input data
-  rmm.profileData(mode, input_data_path, min_percentile, max_percentile);
+  if (arser["--input_data"])
+  {
+    auto input_data_path = arser.get<std::string>("--input_data");
+
+    // Profile min/max while executing the given input data
+    rmm.profileData(mode, input_data_path, min_percentile, max_percentile);
+  }
+  else
+  {
+    // Profile min/max while executing random input data
+    rmm.profileDataWithRandomInputs(mode, min_percentile, max_percentile);
+  }
 
   // Save profiled values to the model
   rmm.saveModel(output_model_path);
diff --git a/compiler/record-minmax/include/RecordFunction.h b/compiler/record-minmax/include/RecordFunction.h
index b570c6a0a..c34aee0e1 100644
--- a/compiler/record-minmax/include/RecordFunction.h
+++ b/compiler/record-minmax/include/RecordFunction.h
@@ -53,7 +53,7 @@ float getNthPercentile(std::vector<float> &vector, float percentile)
 
   float percent_i = static_cast<float>(index) / static_cast<float>(copy.size() - 1);
   float fraction =
-      (percentile / 100.0 - percent_i) / ((index + 1.0) / (copy.size() - 1.0) - percent_i);
+    (percentile / 100.0 - percent_i) / ((index + 1.0) / (copy.size() - 1.0) - percent_i);
   float res = copy[index] + fraction * (copy[index + 1] - copy[index]);
   return res;
 }
diff --git a/compiler/record-minmax/include/RecordMinMax.h b/compiler/record-minmax/include/RecordMinMax.h
index ffdb17aec..85ae4cdc7 100644
--- a/compiler/record-minmax/include/RecordMinMax.h
+++ b/compiler/record-minmax/include/RecordMinMax.h
@@ -39,6 +39,9 @@ public:
   void profileData(const std::string &mode, const std::string &input_data_path,
                    float min_percentile, float max_percentile);
 
+  void profileDataWithRandomInputs(const std::string &mode, float min_percentile,
+                                   float max_percentile);
+
   void saveModel(const std::string &output_model_path);
 
 private:
diff --git a/compiler/record-minmax/requires.cmake b/compiler/record-minmax/requires.cmake
index f6804cef1..9cf12591e 100644
--- a/compiler/record-minmax/requires.cmake
+++ b/compiler/record-minmax/requires.cmake
@@ -1,4 +1,5 @@
 require("luci")
+require("luci-interpreter")
 require("safemain")
 require("arser")
 require("vconone")
diff --git a/compiler/record-minmax/src/HDF5Importer.cpp b/compiler/record-minmax/src/HDF5Importer.cpp
index a0e65eeb7..cfb270ce0 100644
--- a/compiler/record-minmax/src/HDF5Importer.cpp
+++ b/compiler/record-minmax/src/HDF5Importer.cpp
@@ -59,7 +59,30 @@ DataType toInternalDtype(const H5::DataType &h5_type)
   {
     return DataType::S64;
   }
-  // Only support three datatypes for now
+  if (h5_type.getClass() == H5T_class_t::H5T_ENUM)
+  {
+    // We follow the numpy format
+    // In numpy 1.19.0, np.bool_ is saved as H5T_ENUM
+    // - (name, value) -> (FALSE, 0) and (TRUE, 1)
+    // - value dtype is H5T_STD_I8LE
+    // TODO Find a general way to recognize BOOL type
+    char name[10];
+    int8_t value[2] = {0, 1};
+    if (H5Tenum_nameof(h5_type.getId(), value, name, 10) < 0)
+      return DataType::Unknown;
+
+    if (std::string(name) != "FALSE")
+      return DataType::Unknown;
+
+    if (H5Tenum_nameof(h5_type.getId(), value + 1, name, 10) < 0)
+      return DataType::Unknown;
+
+    if (std::string(name) != "TRUE")
+      return DataType::Unknown;
+
+    return DataType::BOOL;
+  }
+  // TODO Support more datatypes
   return DataType::Unknown;
 }
 
@@ -125,6 +148,9 @@ void HDF5Importer::readTensor(int32_t record_idx, int32_t input_idx, DataType *d
     case DataType::S64:
       readTensorData(tensor, static_cast<int64_t *>(buffer));
       break;
+    case DataType::BOOL:
+      readTensorData(tensor, static_cast<uint8_t *>(buffer));
+      break;
     default:
       throw std::runtime_error{"Unsupported data type for input data (.h5)"};
   }
diff --git a/compiler/record-minmax/src/MinMaxObserver.cpp b/compiler/record-minmax/src/MinMaxObserver.cpp
index c22cb4132..40c9b730d 100644
--- a/compiler/record-minmax/src/MinMaxObserver.cpp
+++ b/compiler/record-minmax/src/MinMaxObserver.cpp
@@ -18,6 +18,8 @@
 
 #include <luci/IR/CircleOpcode.h>
 
+#include <math.h>
+
 using DataType = luci_interpreter::DataType;
 
 namespace record_minmax
@@ -51,6 +53,12 @@ void MinMaxObserver::postTensorWrite(const luci::CircleNode *node,
     return;
   }
 
+  if (node->dtype() == DataType::BOOL)
+  {
+    // Bool type tensor is not quantized
+    return;
+  }
+
   // Only support recording of float32 values
   if (tensor->element_type() != DataType::FLOAT32)
     throw std::runtime_error("Tensor's data type is not float");
@@ -59,9 +67,27 @@ void MinMaxObserver::postTensorWrite(const luci::CircleNode *node,
   const auto num_elements = tensor->shape().num_elements();
 
   std::vector<float> buf(data, data + num_elements);
-  auto minmax = std::minmax_element(buf.begin(), buf.end());
-  float min = *minmax.first;
-  float max = *minmax.second;
+
+  float max = std::numeric_limits<float>::lowest();
+  float min = std::numeric_limits<float>::max();
+
+  bool all_nan = true;
+  for (auto number : buf)
+  {
+    if (isnan(number))
+      continue;
+
+    all_nan = false;
+
+    if (number > max)
+      max = number;
+
+    if (number < min)
+      min = number;
+  }
+
+  if (all_nan)
+    throw std::runtime_error("All values are NaN(Not a Number)");
 
   _minmax_data.recordMinMax(node, min, max);
 }
diff --git a/compiler/record-minmax/src/RecordMinMax.cpp b/compiler/record-minmax/src/RecordMinMax.cpp
index cd5f29352..333ff5e3b 100644
--- a/compiler/record-minmax/src/RecordMinMax.cpp
+++ b/compiler/record-minmax/src/RecordMinMax.cpp
@@ -30,6 +30,7 @@
 #include <numeric>
 #include <stdexcept>
 #include <iostream>
+#include <random>
 
 using Shape = luci_interpreter::Shape;
 using DataType = luci_interpreter::DataType;
@@ -37,6 +38,18 @@ using DataType = luci_interpreter::DataType;
 namespace
 {
 
+std::vector<uint8_t> genRandomBoolData(std::mt19937 &gen, uint32_t num_elements)
+{
+  std::uniform_int_distribution<> dist(0, 1);
+  std::vector<uint8_t> input_data(num_elements);
+
+  // Write random data
+  for (auto &iter : input_data)
+    iter = static_cast<uint8_t>(dist(gen));
+
+  return input_data;
+}
+
 /**
  * @brief  getTensorSize will return size in bytes
  */
@@ -68,6 +81,38 @@ void verifyTypeShape(const luci::CircleInput *input_node, const DataType &dtype,
   }
 }
 
+void update_quantparam(record_minmax::MinMaxObserver *observer, const std::string &mode,
+                       float min_percentile, float max_percentile)
+{
+  auto minmax_map = observer->minMaxData()->getMap();
+  for (auto iter = minmax_map->begin(); iter != minmax_map->end(); ++iter)
+  {
+    auto node = iter->first;
+    auto minmax = iter->second;
+
+    float min{0.0f}, max{0.0f};
+    if (mode == "percentile")
+    {
+      min = record_minmax::getNthPercentile(minmax.min_vector, min_percentile);
+      max = record_minmax::getNthPercentile(minmax.max_vector, max_percentile);
+    }
+    else if (mode == "moving_average")
+    {
+      min = record_minmax::getMovingAverage(minmax.min_vector, 0.9, 16, true);
+      max = record_minmax::getMovingAverage(minmax.max_vector, 0.9, 16, false);
+    }
+    assert(mode == "percentile" || mode == "moving_average");
+    auto quantparam = std::make_unique<luci::CircleQuantParam>();
+    quantparam->min.push_back(min);
+    quantparam->max.push_back(max);
+
+    assert(node->quantparam() == nullptr);
+
+    auto mutable_node = const_cast<luci::CircleNode *>(node);
+    mutable_node->quantparam(std::move(quantparam));
+  }
+}
+
 } // namespace
 
 namespace record_minmax
@@ -169,33 +214,75 @@ void RecordMinMax::profileData(const std::string &mode, const std::string &input
     throw std::runtime_error("HDF5 error occurred.");
   }
 
-  auto minmax_map = _observer->minMaxData()->getMap();
-  for (auto iter = minmax_map->begin(); iter != minmax_map->end(); ++iter)
+  update_quantparam(_observer.get(), mode, min_percentile, max_percentile);
+}
+
+void RecordMinMax::profileDataWithRandomInputs(const std::string &mode, float min_percentile,
+                                               float max_percentile)
+{
+  // We use three randomly-generated records
+  const uint32_t num_records = 3;
+
+  const auto input_nodes = loco::input_nodes(_module->graph());
+  const auto num_inputs = input_nodes.size();
+
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_real_distribution<> dist(-5, 5);
+
+  for (int32_t record_idx = 0; record_idx < num_records; record_idx++)
   {
-    auto node = iter->first;
-    auto minmax = iter->second;
+    std::cout << "Recording " << record_idx << "'th data" << std::endl;
 
-    float min{0.0f}, max{0.0f};
-    if (mode == "percentile")
+    for (int32_t input_idx = 0; input_idx < num_inputs; input_idx++)
     {
-      min = getNthPercentile(minmax.min_vector, min_percentile);
-      max = getNthPercentile(minmax.max_vector, max_percentile);
-    }
-    else if (mode == "moving_average")
-    {
-      min = getMovingAverage(minmax.min_vector, 0.9, 16, true);
-      max = getMovingAverage(minmax.max_vector, 0.9, 16, false);
-    }
-    assert(mode == "percentile" || mode == "moving_average");
-    auto quantparam = std::make_unique<luci::CircleQuantParam>();
-    quantparam->min.push_back(min);
-    quantparam->max.push_back(max);
+      const auto *input_node = loco::must_cast<const luci::CircleInput *>(input_nodes[input_idx]);
+      assert(input_node->index() == input_idx);
+      uint32_t num_elements = 1;
+      for (uint32_t i = 0; i < input_node->rank(); i++)
+      {
+        if (!input_node->dim(i).known())
+          throw std::runtime_error("Input dimension must be known");
 
-    assert(node->quantparam() == nullptr);
+        num_elements *= input_node->dim(i).value();
+      }
 
-    auto mutable_node = const_cast<luci::CircleNode *>(node);
-    mutable_node->quantparam(std::move(quantparam));
+      if (num_elements == 0)
+        throw std::runtime_error("Only support non-zero sized inputs");
+
+      // TODO Support more input data types
+      assert(input_node->dtype() == loco::DataType::FLOAT32 ||
+             input_node->dtype() == loco::DataType::BOOL);
+
+      if (input_node->dtype() == DataType::FLOAT32)
+      // clang-format off
+      {
+      std::vector<float> input_data(num_elements);
+
+      // Write random data
+      for (auto &iter : input_data)
+        iter = static_cast<float>(dist(gen));
+
+      // TODO: Input data is copied twice (file -> buffer (input_data) -> interpreter inputs)
+      //       We can redcue the copy by directly writing data from file to interpreter inputs
+      _interpreter->writeInputTensor(input_node, input_data.data(),
+                                     input_data.size() * sizeof(float));
+      }
+      // clang-format on
+      else if (input_node->dtype() == DataType::BOOL)
+      {
+        auto input_data = genRandomBoolData(gen, num_elements);
+        _interpreter->writeInputTensor(input_node, input_data.data(),
+                                       input_data.size() * sizeof(uint8_t));
+      }
+    }
+
+    _interpreter->interpret();
   }
+
+  std::cout << "Recording finished. Number of recorded data: " << num_records << std::endl;
+
+  update_quantparam(_observer.get(), mode, min_percentile, max_percentile);
 }
 
 void RecordMinMax::saveModel(const std::string &output_model_path)
diff --git a/compiler/record-minmax/tests/RecordFunction.test.cpp b/compiler/record-minmax/tests/RecordFunction.test.cpp
index e2f135a4e..0d8632254 100644
--- a/compiler/record-minmax/tests/RecordFunction.test.cpp
+++ b/compiler/record-minmax/tests/RecordFunction.test.cpp
@@ -115,4 +115,12 @@ TEST(GetNthPercentileTest, EmptyVector_NEG)
   SUCCEED();
 }
 
+TEST(GetMovingAverageTest, Simple)
+{
+  std::vector<float> input{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+
+  EXPECT_NE(0, getMovingAverage(input, 0.5, 4, true));
+  EXPECT_NE(0, getMovingAverage(input, 0.5, 4, false));
+}
+
 } // namespace record_minmax
diff --git a/compiler/souschef/include/souschef/Data/Gaussian.h b/compiler/souschef/include/souschef/Data/Gaussian.h
index 75570e0b8..8093b4c41 100644
--- a/compiler/souschef/include/souschef/Data/Gaussian.h
+++ b/compiler/souschef/include/souschef/Data/Gaussian.h
@@ -57,6 +57,22 @@ private:
   float _stddev;
 };
 
+class GaussianInt16DataChef final : public DataChef
+{
+public:
+  GaussianInt16DataChef(float mean, float stddev) : _mean{mean}, _stddev{stddev}
+  {
+    // DO NOTHING
+  }
+
+public:
+  std::vector<uint8_t> generate(int32_t count) const override;
+
+private:
+  float _mean;
+  float _stddev;
+};
+
 class GaussianUint8DataChef final : public DataChef
 {
 public:
@@ -83,6 +99,11 @@ struct GaussianInt32DataChefFactory : public DataChefFactory
   std::unique_ptr<DataChef> create(const Arguments &args) const;
 };
 
+struct GaussianInt16DataChefFactory : public DataChefFactory
+{
+  std::unique_ptr<DataChef> create(const Arguments &args) const;
+};
+
 struct GaussianUint8DataChefFactory : public DataChefFactory
 {
   std::unique_ptr<DataChef> create(const Arguments &args) const;
diff --git a/compiler/souschef/include/souschef/DataChef.def b/compiler/souschef/include/souschef/DataChef.def
index 28901db18..d724d0390 100644
--- a/compiler/souschef/include/souschef/DataChef.def
+++ b/compiler/souschef/include/souschef/DataChef.def
@@ -7,13 +7,16 @@
 DATA_CHEF(FLOAT32, constant, ConstantDataChefFactory<float>)
 DATA_CHEF(BOOL, constant, ConstantDataChefFactory<bool>)
 DATA_CHEF(UINT8, constant, ConstantDataChefFactory<uint8_t>)
+DATA_CHEF(INT16, constant, ConstantDataChefFactory<int16_t>)
 DATA_CHEF(INT32, constant, ConstantDataChefFactory<int32_t>)
 DATA_CHEF(INT64, constant, ConstantDataChefFactory<int64_t>)
 DATA_CHEF(INT64, explicit, ExplicitDataChefFactory<int64_t>)
 DATA_CHEF(INT32, explicit, ExplicitDataChefFactory<int32_t>)
+DATA_CHEF(INT16, explicit, ExplicitDataChefFactory<int16_t>)
 DATA_CHEF(UINT8, explicit, ExplicitDataChefFactory<uint8_t>)
 DATA_CHEF(BOOL, explicit, ExplicitDataChefFactory<bool>)
 DATA_CHEF(FLOAT32, explicit, ExplicitDataChefFactory<float>)
 DATA_CHEF(FLOAT32, gaussian, GaussianFloat32DataChefFactory)
 DATA_CHEF(INT32, gaussian, GaussianInt32DataChefFactory)
+DATA_CHEF(INT16, gaussian, GaussianInt16DataChefFactory)
 DATA_CHEF(UINT8, gaussian, GaussianUint8DataChefFactory)
diff --git a/compiler/souschef/src/Gaussian.cpp b/compiler/souschef/src/Gaussian.cpp
index 4a5083d8e..32cbcff4d 100644
--- a/compiler/souschef/src/Gaussian.cpp
+++ b/compiler/souschef/src/Gaussian.cpp
@@ -26,22 +26,25 @@
 namespace souschef
 {
 
-std::vector<uint8_t> GaussianFloat32DataChef::generate(int32_t count) const
+template <typename T>
+static std::vector<uint8_t> generate_gaussian(int32_t count, float mean, float stddev,
+                                              std::minstd_rand::result_type seed)
 {
-  // TODO Support seed value override
-  auto seed = std::chrono::system_clock::now().time_since_epoch().count();
-
   std::minstd_rand rand{static_cast<std::minstd_rand::result_type>(seed)};
-  std::normal_distribution<float> dist{_mean, _stddev};
+  std::normal_distribution<float> dist{mean, stddev};
 
   std::vector<uint8_t> res;
 
+  constexpr float max_cap = std::numeric_limits<T>::max();
+  constexpr float min_cap = std::numeric_limits<T>::min();
   for (uint32_t n = 0; n < count; ++n)
   {
-    auto const value = dist(rand);
+    float raw_value = dist(rand);
+    const float capped_value = std::max(min_cap, std::min(max_cap, raw_value));
+    auto const value = static_cast<T>(capped_value);
     auto const arr = reinterpret_cast<const uint8_t *>(&value);
 
-    for (uint32_t b = 0; b < sizeof(float); ++b)
+    for (uint32_t b = 0; b < sizeof(T); ++b)
     {
       res.emplace_back(arr[b]);
     }
@@ -50,52 +53,35 @@ std::vector<uint8_t> GaussianFloat32DataChef::generate(int32_t count) const
   return res;
 }
 
-std::vector<uint8_t> GaussianInt32DataChef::generate(int32_t count) const
+template <typename T>
+static std::vector<uint8_t> generate_gaussian(int32_t count, float mean, float stddev)
 {
-  // TODO Support seed value override
-  auto seed = std::chrono::system_clock::now().time_since_epoch().count();
+  auto time_stamp = std::chrono::system_clock::now().time_since_epoch().count();
 
-  std::minstd_rand rand{static_cast<std::minstd_rand::result_type>(seed)};
-  std::normal_distribution<float> dist{_mean, _stddev};
+  // Note this is implementation defined, change if needed.
+  auto seed = static_cast<std::minstd_rand::result_type>(time_stamp);
 
-  std::vector<uint8_t> res;
+  return generate_gaussian<T>(count, mean, stddev, seed);
+}
 
-  for (uint32_t n = 0; n < count; ++n)
-  {
-    auto const value = static_cast<int32_t>(dist(rand));
-    auto const arr = reinterpret_cast<const uint8_t *>(&value);
+std::vector<uint8_t> GaussianFloat32DataChef::generate(int32_t count) const
+{
+  return generate_gaussian<float>(count, _mean, _stddev);
+}
 
-    for (uint32_t b = 0; b < sizeof(int32_t); ++b)
-    {
-      res.emplace_back(arr[b]);
-    }
-  }
+std::vector<uint8_t> GaussianInt32DataChef::generate(int32_t count) const
+{
+  return generate_gaussian<int32_t>(count, _mean, _stddev);
+}
 
-  return res;
+std::vector<uint8_t> GaussianInt16DataChef::generate(int32_t count) const
+{
+  return generate_gaussian<int16_t>(count, _mean, _stddev);
 }
 
 std::vector<uint8_t> GaussianUint8DataChef::generate(int32_t count) const
 {
-  // TODO Support seed value override
-  auto seed = std::chrono::system_clock::now().time_since_epoch().count();
-
-  std::minstd_rand rand{static_cast<std::minstd_rand::result_type>(seed)};
-  std::normal_distribution<float> dist{_mean, _stddev};
-
-  std::vector<uint8_t> res;
-
-  for (uint32_t n = 0; n < count; ++n)
-  {
-    auto const value = static_cast<uint8_t>(dist(rand));        // uint8_t for data type
-    auto const arr = reinterpret_cast<const uint8_t *>(&value); // uint8_t for byte streaming
-
-    for (uint32_t b = 0; b < sizeof(uint8_t); ++b)
-    {
-      res.emplace_back(arr[b]);
-    }
-  }
-
-  return res;
+  return generate_gaussian<uint8_t>(count, _mean, _stddev);
 }
 
 std::unique_ptr<DataChef> GaussianFloat32DataChefFactory::create(const Arguments &args) const
@@ -124,6 +110,19 @@ std::unique_ptr<DataChef> GaussianInt32DataChefFactory::create(const Arguments &
   return std::unique_ptr<DataChef>{new GaussianInt32DataChef{mean, stddev}};
 }
 
+std::unique_ptr<DataChef> GaussianInt16DataChefFactory::create(const Arguments &args) const
+{
+  if (args.count() != 2)
+  {
+    throw std::runtime_error{"invalid argument count: two arguments (mean/stddev) are expected"};
+  }
+
+  auto const mean = to_number<float>(args.value(0));
+  auto const stddev = to_number<float>(args.value(1));
+
+  return std::unique_ptr<DataChef>{new GaussianInt16DataChef{mean, stddev}};
+}
+
 std::unique_ptr<DataChef> GaussianUint8DataChefFactory::create(const Arguments &args) const
 {
   if (args.count() != 2)
diff --git a/compiler/souschef/src/LexicalCast.cpp b/compiler/souschef/src/LexicalCast.cpp
index 8e3d4cbbb..1af6e30f9 100644
--- a/compiler/souschef/src/LexicalCast.cpp
+++ b/compiler/souschef/src/LexicalCast.cpp
@@ -18,12 +18,25 @@
 
 #include <cassert>
 #include <limits>
+#include <stdexcept>
 
 namespace souschef
 {
 
 template <> float to_number(const std::string &s) { return std::stof(s); }
 template <> int to_number(const std::string &s) { return std::stoi(s); }
+template <> int16_t to_number(const std::string &s)
+{
+  // There are no standard function to parse int16_t or short int
+  // This function simulates behavior similar stoi, stol and stoll
+  int res = std::stol(s);
+  // standard does not specify string in error message, this is arbitrary
+  if (res < std::numeric_limits<int16_t>::min() || res > std::numeric_limits<int16_t>::max())
+  {
+    throw std::out_of_range("to_number<int16_t>");
+  }
+  return res;
+}
 template <> int64_t to_number(const std::string &s) { return std::stoll(s); }
 template <> uint8_t to_number(const std::string &s)
 {
diff --git a/compiler/stdex/CMakeLists.txt b/compiler/stdex/CMakeLists.txt
deleted file mode 100644
index 91f07e69f..000000000
--- a/compiler/stdex/CMakeLists.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-file(GLOB_RECURSE TESTS "src/*.test.cpp")
-
-add_library(stdex INTERFACE)
-target_include_directories(stdex INTERFACE include)
-
-if(NOT ENABLE_TEST)
-  return()
-endif(NOT ENABLE_TEST)
-
-# Google Test is mandatory for test
-nnas_find_package(GTest REQUIRED)
-
-add_executable(stdex_test ${TESTS})
-target_link_libraries(stdex_test stdex)
-target_link_libraries(stdex_test gtest_main)
-add_test(stdex_test stdex_test)
diff --git a/compiler/stdex/README.md b/compiler/stdex/README.md
deleted file mode 100644
index 054d08569..000000000
--- a/compiler/stdex/README.md
+++ /dev/null
@@ -1,22 +0,0 @@
-# stdex
-
-`stdex` is an extension over standard C++ libraries.
-
-# How to use
-
-Please read each header files.
-
-One example of `stdex::make_unique(..)` in `compiler/stdex/Memory.h` is as follows:
-
-```cpp
-#include <stdex/Memory.h>
-
-using stdex::make_unique;
-
-class A { ... };
-
-...
-
-std::unique_ptr<A> a = make_unique<A>(); // Note: std::make_unique is not supported in C++ 11
-
-```
diff --git a/compiler/stdex/include/stdex/Set.h b/compiler/stdex/include/stdex/Set.h
deleted file mode 100644
index 2c61e0d01..000000000
--- a/compiler/stdex/include/stdex/Set.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __STDEX_SET_H__
-#define __STDEX_SET_H__
-
-#include <set>
-
-template <typename T> bool operator==(const std::set<T> &lhs, const std::set<T> &rhs)
-{
-  if (rhs.size() != lhs.size())
-  {
-    return false;
-  }
-
-  for (const auto &element : lhs)
-  {
-    if (rhs.find(element) == rhs.end())
-    {
-      return false;
-    }
-  }
-
-  return true;
-}
-
-template <typename T> std::set<T> operator-(const std::set<T> &lhs, const std::set<T> &rhs)
-{
-  std::set<T> res;
-
-  for (const auto &element : lhs)
-  {
-    if (rhs.find(element) == rhs.end())
-    {
-      res.insert(element);
-    }
-  }
-
-  return res;
-}
-
-#endif // __STDEX_SET_H__
diff --git a/compiler/stdex/src/Memory.test.cpp b/compiler/stdex/src/Memory.test.cpp
deleted file mode 100644
index 433af4534..000000000
--- a/compiler/stdex/src/Memory.test.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "stdex/Memory.h"
-
-#include <gtest/gtest.h>
-
-namespace
-{
-
-struct Stat
-{
-  unsigned allocated = 0;
-  unsigned freed = 0;
-};
-
-struct Counter
-{
-public:
-  Counter(Stat *stat) : _stat{stat} { _stat->allocated += 1; }
-
-public:
-  ~Counter() { _stat->freed += 1; }
-
-private:
-  Stat *_stat;
-};
-
-} // namespace
-
-TEST(MemoryTest, make_unique)
-{
-  Stat stat;
-
-  ASSERT_EQ(stat.allocated, 0);
-  ASSERT_EQ(stat.freed, 0);
-
-  auto o = stdex::make_unique<::Counter>(&stat);
-
-  ASSERT_EQ(stat.allocated, 1);
-  ASSERT_EQ(stat.freed, 0);
-
-  o.reset();
-
-  ASSERT_EQ(stat.allocated, 1);
-  ASSERT_EQ(stat.freed, 1);
-}
diff --git a/compiler/tf2circle/CMakeLists.txt b/compiler/tf2circle/CMakeLists.txt
index 549f731a4..8678e90b4 100644
--- a/compiler/tf2circle/CMakeLists.txt
+++ b/compiler/tf2circle/CMakeLists.txt
@@ -40,7 +40,6 @@ target_link_libraries(tf2circle PRIVATE tfinfo)
 target_link_libraries(tf2circle PRIVATE exo)
 target_link_libraries(tf2circle PRIVATE locop)
 target_link_libraries(tf2circle PRIVATE hermes_std)
-target_link_libraries(tf2circle PRIVATE stdex)
 target_link_libraries(tf2circle PRIVATE angkor cwrap)
 target_link_libraries(tf2circle PRIVATE tf2circle_customop_info_proto)
 
diff --git a/compiler/tf2circle/requires.cmake b/compiler/tf2circle/requires.cmake
index 68d45bf3a..87ea50bf7 100644
--- a/compiler/tf2circle/requires.cmake
+++ b/compiler/tf2circle/requires.cmake
@@ -1,4 +1,3 @@
-require("stdex")
 require("hermes-std")
 require("moco-tf")
 require("exo")
diff --git a/compiler/tf2circle/src/tf2circle.cpp b/compiler/tf2circle/src/tf2circle.cpp
index a1160e968..b4d21133d 100644
--- a/compiler/tf2circle/src/tf2circle.cpp
+++ b/compiler/tf2circle/src/tf2circle.cpp
@@ -28,10 +28,8 @@
 #include <hermes/ConsoleReporter.h>
 #include <hermes/EnvConfig.h>
 
-#include <stdex/Memory.h>
-
 #include <cassert>
-
+#include <memory>
 #include <iostream>
 #include <stdexcept>
 #include <string>
@@ -70,8 +68,8 @@ struct LoggingContext
     if (ctx == nullptr)
     {
       ctx = new hermes::Context;
-      ctx->sinks()->append(stdex::make_unique<hermes::ConsoleReporter>());
-      ctx->config(stdex::make_unique<EnvConfig>("TF2CIRCLE_Log"));
+      ctx->sinks()->append(std::make_unique<hermes::ConsoleReporter>());
+      ctx->config(std::make_unique<EnvConfig>("TF2CIRCLE_Log"));
     }
 
     return ctx;
@@ -133,9 +131,9 @@ int EntryFunctor::operator()(int argc, char **argv) const
   using EnvConfig = hermes::EnvConfig<hermes::EnvFormat::BooleanNumber>;
 
   // This line allows users to control all the moco-tf loggers via TF2CIRCLE_Log_Frontend
-  moco::LoggingContext::get()->config(stdex::make_unique<EnvConfig>("TF2CIRCLE_Log_Frontend"));
+  moco::LoggingContext::get()->config(std::make_unique<EnvConfig>("TF2CIRCLE_Log_Frontend"));
   // This line allows users to control all the exo-circle loggers via TF2CIRCLE_Log_Backend
-  exo::LoggingContext::get()->config(stdex::make_unique<EnvConfig>("TF2CIRCLE_Log_Backend"));
+  exo::LoggingContext::get()->config(std::make_unique<EnvConfig>("TF2CIRCLE_Log_Backend"));
 
   LOGGER(l);
 
diff --git a/compiler/tf2nnpkg/CMakeLists.txt b/compiler/tf2nnpkg/CMakeLists.txt
index 8e1edf858..b81f40646 100644
--- a/compiler/tf2nnpkg/CMakeLists.txt
+++ b/compiler/tf2nnpkg/CMakeLists.txt
@@ -30,6 +30,5 @@ target_link_libraries(tf2nnpkg PRIVATE tfinfo)
 target_link_libraries(tf2nnpkg PRIVATE exo)
 target_link_libraries(tf2nnpkg PRIVATE locop)
 target_link_libraries(tf2nnpkg PRIVATE hermes_std)
-target_link_libraries(tf2nnpkg PRIVATE stdex)
 target_link_libraries(tf2nnpkg PRIVATE angkor cwrap)
 install(TARGETS tf2nnpkg DESTINATION bin)
diff --git a/compiler/tf2nnpkg/requires.cmake b/compiler/tf2nnpkg/requires.cmake
index 68d45bf3a..87ea50bf7 100644
--- a/compiler/tf2nnpkg/requires.cmake
+++ b/compiler/tf2nnpkg/requires.cmake
@@ -1,4 +1,3 @@
-require("stdex")
 require("hermes-std")
 require("moco-tf")
 require("exo")
diff --git a/compiler/tf2nnpkg/src/tf2nnpkg.cpp b/compiler/tf2nnpkg/src/tf2nnpkg.cpp
index d9a0d9d2f..548cee61f 100644
--- a/compiler/tf2nnpkg/src/tf2nnpkg.cpp
+++ b/compiler/tf2nnpkg/src/tf2nnpkg.cpp
@@ -28,8 +28,7 @@
 #include <hermes/ConsoleReporter.h>
 #include <hermes/EnvConfig.h>
 
-#include <stdex/Memory.h>
-
+#include <memory>
 #include <iostream>
 #include <fstream>
 #include <functional>
@@ -71,8 +70,8 @@ struct LoggingContext
     if (ctx == nullptr)
     {
       ctx = new hermes::Context;
-      ctx->sinks()->append(stdex::make_unique<hermes::ConsoleReporter>());
-      ctx->config(stdex::make_unique<EnvConfig>("TF2NNPKG_Log"));
+      ctx->sinks()->append(std::make_unique<hermes::ConsoleReporter>());
+      ctx->config(std::make_unique<EnvConfig>("TF2NNPKG_Log"));
     }
 
     return ctx;
@@ -148,9 +147,9 @@ int EntryFunctor::operator()(int argc, char **argv) const
   using EnvConfig = hermes::EnvConfig<hermes::EnvFormat::BooleanNumber>;
 
   // This line allows users to control all the moco-tf loggers via TF2NNPKG_Log_Frontend
-  moco::LoggingContext::get()->config(stdex::make_unique<EnvConfig>("TF2NNPKG_Log_Frontend"));
+  moco::LoggingContext::get()->config(std::make_unique<EnvConfig>("TF2NNPKG_Log_Frontend"));
   // This line allows users to control all the exo-circle loggers via TF2NNPKG_Log_Backend
-  exo::LoggingContext::get()->config(stdex::make_unique<EnvConfig>("TF2NNPKG_Log_Backend"));
+  exo::LoggingContext::get()->config(std::make_unique<EnvConfig>("TF2NNPKG_Log_Backend"));
 
   LOGGER(l);
 
diff --git a/compiler/tf2tflite/CMakeLists.txt b/compiler/tf2tflite/CMakeLists.txt
index 663563e00..e4a723305 100644
--- a/compiler/tf2tflite/CMakeLists.txt
+++ b/compiler/tf2tflite/CMakeLists.txt
@@ -38,7 +38,6 @@ target_link_libraries(tf2tflite PRIVATE tfinfo)
 target_link_libraries(tf2tflite PRIVATE exo)
 target_link_libraries(tf2tflite PRIVATE locop)
 target_link_libraries(tf2tflite PRIVATE hermes_std)
-target_link_libraries(tf2tflite PRIVATE stdex)
 target_link_libraries(tf2tflite PRIVATE angkor cwrap)
 target_link_libraries(tf2tflite PRIVATE tf2tflite_customop_info_proto)
 install(TARGETS tf2tflite DESTINATION bin)
diff --git a/compiler/tf2tflite/requires.cmake b/compiler/tf2tflite/requires.cmake
index 68d45bf3a..87ea50bf7 100644
--- a/compiler/tf2tflite/requires.cmake
+++ b/compiler/tf2tflite/requires.cmake
@@ -1,4 +1,3 @@
-require("stdex")
 require("hermes-std")
 require("moco-tf")
 require("exo")
diff --git a/compiler/tf2tflite/src/Driver.cpp b/compiler/tf2tflite/src/Driver.cpp
index e43d30bb2..12fcbd005 100644
--- a/compiler/tf2tflite/src/Driver.cpp
+++ b/compiler/tf2tflite/src/Driver.cpp
@@ -28,10 +28,8 @@
 #include <hermes/ConsoleReporter.h>
 #include <hermes/EnvConfig.h>
 
-#include <stdex/Memory.h>
-
 #include <cassert>
-
+#include <memory>
 #include <iostream>
 #include <stdexcept>
 #include <string>
@@ -70,8 +68,8 @@ struct LoggingContext
     if (ctx == nullptr)
     {
       ctx = new hermes::Context;
-      ctx->sinks()->append(stdex::make_unique<hermes::ConsoleReporter>());
-      ctx->config(stdex::make_unique<EnvConfig>("TF2TFLITE_Log"));
+      ctx->sinks()->append(std::make_unique<hermes::ConsoleReporter>());
+      ctx->config(std::make_unique<EnvConfig>("TF2TFLITE_Log"));
     }
 
     return ctx;
@@ -96,9 +94,9 @@ int main(int argc, char **argv)
   using EnvConfig = hermes::EnvConfig<hermes::EnvFormat::BooleanNumber>;
 
   // This line allows users to control all the moco-tf loggers via TF2TFLITE_Log_Frontend
-  moco::LoggingContext::get()->config(stdex::make_unique<EnvConfig>("TF2TFLITE_Log_Frontend"));
+  moco::LoggingContext::get()->config(std::make_unique<EnvConfig>("TF2TFLITE_Log_Frontend"));
   // This line allows users to control all the exo-tflite loggers via TF2TFLITE_Log_Backend
-  exo::LoggingContext::get()->config(stdex::make_unique<EnvConfig>("TF2TFLITE_Log_Backend"));
+  exo::LoggingContext::get()->config(std::make_unique<EnvConfig>("TF2TFLITE_Log_Backend"));
 
   LOGGER(l);
 
diff --git a/compiler/tf2tfliteV2/tf2tfliteV2.py b/compiler/tf2tfliteV2/tf2tfliteV2.py
index 3fb988102..c6973ff96 100755
--- a/compiler/tf2tfliteV2/tf2tfliteV2.py
+++ b/compiler/tf2tfliteV2/tf2tfliteV2.py
@@ -180,6 +180,15 @@ def _v2_convert(flags):
             raise ValueError("--input_arrays must be provided")
         if not flags.output_arrays:
             raise ValueError("--output_arrays must be provided")
+        input_shapes = []
+        if flags.input_shapes:
+            input_shapes = [
+                _parse_array(shape, type_fn=int)
+                for shape in flags.input_shapes.split(":")
+            ]
+            if len(input_shapes) != len(_parse_array(flags.input_arrays)):
+                raise ValueError(
+                    "--input_shapes and --input_arrays must have the same length")
         file_content = open(flags.input_path, 'rb').read()
         try:
             graph_def = tf.compat.v1.GraphDef()
@@ -200,6 +209,8 @@ def _v2_convert(flags):
                 _str + ":0" if len(_str.split(":")) == 1 else _str
                 for _str in _parse_array(flags.output_arrays)
             ])
+        for i in range(len(input_shapes)):
+            wrap_func.inputs[i].set_shape(input_shapes[i])
         converter = tf.lite.TFLiteConverter.from_concrete_functions([wrap_func])
 
     if flags.model_format == "saved_model":
diff --git a/compiler/tfinfo-v2/CMakeLists.txt b/compiler/tfinfo-v2/CMakeLists.txt
index cf438ea29..40df521b9 100644
--- a/compiler/tfinfo-v2/CMakeLists.txt
+++ b/compiler/tfinfo-v2/CMakeLists.txt
@@ -24,7 +24,6 @@ set_target_properties(tfinfo_v2 PROPERTIES POSITION_INDEPENDENT_CODE ON)
 target_include_directories(tfinfo_v2 PUBLIC include)
 target_link_libraries(tfinfo_v2 PRIVATE tfinfo_v2_proto)
 target_link_libraries(tfinfo_v2 PRIVATE oops)
-target_link_libraries(tfinfo_v2 PRIVATE stdex)
 
 if(NOT ENABLE_TEST)
   return()
diff --git a/compiler/tfinfo-v2/include/tfinfo-v2/TensorSignature.h b/compiler/tfinfo-v2/include/tfinfo-v2/TensorSignature.h
index f26d0354a..8c014f1fa 100644
--- a/compiler/tfinfo-v2/include/tfinfo-v2/TensorSignature.h
+++ b/compiler/tfinfo-v2/include/tfinfo-v2/TensorSignature.h
@@ -98,7 +98,7 @@ public:
   }
 
   TensorSignature(const Kind kind, const std::string &name, const ShapeHint &shape_hint)
-      : TensorSignature(kind, name)
+    : TensorSignature(kind, name)
   {
     _shape_hint = shape_hint;
   }
diff --git a/compiler/tfinfo-v2/requires.cmake b/compiler/tfinfo-v2/requires.cmake
index e7efab4fb..a1b974421 100644
--- a/compiler/tfinfo-v2/requires.cmake
+++ b/compiler/tfinfo-v2/requires.cmake
@@ -1,2 +1 @@
 require("oops")
-require("stdex")
diff --git a/compiler/tfinfo-v2/src/TFInfo_v2.test.cpp b/compiler/tfinfo-v2/src/TFInfo_v2.test.cpp
index 02a2d9199..bcab4ac7f 100644
--- a/compiler/tfinfo-v2/src/TFInfo_v2.test.cpp
+++ b/compiler/tfinfo-v2/src/TFInfo_v2.test.cpp
@@ -54,7 +54,7 @@ const std::vector<std::string> success_cases =
                     name : "relu:0"
                 }
     ),
-    // clang-format on
+  // clang-format on
 };
 
 } // namespace
@@ -221,7 +221,7 @@ const std::vector<std::string> fail_cases =
                 input, a:0, TF_FLOAT, [2, 3 ,4]
                 output, b:0, TF_FLOAT, [2, 3 ,4]
       )",
-    // clang-format on
+  // clang-format on
 };
 
 } // namespace
diff --git a/compiler/tfinfo-v2/src/TensorInfoLoader.cpp b/compiler/tfinfo-v2/src/TensorInfoLoader.cpp
index 0bf828773..249bf384a 100644
--- a/compiler/tfinfo-v2/src/TensorInfoLoader.cpp
+++ b/compiler/tfinfo-v2/src/TensorInfoLoader.cpp
@@ -19,13 +19,13 @@
 #include "tfinfo-v2/TensorSignature.h"
 
 #include <oops/UserExn.h>
-#include <stdex/Memory.h>
 
 #include <tfinfo-v2.pb.h>
 
 #include <google/protobuf/io/zero_copy_stream_impl.h>
 #include <google/protobuf/text_format.h>
 
+#include <memory>
 #include <fstream>
 #include <fcntl.h>
 
@@ -107,8 +107,8 @@ void convert(tfinfo_v2_proto::InfoDef &info_def, tfinfo::v2::TensorSignatures &t
       auto name = input_def.name();
       validate_tensor_name(name, path);
 
-      auto tensor = stdex::make_unique<tfinfo::v2::TensorSignature>(
-          tfinfo::v2::TensorSignature::Kind::Input, name);
+      auto tensor = std::make_unique<tfinfo::v2::TensorSignature>(
+        tfinfo::v2::TensorSignature::Kind::Input, name);
 
       // when there is dim attribute for unknown shape
       if (input_def.dim_size() > 0)
@@ -136,8 +136,8 @@ void convert(tfinfo_v2_proto::InfoDef &info_def, tfinfo::v2::TensorSignatures &t
       auto name = info_def.output().Get(i).name();
       validate_tensor_name(name, path);
 
-      auto tensor = stdex::make_unique<tfinfo::v2::TensorSignature>(
-          tfinfo::v2::TensorSignature::Kind::Output, name);
+      auto tensor = std::make_unique<tfinfo::v2::TensorSignature>(
+        tfinfo::v2::TensorSignature::Kind::Output, name);
       tensors.emplace_back(std::move(tensor));
     }
   }
diff --git a/compiler/tfinfo/CMakeLists.txt b/compiler/tfinfo/CMakeLists.txt
index 678912e6f..359699e13 100644
--- a/compiler/tfinfo/CMakeLists.txt
+++ b/compiler/tfinfo/CMakeLists.txt
@@ -5,7 +5,7 @@ list(REMOVE_ITEM SOURCES ${TESTS})
 add_library(tfinfo STATIC ${SOURCES})
 set_target_properties(tfinfo PROPERTIES POSITION_INDEPENDENT_CODE ON)
 target_include_directories(tfinfo PUBLIC include)
-target_link_libraries(tfinfo stdex angkor oops)
+target_link_libraries(tfinfo angkor oops)
 
 # TODO Remove "nnkit_support_tftestinfo" later
 add_library(nnkit_support_tftestinfo ALIAS tfinfo)
diff --git a/compiler/tfinfo/include/nnkit/support/tftestinfo/ParsedTensor.h b/compiler/tfinfo/include/nnkit/support/tftestinfo/ParsedTensor.h
index aec8c5e40..eef206207 100644
--- a/compiler/tfinfo/include/nnkit/support/tftestinfo/ParsedTensor.h
+++ b/compiler/tfinfo/include/nnkit/support/tftestinfo/ParsedTensor.h
@@ -57,7 +57,7 @@ public:
 
   ParsedTensor(const Kind kind, const std::string &name, const DataType &dtype,
                const std::vector<int32_t> &shape)
-      : _kind(kind), _dtype(dtype)
+    : _kind(kind), _dtype(dtype)
   {
     _tensor_name.assign(name);
 
@@ -66,7 +66,9 @@ public:
       _shape.dim(rank) = shape.at(rank);
   }
 
-  ~ParsedTensor() { /* empty */}
+  ~ParsedTensor()
+  { /* empty */
+  }
 
 public:
   Kind kind() const { return _kind; }
diff --git a/compiler/tfinfo/requires.cmake b/compiler/tfinfo/requires.cmake
index 3b45c6458..d7ecb2382 100644
--- a/compiler/tfinfo/requires.cmake
+++ b/compiler/tfinfo/requires.cmake
@@ -1,3 +1,2 @@
-require("stdex")
 require("angkor")
 require("oops")
diff --git a/compiler/tfinfo/src/TensorInfoParser.cpp b/compiler/tfinfo/src/TensorInfoParser.cpp
index 9eb3da296..050da40de 100644
--- a/compiler/tfinfo/src/TensorInfoParser.cpp
+++ b/compiler/tfinfo/src/TensorInfoParser.cpp
@@ -21,7 +21,6 @@
 #include "Compat.h"
 
 #include <oops/UserExn.h>
-#include <stdex/Memory.h>
 #include <nncc/core/ADT/tensor/Shape.h>
 
 #include <cctype>
@@ -197,7 +196,7 @@ std::unique_ptr<ParsedTensor> parse_line(std::string &line)
     shape.emplace_back(std::stoi(dim));
   }
 
-  return stdex::make_unique<ParsedTensor>(kind, name, dtype, shape);
+  return std::make_unique<ParsedTensor>(kind, name, dtype, shape);
 }
 
 #undef CHECK_NOT_NULL
diff --git a/compiler/tfkit/CMakeLists.txt b/compiler/tfkit/CMakeLists.txt
index b809658b1..2058fbc02 100644
--- a/compiler/tfkit/CMakeLists.txt
+++ b/compiler/tfkit/CMakeLists.txt
@@ -7,7 +7,6 @@ message(STATUS "Build tfkit: TRUE")
 file(GLOB_RECURSE SOURCES "src/*.cpp")
 
 add_executable(tfkit ${SOURCES})
-target_link_libraries(tfkit PRIVATE stdex)
 target_link_libraries(tfkit PRIVATE cli)
 target_link_libraries(tfkit PRIVATE mio_tf)
 target_link_libraries(tfkit PRIVATE nncc_common)
diff --git a/compiler/tfkit/src/ConvertCommand.cpp b/compiler/tfkit/src/ConvertCommand.cpp
index 3e417cc78..2b5d077c9 100644
--- a/compiler/tfkit/src/ConvertCommand.cpp
+++ b/compiler/tfkit/src/ConvertCommand.cpp
@@ -17,8 +17,6 @@
 #include "ConvertCommand.hpp"
 #include "Support.hpp"
 
-#include <stdex/Memory.h>
-
 #include <tensorflow/core/framework/graph.pb.h>
 
 #include <google/protobuf/io/coded_stream.h>
@@ -26,6 +24,7 @@
 #include <google/protobuf/text_format.h>
 #include <google/protobuf/util/json_util.h>
 
+#include <memory>
 #include <cassert>
 #include <map>
 #include <string>
@@ -114,12 +113,12 @@ int ConvertCommand::run(int argc, const char *const *argv) const
 
   std::map<std::string, std::unique_ptr<Importer>> importers;
 
-  importers["pb"] = stdex::make_unique<ImporterImpl<DataFormat::PBBIN>>();
-  importers["pbtxt"] = stdex::make_unique<ImporterImpl<DataFormat::PBTXT>>();
+  importers["pb"] = std::make_unique<ImporterImpl<DataFormat::PBBIN>>();
+  importers["pbtxt"] = std::make_unique<ImporterImpl<DataFormat::PBTXT>>();
 
   std::map<std::string, std::unique_ptr<Exporter>> exporters;
 
-  exporters["json"] = stdex::make_unique<ExporterImpl<DataFormat::JSON>>();
+  exporters["json"] = std::make_unique<ExporterImpl<DataFormat::JSON>>();
 
   auto importer = importers.at(input_format).get();
   auto exporter = exporters.at(output_format).get();
diff --git a/compiler/tfkit/src/Main.cpp b/compiler/tfkit/src/Main.cpp
index 60bd6abfa..a695741dd 100644
--- a/compiler/tfkit/src/Main.cpp
+++ b/compiler/tfkit/src/Main.cpp
@@ -21,17 +21,18 @@
 #include "ConvertCommand.hpp"
 
 #include <cli/App.h>
-#include <stdex/Memory.h>
+
+#include <memory>
 
 int main(int argc, char **argv)
 {
   cli::App app{argv[0]};
 
-  app.insert("encode", stdex::make_unique<tfkit::EncodeCommand>());
-  app.insert("decode", stdex::make_unique<tfkit::DecodeCommand>());
-  app.insert("unpack", stdex::make_unique<tfkit::UnpackCommand>());
-  app.insert("pack", stdex::make_unique<tfkit::PackCommand>());
-  app.insert("convert", stdex::make_unique<tfkit::ConvertCommand>());
+  app.insert("encode", std::make_unique<tfkit::EncodeCommand>());
+  app.insert("decode", std::make_unique<tfkit::DecodeCommand>());
+  app.insert("unpack", std::make_unique<tfkit::UnpackCommand>());
+  app.insert("pack", std::make_unique<tfkit::PackCommand>());
+  app.insert("convert", std::make_unique<tfkit::ConvertCommand>());
 
   return app.run(argc - 1, argv + 1);
 }
diff --git a/compiler/tfkit/src/PackCommand.cpp b/compiler/tfkit/src/PackCommand.cpp
index a1c4a6fc8..d854e30db 100644
--- a/compiler/tfkit/src/PackCommand.cpp
+++ b/compiler/tfkit/src/PackCommand.cpp
@@ -60,7 +60,7 @@ template <> void pack<float>(tensorflow::TensorProto *input_tensor)
     }
 
     input_tensor->set_tensor_content(std::string(
-        reinterpret_cast<const char *>(tensor_content.data()), sizeof(float) * input_flat_size));
+      reinterpret_cast<const char *>(tensor_content.data()), sizeof(float) * input_flat_size));
 
     input_tensor->clear_float_val();
   }
@@ -99,7 +99,7 @@ template <> void pack<int32_t>(tensorflow::TensorProto *input_tensor)
     }
 
     input_tensor->set_tensor_content(std::string(
-        reinterpret_cast<const char *>(tensor_content.data()), sizeof(int32_t) * input_flat_size));
+      reinterpret_cast<const char *>(tensor_content.data()), sizeof(int32_t) * input_flat_size));
 
     input_tensor->clear_int_val();
   }
diff --git a/compiler/tfkit/src/Support.cpp b/compiler/tfkit/src/Support.cpp
index 40d8705a7..1ce4c4680 100644
--- a/compiler/tfkit/src/Support.cpp
+++ b/compiler/tfkit/src/Support.cpp
@@ -17,10 +17,9 @@
 
 #include "Support.hpp"
 
-#include <stdex/Memory.h>
-
 #include <tensorflow/core/framework/graph.pb.h>
 
+#include <memory>
 #include <cassert>
 #include <fstream>
 #include <stdexcept>
@@ -36,7 +35,7 @@ std::unique_ptr<T> open_fstream(const std::string &path, std::ios_base::openmode
     return nullptr;
   }
 
-  auto stream = stdex::make_unique<T>(path.c_str(), mode);
+  auto stream = std::make_unique<T>(path.c_str(), mode);
   if (!stream->is_open())
   {
     throw std::runtime_error{"ERROR: Failed to open " + path};
@@ -111,7 +110,7 @@ std::string CmdArguments::get_or(unsigned int index, const std::string &s) const
 
 std::unique_ptr<IOConfiguration> make_ioconfig(const CmdArguments &cmdargs)
 {
-  auto iocfg = stdex::make_unique<IOConfiguration>();
+  auto iocfg = std::make_unique<IOConfiguration>();
 
   auto in = open_fstream<std::ifstream>(cmdargs.get_or(0, "-"), std::ios::in | std::ios::binary);
   iocfg->in(std::move(in));
diff --git a/compiler/tfkit/src/Support.hpp b/compiler/tfkit/src/Support.hpp
index a5b954d5e..21726ea57 100644
--- a/compiler/tfkit/src/Support.hpp
+++ b/compiler/tfkit/src/Support.hpp
@@ -41,7 +41,7 @@ class CmdArguments
 public:
   CmdArguments() = delete;
   CmdArguments(int argc, const char *const *argv)
-      : _argc(static_cast<unsigned int>(argc)), _argv{argv}
+    : _argc(static_cast<unsigned int>(argc)), _argv{argv}
   {
   }
 
diff --git a/compiler/tfkit/src/UnpackCommand.cpp b/compiler/tfkit/src/UnpackCommand.cpp
index a6711f131..b5dd78cbb 100644
--- a/compiler/tfkit/src/UnpackCommand.cpp
+++ b/compiler/tfkit/src/UnpackCommand.cpp
@@ -52,7 +52,7 @@ template <> void unpack<float>(tensorflow::TensorProto *input_tensor)
     input_tensor->clear_float_val();
 
     const float *tensor_content =
-        reinterpret_cast<const float *>(input_tensor->tensor_content().data());
+      reinterpret_cast<const float *>(input_tensor->tensor_content().data());
     for (int i = 0; i < input_flat_size; i++)
     {
       input_tensor->add_float_val(tensor_content[i]);
@@ -87,7 +87,7 @@ template <> void unpack<int32_t>(tensorflow::TensorProto *input_tensor)
     input_tensor->clear_int_val();
 
     const int32_t *tensor_content =
-        reinterpret_cast<const int32_t *>(input_tensor->tensor_content().data());
+      reinterpret_cast<const int32_t *>(input_tensor->tensor_content().data());
     for (int i = 0; i < input_flat_size; i++)
     {
       input_tensor->add_int_val(tensor_content[i]);
@@ -122,7 +122,7 @@ template <> void unpack<int8_t>(tensorflow::TensorProto *input_tensor)
     input_tensor->clear_int_val();
 
     const int8_t *tensor_content =
-        reinterpret_cast<const int8_t *>(input_tensor->tensor_content().data());
+      reinterpret_cast<const int8_t *>(input_tensor->tensor_content().data());
     for (int i = 0; i < input_flat_size; i++)
     {
       input_tensor->add_int_val(tensor_content[i]);
@@ -157,7 +157,7 @@ template <> void unpack<bool>(tensorflow::TensorProto *input_tensor)
     input_tensor->clear_bool_val();
 
     const bool *tensor_content =
-        reinterpret_cast<const bool *>(input_tensor->tensor_content().data());
+      reinterpret_cast<const bool *>(input_tensor->tensor_content().data());
     for (int i = 0; i < input_flat_size; i++)
     {
       input_tensor->add_bool_val(tensor_content[i]);
diff --git a/compiler/tfl-inspect/driver/Driver.cpp b/compiler/tfl-inspect/driver/Driver.cpp
index a48001169..3e62e0ffb 100644
--- a/compiler/tfl-inspect/driver/Driver.cpp
+++ b/compiler/tfl-inspect/driver/Driver.cpp
@@ -32,8 +32,8 @@ int entry(int argc, char **argv)
                      "Lite model files"};
   arser.add_argument("--operators").nargs(0).help("Dump operators in tflite file");
   arser.add_argument("--conv2d_weight")
-      .nargs(0)
-      .help("Dump Conv2D series weight operators in tflite file");
+    .nargs(0)
+    .help("Dump Conv2D series weight operators in tflite file");
   arser.add_argument("--op_version").nargs(0).help("Dump versions of the operators in tflite file");
   arser.add_argument("tflite").type(arser::DataType::STR).help("TFLite file to inspect");
 
diff --git a/compiler/tflchef/core/src/Convert.cpp b/compiler/tflchef/core/src/Convert.cpp
index 9602faa96..de3ae4ed1 100644
--- a/compiler/tflchef/core/src/Convert.cpp
+++ b/compiler/tflchef/core/src/Convert.cpp
@@ -70,6 +70,8 @@ tflite::TensorType as_tflite_tensortype(const tflchef::TensorType &value)
       return tflite::TensorType_INT64;
     case tflchef::BOOL:
       return tflite::TensorType_BOOL;
+    case tflchef::INT16:
+      return tflite::TensorType_INT16;
     default:
       break;
   }
diff --git a/compiler/tflchef/core/src/CustomOp/BroadcastTo.cpp b/compiler/tflchef/core/src/CustomOp/BroadcastTo.cpp
new file mode 100644
index 000000000..fc429e2f7
--- /dev/null
+++ b/compiler/tflchef/core/src/CustomOp/BroadcastTo.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BroadcastTo.h"
+
+#include "flatbuffers/flexbuffers.h"
+
+flatbuffers::Offset<void> BroadcastToChef::value(flatbuffers::FlatBufferBuilder &fbb) const
+{
+  return flatbuffers::Offset<void>();
+}
+
+flatbuffers::Offset<flatbuffers::Vector<uint8_t>>
+BroadcastToChef::custom_value(flatbuffers::FlatBufferBuilder &fbb) const
+{
+  auto &operation = (*_operation);
+
+  assert(operation.type() == "BroadcastTo");
+
+  /**
+   * REGISTER_OP("BroadcastTo")
+    .Input("input: T")
+    .Input("shape: Tidx")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("Tidx: {int32, int64} = DT_INT32")
+    .SetShapeFn([](InferenceContext* c)
+   */
+
+  auto flex_buffers = std::make_unique<flexbuffers::Builder>();
+  size_t map_start = flex_buffers->StartMap();
+
+  // TODO Support more data types
+  flex_buffers->Int("T", tflite::TensorType_FLOAT32);
+  flex_buffers->Int("Tidx", tflite::TensorType_INT32);
+
+  flex_buffers->EndMap(map_start);
+  flex_buffers->Finish();
+
+  auto circle_custom_options = fbb.CreateVector(flex_buffers->GetBuffer());
+  return circle_custom_options;
+}
+
+std::unique_ptr<OpChef> BroadcastToChefFactory::create(const tflchef::Operation *operation) const
+{
+  return std::unique_ptr<OpChef>{new BroadcastToChef{operation}};
+}
diff --git a/compiler/tflchef/core/src/CustomOp/BroadcastTo.h b/compiler/tflchef/core/src/CustomOp/BroadcastTo.h
new file mode 100644
index 000000000..3ed71c511
--- /dev/null
+++ b/compiler/tflchef/core/src/CustomOp/BroadcastTo.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __OP_BROADCASTTO_H__
+#define __OP_BROADCASTTO_H__
+
+#include "OpChef.h"
+
+class BroadcastToChef final : public OpChef
+{
+public:
+  explicit BroadcastToChef(const tflchef::Operation *operation) : _operation{operation}
+  {
+    // DO NOTHING
+  }
+
+public:
+  tflite::BuiltinOperator code(void) const override { return tflite::BuiltinOperator_CUSTOM; }
+
+  tflite::BuiltinOptions type(void) const override { return tflite::BuiltinOptions_NONE; }
+
+  flatbuffers::Offset<void> value(flatbuffers::FlatBufferBuilder &fbb) const override;
+
+  flatbuffers::Offset<flatbuffers::Vector<uint8_t>>
+  custom_value(flatbuffers::FlatBufferBuilder &fbb) const override;
+
+private:
+  const tflchef::Operation *_operation;
+};
+
+struct BroadcastToChefFactory final : public OpChefFactory
+{
+  std::unique_ptr<OpChef> create(const tflchef::Operation *operation) const override;
+};
+
+#endif // __OP_BROADCASTTO_H__
diff --git a/compiler/tflchef/core/src/ModelChef.cpp b/compiler/tflchef/core/src/ModelChef.cpp
index 2f4d7eeb5..467b0d300 100644
--- a/compiler/tflchef/core/src/ModelChef.cpp
+++ b/compiler/tflchef/core/src/ModelChef.cpp
@@ -51,7 +51,7 @@ class GeneratedModelImpl final : public tflchef::GeneratedModel::Impl
 {
 public:
   GeneratedModelImpl(std::unique_ptr<flatbuffers::FlatBufferBuilder> &&builder)
-      : _builder{std::move(builder)}
+    : _builder{std::move(builder)}
   {
     // DO NOTHING
   }
@@ -90,6 +90,7 @@ DataChefRegistry &data_chef_registry(const tflchef::TensorType &type)
   static DataChefRegistry fp32;
   static DataChefRegistry u8;
   static DataChefRegistry boolean;
+  static DataChefRegistry s16;
 
   switch (type)
   {
@@ -103,6 +104,8 @@ DataChefRegistry &data_chef_registry(const tflchef::TensorType &type)
       return u8;
     case tflchef::BOOL:
       return boolean;
+    case tflchef::INT16:
+      return s16;
     default:
       break;
   }
@@ -197,6 +200,7 @@ struct CookParams
   std::vector<flatbuffers::Offset<::tflite::SubGraph>> &subgraph_vec;
   std::unique_ptr<flatbuffers::FlatBufferBuilder> &flatbuffer_builder;
   std::map<tflite::BuiltinOperator, int32_t> &builtin_code_map;
+  std::vector<std::string> &custom_code_vec;
   std::string noname;
 };
 
@@ -209,6 +213,7 @@ template <typename T> void cook_graph(const T &graph, CookParams &cp)
   std::vector<flatbuffers::Offset<::tflite::SubGraph>> &subgraph_vec = cp.subgraph_vec;
   std::unique_ptr<flatbuffers::FlatBufferBuilder> &flatbuffer_builder = cp.flatbuffer_builder;
   std::map<tflite::BuiltinOperator, int32_t> &builtin_code_map = cp.builtin_code_map;
+  std::vector<std::string> &custom_code_vec = cp.custom_code_vec;
 
   // Operand-related
   std::vector<flatbuffers::Offset<::tflite::Tensor>> tensor_vec;
@@ -399,21 +404,21 @@ template <typename T> void cook_graph(const T &graph, CookParams &cp)
       {
         // Create array segments
         auto tflite_array_segments =
-            as_tflite_sparse_index_vec(*flatbuffer_builder, dm.array_segments());
+          as_tflite_sparse_index_vec(*flatbuffer_builder, dm.array_segments());
 
         // Create array indices
         auto tflite_array_indices =
-            as_tflite_sparse_index_vec(*flatbuffer_builder, dm.array_indices());
+          as_tflite_sparse_index_vec(*flatbuffer_builder, dm.array_indices());
 
         auto tflite_dim_metadata_builder = tflite::DimensionMetadataBuilder{*flatbuffer_builder};
         tflite_dim_metadata_builder.add_format(as_tflite_dimensiontype(dm.format()));
         tflite_dim_metadata_builder.add_dense_size(dm.dense_size());
         tflite_dim_metadata_builder.add_array_segments(tflite_array_segments);
         tflite_dim_metadata_builder.add_array_segments_type(
-            as_tflite_sparse_idx_vec_type(dm.array_segments().type()));
+          as_tflite_sparse_idx_vec_type(dm.array_segments().type()));
         tflite_dim_metadata_builder.add_array_indices(tflite_array_indices);
         tflite_dim_metadata_builder.add_array_indices_type(
-            as_tflite_sparse_idx_vec_type(dm.array_indices().type()));
+          as_tflite_sparse_idx_vec_type(dm.array_indices().type()));
         auto tflite_dim_metadata = tflite_dim_metadata_builder.Finish();
         dim_metadata_vec.emplace_back(tflite_dim_metadata);
       }
@@ -480,11 +485,23 @@ template <typename T> void cook_graph(const T &graph, CookParams &cp)
     // Create Operator
     tflite::OperatorBuilder op_builder{*flatbuffer_builder};
 
-    // Get operator code index from builtin_code_set with assumption, order of
-    // builtin_code_set is same as that of code_vec
+    // Note that opcode_index is an index into the operator_codes vector.
+    // operator_codes consists of buildtin_code and custom_code, which is inserted sequentially.
+    uint32_t opcode_index = 0;
     auto op_it = builtin_code_map.find(op_chef->code());
-    assert(op_it != builtin_code_map.end());
-    uint32_t opcode_index = std::distance(builtin_code_map.begin(), op_it);
+    // builtin operator
+    if (op_it != builtin_code_map.end())
+    {
+      opcode_index = std::distance(builtin_code_map.begin(), op_it);
+    }
+    // custom operator
+    else
+    {
+      auto op_it = std::find(custom_code_vec.begin(), custom_code_vec.end(), operation.type());
+      assert(op_it != custom_code_vec.end());
+      opcode_index = builtin_code_map.size();
+      opcode_index += std::distance(custom_code_vec.begin(), op_it);
+    }
 
     op_builder.add_opcode_index(opcode_index);
     op_builder.add_inputs(inputs);
@@ -538,7 +555,7 @@ GeneratedModel cook(const ::tflchef::ModelRecipe &model_recipe)
 // Initialize Data Chef Registry
 #define DATA_CHEF(TYPE, NAME, FACTORY_CLASS) \
   data_chef_registry(::tflchef::TYPE)        \
-      .add(#NAME, std::unique_ptr<FACTORY_CLASS>(new FACTORY_CLASS()));
+    .add(#NAME, std::unique_ptr<FACTORY_CLASS>(new FACTORY_CLASS()));
 #include <souschef/DataChef.def>
 #undef DATA_CHEF
 
@@ -546,7 +563,7 @@ GeneratedModel cook(const ::tflchef::ModelRecipe &model_recipe)
   // Create FlatBufferBuilder
   //
   auto flatbuffer_builder =
-      std::unique_ptr<flatbuffers::FlatBufferBuilder>(new flatbuffers::FlatBufferBuilder(1024));
+    std::unique_ptr<flatbuffers::FlatBufferBuilder>(new flatbuffers::FlatBufferBuilder(1024));
 
   // Operand-related
   std::vector<flatbuffers::Offset<::tflite::Buffer>> buffer_vec;
@@ -571,11 +588,9 @@ GeneratedModel cook(const ::tflchef::ModelRecipe &model_recipe)
 
   // Create OperatorCode with Custom Operator
   std::set<std::string> custom_code_set = gather_customcode_set(model_recipe);
-  if (custom_code_set.size() &&
-      builtin_code_map.find(tflite::BuiltinOperator_CUSTOM) == builtin_code_map.end())
-    builtin_code_map[tflite::BuiltinOperator_CUSTOM] = 1;
+  std::vector<std::string> custom_code_vec{custom_code_set.begin(), custom_code_set.end()};
 
-  for (auto opcode : custom_code_set)
+  for (auto opcode : custom_code_vec)
   {
     auto custom_code = flatbuffer_builder->CreateString(opcode);
     tflite::OperatorCodeBuilder code_builder{*flatbuffer_builder};
@@ -598,7 +613,8 @@ GeneratedModel cook(const ::tflchef::ModelRecipe &model_recipe)
   //
   // Create Main graph
   //
-  CookParams cp{buffer_vec, code_vec, subgraph_vec, flatbuffer_builder, builtin_code_map, "main"};
+  CookParams cp{buffer_vec,       code_vec,        subgraph_vec, flatbuffer_builder,
+                builtin_code_map, custom_code_vec, "main"};
 
   cook_graph<::tflchef::ModelRecipe>(model_recipe, cp);
 
@@ -612,8 +628,8 @@ GeneratedModel cook(const ::tflchef::ModelRecipe &model_recipe)
     std::ostringstream stringStream;
     stringStream << "sub_" << (g + 1);
 
-    CookParams cp{buffer_vec,         code_vec,         subgraph_vec,
-                  flatbuffer_builder, builtin_code_map, stringStream.str()};
+    CookParams cp{buffer_vec,       code_vec,        subgraph_vec,      flatbuffer_builder,
+                  builtin_code_map, custom_code_vec, stringStream.str()};
 
     cook_graph<::tflchef::Graph>(graph, cp);
   }
@@ -640,7 +656,7 @@ GeneratedModel cook(const ::tflchef::ModelRecipe &model_recipe)
 
   // Return "GenerateModel"
   return GeneratedModel{
-      std::unique_ptr<GeneratedModelImpl>(new GeneratedModelImpl(std::move(flatbuffer_builder)))};
+    std::unique_ptr<GeneratedModelImpl>(new GeneratedModelImpl(std::move(flatbuffer_builder)))};
 }
 
 } // namespace tflchef
diff --git a/compiler/tflchef/core/src/Op/BidirectionalSequenceLSTM.cpp b/compiler/tflchef/core/src/Op/BidirectionalSequenceLSTM.cpp
new file mode 100644
index 000000000..1bf2264ab
--- /dev/null
+++ b/compiler/tflchef/core/src/Op/BidirectionalSequenceLSTM.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BidirectionalSequenceLSTM.h"
+#include "Convert.h"
+
+#include <cassert>
+
+flatbuffers::Offset<void>
+BidirectionalSequenceLSTMChef::value(flatbuffers::FlatBufferBuilder &fbb) const
+{
+  auto &operation = (*_operation);
+
+  assert(operation.has_bidirectional_sequence_lstm_options());
+
+  tflite::BidirectionalSequenceLSTMOptionsBuilder options_builder(fbb);
+  options_builder.add_fused_activation_function(
+    as_tflite_activation(operation.bidirectional_sequence_lstm_options().activation()));
+  options_builder.add_cell_clip(operation.bidirectional_sequence_lstm_options().cell_clip());
+  options_builder.add_proj_clip(operation.bidirectional_sequence_lstm_options().proj_clip());
+  options_builder.add_time_major(operation.bidirectional_sequence_lstm_options().time_major());
+  options_builder.add_asymmetric_quantize_inputs(
+    operation.bidirectional_sequence_lstm_options().asymmetric_quantize_inputs());
+  options_builder.add_merge_outputs(
+    operation.bidirectional_sequence_lstm_options().merge_outputs());
+
+  return options_builder.Finish().Union();
+}
+
+std::unique_ptr<OpChef>
+BidirectionalSequenceLSTMChefFactory::create(const tflchef::Operation *operation) const
+{
+  return std::unique_ptr<OpChef>{new BidirectionalSequenceLSTMChef{operation}};
+}
diff --git a/compiler/tflchef/core/src/Op/BidirectionalSequenceLSTM.h b/compiler/tflchef/core/src/Op/BidirectionalSequenceLSTM.h
new file mode 100644
index 000000000..e66917b97
--- /dev/null
+++ b/compiler/tflchef/core/src/Op/BidirectionalSequenceLSTM.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __OP_BIDIRECTIONALSEQUENCE_LSTM_H__
+#define __OP_BIDIRECTIONALSEQUENCE_LSTM_H__
+
+#include "OpChef.h"
+
+class BidirectionalSequenceLSTMChef final : public OpChef
+{
+public:
+  explicit BidirectionalSequenceLSTMChef(const tflchef::Operation *operation)
+    : _operation{operation}
+  {
+    // DO NOTHING
+  }
+
+public:
+  tflite::BuiltinOperator code(void) const override
+  {
+    return tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM;
+  }
+
+  tflite::BuiltinOptions type(void) const override
+  {
+    return tflite::BuiltinOptions_BidirectionalSequenceLSTMOptions;
+  }
+
+  flatbuffers::Offset<void> value(flatbuffers::FlatBufferBuilder &fbb) const override;
+
+private:
+  const tflchef::Operation *_operation;
+};
+
+struct BidirectionalSequenceLSTMChefFactory final : public OpChefFactory
+{
+  std::unique_ptr<OpChef> create(const tflchef::Operation *operation) const override;
+};
+
+#endif // __OP_BIDIRECTIONALSEQUENCE_LSTM_H__
diff --git a/compiler/tflchef/core/src/Op/FakeQuant.cpp b/compiler/tflchef/core/src/Op/FakeQuant.cpp
new file mode 100644
index 000000000..e4cbbfe44
--- /dev/null
+++ b/compiler/tflchef/core/src/Op/FakeQuant.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "FakeQuant.h"
+#include "Convert.h"
+
+#include <cassert>
+
+flatbuffers::Offset<void> FakeQuantChef::value(flatbuffers::FlatBufferBuilder &fbb) const
+{
+  auto &operation = (*_operation);
+  assert(operation.has_fakequant_options());
+
+  auto options = operation.fakequant_options();
+
+  tflite::FakeQuantOptionsBuilder fq_options_builder{fbb};
+  fq_options_builder.add_min(options.min());
+  fq_options_builder.add_max(options.max());
+  fq_options_builder.add_num_bits(options.num_bits());
+  fq_options_builder.add_narrow_range(options.narrow_range());
+
+  return fq_options_builder.Finish().Union();
+}
+
+std::unique_ptr<OpChef> FakeQuantChefFactory::create(const tflchef::Operation *operation) const
+{
+  return std::unique_ptr<OpChef>{new FakeQuantChef{operation}};
+}
diff --git a/compiler/tflchef/core/src/Op/FakeQuant.h b/compiler/tflchef/core/src/Op/FakeQuant.h
new file mode 100644
index 000000000..0fbfea315
--- /dev/null
+++ b/compiler/tflchef/core/src/Op/FakeQuant.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __OP_FAKE_QUANT_H__
+#define __OP_FAKE_QUANT_H__
+
+#include "OpChef.h"
+
+class FakeQuantChef final : public OpChef
+{
+public:
+  explicit FakeQuantChef(const tflchef::Operation *operation) : _operation{operation}
+  {
+    // DO NOTHING
+  }
+
+public:
+  tflite::BuiltinOperator code(void) const override { return tflite::BuiltinOperator_FAKE_QUANT; }
+
+  tflite::BuiltinOptions type(void) const override
+  {
+    return tflite::BuiltinOptions_FakeQuantOptions;
+  }
+
+  flatbuffers::Offset<void> value(flatbuffers::FlatBufferBuilder &fbb) const override;
+
+private:
+  const tflchef::Operation *_operation;
+};
+
+struct FakeQuantChefFactory final : public OpChefFactory
+{
+  std::unique_ptr<OpChef> create(const tflchef::Operation *operation) const override;
+};
+
+#endif // __OP_FAKE_QUANT_H__
diff --git a/compiler/tflchef/core/src/Op/LocalResponseNormalization.h b/compiler/tflchef/core/src/Op/LocalResponseNormalization.h
index 62a2355f2..afc37e6ec 100644
--- a/compiler/tflchef/core/src/Op/LocalResponseNormalization.h
+++ b/compiler/tflchef/core/src/Op/LocalResponseNormalization.h
@@ -23,7 +23,7 @@ class LocalResponseNormalizationChef final : public OpChef
 {
 public:
   explicit LocalResponseNormalizationChef(const tflchef::Operation *operation)
-      : _operation{operation}
+    : _operation{operation}
   {
     // DO NOTHING
   }
diff --git a/compiler/tflchef/core/src/Op/Squeeze.cpp b/compiler/tflchef/core/src/Op/Squeeze.cpp
index 8d6ef42d6..1c1d99a01 100644
--- a/compiler/tflchef/core/src/Op/Squeeze.cpp
+++ b/compiler/tflchef/core/src/Op/Squeeze.cpp
@@ -30,7 +30,7 @@ flatbuffers::Offset<void> SqueezeChef::value(flatbuffers::FlatBufferBuilder &fbb
   // Note: 'CreateVector' should be placed before 'CreateOptions'
   //       Read flatbuffers.h 'void NotNested()' for more information
   auto fb_squeeze_dims =
-      fbb.CreateVector(options.squeeze_dim().data(), options.squeeze_dim().size());
+    fbb.CreateVector(options.squeeze_dim().data(), options.squeeze_dim().size());
 
   return tflite::CreateSqueezeOptions(fbb, fb_squeeze_dims).Union();
 }
diff --git a/compiler/tflchef/core/src/Op/StridedSlice.cpp b/compiler/tflchef/core/src/Op/StridedSlice.cpp
index 587a95c66..67fd03140 100644
--- a/compiler/tflchef/core/src/Op/StridedSlice.cpp
+++ b/compiler/tflchef/core/src/Op/StridedSlice.cpp
@@ -29,11 +29,11 @@ flatbuffers::Offset<void> StridedSliceChef::value(flatbuffers::FlatBufferBuilder
   strided_slice_options_builder.add_begin_mask(operation.strided_slice_options().begin_mask());
   strided_slice_options_builder.add_end_mask(operation.strided_slice_options().end_mask());
   strided_slice_options_builder.add_ellipsis_mask(
-      operation.strided_slice_options().ellipsis_mask());
+    operation.strided_slice_options().ellipsis_mask());
   strided_slice_options_builder.add_new_axis_mask(
-      operation.strided_slice_options().new_axis_mask());
+    operation.strided_slice_options().new_axis_mask());
   strided_slice_options_builder.add_shrink_axis_mask(
-      operation.strided_slice_options().shrink_axis_mask());
+    operation.strided_slice_options().shrink_axis_mask());
 
   return strided_slice_options_builder.Finish().Union();
 }
diff --git a/compiler/tflchef/core/src/Op/UnidirectionalSequenceLSTM.cpp b/compiler/tflchef/core/src/Op/UnidirectionalSequenceLSTM.cpp
index ceabfc13c..2d6becdff 100644
--- a/compiler/tflchef/core/src/Op/UnidirectionalSequenceLSTM.cpp
+++ b/compiler/tflchef/core/src/Op/UnidirectionalSequenceLSTM.cpp
@@ -28,12 +28,12 @@ UnidirectionalSequenceLSTMChef::value(flatbuffers::FlatBufferBuilder &fbb) const
 
   tflite::UnidirectionalSequenceLSTMOptionsBuilder options_builder(fbb);
   options_builder.add_fused_activation_function(
-      as_tflite_activation(operation.unidirectional_sequence_lstm_options().activation()));
+    as_tflite_activation(operation.unidirectional_sequence_lstm_options().activation()));
   options_builder.add_cell_clip(operation.unidirectional_sequence_lstm_options().cell_clip());
   options_builder.add_proj_clip(operation.unidirectional_sequence_lstm_options().proj_clip());
   options_builder.add_time_major(operation.unidirectional_sequence_lstm_options().time_major());
   options_builder.add_asymmetric_quantize_inputs(
-      operation.unidirectional_sequence_lstm_options().asymmetric_quantize_inputs());
+    operation.unidirectional_sequence_lstm_options().asymmetric_quantize_inputs());
 
   return options_builder.Finish().Union();
 }
diff --git a/compiler/tflchef/core/src/Op/UnidirectionalSequenceLSTM.h b/compiler/tflchef/core/src/Op/UnidirectionalSequenceLSTM.h
index 6811ad378..b8a6d8103 100644
--- a/compiler/tflchef/core/src/Op/UnidirectionalSequenceLSTM.h
+++ b/compiler/tflchef/core/src/Op/UnidirectionalSequenceLSTM.h
@@ -23,7 +23,7 @@ class UnidirectionalSequenceLSTMChef final : public OpChef
 {
 public:
   explicit UnidirectionalSequenceLSTMChef(const tflchef::Operation *operation)
-      : _operation{operation}
+    : _operation{operation}
   {
     // DO NOTHING
   }
diff --git a/compiler/tflchef/core/src/OpChef.def b/compiler/tflchef/core/src/OpChef.def
index 718fffc78..714e8947b 100644
--- a/compiler/tflchef/core/src/OpChef.def
+++ b/compiler/tflchef/core/src/OpChef.def
@@ -12,6 +12,7 @@ OP_CHEF(ArgMin, ArgMinChefFactory)
 OP_CHEF(AveragePool2D, AveragePool2DChefFactory)
 OP_CHEF(BatchMatMul, BatchMatMulChefFactory)
 OP_CHEF(BatchToSpaceND, BatchToSpaceNDChefFactory)
+OP_CHEF(BidirectionalSequenceLSTM, BidirectionalSequenceLSTMChefFactory)
 OP_CHEF(Cast, CastChefFactory)
 OP_CHEF(Ceil, CeilChefFactory)
 OP_CHEF(Concatenation, ConcatenationChefFactory)
@@ -25,6 +26,7 @@ OP_CHEF(ELU, ELUChefFactory)
 OP_CHEF(Equal, EqualChefFactory)
 OP_CHEF(Exp, ExpChefFactory)
 OP_CHEF(ExpandDims, ExpandDimsChefFactory)
+OP_CHEF(FakeQuant, FakeQuantChefFactory)
 OP_CHEF(Fill, FillChefFactory)
 OP_CHEF(Floor, FloorChefFactory)
 OP_CHEF(FloorDiv, FloorDivChefFactory)
@@ -117,6 +119,7 @@ OP_CHEF(ZerosLike, ZerosLikeChefFactory)
 OP_CHEF(AddV2, AddV2ChefFactory)
 OP_CHEF(All, AllChefFactory)
 OP_CHEF(BatchMatMulV2, BatchMatMulV2ChefFactory)
+OP_CHEF(BroadcastTo, BroadcastToChefFactory)
 OP_CHEF(MatMul, MatMulChefFactory)
 OP_CHEF(MatrixBandPart, MatrixBandPartChefFactory)
 OP_CHEF(MaxPoolWithArgMax, MaxPoolWithArgMaxChefFactory)
diff --git a/compiler/tflchef/core/src/OpChefs.h b/compiler/tflchef/core/src/OpChefs.h
index 3527937a0..99f331e37 100644
--- a/compiler/tflchef/core/src/OpChefs.h
+++ b/compiler/tflchef/core/src/OpChefs.h
@@ -25,6 +25,7 @@
 #include "Op/AveragePool2D.h"
 #include "Op/BatchMatMul.h"
 #include "Op/BatchToSpaceND.h"
+#include "Op/BidirectionalSequenceLSTM.h"
 #include "Op/Cast.h"
 #include "Op/Ceil.h"
 #include "Op/Concatenation.h"
@@ -38,6 +39,7 @@
 #include "Op/Equal.h"
 #include "Op/Exp.h"
 #include "Op/ExpandDims.h"
+#include "Op/FakeQuant.h"
 #include "Op/Fill.h"
 #include "Op/Floor.h"
 #include "Op/FloorDiv.h"
@@ -129,6 +131,7 @@
 #include "CustomOp/AddV2.h"
 #include "CustomOp/All.h"
 #include "CustomOp/BatchMatMulV2.h"
+#include "CustomOp/BroadcastTo.h"
 #include "CustomOp/MatMul.h"
 #include "CustomOp/MatrixBandPart.h"
 #include "CustomOp/MaxPoolWithArgMax.h"
diff --git a/compiler/tflchef/proto/tflchef.proto b/compiler/tflchef/proto/tflchef.proto
index 48a682d94..c5e44f68c 100644
--- a/compiler/tflchef/proto/tflchef.proto
+++ b/compiler/tflchef/proto/tflchef.proto
@@ -19,6 +19,7 @@ enum TensorType {
   UINT8 = 3;
   INT64 = 4;
   BOOL = 6;
+  INT16 = 7;
 }
 
 enum DimensionType {
@@ -110,6 +111,15 @@ enum MirrorPadMode {
   SYMMETRIC = 1;
 }
 
+message BidirectionalSequenceLSTMOptions {
+  optional Activation activation = 1 [default = NONE];
+  optional float cell_clip = 2 [default = 0.0];
+  optional float proj_clip = 3 [default = 0.0];
+  optional bool merge_outputs = 6 [default = false];
+  optional bool time_major = 4 [default = true];
+  optional bool asymmetric_quantize_inputs = 5 [default = false];  
+}
+
 message Conv2DOptions
 {
   optional Padding padding = 1 [default = VALID];
@@ -509,6 +519,13 @@ message MaxPoolWithArgMaxOptions {
   optional bool include_batch_in_index = 7 [default = false];
 }
 
+message FakeQuantOptions {
+  optional float min = 1 [default = 0.0];
+  optional float max = 2 [default = 0.0];
+  optional int32 num_bits = 3 [default = 0];
+  optional bool narrow_range = 4 [default = false];
+}
+
 message Operation {
   optional string type = 1;
   repeated string input = 2;
@@ -593,8 +610,8 @@ message Operation {
   optional SparseToDenseOptions sparse_to_dense_options = 175;
   optional PowOptions pow_options = 176;
   optional ArgMinOptions argmin_options = 177;
-  // FakeQuantOptions 178
-  // BidirectionalSequenceLSTMOptions 179
+  optional FakeQuantOptions fakequant_options = 178;
+  optional BidirectionalSequenceLSTMOptions bidirectional_sequence_lstm_options = 179;
   // BidirectionalSequenceRNNOptions 180
   optional UnidirectionalSequenceLSTMOptions unidirectional_sequence_lstm_options = 181;
   optional RangeOptions range_options = 182;
diff --git a/compiler/tflchef/tests/short_int_datatype/test.recipe b/compiler/tflchef/tests/short_int_datatype/test.recipe
new file mode 100644
index 000000000..1e135d912
--- /dev/null
+++ b/compiler/tflchef/tests/short_int_datatype/test.recipe
@@ -0,0 +1,44 @@
+operand {
+  name: "ifm"
+  type: INT16
+  shape { dim: 1 dim: 5 dim: 5 dim: 2 }
+}
+operand {
+  name: "ker"
+  type: INT16
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+  filler {
+    tag: "gaussian"
+    arg: "1.0"
+    arg: "6.0"
+  }
+}
+operand {
+  name: "bias"
+  type: INT16
+  shape { dim: 1 }
+  filler {
+    tag: "constant"
+    arg: "12345"
+  }
+}
+operand {
+  name: "ofm"
+  type: INT16
+  shape { dim: 1 dim: 3 dim: 3 dim: 1 }
+}
+operation {
+  type: "Conv2D"
+  conv2d_options {
+    padding: VALID
+    stride_w: 1
+    stride_h: 1
+  }
+  input: "ifm"
+  input: "ker"
+  input: "bias"
+  output: "ofm"
+}
+input: "ifm"
+input: "ker"
+output: "ofm"
diff --git a/compiler/tflchef/tests/short_int_datatype/test.reverse b/compiler/tflchef/tests/short_int_datatype/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/compiler/tflchef/tests/short_int_datatype/test.reverse
diff --git a/compiler/tflchef/tflite/CMakeLists.txt b/compiler/tflchef/tflite/CMakeLists.txt
index 83127cb3e..ce8b8c463 100644
--- a/compiler/tflchef/tflite/CMakeLists.txt
+++ b/compiler/tflchef/tflite/CMakeLists.txt
@@ -5,6 +5,5 @@ target_include_directories(tflchef_tflite PUBLIC include)
 target_include_directories(tflchef_tflite PRIVATE src)
 target_link_libraries(tflchef_tflite tflchef_proto)
 target_link_libraries(tflchef_tflite mio_tflite)
-target_link_libraries(tflchef_tflite stdex)
 target_link_libraries(tflchef_tflite cwrap)
 target_link_libraries(tflchef_tflite souschef)
diff --git a/compiler/tflchef/tflite/src/Convert.cpp b/compiler/tflchef/tflite/src/Convert.cpp
index 29276ff94..f47e51d3d 100644
--- a/compiler/tflchef/tflite/src/Convert.cpp
+++ b/compiler/tflchef/tflite/src/Convert.cpp
@@ -33,10 +33,11 @@ tflchef::TensorType as_tflchef_type(const tflite::TensorType type)
       return tflchef::UINT8;
     case tflite::TensorType_BOOL:
       return tflchef::BOOL;
+    case tflite::TensorType_INT16:
+      return tflchef::INT16;
     // TODO handle other types
     // TensorType_FLOAT16
     // TensorType_STRING
-    // TensorType_INT16
     // TensorType_COMPLEX64
     default:
       throw std::runtime_error{"unsupported tensor type"};
diff --git a/compiler/tflchef/tflite/src/Op/BidirectionalSequenceLSTM.cpp b/compiler/tflchef/tflite/src/Op/BidirectionalSequenceLSTM.cpp
new file mode 100644
index 000000000..32548247e
--- /dev/null
+++ b/compiler/tflchef/tflite/src/Op/BidirectionalSequenceLSTM.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BidirectionalSequenceLSTM.h"
+
+#include "Convert.h"
+#include "FillerHelper.h"
+
+namespace tflchef
+{
+
+void TFliteOpBidirectionalSequenceLSTM::filler(const tflite::Operator *op, TFliteImport *import,
+                                               tflchef::ModelRecipe *model_recipe) const
+{
+  const std::vector<int32_t> &inputs = as_index_vector(op->inputs());
+  assert(inputs.size() == 48);
+
+  for (int32_t i = 0; i < inputs.size(); i++)
+  {
+    // Except for Input 0, 35, 36, 37 and 38.
+    // Each Input mean Input Tensor, ActivationState Tensor (forward and backward), and CellState
+    // Tensor (forward and backward).
+    // This could be updated from previous input or User Given data, so This could not be Const
+    if (i == 0 || i == 35 || i == 36 || i == 37 || i == 38)
+      continue;
+    if (inputs[i] != -1)
+      fill_tensor_to_import(inputs[i], import);
+  }
+}
+
+tflchef::Operation *
+TFliteOpBidirectionalSequenceLSTM::build(const tflite::Operator *op, TFliteImport *import,
+                                         tflchef::ModelRecipe *model_recipe) const
+{
+  auto op_params = op->builtin_options_as_BidirectionalSequenceLSTMOptions();
+  assert(op_params != nullptr);
+
+  auto operation = model_recipe->add_operation();
+
+  operation->set_type("BidirectionalSequenceLSTM");
+
+  auto op_options = operation->mutable_bidirectional_sequence_lstm_options();
+
+  op_options->set_activation(as_tflchef_activation(op_params->fused_activation_function()));
+  op_options->set_cell_clip(op_params->cell_clip());
+  op_options->set_proj_clip(op_params->proj_clip());
+  op_options->set_time_major(op_params->time_major());
+  op_options->set_asymmetric_quantize_inputs(op_params->asymmetric_quantize_inputs());
+  op_options->set_merge_outputs(op_params->merge_outputs());
+
+  return operation;
+}
+
+} // namespace tflchef
diff --git a/compiler/tflchef/tflite/src/Op/BidirectionalSequenceLSTM.h b/compiler/tflchef/tflite/src/Op/BidirectionalSequenceLSTM.h
new file mode 100644
index 000000000..333f542ac
--- /dev/null
+++ b/compiler/tflchef/tflite/src/Op/BidirectionalSequenceLSTM.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __TFLITE_OP_BIDIRECTIONALSEQUENCE_LSTM_H__
+#define __TFLITE_OP_BIDIRECTIONALSEQUENCE_LSTM_H__
+
+#include "TFliteOpChef.h"
+
+namespace tflchef
+{
+
+/**
+ * @brief tflchef operator builder for BidirectionalSequenceLSTM
+ */
+class TFliteOpBidirectionalSequenceLSTM : public TFliteOpChef
+{
+public:
+  void filler(const tflite::Operator *op, TFliteImport *import,
+              tflchef::ModelRecipe *model_recipe) const override;
+  tflchef::Operation *build(const tflite::Operator *op, TFliteImport *import,
+                            tflchef::ModelRecipe *model_recipe) const override;
+};
+
+} // namespace tflchef
+
+#endif // __TFLITE_OP_BIDIRECTIONALSEQUENCE_LSTM_H__
diff --git a/compiler/tflchef/tflite/src/Op/FakeQuant.cpp b/compiler/tflchef/tflite/src/Op/FakeQuant.cpp
new file mode 100644
index 000000000..f44b85465
--- /dev/null
+++ b/compiler/tflchef/tflite/src/Op/FakeQuant.cpp
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "FakeQuant.h"
+
+#include "Convert.h"
+
+namespace tflchef
+{
+
+void TFliteOpFakeQuant::filler(const tflite::Operator *op, TFliteImport *import,
+                               tflchef::ModelRecipe *model_recipe) const
+{
+  // Nothing to do with filler
+}
+
+tflchef::Operation *TFliteOpFakeQuant::build(const tflite::Operator *op, TFliteImport *import,
+                                             tflchef::ModelRecipe *model_recipe) const
+{
+  auto op_params = op->builtin_options_as_FakeQuantOptions();
+  assert(op_params != nullptr);
+
+  auto operation = model_recipe->add_operation();
+
+  operation->set_type("FakeQuant");
+
+  auto op_options = operation->mutable_fakequant_options();
+
+  op_options->set_min(op_params->min());
+  op_options->set_max(op_params->max());
+  op_options->set_num_bits(op_params->num_bits());
+  op_options->set_narrow_range(op_params->narrow_range());
+
+  return operation;
+}
+
+} // namespace tflchef
diff --git a/compiler/tflchef/tflite/src/Op/FakeQuant.h b/compiler/tflchef/tflite/src/Op/FakeQuant.h
new file mode 100644
index 000000000..f36e615df
--- /dev/null
+++ b/compiler/tflchef/tflite/src/Op/FakeQuant.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __TFLITE_OP_FAKE_QUANT_H__
+#define __TFLITE_OP_FAKE_QUANT_H__
+
+#include "TFliteOpChef.h"
+
+namespace tflchef
+{
+
+/**
+ * @brief tflchef operator builder for FakeQuant
+ */
+class TFliteOpFakeQuant : public TFliteOpChef
+{
+public:
+  void filler(const tflite::Operator *op, TFliteImport *import,
+              tflchef::ModelRecipe *model_recipe) const override;
+  tflchef::Operation *build(const tflite::Operator *op, TFliteImport *import,
+                            tflchef::ModelRecipe *model_recipe) const override;
+};
+
+} // namespace tflchef
+
+#endif // __TFLITE_OP_FAKE_QUANT_H__
diff --git a/compiler/tflchef/tflite/src/Op/Maximum.cpp b/compiler/tflchef/tflite/src/Op/Maximum.cpp
index fb977b6ed..d52caf0c2 100644
--- a/compiler/tflchef/tflite/src/Op/Maximum.cpp
+++ b/compiler/tflchef/tflite/src/Op/Maximum.cpp
@@ -16,13 +16,20 @@
 
 #include "Maximum.h"
 
+#include "Convert.h"
+#include "FillerHelper.h"
+
 namespace tflchef
 {
 
 void TFliteOpMaximum::filler(const tflite::Operator *op, TFliteImport *import,
                              tflchef::ModelRecipe *model_recipe) const
 {
-  // Nothing to do with filler
+  const std::vector<int32_t> &inputs = as_index_vector(op->inputs());
+  assert(inputs.size() == 2);
+
+  fill_tensor_to_import(inputs[0], import);
+  fill_tensor_to_import(inputs[1], import);
 }
 
 tflchef::Operation *TFliteOpMaximum::build(const tflite::Operator *op, TFliteImport *import,
diff --git a/compiler/tflchef/tflite/src/Op/Minimum.cpp b/compiler/tflchef/tflite/src/Op/Minimum.cpp
index 2bb50cb89..6440f1deb 100644
--- a/compiler/tflchef/tflite/src/Op/Minimum.cpp
+++ b/compiler/tflchef/tflite/src/Op/Minimum.cpp
@@ -17,6 +17,7 @@
 #include "Minimum.h"
 
 #include "Convert.h"
+#include "FillerHelper.h"
 
 namespace tflchef
 {
@@ -24,7 +25,11 @@ namespace tflchef
 void TFliteOpMinimum::filler(const tflite::Operator *op, TFliteImport *import,
                              tflchef::ModelRecipe *model_recipe) const
 {
-  // Nothing to do with filler
+  const std::vector<int32_t> &inputs = as_index_vector(op->inputs());
+  assert(inputs.size() == 2);
+
+  fill_tensor_to_import(inputs[0], import);
+  fill_tensor_to_import(inputs[1], import);
 }
 
 tflchef::Operation *TFliteOpMinimum::build(const tflite::Operator *op, TFliteImport *import,
diff --git a/compiler/tflchef/tflite/src/Op/UnidirectionalSequenceLSTM.cpp b/compiler/tflchef/tflite/src/Op/UnidirectionalSequenceLSTM.cpp
index c2c79285b..b2bc1acbd 100644
--- a/compiler/tflchef/tflite/src/Op/UnidirectionalSequenceLSTM.cpp
+++ b/compiler/tflchef/tflite/src/Op/UnidirectionalSequenceLSTM.cpp
@@ -30,11 +30,11 @@ void TFliteOpUnidirectionalSequenceLSTM::filler(const tflite::Operator *op, TFli
 
   for (int32_t i = 0; i < inputs.size(); i++)
   {
-    // Except for Input 0, 17 and 18.
-    // Each Input mean Input[0](=Input Tensor), Input[17](=OutputState Tensor) and
-    // Input[18](=CellState Tensor).
+    // Except for Input 0, 18 and 19.
+    // Each Input mean Input[0](=Input Tensor), Input[18](=OutputState Tensor) and
+    // Input[19](=CellState Tensor).
     // This could be updated from previous input or User Given data, so This could not be Const
-    if (i == 0 || i == 17 || i == 18)
+    if (i == 0 || i == 18 || i == 19)
       continue;
     if (inputs[i] != -1)
       fill_tensor_to_import(inputs[i], import);
diff --git a/compiler/tflchef/tflite/src/TFliteOpChefs.h b/compiler/tflchef/tflite/src/TFliteOpChefs.h
index 2e4d28051..960ff6e36 100644
--- a/compiler/tflchef/tflite/src/TFliteOpChefs.h
+++ b/compiler/tflchef/tflite/src/TFliteOpChefs.h
@@ -26,6 +26,7 @@
 #include "Op/AveragePool2D.h"
 #include "Op/BatchMatMul.h"
 #include "Op/BatchToSpaceND.h"
+#include "Op/BidirectionalSequenceLSTM.h"
 #include "Op/Cast.h"
 #include "Op/Ceil.h"
 #include "Op/Concatenation.h"
@@ -39,6 +40,7 @@
 #include "Op/Equal.h"
 #include "Op/Exp.h"
 #include "Op/ExpandDims.h"
+#include "Op/FakeQuant.h"
 #include "Op/Fill.h"
 #include "Op/Floor.h"
 #include "Op/FloorDiv.h"
diff --git a/compiler/tflchef/tflite/src/TFliteOpRegistry.h b/compiler/tflchef/tflite/src/TFliteOpRegistry.h
index 9cc630a97..c240bcf52 100644
--- a/compiler/tflchef/tflite/src/TFliteOpRegistry.h
+++ b/compiler/tflchef/tflite/src/TFliteOpRegistry.h
@@ -63,6 +63,7 @@ private:
     REG_TFL_OP(AVERAGE_POOL_2D, TFliteOpAveragePool2D);
     REG_TFL_OP(BATCH_MATMUL, TFliteOpBatchMatMul);
     REG_TFL_OP(BATCH_TO_SPACE_ND, TFliteOpBatchToSpaceND);
+    REG_TFL_OP(BIDIRECTIONAL_SEQUENCE_LSTM, TFliteOpBidirectionalSequenceLSTM);
     REG_TFL_OP(CAST, TFliteOpCast);
     REG_TFL_OP(CEIL, TFliteOpCeil);
     REG_TFL_OP(CONCATENATION, TFliteOpConcatenation);
@@ -76,6 +77,7 @@ private:
     REG_TFL_OP(EQUAL, TFliteOpEqual);
     REG_TFL_OP(EXP, TFliteOpExp);
     REG_TFL_OP(EXPAND_DIMS, TFliteOpExpandDims);
+    REG_TFL_OP(FAKE_QUANT, TFliteOpFakeQuant);
     REG_TFL_OP(FILL, TFliteOpFill);
     REG_TFL_OP(FLOOR, TFliteOpFloor);
     REG_TFL_OP(FLOOR_DIV, TFliteOpFloorDiv);
diff --git a/compiler/tflchef/tools/console/CMakeLists.txt b/compiler/tflchef/tools/console/CMakeLists.txt
index d9160c3a2..c57e3fdcb 100644
--- a/compiler/tflchef/tools/console/CMakeLists.txt
+++ b/compiler/tflchef/tools/console/CMakeLists.txt
@@ -1,3 +1,14 @@
 add_executable(tflchef Driver.cpp)
 target_link_libraries(tflchef tflchef_core)
 target_link_libraries(tflchef safemain)
+
+install(TARGETS tflchef DESTINATION bin)
+
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
+nnas_find_package(GTest REQUIRED)
+
+GTest_AddTest(tflchef_test Driver.test.cpp Driver.cpp)
+target_link_libraries(tflchef_test tflchef_core)
diff --git a/compiler/tflchef/tools/console/Driver.cpp b/compiler/tflchef/tools/console/Driver.cpp
index d6f7ba1ae..23f2fff3f 100644
--- a/compiler/tflchef/tools/console/Driver.cpp
+++ b/compiler/tflchef/tools/console/Driver.cpp
@@ -22,7 +22,7 @@
 
 #include <iostream>
 
-int entry(int argc, char **argv)
+int entry_stream(std::istream &is)
 {
   int32_t model_version = 1;
 
@@ -30,7 +30,7 @@ int entry(int argc, char **argv)
 
   // Read a model recipe from standard input
   {
-    google::protobuf::io::IstreamInputStream iis{&std::cin};
+    google::protobuf::io::IstreamInputStream iis{&is};
     if (!google::protobuf::TextFormat::Parse(&iis, &model_recipe))
     {
       std::cerr << "ERROR: Failed to parse recipe" << std::endl;
@@ -56,3 +56,9 @@ int entry(int argc, char **argv)
 
   return 0;
 }
+
+int entry(int, char **)
+{
+  // forward to entry_stream
+  return entry_stream(std::cin);
+}
diff --git a/compiler/tflchef/tools/console/Driver.test.cpp b/compiler/tflchef/tools/console/Driver.test.cpp
new file mode 100644
index 000000000..b3cf2134d
--- /dev/null
+++ b/compiler/tflchef/tools/console/Driver.test.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+// entry function to test from Driver.cpp
+int entry_stream(std::istream &is);
+
+TEST(TFlChefDriverTest, entry_empty_NEG)
+{
+  std::istringstream empty_input("");
+
+  ASSERT_EQ(0, entry_stream(empty_input));
+}
+
+TEST(TFlChefDriverTest, entry_invaid_NEG)
+{
+  std::istringstream empty_input("invalid: input");
+
+  ASSERT_NE(0, entry_stream(empty_input));
+}
+
+TEST(TFlChefDriverTest, entry_invaid_version_NEG)
+{
+  std::istringstream empty_input("version: 9999");
+
+  ASSERT_NE(0, entry_stream(empty_input));
+}
diff --git a/compiler/tflchef/tools/file/CMakeLists.txt b/compiler/tflchef/tools/file/CMakeLists.txt
index f411d60f1..e3b7b2f48 100644
--- a/compiler/tflchef/tools/file/CMakeLists.txt
+++ b/compiler/tflchef/tools/file/CMakeLists.txt
@@ -2,3 +2,5 @@ add_executable(tflchef-file Driver.cpp)
 target_link_libraries(tflchef-file arser)
 target_link_libraries(tflchef-file tflchef_core)
 target_link_libraries(tflchef-file safemain)
+
+install(TARGETS tflchef-file DESTINATION bin)
diff --git a/compiler/tflchef/tools/file/Driver.cpp b/compiler/tflchef/tools/file/Driver.cpp
index 46e5b5583..d4605ced3 100644
--- a/compiler/tflchef/tools/file/Driver.cpp
+++ b/compiler/tflchef/tools/file/Driver.cpp
@@ -29,8 +29,8 @@ int entry(int argc, char **argv)
 {
   arser::Arser arser;
   arser.add_argument("recipe")
-      .type(arser::DataType::STR)
-      .help("Source recipe file path to convert");
+    .type(arser::DataType::STR)
+    .help("Source recipe file path to convert");
   arser.add_argument("tflite").type(arser::DataType::STR).help("Target tflite file path");
 
   try
@@ -67,8 +67,8 @@ int entry(int argc, char **argv)
 
   if (model_version > 1)
   {
-    std::cerr << "ERROR: Unsupported recipe version: " << model_version << ", '" << argv[1] << "'"
-              << std::endl;
+    std::cerr << "ERROR: Unsupported recipe version: " << model_version << ", '" << recipe_path
+              << "'" << std::endl;
     return 255;
   }
 
diff --git a/compiler/tflchef/tools/reverse/CMakeLists.txt b/compiler/tflchef/tools/reverse/CMakeLists.txt
index a5c0f5bca..21700faca 100644
--- a/compiler/tflchef/tools/reverse/CMakeLists.txt
+++ b/compiler/tflchef/tools/reverse/CMakeLists.txt
@@ -3,3 +3,5 @@ target_link_libraries(tflchef-reverse arser)
 target_link_libraries(tflchef-reverse tflchef_tflite)
 target_link_libraries(tflchef-reverse safemain)
 target_link_libraries(tflchef-reverse foder)
+
+install(TARGETS tflchef-reverse DESTINATION bin)
diff --git a/compiler/tflchef/tools/reverse/Driver.cpp b/compiler/tflchef/tools/reverse/Driver.cpp
index 4d795a3d0..1451e8bb8 100644
--- a/compiler/tflchef/tools/reverse/Driver.cpp
+++ b/compiler/tflchef/tools/reverse/Driver.cpp
@@ -26,8 +26,8 @@ int entry(int argc, char **argv)
 {
   arser::Arser arser;
   arser.add_argument("tflite")
-      .type(arser::DataType::STR)
-      .help("Source tflite file path to convert");
+    .type(arser::DataType::STR)
+    .help("Source tflite file path to convert");
   arser.add_argument("recipe").type(arser::DataType::STR).help("Target recipe file path");
 
   try
diff --git a/compiler/tfldump/README.md b/compiler/tfldump/README.md
index 50d003f12..65ad105c2 100644
--- a/compiler/tfldump/README.md
+++ b/compiler/tfldump/README.md
@@ -63,5 +63,4 @@ O T(3) ofm
 ### Dependency
 
 - safemain
-- stdex
 - FlatBuffers
diff --git a/compiler/tfldump/src/OpPrinter.cpp b/compiler/tfldump/src/OpPrinter.cpp
index c35848047..4a417cef1 100644
--- a/compiler/tfldump/src/OpPrinter.cpp
+++ b/compiler/tfldump/src/OpPrinter.cpp
@@ -74,6 +74,26 @@ public:
   }
 };
 
+class BidirectionalSequenceLSTMPrinter : public OpPrinter
+{
+public:
+  void options(const tflite::Operator *op, std::ostream &os) const override
+  {
+    if (auto *params = op->builtin_options_as_BidirectionalSequenceLSTMOptions())
+    {
+      os << "    ";
+      os << "Activation(" << EnumNameActivationFunctionType(params->fused_activation_function())
+         << ") ";
+      os << "cell_clip(" << params->cell_clip() << ") ";
+      os << "proj_clip(" << params->proj_clip() << ") ";
+      os << "time_major(" << params->time_major() << ") ";
+      os << "asymmetric_quantize_inputs(" << params->asymmetric_quantize_inputs() << ") ";
+      os << "merge_outputs(" << params->merge_outputs() << ") ";
+      os << std::endl;
+    }
+  }
+};
+
 class CastPrinter : public OpPrinter
 {
 public:
@@ -277,7 +297,7 @@ public:
       os << "Stride.H(" << conv_params->stride_h() << ") ";
       os << "DepthMultiplier(" << conv_params->depth_multiplier() << ") ";
       os << "Dilation.W(" << conv_params->dilation_w_factor() << ") ";
-      os << "Dilation.H(" << conv_params->dilation_h_factor() << ")";
+      os << "Dilation.H(" << conv_params->dilation_h_factor() << ") ";
       os << "Activation("
          << EnumNameActivationFunctionType(conv_params->fused_activation_function()) << ") ";
       os << std::endl;
@@ -285,6 +305,25 @@ public:
   }
 };
 
+class FakeQuantPrinter : public OpPrinter
+{
+public:
+  void options(const tflite::Operator *op, std::ostream &os) const override
+  {
+    if (auto *params = op->builtin_options_as_FakeQuantOptions())
+    {
+      os << "    ";
+      os << "Min(" << params->min() << ") ";
+      os << "Max(" << params->max() << ") ";
+      os << "NumBits(" << params->num_bits() << ") ";
+      os << std::boolalpha;
+      os << "NarrowRange(" << params->narrow_range() << ") ";
+      os << std::noboolalpha;
+      os << std::endl;
+    }
+  }
+};
+
 class FullyConnectedPrinter : public OpPrinter
 {
 public:
@@ -672,6 +711,8 @@ OpPrinterRegistry::OpPrinterRegistry()
   _op_map[tflite::BuiltinOperator_ARG_MAX] = make_unique<ArgMaxPrinter>();
   _op_map[tflite::BuiltinOperator_ARG_MIN] = make_unique<ArgMinPrinter>();
   _op_map[tflite::BuiltinOperator_AVERAGE_POOL_2D] = make_unique<Pool2DPrinter>();
+  _op_map[tflite::BuiltinOperator_BIDIRECTIONAL_SEQUENCE_LSTM] =
+    make_unique<BidirectionalSequenceLSTMPrinter>();
   _op_map[tflite::BuiltinOperator_CAST] = make_unique<CastPrinter>();
   // There is no Option for CEIL
   _op_map[tflite::BuiltinOperator_CONCATENATION] = make_unique<ConcatenationPrinter>();
@@ -680,6 +721,7 @@ OpPrinterRegistry::OpPrinterRegistry()
   _op_map[tflite::BuiltinOperator_DEPTHWISE_CONV_2D] = make_unique<DepthwiseConv2DPrinter>();
   // There is no Option for DEQUANTIZE
   _op_map[tflite::BuiltinOperator_DIV] = make_unique<DivPrinter>();
+  _op_map[tflite::BuiltinOperator_FAKE_QUANT] = make_unique<FakeQuantPrinter>();
   // There is no Option for FLOOR
   // There is no Option for FLOOR_MOD
   _op_map[tflite::BuiltinOperator_FULLY_CONNECTED] = make_unique<FullyConnectedPrinter>();
@@ -689,7 +731,7 @@ OpPrinterRegistry::OpPrinterRegistry()
   _op_map[tflite::BuiltinOperator_L2_NORMALIZATION] = make_unique<L2NormPrinter>();
   _op_map[tflite::BuiltinOperator_LEAKY_RELU] = make_unique<LeakyReluPrinter>();
   _op_map[tflite::BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION] =
-      make_unique<LocalResponseNormalizationPrinter>();
+    make_unique<LocalResponseNormalizationPrinter>();
   // There is no Option for LOG
   // There is no Option for LOGISTIC
   // There is no Option for LOG_SOFTMAX
@@ -714,7 +756,7 @@ OpPrinterRegistry::OpPrinterRegistry()
   _op_map[tflite::BuiltinOperator_RESHAPE] = make_unique<ReshapePrinter>();
   _op_map[tflite::BuiltinOperator_RESIZE_BILINEAR] = make_unique<ResizeBilinearPrinter>();
   _op_map[tflite::BuiltinOperator_RESIZE_NEAREST_NEIGHBOR] =
-      make_unique<ResizeNearestNeighborPrinter>();
+    make_unique<ResizeNearestNeighborPrinter>();
   _op_map[tflite::BuiltinOperator_REVERSE_SEQUENCE] = make_unique<ReverseSequencePrinter>();
   // There is no Option for ROUND
   // There is no Option for SELECT
@@ -735,7 +777,7 @@ OpPrinterRegistry::OpPrinterRegistry()
   _op_map[tflite::BuiltinOperator_TRANSPOSE_CONV] = make_unique<TransposeConvPrinter>();
   // There is no Option for TOPK_V2
   _op_map[tflite::BuiltinOperator_UNIDIRECTIONAL_SEQUENCE_LSTM] =
-      make_unique<UnidirectionalSequenceLSTMPrinter>();
+    make_unique<UnidirectionalSequenceLSTMPrinter>();
   _op_map[tflite::BuiltinOperator_UNIQUE] = make_unique<UniquePrinter>();
   _op_map[tflite::BuiltinOperator_WHILE] = make_unique<WhilePrinter>();
   _op_map[tflite::BuiltinOperator_CUSTOM] = make_unique<CustomOpPrinter>();
diff --git a/compiler/tflite2circle/CMakeLists.txt b/compiler/tflite2circle/CMakeLists.txt
index b1d1f6149..3e46dd803 100644
--- a/compiler/tflite2circle/CMakeLists.txt
+++ b/compiler/tflite2circle/CMakeLists.txt
@@ -15,5 +15,6 @@ target_link_libraries(tflite2circle safemain)
 target_link_libraries(tflite2circle mio_tflite)
 target_link_libraries(tflite2circle mio_circle)
 target_link_libraries(tflite2circle vconone)
+target_link_libraries(tflite2circle nncc_coverage)
 
 install(TARGETS tflite2circle DESTINATION bin)
diff --git a/compiler/tflite2circle/driver/Driver.cpp b/compiler/tflite2circle/driver/Driver.cpp
index 2f11e0a13..ba7892179 100644
--- a/compiler/tflite2circle/driver/Driver.cpp
+++ b/compiler/tflite2circle/driver/Driver.cpp
@@ -37,16 +37,16 @@ int entry(int argc, char **argv)
   arser::Arser arser{"tflite2circle is a Tensorflow lite to circle model converter"};
 
   arser.add_argument("--version")
-      .nargs(0)
-      .required(false)
-      .default_value(false)
-      .help("Show version information and exit")
-      .exit_with(print_version);
+    .nargs(0)
+    .required(false)
+    .default_value(false)
+    .help("Show version information and exit")
+    .exit_with(print_version);
 
   arser.add_argument("tflite")
-      .nargs(1)
-      .type(arser::DataType::STR)
-      .help("Source tflite file path to convert");
+    .nargs(1)
+    .type(arser::DataType::STR)
+    .help("Source tflite file path to convert");
   arser.add_argument("circle").nargs(1).type(arser::DataType::STR).help("Target circle file path");
 
   try
diff --git a/compiler/tflite2circle/src/BuildBuiltinOptions.h b/compiler/tflite2circle/src/BuildBuiltinOptions.h
index 56a16d4e0..dc6ff086c 100644
--- a/compiler/tflite2circle/src/BuildBuiltinOptions.h
+++ b/compiler/tflite2circle/src/BuildBuiltinOptions.h
@@ -26,6 +26,7 @@
 #include "BuildBuiltinOptions/ArgMinOptions.h"
 #include "BuildBuiltinOptions/BatchMatMulOptions.h"
 #include "BuildBuiltinOptions/BatchToSpaceNDOptions.h"
+#include "BuildBuiltinOptions/BidirectionalSequenceLSTMOptions.h"
 #include "BuildBuiltinOptions/CastOptions.h"
 #include "BuildBuiltinOptions/ConcatenationOptions.h"
 #include "BuildBuiltinOptions/Conv2DOptions.h"
@@ -36,6 +37,7 @@
 #include "BuildBuiltinOptions/EqualOptions.h"
 #include "BuildBuiltinOptions/ExpandDimsOptions.h"
 #include "BuildBuiltinOptions/ExpOptions.h"
+#include "BuildBuiltinOptions/FakeQuantOptions.h"
 #include "BuildBuiltinOptions/FillOptions.h"
 #include "BuildBuiltinOptions/FloorDivOptions.h"
 #include "BuildBuiltinOptions/FloorModOptions.h"
diff --git a/compiler/tflite2circle/src/BuildBuiltinOptions/AddOptions.cpp b/compiler/tflite2circle/src/BuildBuiltinOptions/AddOptions.cpp
index f93a0f21f..5bdb1020a 100644
--- a/compiler/tflite2circle/src/BuildBuiltinOptions/AddOptions.cpp
+++ b/compiler/tflite2circle/src/BuildBuiltinOptions/AddOptions.cpp
@@ -29,7 +29,7 @@ flatbuffers::Offset<circle::AddOptions> build_circle_AddOptions(flatbuffers::Fla
   assert(tflite_builtin_options);
   circle::AddOptionsBuilder builtin_options_builder{fb};
   builtin_options_builder.add_fused_activation_function(
-      get_circle_activation_function_type(tflite_builtin_options->fused_activation_function()));
+    get_circle_activation_function_type(tflite_builtin_options->fused_activation_function()));
   return builtin_options_builder.Finish();
 }
 
diff --git a/compiler/tflite2circle/src/BuildBuiltinOptions/ArgMaxOptions.cpp b/compiler/tflite2circle/src/BuildBuiltinOptions/ArgMaxOptions.cpp
index 0ccdde4cb..ac0044a8f 100644
--- a/compiler/tflite2circle/src/BuildBuiltinOptions/ArgMaxOptions.cpp
+++ b/compiler/tflite2circle/src/BuildBuiltinOptions/ArgMaxOptions.cpp
@@ -29,7 +29,7 @@ build_circle_ArgMaxOptions(flatbuffers::FlatBufferBuilder &fb, const tflite::Ope
   assert(tflite_builtin_options);
   circle::ArgMaxOptionsBuilder builtin_options_builder{fb};
   builtin_options_builder.add_output_type(
-      get_circle_tensortype(tflite_builtin_options->output_type()));
+    get_circle_tensortype(tflite_builtin_options->output_type()));
   return builtin_options_builder.Finish();
 }
 
diff --git a/compiler/tflite2circle/src/BuildBuiltinOptions/ArgMinOptions.cpp b/compiler/tflite2circle/src/BuildBuiltinOptions/ArgMinOptions.cpp
index 204558df8..3011c8b65 100644
--- a/compiler/tflite2circle/src/BuildBuiltinOptions/ArgMinOptions.cpp
+++ b/compiler/tflite2circle/src/BuildBuiltinOptions/ArgMinOptions.cpp
@@ -29,7 +29,7 @@ build_circle_ArgMinOptions(flatbuffers::FlatBufferBuilder &fb, const tflite::Ope
   assert(tflite_builtin_options);
   circle::ArgMinOptionsBuilder builtin_options_builder{fb};
   builtin_options_builder.add_output_type(
-      get_circle_tensortype(tflite_builtin_options->output_type()));
+    get_circle_tensortype(tflite_builtin_options->output_type()));
   return builtin_options_builder.Finish();
 }
 
diff --git a/compiler/tflite2circle/src/BuildBuiltinOptions/BidirectionalSequenceLSTMOptions.cpp b/compiler/tflite2circle/src/BuildBuiltinOptions/BidirectionalSequenceLSTMOptions.cpp
new file mode 100644
index 000000000..2a6cf171b
--- /dev/null
+++ b/compiler/tflite2circle/src/BuildBuiltinOptions/BidirectionalSequenceLSTMOptions.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BidirectionalSequenceLSTMOptions.h"
+#include "DataLookup.h"
+
+#include <cassert>
+
+namespace tflite2circle
+{
+
+flatbuffers::Offset<circle::BidirectionalSequenceLSTMOptions>
+build_circle_BidirectionalSequenceLSTMOptions(flatbuffers::FlatBufferBuilder &fb,
+                                              const tflite::Operator *op)
+{
+  auto tflite_builtin_options = op->builtin_options_as_BidirectionalSequenceLSTMOptions();
+  circle::BidirectionalSequenceLSTMOptionsBuilder builtin_options_builder{fb};
+  builtin_options_builder.add_fused_activation_function(
+    get_circle_activation_function_type(tflite_builtin_options->fused_activation_function()));
+  builtin_options_builder.add_cell_clip(tflite_builtin_options->cell_clip());
+  builtin_options_builder.add_proj_clip(tflite_builtin_options->proj_clip());
+  builtin_options_builder.add_time_major(tflite_builtin_options->time_major());
+  builtin_options_builder.add_merge_outputs(tflite_builtin_options->merge_outputs());
+  builtin_options_builder.add_asymmetric_quantize_inputs(
+    tflite_builtin_options->asymmetric_quantize_inputs());
+  return builtin_options_builder.Finish();
+}
+
+} // namespace tflite2circle
diff --git a/compiler/tflite2circle/src/BuildBuiltinOptions/BidirectionalSequenceLSTMOptions.h b/compiler/tflite2circle/src/BuildBuiltinOptions/BidirectionalSequenceLSTMOptions.h
new file mode 100644
index 000000000..7b77b1cea
--- /dev/null
+++ b/compiler/tflite2circle/src/BuildBuiltinOptions/BidirectionalSequenceLSTMOptions.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BBO_BIDIRECTIONALSEQUENCE_LSTM_OPTIONS_H__
+#define __BBO_BIDIRECTIONALSEQUENCE_LSTM_OPTIONS_H__
+
+#include <mio/tflite/schema_generated.h>
+#include <mio/circle/schema_generated.h>
+
+namespace tflite2circle
+{
+
+flatbuffers::Offset<circle::BidirectionalSequenceLSTMOptions>
+build_circle_BidirectionalSequenceLSTMOptions(flatbuffers::FlatBufferBuilder &fb,
+                                              const tflite::Operator *op);
+
+} // namespace tflite2circle
+
+#endif // __BBO_BIDIRECTIONALSEQUENCE_LSTM_OPTIONS_H__
diff --git a/compiler/tflite2circle/src/BuildBuiltinOptions/CastOptions.cpp b/compiler/tflite2circle/src/BuildBuiltinOptions/CastOptions.cpp
index bc1445248..0f2422c05 100644
--- a/compiler/tflite2circle/src/BuildBuiltinOptions/CastOptions.cpp
+++ b/compiler/tflite2circle/src/BuildBuiltinOptions/CastOptions.cpp
@@ -31,9 +31,9 @@ build_circle_CastOptions(flatbuffers::FlatBufferBuilder &fb, const tflite::Opera
 
   circle::CastOptionsBuilder builtin_options_builder{fb};
   builtin_options_builder.add_in_data_type(
-      get_circle_tensortype(tflite_builtin_options->in_data_type()));
+    get_circle_tensortype(tflite_builtin_options->in_data_type()));
   builtin_options_builder.add_out_data_type(
-      get_circle_tensortype(tflite_builtin_options->out_data_type()));
+    get_circle_tensortype(tflite_builtin_options->out_data_type()));
   return builtin_options_builder.Finish();
 }
 
diff --git a/compiler/tflite2circle/src/BuildBuiltinOptions/ConcatenationOptions.cpp b/compiler/tflite2circle/src/BuildBuiltinOptions/ConcatenationOptions.cpp
index 933e7cf66..becc63bf6 100644
--- a/compiler/tflite2circle/src/BuildBuiltinOptions/ConcatenationOptions.cpp
+++ b/compiler/tflite2circle/src/BuildBuiltinOptions/ConcatenationOptions.cpp
@@ -30,7 +30,7 @@ build_circle_ConcatenationOptions(flatbuffers::FlatBufferBuilder &fb, const tfli
   circle::ConcatenationOptionsBuilder builtin_options_builder{fb};
   builtin_options_builder.add_axis(tflite_builtin_options->axis());
   builtin_options_builder.add_fused_activation_function(
-      get_circle_activation_function_type(tflite_builtin_options->fused_activation_function()));
+    get_circle_activation_function_type(tflite_builtin_options->fused_activation_function()));
   return builtin_options_builder.Finish();
 }
 
diff --git a/compiler/tflite2circle/src/BuildBuiltinOptions/Conv2DOptions.cpp b/compiler/tflite2circle/src/BuildBuiltinOptions/Conv2DOptions.cpp
index ace63dd26..ec0cffeda 100644
--- a/compiler/tflite2circle/src/BuildBuiltinOptions/Conv2DOptions.cpp
+++ b/compiler/tflite2circle/src/BuildBuiltinOptions/Conv2DOptions.cpp
@@ -32,7 +32,7 @@ build_circle_Conv2DOptions(flatbuffers::FlatBufferBuilder &fb, const tflite::Ope
   builtin_options_builder.add_stride_w(tflite_builtin_options->stride_w());
   builtin_options_builder.add_stride_h(tflite_builtin_options->stride_h());
   builtin_options_builder.add_fused_activation_function(
-      get_circle_activation_function_type(tflite_builtin_options->fused_activation_function()));
+    get_circle_activation_function_type(tflite_builtin_options->fused_activation_function()));
   builtin_options_builder.add_dilation_w_factor(tflite_builtin_options->dilation_w_factor());
   builtin_options_builder.add_dilation_h_factor(tflite_builtin_options->dilation_h_factor());
   return builtin_options_builder.Finish();
diff --git a/compiler/tflite2circle/src/BuildBuiltinOptions/DepthwiseConv2DOptions.cpp b/compiler/tflite2circle/src/BuildBuiltinOptions/DepthwiseConv2DOptions.cpp
index 2aa35abc6..910a6ead9 100644
--- a/compiler/tflite2circle/src/BuildBuiltinOptions/DepthwiseConv2DOptions.cpp
+++ b/compiler/tflite2circle/src/BuildBuiltinOptions/DepthwiseConv2DOptions.cpp
@@ -33,7 +33,7 @@ build_circle_DepthwiseConv2DOptions(flatbuffers::FlatBufferBuilder &fb, const tf
   builtin_options_builder.add_stride_h(tflite_builtin_options->stride_h());
   builtin_options_builder.add_depth_multiplier(tflite_builtin_options->depth_multiplier());
   builtin_options_builder.add_fused_activation_function(
-      get_circle_activation_function_type(tflite_builtin_options->fused_activation_function()));
+    get_circle_activation_function_type(tflite_builtin_options->fused_activation_function()));
   builtin_options_builder.add_dilation_w_factor(tflite_builtin_options->dilation_w_factor());
   builtin_options_builder.add_dilation_h_factor(tflite_builtin_options->dilation_h_factor());
   return builtin_options_builder.Finish();
diff --git a/compiler/tflite2circle/src/BuildBuiltinOptions/DivOptions.cpp b/compiler/tflite2circle/src/BuildBuiltinOptions/DivOptions.cpp
index 4272fe144..3678928a5 100644
--- a/compiler/tflite2circle/src/BuildBuiltinOptions/DivOptions.cpp
+++ b/compiler/tflite2circle/src/BuildBuiltinOptions/DivOptions.cpp
@@ -29,7 +29,7 @@ flatbuffers::Offset<circle::DivOptions> build_circle_DivOptions(flatbuffers::Fla
   assert(tflite_builtin_options);
   circle::DivOptionsBuilder builtin_options_builder{fb};
   builtin_options_builder.add_fused_activation_function(
-      get_circle_activation_function_type(tflite_builtin_options->fused_activation_function()));
+    get_circle_activation_function_type(tflite_builtin_options->fused_activation_function()));
   return builtin_options_builder.Finish();
 }
 
diff --git a/compiler/tflite2circle/src/BuildBuiltinOptions/FakeQuantOptions.cpp b/compiler/tflite2circle/src/BuildBuiltinOptions/FakeQuantOptions.cpp
new file mode 100644
index 000000000..e38600f82
--- /dev/null
+++ b/compiler/tflite2circle/src/BuildBuiltinOptions/FakeQuantOptions.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "FillOptions.h"
+
+namespace tflite2circle
+{
+
+flatbuffers::Offset<circle::FakeQuantOptions>
+build_circle_FakeQuantOptions(flatbuffers::FlatBufferBuilder &fb, const tflite::Operator *op)
+{
+  auto tflite_builtin_options = op->builtin_options_as_FakeQuantOptions();
+  assert(tflite_builtin_options);
+  circle::FakeQuantOptionsBuilder builtin_options_builder{fb};
+  builtin_options_builder.add_min(tflite_builtin_options->min());
+  builtin_options_builder.add_max(tflite_builtin_options->max());
+  builtin_options_builder.add_num_bits(tflite_builtin_options->num_bits());
+  builtin_options_builder.add_narrow_range(tflite_builtin_options->narrow_range());
+  return builtin_options_builder.Finish();
+}
+
+} // namespace tflite2circle
diff --git a/compiler/tflite2circle/src/BuildBuiltinOptions/FakeQuantOptions.h b/compiler/tflite2circle/src/BuildBuiltinOptions/FakeQuantOptions.h
new file mode 100644
index 000000000..1f5f12b86
--- /dev/null
+++ b/compiler/tflite2circle/src/BuildBuiltinOptions/FakeQuantOptions.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BBO_FAKEQUANT_OPTIONS_H__
+#define __BBO_FAKEQUANT_OPTIONS_H__
+
+#include <mio/tflite/schema_generated.h>
+#include <mio/circle/schema_generated.h>
+
+namespace tflite2circle
+{
+
+flatbuffers::Offset<circle::FillOptions>
+build_circle_FakeQuantOptions(flatbuffers::FlatBufferBuilder &fb, const tflite::Operator *op);
+
+} // namespace tflite2circle
+
+#endif // __BBO_FAKEQUANT_OPTIONS_H__
diff --git a/compiler/tflite2circle/src/BuildBuiltinOptions/FullyConnectedOptions.cpp b/compiler/tflite2circle/src/BuildBuiltinOptions/FullyConnectedOptions.cpp
index 098a96a40..2619b73eb 100644
--- a/compiler/tflite2circle/src/BuildBuiltinOptions/FullyConnectedOptions.cpp
+++ b/compiler/tflite2circle/src/BuildBuiltinOptions/FullyConnectedOptions.cpp
@@ -29,14 +29,14 @@ build_circle_FullyConnectedOptions(flatbuffers::FlatBufferBuilder &fb, const tfl
   assert(tflite_builtin_options);
   circle::FullyConnectedOptionsBuilder builtin_options_builder{fb};
   builtin_options_builder.add_fused_activation_function(
-      get_circle_activation_function_type(tflite_builtin_options->fused_activation_function()));
+    get_circle_activation_function_type(tflite_builtin_options->fused_activation_function()));
   // Get FullyConnectedOptionsWeightsFormat
   auto tflite_weight_format = tflite_builtin_options->weights_format();
   if (tflite_weight_format == tflite::FullyConnectedOptionsWeightsFormat_DEFAULT)
     builtin_options_builder.add_weights_format(circle::FullyConnectedOptionsWeightsFormat_DEFAULT);
   else if (tflite_weight_format == tflite::FullyConnectedOptionsWeightsFormat_SHUFFLED4x16INT8)
     builtin_options_builder.add_weights_format(
-        circle::FullyConnectedOptionsWeightsFormat_SHUFFLED4x16INT8);
+      circle::FullyConnectedOptionsWeightsFormat_SHUFFLED4x16INT8);
   return builtin_options_builder.Finish();
 }
 
diff --git a/compiler/tflite2circle/src/BuildBuiltinOptions/L2NormalizeOptions.cpp b/compiler/tflite2circle/src/BuildBuiltinOptions/L2NormalizeOptions.cpp
index d58aed83d..f5121a811 100644
--- a/compiler/tflite2circle/src/BuildBuiltinOptions/L2NormalizeOptions.cpp
+++ b/compiler/tflite2circle/src/BuildBuiltinOptions/L2NormalizeOptions.cpp
@@ -29,7 +29,7 @@ build_circle_L2NormOptions(flatbuffers::FlatBufferBuilder &fb, const tflite::Ope
   assert(tflite_builtin_options);
   circle::L2NormOptionsBuilder builtin_options_builder{fb};
   builtin_options_builder.add_fused_activation_function(
-      get_circle_activation_function_type(tflite_builtin_options->fused_activation_function()));
+    get_circle_activation_function_type(tflite_builtin_options->fused_activation_function()));
   return builtin_options_builder.Finish();
 }
 
diff --git a/compiler/tflite2circle/src/BuildBuiltinOptions/MulOptions.cpp b/compiler/tflite2circle/src/BuildBuiltinOptions/MulOptions.cpp
index 009daea8b..3d4b9deb5 100644
--- a/compiler/tflite2circle/src/BuildBuiltinOptions/MulOptions.cpp
+++ b/compiler/tflite2circle/src/BuildBuiltinOptions/MulOptions.cpp
@@ -29,7 +29,7 @@ flatbuffers::Offset<circle::MulOptions> build_circle_MulOptions(flatbuffers::Fla
   assert(tflite_builtin_options);
   circle::MulOptionsBuilder builtin_options_builder{fb};
   builtin_options_builder.add_fused_activation_function(
-      get_circle_activation_function_type(tflite_builtin_options->fused_activation_function()));
+    get_circle_activation_function_type(tflite_builtin_options->fused_activation_function()));
   return builtin_options_builder.Finish();
 }
 
diff --git a/compiler/tflite2circle/src/BuildBuiltinOptions/Pool2DOptions.cpp b/compiler/tflite2circle/src/BuildBuiltinOptions/Pool2DOptions.cpp
index 6b0bd1288..d796eadfa 100644
--- a/compiler/tflite2circle/src/BuildBuiltinOptions/Pool2DOptions.cpp
+++ b/compiler/tflite2circle/src/BuildBuiltinOptions/Pool2DOptions.cpp
@@ -34,7 +34,7 @@ build_circle_Pool2DOptions(flatbuffers::FlatBufferBuilder &fb, const tflite::Ope
   builtin_options_builder.add_filter_width(tflite_builtin_options->filter_width());
   builtin_options_builder.add_filter_height(tflite_builtin_options->filter_height());
   builtin_options_builder.add_fused_activation_function(
-      get_circle_activation_function_type(tflite_builtin_options->fused_activation_function()));
+    get_circle_activation_function_type(tflite_builtin_options->fused_activation_function()));
   return builtin_options_builder.Finish();
 }
 
diff --git a/compiler/tflite2circle/src/BuildBuiltinOptions/SubOptions.cpp b/compiler/tflite2circle/src/BuildBuiltinOptions/SubOptions.cpp
index 2e55f4dab..982f3fd68 100644
--- a/compiler/tflite2circle/src/BuildBuiltinOptions/SubOptions.cpp
+++ b/compiler/tflite2circle/src/BuildBuiltinOptions/SubOptions.cpp
@@ -29,7 +29,7 @@ flatbuffers::Offset<circle::SubOptions> build_circle_SubOptions(flatbuffers::Fla
   assert(tflite_builtin_options);
   circle::SubOptionsBuilder builtin_options_builder{fb};
   builtin_options_builder.add_fused_activation_function(
-      get_circle_activation_function_type(tflite_builtin_options->fused_activation_function()));
+    get_circle_activation_function_type(tflite_builtin_options->fused_activation_function()));
   return builtin_options_builder.Finish();
 }
 
diff --git a/compiler/tflite2circle/src/BuildBuiltinOptions/UnidirectionalSequenceLSTMOptions.cpp b/compiler/tflite2circle/src/BuildBuiltinOptions/UnidirectionalSequenceLSTMOptions.cpp
index 64ceb5a74..6e8143be9 100644
--- a/compiler/tflite2circle/src/BuildBuiltinOptions/UnidirectionalSequenceLSTMOptions.cpp
+++ b/compiler/tflite2circle/src/BuildBuiltinOptions/UnidirectionalSequenceLSTMOptions.cpp
@@ -29,12 +29,12 @@ build_circle_UnidirectionalSequenceLSTMOptions(flatbuffers::FlatBufferBuilder &f
   auto tflite_builtin_options = op->builtin_options_as_UnidirectionalSequenceLSTMOptions();
   circle::UnidirectionalSequenceLSTMOptionsBuilder builtin_options_builder{fb};
   builtin_options_builder.add_fused_activation_function(
-      get_circle_activation_function_type(tflite_builtin_options->fused_activation_function()));
+    get_circle_activation_function_type(tflite_builtin_options->fused_activation_function()));
   builtin_options_builder.add_cell_clip(tflite_builtin_options->cell_clip());
   builtin_options_builder.add_proj_clip(tflite_builtin_options->proj_clip());
   builtin_options_builder.add_time_major(tflite_builtin_options->time_major());
   builtin_options_builder.add_asymmetric_quantize_inputs(
-      tflite_builtin_options->asymmetric_quantize_inputs());
+    tflite_builtin_options->asymmetric_quantize_inputs());
   return builtin_options_builder.Finish();
 }
 
diff --git a/compiler/tflite2circle/src/BuildBuiltinOptions/UniqueOptions.cpp b/compiler/tflite2circle/src/BuildBuiltinOptions/UniqueOptions.cpp
index 96ddc15ad..f7ddeffcb 100644
--- a/compiler/tflite2circle/src/BuildBuiltinOptions/UniqueOptions.cpp
+++ b/compiler/tflite2circle/src/BuildBuiltinOptions/UniqueOptions.cpp
@@ -29,7 +29,7 @@ build_circle_UniqueOptions(flatbuffers::FlatBufferBuilder &fb, const tflite::Ope
   assert(tflite_builtin_options);
   circle::UniqueOptionsBuilder builtin_options_builder{fb};
   builtin_options_builder.add_idx_out_type(
-      get_circle_tensortype(tflite_builtin_options->idx_out_type()));
+    get_circle_tensortype(tflite_builtin_options->idx_out_type()));
   return builtin_options_builder.Finish();
 }
 
diff --git a/compiler/tflite2circle/src/CircleModel.cpp b/compiler/tflite2circle/src/CircleModel.cpp
index a95c37089..9ab884e75 100644
--- a/compiler/tflite2circle/src/CircleModel.cpp
+++ b/compiler/tflite2circle/src/CircleModel.cpp
@@ -126,13 +126,13 @@ Offset<SubGraphLink>::Offset(FlatBufBuilder &fb, const TFLFlatBufVec *tflite_fla
         flatbuffers::Offset<flatbuffers::Vector<int32_t>> traversal_order;
         flatbuffers::Offset<flatbuffers::Vector<int32_t>> block_map;
         flatbuffers::Offset<flatbuffers::Vector<flatbuffers::Offset<circle::DimensionMetadata>>>
-            dim_metadata;
+          dim_metadata;
 
         // traversal_order
         if (it->sparsity()->traversal_order())
         {
           auto traversal_order_vec = std::vector<int32_t>{
-              it->sparsity()->traversal_order()->begin(), it->sparsity()->traversal_order()->end()};
+            it->sparsity()->traversal_order()->begin(), it->sparsity()->traversal_order()->end()};
           traversal_order = fb->CreateVector(traversal_order_vec);
         }
 
@@ -152,16 +152,16 @@ Offset<SubGraphLink>::Offset(FlatBufBuilder &fb, const TFLFlatBufVec *tflite_fla
           // array_segments
           auto tflite_array_segments_type = it->array_segments_type();
           auto circle_array_segments =
-              get_circle_sparse_index_vector(*fb, it->array_segments(), tflite_array_segments_type);
+            get_circle_sparse_index_vector(*fb, it->array_segments(), tflite_array_segments_type);
           auto circle_array_segments_type =
-              get_circle_sparse_index_vector_type(tflite_array_segments_type);
+            get_circle_sparse_index_vector_type(tflite_array_segments_type);
 
           // array_indices
           auto tflite_array_indices_type = it->array_indices_type();
           auto circle_array_indices =
-              get_circle_sparse_index_vector(*fb, it->array_indices(), tflite_array_indices_type);
+            get_circle_sparse_index_vector(*fb, it->array_indices(), tflite_array_indices_type);
           auto circle_array_indices_type =
-              get_circle_sparse_index_vector_type(tflite_array_indices_type);
+            get_circle_sparse_index_vector_type(tflite_array_indices_type);
 
           auto circle_dim_metadata_builder = circle::DimensionMetadataBuilder{*fb};
 
@@ -184,7 +184,7 @@ Offset<SubGraphLink>::Offset(FlatBufBuilder &fb, const TFLFlatBufVec *tflite_fla
       if (it->shape_signature())
       {
         auto shape_signature_vec =
-            std::vector<int32_t>({it->shape_signature()->begin(), it->shape_signature()->end()});
+          std::vector<int32_t>({it->shape_signature()->begin(), it->shape_signature()->end()});
         shape_signature = fb->CreateVector(shape_signature_vec);
       }
 
@@ -297,7 +297,7 @@ Offset<OperatorCodeLink>::Offset(FlatBufBuilder &fb, const TFLFlatBufVec *tflite
 }
 
 CircleModel::CircleModel(FlatBufBuilder &fb, TFLModel &model)
-    : _version{0}, _description{fb->CreateString("nnpackage")}, _fb{fb}
+  : _version{0}, _description{fb->CreateString("nnpackage")}, _fb{fb}
 {
   const tflite::Model *tfl_model = model.load_model();
   // verify flatbuffers
@@ -309,11 +309,11 @@ CircleModel::CircleModel(FlatBufBuilder &fb, TFLModel &model)
   }
 
   _operator_codes_offset =
-      std::make_unique<Offset<OperatorCodeLink>>(fb, tfl_model->operator_codes());
+    std::make_unique<Offset<OperatorCodeLink>>(fb, tfl_model->operator_codes());
   _subGraphs_offset = std::make_unique<Offset<SubGraphLink>>(fb, tfl_model->subgraphs());
   _buffers_offset = std::make_unique<Offset<BufferLink>>(fb, tfl_model->buffers());
   _metadata_buffer_offset =
-      std::make_unique<Offset<MetaDataBufferLink>>(fb, tfl_model->metadata_buffer());
+    std::make_unique<Offset<MetaDataBufferLink>>(fb, tfl_model->metadata_buffer());
   model_build();
 }
 
diff --git a/compiler/tflite2circle/src/DataLookup.cpp b/compiler/tflite2circle/src/DataLookup.cpp
index f8dd75f4c..c5ed62e31 100644
--- a/compiler/tflite2circle/src/DataLookup.cpp
+++ b/compiler/tflite2circle/src/DataLookup.cpp
@@ -148,7 +148,7 @@ get_circle_sparse_index_vector(flatbuffers::FlatBufferBuilder &fb, const void *v
     {
       const tflite::Int32Vector *i32_array = static_cast<const tflite::Int32Vector *>(v_array);
       auto values_vec_int32 =
-          std::vector<int32_t>{i32_array->values()->begin(), i32_array->values()->end()};
+        std::vector<int32_t>{i32_array->values()->begin(), i32_array->values()->end()};
       auto values_int32 = fb.CreateVector(values_vec_int32);
       circle::Int32VectorBuilder int32_vector_builder{fb};
       int32_vector_builder.add_values(values_int32);
@@ -158,7 +158,7 @@ get_circle_sparse_index_vector(flatbuffers::FlatBufferBuilder &fb, const void *v
     {
       const tflite::Uint16Vector *u16_array = static_cast<const tflite::Uint16Vector *>(v_array);
       auto values_vec_uint16 =
-          std::vector<uint16_t>{u16_array->values()->begin(), u16_array->values()->end()};
+        std::vector<uint16_t>{u16_array->values()->begin(), u16_array->values()->end()};
       auto values_uint16 = fb.CreateVector(values_vec_uint16);
       circle::Uint16VectorBuilder uint16_vector_builder{fb};
       uint16_vector_builder.add_values(values_uint16);
@@ -168,7 +168,7 @@ get_circle_sparse_index_vector(flatbuffers::FlatBufferBuilder &fb, const void *v
     {
       const tflite::Uint8Vector *u8_array = static_cast<const tflite::Uint8Vector *>(v_array);
       auto values_vec_uint8 =
-          std::vector<uint8_t>{u8_array->values()->begin(), u8_array->values()->end()};
+        std::vector<uint8_t>{u8_array->values()->begin(), u8_array->values()->end()};
       auto values_uint8 = fb.CreateVector(values_vec_uint8);
       circle::Uint8VectorBuilder uint8_vector_builder{fb};
       uint8_vector_builder.add_values(values_uint8);
diff --git a/compiler/tflite2circle/src/DataLookup.h b/compiler/tflite2circle/src/DataLookup.h
index 58a357703..601d014dd 100644
--- a/compiler/tflite2circle/src/DataLookup.h
+++ b/compiler/tflite2circle/src/DataLookup.h
@@ -27,19 +27,19 @@ namespace tflite2circle
  * @brief Returns circle builtin_code according to tflite.
  *
  * @note You can see a list of currently supported BuiltinOperator in TFLOperator.lst file.
-*/
+ */
 circle::BuiltinOperator get_circle_builtin_code(tflite::BuiltinOperator tfl_bop);
 
 /**
  * @brief Returns circle TensorType according to tflite.
  *
  * @note You can see a list of currently supported TensorType in TFLTensorType.lst file.
-*/
+ */
 circle::TensorType get_circle_tensortype(tflite::TensorType tfl_tt);
 
 /**
  * @brief Returns circle Padding enum according to tflite.
-*/
+ */
 circle::Padding get_circle_padding(tflite::Padding tfl_p);
 
 /**
@@ -47,7 +47,7 @@ circle::Padding get_circle_padding(tflite::Padding tfl_p);
  *
  * @note You can see a list of currently supported ActivationFunctionType in
  *       TFLActivationFunctionType.lst file.
-*/
+ */
 circle::ActivationFunctionType
 get_circle_activation_function_type(tflite::ActivationFunctionType tfl_aft);
 
@@ -60,7 +60,7 @@ get_circle_activation_function_type(tflite::ActivationFunctionType tfl_aft);
  *       This function calls the build_circle_##BuiltinOptions internally(e.g.
  *       build_circle_AbsOptions, build_circle_AddOptions, etc.), so refer to it for a more
  *       detailed implementation.
-*/
+ */
 flatbuffers::Offset<void> get_circle_builtin_options(flatbuffers::FlatBufferBuilder &fb,
                                                      const tflite::Operator *op);
 
@@ -68,29 +68,29 @@ flatbuffers::Offset<void> get_circle_builtin_options(flatbuffers::FlatBufferBuil
  * @brief Returns circle builtin_options_type according to tflite.
  *
  * @note You can see a list of currently supported BuiltinOptions in TFLBuiltinOptions.lst file.
-*/
+ */
 circle::BuiltinOptions get_circle_builtin_options_type(const tflite::Operator *op);
 
 /**
  * @brief Returns circle MirrorPadMode according to tflite.
-*/
+ */
 circle::MirrorPadMode get_circle_mirrorpad_mode(tflite::MirrorPadMode tfl_mode);
 
 /**
  * @brief Returns circle DimensionType according to tflite.
-*/
+ */
 circle::DimensionType get_circle_dimension_type(tflite::DimensionType tfl_dim_type);
 
 /**
  * @brief Returns circle SparseIndexVector according to tflite.
-*/
+ */
 flatbuffers::Offset<void>
 get_circle_sparse_index_vector(flatbuffers::FlatBufferBuilder &fb, const void *values,
                                const tflite::SparseIndexVector &tfl_sparse_index_vector_type);
 
 /**
  * @brief Returns circle SparseIndexVector type according to tflite.
-*/
+ */
 circle::SparseIndexVector
 get_circle_sparse_index_vector_type(const tflite::SparseIndexVector &tfl_sparse_index_vector_type);
 
diff --git a/compiler/tflite2circle/src/TFLBuiltinOptions.lst b/compiler/tflite2circle/src/TFLBuiltinOptions.lst
index 4bc101f8e..f2de7e046 100644
--- a/compiler/tflite2circle/src/TFLBuiltinOptions.lst
+++ b/compiler/tflite2circle/src/TFLBuiltinOptions.lst
@@ -63,7 +63,7 @@ TFL_BUILTIN_OPTIONS(NotEqualOptions)
 TFL_BUILTIN_OPTIONS(ShapeOptions)
 TFL_BUILTIN_OPTIONS(PowOptions)
 TFL_BUILTIN_OPTIONS(ArgMinOptions)
-//TFL_BUILTIN_OPTIONS(FakeQuantOptions)
+TFL_BUILTIN_OPTIONS(FakeQuantOptions)
 TFL_BUILTIN_OPTIONS(PackOptions)
 TFL_BUILTIN_OPTIONS(LogicalOrOptions)
 TFL_BUILTIN_OPTIONS(OneHotOptions)
@@ -74,7 +74,7 @@ TFL_BUILTIN_OPTIONS(FloorDivOptions)
 TFL_BUILTIN_OPTIONS(SquareOptions)
 TFL_BUILTIN_OPTIONS(ZerosLikeOptions)
 TFL_BUILTIN_OPTIONS(FillOptions)
-//TFL_BUILTIN_OPTIONS(BidirectionalSequenceLSTMOptions)
+TFL_BUILTIN_OPTIONS(BidirectionalSequenceLSTMOptions)
 //TFL_BUILTIN_OPTIONS(BidirectionalSequenceRNNOptions)
 TFL_BUILTIN_OPTIONS(UnidirectionalSequenceLSTMOptions)
 TFL_BUILTIN_OPTIONS(FloorModOptions)
diff --git a/compiler/vconone/CMakeLists.txt b/compiler/vconone/CMakeLists.txt
index 595bbfd99..882f6507b 100644
--- a/compiler/vconone/CMakeLists.txt
+++ b/compiler/vconone/CMakeLists.txt
@@ -1,5 +1,5 @@
 if (NOT VCONONE_VERSION)
-  set(VCONONE_VERSION 0x00000000000c0001)
+  set(VCONONE_VERSION 0x00000000000f0001)
   # NOTE order is [build patch minor major]
   # if VCONONE_VERSION is set with -D option, it will be cached
   # you may have to remove cache file if you remove -D option
diff --git a/compiler/vconone/src/version.cpp b/compiler/vconone/src/version.cpp
index 9b693c621..d94a7ada6 100644
--- a/compiler/vconone/src/version.cpp
+++ b/compiler/vconone/src/version.cpp
@@ -54,7 +54,7 @@ std::string get_string(void)
 std::string get_copyright(void)
 {
   std::string str;
-  str = "Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved\r\n";
+  str = "Copyright (c) 2020-2021 Samsung Electronics Co., Ltd. All Rights Reserved\r\n";
   str += "Licensed under the Apache License, Version 2.0\r\n";
   str += "https://github.com/Samsung/ONE";
   return str;
diff --git a/compute/.clang-format b/compute/.clang-format
deleted file mode 120000
index 0ff66f331..000000000
--- a/compute/.clang-format
+++ /dev/null
@@ -1 +0,0 @@
-../.clang-format.8
-\ No newline at end of file
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h b/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h
index 4a3717885..d3e116381 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/CLKernelLibraryEx.h
@@ -264,5 +264,5 @@ private:
     _program_source_map; /**< Contains sources for all programs.
                            Used for compile-time kernel inclusion. >*/
 };
-}
+} // namespace arm_compute
 #endif /* __ARM_COMPUTE_CLKERNELLIBRARY_EX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h
index a0aa0560b..46d4ae858 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h
@@ -40,7 +40,7 @@
 #ifndef ARM_COMPUTE_CLARGMINMAXLAYERKERNELEX_H
 #define ARM_COMPUTE_CLARGMINMAXLAYERKERNELEX_H
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
 
 namespace arm_compute
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h
index bb6fcb8f5..eac866b67 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h
@@ -41,8 +41,8 @@
 #ifndef __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__
 #define __ARM_COMPUTE_CLBINARYLOGICALOPKERNEL_H__
 
-#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/TypesEx.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h
index ed668fd9c..cf671102e 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastBoolKernel.h
@@ -47,7 +47,7 @@
 #ifndef __ARM_COMPUTE_CLCASTBOOLKERNEL_H__
 #define __ARM_COMPUTE_CLCASTBOOLKERNEL_H__
 
-#include "arm_compute/core/CL/ICLSimple3DKernel.h"
+#include "src/core/CL/ICLSimple3DKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h
index fb689f747..6729fb0f1 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h
@@ -47,7 +47,7 @@
 #ifndef __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__
 #define __ARM_COMPUTE_CLEMBEDDINGLOOKUPKERNEL_H__
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h
new file mode 100644
index 000000000..64908ab59
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef ARM_COMPUTE_CLGEMMMatrixAccumulateBiasesKernel_H
+#define ARM_COMPUTE_CLGEMMMatrixAccumulateBiasesKernel_H
+
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+/** Interface to add a bias to each row of the input tensor
+ *
+ */
+class CLGEMMMatrixAccumulateBiasesKernel : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLGEMMMatrixAccumulateBiasesKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLGEMMMatrixAccumulateBiasesKernel(const CLGEMMMatrixAccumulateBiasesKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLGEMMMatrixAccumulateBiasesKernel &
+  operator=(const CLGEMMMatrixAccumulateBiasesKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLGEMMMatrixAccumulateBiasesKernel(CLGEMMMatrixAccumulateBiasesKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLGEMMMatrixAccumulateBiasesKernel &operator=(CLGEMMMatrixAccumulateBiasesKernel &&) = default;
+  /** Set the accumulate buffer and the biases of the kernel.
+   *
+   * @param[in, out] accum  The accumulate tensor to convert. Data types supported: F16/F32
+   * @param[in]      biases The shared biases tensor to append. It must be 1D tensor. Data types
+   * supported: Same as @p input
+   */
+  void configure(ICLTensor *accum, const ICLTensor *biases);
+  /** Set the accumulate buffer and the biases of the kernel.
+   *
+   * @param[in]      compile_context The compile context to be used.
+   * @param[in, out] accum           The accumulate tensor to convert. Data types supported: F16/F32
+   * @param[in]      biases          The shared biases tensor to append. It must be 1D tensor. Data
+   * types supported: Same as @p input
+   */
+  void configure(const CLCompileContext &compile_context, ICLTensor *accum,
+                 const ICLTensor *biases);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLGEMMMatrixAccumulateBiasesKernel
+   *
+   * @param[in] accum      The accumulate tensor to convert. Data types supported: F16/F32
+   * @param[in] biases     The shared biases tensor to append. It must be 1D tensor. Data types
+   * supported: Same as @p input
+   * @param[in] gpu_target GPU target
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *accum, const ITensorInfo *biases, GPUTarget gpu_target);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  ICLTensor *_accum;
+  const ICLTensor *_biases;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLGEMMMatrixAccumulateBiasesKernel_H */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h
index 6630c7be7..a55f2401d 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGatherExKernel.h
@@ -47,7 +47,7 @@
 #ifndef __ARM_COMPUTE_CLGATHEREXKERNEL_H__
 #define __ARM_COMPUTE_CLGATHEREXKERNEL_H__
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h
index 96f830898..f9d6f7cc5 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLHashtableLookupKernel.h
@@ -47,7 +47,7 @@
 #ifndef __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__
 #define __ARM_COMPUTE_CLHASHTABLELOOKUPKERNEL_H__
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 
 namespace arm_compute
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h
index f57e799ad..7da9e9a4c 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.h
@@ -41,7 +41,7 @@
 #ifndef __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__
 #define __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNELEX_H__
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMemsetKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMemsetKernel.h
new file mode 100644
index 000000000..4befdd05c
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMemsetKernel.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLMEMSETKERNEL_H
+#define ARM_COMPUTE_CLMEMSETKERNEL_H
+
+#include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/Types.h"
+#include "src/core/CL/ICLKernel.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for filling the planes of a tensor */
+class CLMemsetKernel : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLMemsetKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLMemsetKernel(const CLMemsetKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLMemsetKernel &operator=(const CLMemsetKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  CLMemsetKernel(CLMemsetKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  CLMemsetKernel &operator=(CLMemsetKernel &&) = default;
+  /** Default destructor */
+  ~CLMemsetKernel() = default;
+
+  /** Initialise the kernel's tensor and filling value
+   *
+   * @param[in,out] tensor         Input tensor to fill. Supported data types: All.
+   * @param[in]     constant_value The value used to fill the planes of the tensor
+   * @param[in]     window         Window to be used in case setting only part of a tensor. Default
+   * is nullptr.
+   */
+  void configure(ICLTensor *tensor, const PixelValue &constant_value, Window *window = nullptr);
+  /** Initialise the kernel's tensor and filling value
+   *
+   * @param[in]     compile_context The compile context to be used.
+   * @param[in,out] tensor          Input tensor to fill. Supported data types: All.
+   * @param[in]     constant_value  The value used to fill the planes of the tensor
+   * @param[in]     window          Window to be used in case setting only part of a tensor. Default
+   * is nullptr.
+   */
+  void configure(const CLCompileContext &compile_context, ICLTensor *tensor,
+                 const PixelValue &constant_value, Window *window = nullptr);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLMemsetKernel
+   *
+   * @param[in] tensor         Source tensor info. Data types supported: All.
+   * @param[in] constant_value The value used to fill the planes of the tensor
+   * @param[in] window         Window to be used in case setting only part of a tensor. Default is
+   * nullptr.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *tensor, const PixelValue &constant_value,
+                         Window *window = nullptr);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  ICLTensor *_tensor;
+  Window _full_window;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLMEMSETRKERNEL_H */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h
index 90e8b5705..5394a062c 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h
@@ -41,7 +41,7 @@
 #ifndef __ARM_COMPUTE_CLMULTIPLYSCALEFACTORKERNEL_H__
 #define __ARM_COMPUTE_CLMULTIPLYSCALEFACTORKERNEL_H__
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h
index fa383c0d0..384050aff 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLNegKernel.h
@@ -41,7 +41,7 @@
 #ifndef __ARM_COMPUTE_CLNEGKERNEL_H__
 #define __ARM_COMPUTE_CLNEGKERNEL_H__
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h
index a512057b9..1d64f9f7d 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLOneHotKernel.h
@@ -39,7 +39,7 @@
  */
 #ifndef __ARM_COMPUTE_CLONEHOTKERNEL_H__
 #define __ARM_COMPUTE_CLONEHOTKERNEL_H__
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernelEx.h
new file mode 100644
index 000000000..d4230aaf3
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPadLayerKernelEx.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLPADLAYERKERNELEX_H
+#define ARM_COMPUTE_CLPADLAYERKERNELEX_H
+
+#include "src/core/CL/ICLKernel.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Interface for the PadLayer function. */
+class CLPadLayerKernelEx : public ICLKernel
+{
+public:
+  /** Default constructor */
+  CLPadLayerKernelEx();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLPadLayerKernelEx(const CLPadLayerKernelEx &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLPadLayerKernelEx &operator=(const CLPadLayerKernelEx &) = delete;
+  /** Allow instances of this class to be moved */
+  CLPadLayerKernelEx(CLPadLayerKernelEx &&) = default;
+  /** Allow instances of this class to be moved */
+  CLPadLayerKernelEx &operator=(CLPadLayerKernelEx &&) = default;
+  /** Default destructor */
+  ~CLPadLayerKernelEx() = default;
+  /** Set the input and output tensor.
+   *
+   * @param[in]  input          Source tensor. Data types supported: U8, S8, QASYMM8,
+   * QASYMM8_SIGNED, U16, S16, U32, S32, F16, F32.
+   * @param[out] output         Output tensor. Data type supported: same as @p input
+   * @param[in]  padding        The padding for each spatial dimension of the input tensor. The pair
+   * padding[i] specifies the front and the end padding in the i-th dimension.
+   * @param[in]  constant_value (Optional) Constant value to be used for the padding.
+   * @param[in]  mode           (Optional) Controls whether the padding should be filled with @p
+   * constant_value using CONSTANT, or reflect the input, either including the border values
+   * (SYMMETRIC) or not (REFLECT).
+   */
+  void configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding,
+                 PixelValue constant_value = PixelValue(),
+                 PaddingMode mode = PaddingMode::CONSTANT);
+  /** Set the input and output tensor.
+   *
+   * @param[in]  compile_context The compile context to be used.
+   * @param[in]  input           Source tensor. Data types supported: All.
+   * @param[out] output          Output tensor. Data type supported: same as @p input
+   * @param[in]  padding         The padding for each spatial dimension of the input tensor. The
+   * pair padding[i] specifies the front and the end padding in the i-th dimension.
+   * @param[in]  constant_value  (Optional) Constant value to be used for the padding.
+   * @param[in]  mode            (Optional) Controls whether the padding should be filled with @p
+   * constant_value using CONSTANT, or reflect the input, either including the border values
+   * (SYMMETRIC) or not (REFLECT).
+   */
+  void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output,
+                 const PaddingList &padding, PixelValue constant_value = PixelValue(),
+                 PaddingMode mode = PaddingMode::CONSTANT);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * CLPadLayerKernelEx
+   *
+   * @param[in] input          Source tensor info. Data types supported: U8, S8, QASYMM8,
+   * QASYMM8_SIGNED, U16, S16, U32, S32, F16, F32.
+   * @param[in] output         Output tensor info. Data type supported: same as @p input
+   * @param[in] padding        The padding for each spatial dimension of the input tensor. The pair
+   * padding[i] specifies the front and the end padding in the i-th dimension.
+   * @param[in] constant_value (Optional) Constant value to be used for the padding.
+   * @param[in] mode           (Optional) Controls whether the padding should be filled with @p
+   * constant_value using CONSTANT, or reflect the input, either including the border values
+   * (SYMMETRIC) or not (REFLECT).
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const PaddingList &padding, PixelValue constant_value = PixelValue(),
+                         PaddingMode mode = PaddingMode::CONSTANT);
+
+  // Inherited methods overridden:
+  void run(const Window &window, cl::CommandQueue &queue) override;
+
+private:
+  const ICLTensor *_input;
+  ICLTensor *_output;
+  int _input_start_x;
+  int _input_start_y;
+  bool _4d_enabled;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLPADLAYERKERNELEX_H */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h
index 4e1b56cba..3f60db7bb 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h
@@ -41,7 +41,7 @@
 #ifndef __ARM_COMPUTE_CLQUANTIZATIONSYMMETRICKERNEL_H__
 #define __ARM_COMPUTE_CLQUANTIZATIONSYMMETRICKERNEL_H__
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h
index 4f9042e41..548f29a27 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLReduceOperationKernel.h
@@ -47,7 +47,7 @@
 #ifndef __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__
 #define __ARM_COMPUTE_CLREDUCEOPERATIONKERNEL_H__
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 #include "arm_compute/core/Types.h"
 
 namespace arm_compute
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h
index 4d4478ece..5f5b7f9b8 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h
@@ -41,7 +41,7 @@
 #ifndef __ARM_COMPUTE_CLSCALEFACTORSYMM8KERNEL_H__
 #define __ARM_COMPUTE_CLSCALEFACTORSYMM8KERNEL_H__
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h
index aa4a14812..09073af7c 100644
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTopKV2Kernel.h
@@ -47,7 +47,7 @@
 #ifndef __ARM_COMPUTE_CLTOPKV2KERNEL_H__
 #define __ARM_COMPUTE_CLTOPKV2KERNEL_H__
 
-#include "arm_compute/core/CL/ICLKernel.h"
+#include "src/core/CL/ICLKernel.h"
 
 // these parameters can be changed
 #define _ITEMS 16                          // number of items in a group
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h
index 8c544cda8..c46b26170 100644
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h
@@ -41,15 +41,19 @@
 #ifndef __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__
 #define __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__
 
-#include "arm_compute/core/NEON/kernels/NEElementwiseOperationKernel.h"
 #include "arm_compute/core/TypesEx.h"
 
+#include "src/core/cpu/kernels/CpuElementwiseKernel.h"
+
 namespace arm_compute
 {
 
-class NEBinaryLogicalOperationKernel : public NEElementwiseOperationKernel
+class NEBinaryLogicalOperationKernel : public cpu::kernels::CpuComparisonKernel
 {
 public:
+  const char *name() const override { return "NEBinaryLogicalOperationKernel"; }
+
+  NEBinaryLogicalOperationKernel() = default;
   /** Default destructor */
   ~NEBinaryLogicalOperationKernel() = default;
 
@@ -81,6 +85,10 @@ protected:
   // Inherited methods overridden:
   static Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2,
                                    const ITensorInfo &output);
+
+  std::function<void(const ITensor *input1, const ITensor *input2, ITensor *output,
+                     const Window &window)>
+    _function;
 };
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_NEBINARYLOGICALOPERATIONKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastBoolKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastBoolKernel.h
index 101f6ac8e..036d56e69 100644
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastBoolKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastBoolKernel.h
@@ -40,7 +40,7 @@
 #ifndef __ARM_COMPUTE_NECASTBOOLKERNEL_H__
 #define __ARM_COMPUTE_NECASTBOOLKERNEL_H__
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h
index 88f21c96e..621500eb8 100644
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h
@@ -41,7 +41,7 @@
 #ifndef __ARM_COMPUTE_NEEMBEDDINGLOOKUPKERNEL_H__
 #define __ARM_COMPUTE_NEEMBEDDINGLOOKUPKERNEL_H__
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
 
 namespace arm_compute
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h
new file mode 100644
index 000000000..f8f7ac567
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H
+#define ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H
+
+#include "src/core/NEON/INEKernel.h"
+
+namespace arm_compute
+{
+class ITensor;
+/** NEON kernel to add a bias to each row of the input tensor */
+class NEGEMMMatrixAccumulateBiasesKernel : public INEKernel
+{
+public:
+  const char *name() const override { return "NEGEMMMatrixAccumulateBiasesKernel"; }
+  /** Default constructor */
+  NEGEMMMatrixAccumulateBiasesKernel();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEGEMMMatrixAccumulateBiasesKernel(const NEGEMMMatrixAccumulateBiasesKernel &) = delete;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  NEGEMMMatrixAccumulateBiasesKernel &
+  operator=(const NEGEMMMatrixAccumulateBiasesKernel &) = delete;
+  /** Allow instances of this class to be moved */
+  NEGEMMMatrixAccumulateBiasesKernel(NEGEMMMatrixAccumulateBiasesKernel &&) = default;
+  /** Allow instances of this class to be moved */
+  NEGEMMMatrixAccumulateBiasesKernel &operator=(NEGEMMMatrixAccumulateBiasesKernel &&) = default;
+  /** Default destructor */
+  ~NEGEMMMatrixAccumulateBiasesKernel() = default;
+  /** Set the accumulate buffer and the biases of the kernel.
+   *
+   * @param[in, out] accum  The accumulate tensor to convert. Data type supported: F32
+   * @param[in]      biases The shared biases tensor to append. It must be 1D Tensor. Data type
+   * supported: Same as @p input
+   */
+  void configure(ITensor *accum, const ITensor *biases);
+  /** Static function to check if given info will lead to a valid configuration of @ref
+   * NEGEMMMatrixAccumulateBiasesKernel
+   *
+   * @param[in] accum  The accumulate tensor to convert. Data type supported: F32
+   * @param[in] biases The shared biases tensor to append. It must be 1D Tensor. Data type
+   * supported: Same as @p input
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *accum, const ITensorInfo *biases);
+
+  // Inherited methods overridden:
+  void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+  ITensor *_accum;
+  const ITensor *_biases;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h
index 5acfde5a8..a03e08ade 100644
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEGatherKernelEx.h
@@ -41,7 +41,7 @@
 #ifndef __ARM_COMPUTE_NEGATHERKERNELEX_H__
 #define __ARM_COMPUTE_NEGATHERKERNELEX_H__
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
 
 namespace arm_compute
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h
index cb2a485d5..fb3a72725 100644
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h
@@ -41,7 +41,7 @@
 #ifndef __ARM_COMPUTE_NEHASHTABLELOOKUPKERNEL_H__
 #define __ARM_COMPUTE_NEHASHTABLELOOKUPKERNEL_H__
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
 
 namespace arm_compute
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h
index 8724cc69b..1d786b59e 100644
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h
@@ -41,7 +41,7 @@
 #ifndef __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNELEX_H__
 #define __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNELEX_H__
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h
index 198b0be9d..ab534fe96 100644
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h
@@ -41,7 +41,7 @@
 #ifndef __ARM_COMPUTE_NEMULTIPLYSCALEFACTORKERNEL_H__
 #define __ARM_COMPUTE_NEMULTIPLYSCALEFACTORKERNEL_H__
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h
index 963d7b821..c1c9f7a3c 100644
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEOneHotKernel.h
@@ -39,7 +39,7 @@
  */
 #ifndef __ARM_COMPUTE_NEONEHOTKERNEL_H__
 #define __ARM_COMPUTE_NEONEHOTKERNEL_H__
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h
index 0b080cf73..1fd5362ae 100644
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h
+++ b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h
@@ -41,7 +41,7 @@
 #ifndef __ARM_COMPUTE_NEQUANTIZATIONSYMMETRICKERNEL_H__
 #define __ARM_COMPUTE_NEQUANTIZATIONSYMMETRICKERNEL_H__
 
-#include "arm_compute/core/NEON/INEKernel.h"
+#include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/core/UtilsEx.h b/compute/ARMComputeEx/arm_compute/core/UtilsEx.h
index d57e8fcf5..d7ec1b4f0 100644
--- a/compute/ARMComputeEx/arm_compute/core/UtilsEx.h
+++ b/compute/ARMComputeEx/arm_compute/core/UtilsEx.h
@@ -67,5 +67,5 @@ transposeconv_output_dimensions(unsigned int in_width, unsigned int in_height,
                                 unsigned int kernel_width, unsigned int kernel_height,
                                 const PadStrideInfo &info, unsigned int invalid_right,
                                 unsigned int invalid_top);
-}
+} // namespace arm_compute
 #endif /*__ARM_COMPUTE_UTILSEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
index 484ebfd0b..664b8b3b1 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
@@ -26,6 +26,7 @@
 #include <arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h>
 #include <arm_compute/runtime/CL/functions/CLNeg.h>
 #include <arm_compute/runtime/CL/functions/CLOneHot.h>
+#include <arm_compute/runtime/CL/functions/CLPadLayerEx.h>
 #include <arm_compute/runtime/CL/functions/CLReduceOperation.h>
 #include <arm_compute/runtime/CL/functions/CLSplitVEx.h>
 #include <arm_compute/runtime/CL/functions/CLTopKV2.h>
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h
index b1ee52bf9..05bcc4075 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgMinMaxLayerEx.h
@@ -41,8 +41,9 @@
 #define __ARM_COMPUTE_CLARGMINMAXLAYEREX_H__
 
 #include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h"
-#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
+
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
@@ -100,7 +101,7 @@ private:
   std::vector<CLTensor> _results_vector;
   CLTensor _not_reshaped_output;
   std::vector<CLArgMinMaxLayerKernelEx> _reduction_kernels_vector;
-  CLReshapeLayerKernel _reshape_kernel;
+  CLReshapeLayer _reshape_kernel;
   unsigned int _num_of_stages;
   unsigned int _reduction_axis;
 };
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h
index 88a9b00ec..fc4322798 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h
@@ -43,6 +43,7 @@
 
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 #include "arm_compute/core/TypesEx.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCastBool.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCastBool.h
index d6150684a..854ddce52 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCastBool.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCastBool.h
@@ -67,5 +67,5 @@ public:
    */
   void configure(ICLTensor *input, ICLTensor *output);
 };
-}
+} // namespace arm_compute
 #endif /* ARM_COMPUTE_CLCASTBOOL_H */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h
index fbee7e40e..b0149cb09 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLEmbeddingLookup.h
@@ -73,5 +73,5 @@ public:
    */
   void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *lookups);
 };
-}
+} // namespace arm_compute
 #endif /*__ARM_COMPUTE_CLEMBEDDINGLOOKUP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h
index f3266f688..c75ae9a50 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h
@@ -43,14 +43,14 @@
 
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
 #include "arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h"
 #include "arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h"
 #include "arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h"
-#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
-#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "src/core/CL/kernels/CLTransposeKernel.h"
 
 namespace arm_compute
 {
@@ -182,5 +182,5 @@ private:
   bool _is_prepared;
   const ICLTensor *_original_weights;
 };
-}
+} // namespace arm_compute
 #endif /* __ARM_COMPUTE_CLFULLYCONNECTEDHYBRIDLAYER_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h
index f27e9913e..c08da526a 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h
@@ -43,16 +43,14 @@
 
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
-#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
-#include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/functions/CLConvertFullyConnectedWeights.h"
 #include "arm_compute/runtime/CL/functions/CLFlattenLayer.h"
 #include "arm_compute/runtime/CL/functions/CLGEMM.h"
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
-#include "arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h"
 #include "arm_compute/runtime/IWeightsManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "src/core/CL/kernels/CLTransposeKernel.h"
 
 namespace arm_compute
 {
@@ -132,9 +130,6 @@ private:
  * transpose_weights is set to true ) (called once)
  *  -# @ref CLGEMMMatrixMultiplyKernel or @ref CLGEMMLowpMatrixMultiplyCore (if quantized
  * asymmetric)
- *  -# @ref CLGEMMMatrixAccumulateBiasesKernel or @ref
- * CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if quantized asymmetric) (if @p biases is
- * not equal to nullptr)
  *
  * @note  The fully connected layer accepts "weights" tensors only with 2 dimensions.
  */
@@ -157,40 +152,36 @@ public:
    * @param[in]  input   Source tensor. Data type supported: QASYMM8/F16/F32.
    * @param[in]  weights Weights tensor. The weights must be 2 dimensional.
    *                     If this function is called after a Convolution Layer, the (transposed)
-   * weights will have as many rows as the product of the first 3 input's dimensions.
-   *                     If it is called after another FullyConnected Layer, the (transposed)
-   * weights will have as many rows as the input's first dimension.
-   *                     Data type supported: Same as @p input.
+   * weights will have as many rows as the product of the first 3 input's dimensions. If it is
+   * called after another FullyConnected Layer, the (transposed) weights will have as many rows as
+   * the input's first dimension. Data type supported: Same as @p input.
    * @param[in]  biases  Bias tensor. Can be nullptr. Data type supported:Same as @p input.
    * @param[out] output  Destination tensor. Its shape should be equal to the output of a matrix
    * multiplication between:
    *                     - The output of im2col on the input and the (transposed) 2D weights, if the
    * function is called after a Convolution Layer
    *                     - The input tensor and the (transposed) 2D weights, if the function is
-   * called after another FullyConnected Layer.
-   *                     Data type supported: Same as @p input.
+   * called after another FullyConnected Layer. Data type supported: Same as @p input.
    * @param[in]  fc_info (Optional) Fully connected layer additional info
    */
   void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases,
                  ICLTensor *output, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
   /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLFullyConnectedLayerEx
+   * CLFullyConnectedLayer
    *
    * @param[in]  input   Source tensor info. Data type supported: QASYMM8/F16/F32.
    * @param[in]  weights Weights tensor info. The weights must be 2 dimensional.
    *                     If this function is called after a Convolution Layer, the (transposed)
-   * weights will have as many rows as the product of the first 3 input's dimensions.
-   *                     If it is called after another FullyConnected Layer, the (transposed)
-   * weights will have as many rows as the input's first dimension.
-   *                     Data type supported: Same as @p input.
+   * weights will have as many rows as the product of the first 3 input's dimensions. If it is
+   * called after another FullyConnected Layer, the (transposed) weights will have as many rows as
+   * the input's first dimension. Data type supported: Same as @p input.
    * @param[in]  biases  Bias tensor info. Can be nullptr. Data type supported:Same as @p input.
    * @param[out] output  Destination tensor info. Its shape should be equal to the output of a
    * matrix multiplication between:
    *                     - The output of im2col on the input and the (transposed) 2D weights, if the
    * function is called after a Convolution Layer
    *                     - The input tensor and the (transposed) 2D weights, if the function is
-   * called after another FullyConnected Layer.
-   *                     Data type supported: Same as @p input.
+   * called after another FullyConnected Layer. Data type supported: Same as @p input.
    * @param[in]  fc_info (Optional) Fully connected layer additional info
    *
    * @return a status
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h
index 167554c9e..385eb0b2c 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGatherEx.h
@@ -47,11 +47,14 @@
 #ifndef __ARM_COMPUTE_CLGATHEREX_H__
 #define __ARM_COMPUTE_CLGATHEREX_H__
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /**
  * @brief Class to to run @ref CLGatherKernel.
@@ -81,5 +84,5 @@ public:
   static Status validate(const ITensorInfo *input, const ITensorInfo *indices,
                          const ITensorInfo *output, int axis = 0);
 };
-}
+} // namespace arm_compute
 #endif /*__ARM_COMPUTE_CLGATHEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h
index 6618f5aa4..5e172a4c7 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLHashtableLookup.h
@@ -78,5 +78,5 @@ public:
   void configure(const ICLTensor *lookups, const ICLTensor *keys, const ICLTensor *intput,
                  ICLTensor *output, ICLTensor *hits);
 };
-}
+} // namespace arm_compute
 #endif /*__ARM_COMPUTE_CLHASHTABLELOOKUP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h
index 887e7aaa5..02ae6d719 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h
@@ -41,11 +41,14 @@
 #ifndef __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__
 #define __ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYEREX_H__
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
 {
+class CLCompileContext;
 class ICLTensor;
+class ITensorInfo;
 
 /** Basic function to perform a Instance normalization.
  *
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h
index 2bbfca821..62a36f06d 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLOneHot.h
@@ -39,9 +39,11 @@
  */
 #ifndef __ARM_COMPUTE_CLONEHOT_H__
 #define __ARM_COMPUTE_CLONEHOT_H__
-#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
+
 #include "arm_compute/core/CL/kernels/CLOneHotKernel.h"
+#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
 #include "arm_compute/runtime/IFunction.h"
+
 namespace arm_compute
 {
 class ICLTensor;
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h
new file mode 100644
index 000000000..ee1879aaa
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPadLayerEx.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CLPADLAYEREX_H
+#define ARM_COMPUTE_CLPADLAYEREX_H
+
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/core/CL/kernels/CLPadLayerKernelEx.h"
+#include "src/core/gpu/cl/kernels/ClCopyKernel.h"
+// #include "arm_compute/runtime/CL/functions/CLCopy.h"
+#include <memory>
+
+namespace arm_compute
+{
+class ICLTensor;
+
+/** Basic function to pad a tensor. This function calls the following OpenCL functions/kernels:
+ *
+ *  -# @ref CLPadLayerKernelEx if there is padding to be added
+ *  -# @ref CLCopyKernel otherwise
+ */
+class CLPadLayerEx : public IFunction
+{
+public:
+  /** Default constructor */
+  CLPadLayerEx();
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLPadLayerEx(const CLPadLayerEx &) = delete;
+  /** Default move constructor */
+  CLPadLayerEx(CLPadLayerEx &&) = default;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLPadLayerEx &operator=(const CLPadLayerEx &) = delete;
+  /** Default move assignment operator */
+  CLPadLayerEx &operator=(CLPadLayerEx &&) = default;
+
+  /** Initialize the function
+   *
+   * @param[in]  input          Source tensor. Data types supported: All.
+   * @param[out] output         Output tensor. Data type supported: same as @p input
+   * @param[in]  padding        The padding for each spatial dimension of the input tensor. The pair
+   * padding[i] specifies the front and the end padding in the i-th dimension.
+   * @param[in]  constant_value (Optional) Constant value to be used for the padding.
+   * @param[in]  mode           (Optional) Controls whether the padding should be filled with @p
+   * constant_value using CONSTANT, or reflect the input, either including the border values
+   * (SYMMETRIC) or not (REFLECT).
+   */
+  void configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding,
+                 PixelValue constant_value = PixelValue(),
+                 PaddingMode mode = PaddingMode::CONSTANT);
+  /** Initialize the function
+   *
+   * @param[in]  compile_context The compile context to be used.
+   * @param[in]  input           Source tensor. Data types supported: All.
+   * @param[out] output          Output tensor. Data type supported: same as @p input
+   * @param[in]  padding         The padding for each spatial dimension of the input tensor. The
+   * pair padding[i] specifies the front and the end padding in the i-th dimension.
+   * @param[in]  constant_value  (Optional) Constant value to be used for the padding.
+   * @param[in]  mode            (Optional) Controls whether the padding should be filled with @p
+   * constant_value using CONSTANT, or reflect the input, either including the border values
+   * (SYMMETRIC) or not (REFLECT).
+   */
+  void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output,
+                 const PaddingList &padding, PixelValue constant_value = PixelValue(),
+                 PaddingMode mode = PaddingMode::CONSTANT);
+
+  /**  Static function to check if given info will lead to a valid configuration of @ref
+   * CLPadLayerEx.
+   *
+   * @param[in] input          Source tensor info. Data types supported: All.
+   * @param[in] output         Output tensor info. Data type supported: same as @p input
+   * @param[in] padding        The padding for each spatial dimension of the input tensor. The pair
+   * padding[i] specifies the front and the end padding in the i-th dimension.
+   * @param[in] constant_value (Optional) Constant value to be used for the padding
+   * @param[in] mode           (Optional) Controls whether the padding should be filled with @p
+   * constant_value using CONSTANT, or reflect the input, either including the border values
+   * (SYMMETRIC) or not (REFLECT).
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
+                         const PaddingList &padding, PixelValue constant_value = PixelValue(),
+                         PaddingMode mode = PaddingMode::CONSTANT);
+
+  // Inherited methods overridden:
+  void run() override;
+
+private:
+  void configure_reflect_mode(ICLTensor *input, ICLTensor *output);
+
+  std::unique_ptr<CLPadLayerKernelEx> _pad_kernel;
+  std::unique_ptr<opencl::kernels::ClCopyKernel> _copy_kernel;
+  bool _perform_pad;
+};
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CLPADLAYEREX_H */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h
index bb852e404..45eb72bef 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLReduceOperation.h
@@ -116,5 +116,5 @@ private:
   std::unique_ptr<CLReduceOperationKernel[]> _reduce_kernels{nullptr};
   CLReshapeLayer _reshape;
 };
-}
+} // namespace arm_compute
 #endif /*__ARM_COMPUTE_CLREDUCEOPERATION_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h
index bb741d98d..3023df3f0 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSplitVEx.h
@@ -46,6 +46,9 @@
 #include <vector>
 #include <memory>
 
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/runtime/CPP/functions/CPPSplit.h"
+
 namespace arm_compute
 {
 class ICLTensor;
@@ -82,5 +85,5 @@ private:
   unsigned int _num_splits;
   std::vector<CLSlice> _slice_functions;
 };
-}
+} // namespace arm_compute
 #endif /* __ARM_COMPUTE_CLSPLITVEX__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h
index e301a5152..f426a4d75 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTopKV2.h
@@ -160,5 +160,5 @@ private:
   CLTopKV2Store _store_kernel;
 #endif
 };
-}
+} // namespace arm_compute
 #endif // __ARM_COMPUTE_CLTOPK_V2_H__
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
index efc296d6c..d0ddc2609 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
@@ -16,7 +16,6 @@
 #ifndef __ARM_COMPUTE_NEFUNCTIONSEX_H__
 #define __ARM_COMPUTE_NEFUNCTIONSEX_H__
 
-#include <arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h>
 #include <arm_compute/runtime/NEON/functions/NECastBool.h>
 #include <arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h>
 #include <arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h>
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h
index 026d30098..8d931f08d 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h
@@ -41,8 +41,10 @@
 #ifndef __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__
 #define __ARM_COMPUTE_NEBINARYLOGICALOPERATION_H__
 
+#include "arm_compute/core/Error.h"
 #include "arm_compute/core/TypesEx.h"
 #include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/core/ITensorInfo.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h
index c8b08af8d..dd62645ee 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECastBool.h
@@ -41,16 +41,17 @@
 #define __ARM_COMPUTE_NECASTBOOL_H__
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /**
- * @brief Class to run @ref NECastBoolKernel.
+ * @brief Class to run @ref INESimpleFunctionNoBorder.
  */
-class NECastBool : public INESimpleFunction
+class NECastBool : public INESimpleFunctionNoBorder
 {
 public:
   /** Initialize the function's source, destination
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h
index 63f7714aa..82a789e86 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h
@@ -48,12 +48,14 @@
 #define __ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__
 
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#include "arm_compute/core/Error.h"
 
 #include <vector>
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /**
  * @brief Class to perform EmbeddingLookup operation
@@ -84,5 +86,5 @@ public:
   static Status validate(const ITensorInfo *input, const ITensorInfo *output,
                          const ITensorInfo *lookups);
 };
-}
+} // namespace arm_compute
 #endif /*__ARM_COMPUTE_NEEMBEDDINGLOOKUP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
index 56548a479..214592710 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
@@ -44,11 +44,11 @@
 #include "arm_compute/core/NEON/kernels/NEQuantizationSymmetricKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
 #include "arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h"
-#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 #include "arm_compute/runtime/Tensor.h"
+#include "src/core/NEON/kernels/NETransposeKernel.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h
index 8f98f220a..2bbb1fea1 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h
@@ -43,16 +43,16 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/NEON/kernels/NEFlattenLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
-#include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
-#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEFlattenLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMM.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
+#include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
+#include "src/core/NEON/kernels/NETransposeKernel.h"
 
 namespace arm_compute
 {
@@ -79,11 +79,11 @@ public:
   /** Prevent instances of this class from being copied (As this class contains pointers) */
   NEFullyConnectedLayerEx(const NEFullyConnectedLayerEx &) = delete;
   /** Default move constructor */
-  NEFullyConnectedLayerEx(NEFullyConnectedLayerEx &&) = default;
+  NEFullyConnectedLayerEx(NEFullyConnectedLayerEx &&) = delete;
   /** Prevent instances of this class from being copied (As this class contains pointers) */
   NEFullyConnectedLayerEx &operator=(const NEFullyConnectedLayerEx &) = delete;
   /** Default move assignment operator */
-  NEFullyConnectedLayerEx &operator=(NEFullyConnectedLayerEx &&) = default;
+  NEFullyConnectedLayerEx &operator=(NEFullyConnectedLayerEx &&) = delete;
   /** Set the input and output tensors.
    *
    * @param[in]  input   Source tensor. Data type supported: QASYMM8/F16/F32.
@@ -141,7 +141,7 @@ private:
   void configure_mm(const ITensor *input, const ITensor *weights, ITensor *output);
 
   MemoryGroup _memory_group;
-  NEFlattenLayerKernel _flatten_kernel;
+  NEFlattenLayer _flatten_kernel;
   NEConvertFullyConnectedWeights _convert_weights;
   NEFullyConnectedLayerReshapeWeights _reshape_weights_function;
   NEGEMM _mm_gemm;
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h
index 155a1b837..6944c77f6 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGatherEx.h
@@ -47,6 +47,7 @@
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to run @ref NEGatherKernelEx */
 class NEGatherEx : public INESimpleFunctionNoBorder
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h
index 521a05ad9..f6fda60a9 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEHashtableLookup.h
@@ -48,12 +48,14 @@
 #define __ARM_COMPUTE_NEHASHTABLELOOKUP_H__
 
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#include "arm_compute/core/Error.h"
 
 #include <vector>
 
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /**
  * @brief Class to perform HashtableLookup operation
@@ -96,5 +98,5 @@ public:
                          const ITensorInfo *input, const ITensorInfo *output,
                          const ITensorInfo *hits);
 };
-}
+} // namespace arm_compute
 #endif /*__ARM_COMPUTE_NEHASHTABLELOOKUP_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h
index 18e813923..0ee967698 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h
@@ -54,6 +54,7 @@
 namespace arm_compute
 {
 class ITensor;
+class ITensorInfo;
 
 /** Basic function to perform a Instance normalization.
  *
@@ -112,5 +113,5 @@ private:
   Tensor _permuted_input;
   Tensor _permuted_output;
 };
-}
+} // namespace arm_compute
 #endif /* __ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h
index 1a68f801a..668f024a1 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEOneHot.h
@@ -45,6 +45,8 @@ namespace arm_compute
 {
 // Forward declarations
 class ITensor;
+class ITensorInfo;
+
 /** Basic function to run @ref NEOneHotKernel */
 class NEOneHot : public INESimpleFunctionNoBorder
 {
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h
index 91eec815c..9858e6c09 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceOperation.h
@@ -43,7 +43,7 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h
index 48b416923..f34a8f8af 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceSum.h
@@ -43,11 +43,13 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/NEON/functions/NEDequantizationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
 #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
+#include "arm_compute/runtime/Tensor.h"
 
 namespace arm_compute
 {
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
index 7a08dae97..f82579a45 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
@@ -102,9 +102,9 @@ public:
   /** Prevent instances of this class from being copied (As this class contains pointers) */
   NETransposeConvLayer &operator=(const NETransposeConvLayer &) = delete;
   /** Allow instances of this class to be moved */
-  NETransposeConvLayer(NETransposeConvLayer &&) = default;
+  NETransposeConvLayer(NETransposeConvLayer &&) = delete;
   /** Allow instances of this class to be moved */
-  NETransposeConvLayer &operator=(NETransposeConvLayer &&) = default;
+  NETransposeConvLayer &operator=(NETransposeConvLayer &&) = delete;
   /** Default destructor */
   virtual ~NETransposeConvLayer() = default;
 
@@ -171,5 +171,5 @@ private:
   PadStrideInfo _info;
   bool _is_prepared;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif /* __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__ */
diff --git a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
index 1a8ff3e71..1a180a35b 100644
--- a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
+++ b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
@@ -66,12 +66,16 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
   {"gather_ex_1d", "gather_ex.cl"},
   {"gather_ex_1d_out", "gather_ex.cl"},
   {"gemmlowp_mm_midgard_ex", "gemmlowp_ex.cl"},
+  {"gemm_accumulate_biases", "gemm.cl"},
   {"hashtable_lookup", "hashtable_lookup.cl"},
   {"instance_normalization_ex", "instance_normalization_ex.cl"},
+  {"memset", "memset.cl"},
   {"multiply_scale_factor", "multiply_scale_factor.cl"},
   {"neg_tensor", "neg_tensor.cl"},
   {"one_hot", "one_hot.cl"},
   {"one_hot_only_on_value", "one_hot.cl"},
+  {"pad_layer_constant", "pad_layer.cl"},
+  {"pad_layer_symmetric_reflect", "pad_layer.cl"},
   {"quantization_symm8", "quantization_symm8.cl"},
   {"reduce_min_max", "reduce_operation.cl"},
   {"reduce_sum_mean", "reduce_operation.cl"},
@@ -90,10 +94,18 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
 const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map = {
 #ifdef EMBEDDED_KERNELS
   {
+    "activation_float_helpers.h",
+#include "./cl_kernels/activation_float_helpers.hembed"
+  },
+  {
     "arg_min_max_ex.cl",
 #include "./cl_kernels/arg_min_max_ex.clembed"
   },
   {
+    "binary_logical_op.cl",
+#include "./cl_kernels/binary_logical_op.clembed"
+  },
+  {
     "cast.cl",
 #include "./cl_kernels/cast.clembed"
   },
@@ -110,6 +122,10 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
 #include "./cl_kernels/gemmlowp_ex.clembed"
   },
   {
+    "gemm_helpers.h",
+#include "./cl_kernels/gemm_helpers.hembed"
+  },
+  {
     "hashtable_lookup.cl",
 #include "./cl_kernels/hashtable_lookup.clembed"
   },
@@ -126,8 +142,12 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
 #include "./cl_kernels/instance_normalization_ex.clembed"
   },
   {
-    "binary_logical_op.cl",
-#include "./cl_kernels/binary_logical_op.clembed"
+    "gemm.cl",
+#include "./cl_kernels/gemm.clembed"
+  },
+  {
+    "memset.cl",
+#include "./cl_kernels/memset.clembed"
   },
   {
     "multiply_scale_factor.cl",
@@ -142,6 +162,10 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
 #include "./cl_kernels/one_hot.clembed"
   },
   {
+    "pad_layer.cl",
+#include "./cl_kernels/pad_layer.clembed"
+  },
+  {
     "quantization_symm8.cl",
 #include "./cl_kernels/quantization_symm8.clembed"
   },
@@ -150,6 +174,10 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
 #include "./cl_kernels/reduce_operation.clembed"
   },
   {
+    "repeat.h",
+#include "./cl_kernels/repeat.hembed"
+  },
+  {
     "scale_factor.cl",
 #include "./cl_kernels/scale_factor.clembed"
   },
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/activation_float_helpers.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/activation_float_helpers.h
new file mode 100644
index 000000000..3c3ff8419
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/activation_float_helpers.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "helpers.h"
+
+#if GPU_ARCH == GPU_ARCH_BIFROST
+#define MLA(a, b, c) (fma(c, b, a))
+#else // GPU_ARCH == GPU_ARCH_BIFROST
+#define MLA(a, b, c) ((b) * (c) + (a))
+#endif // GPU_ARCH == GPU_ARCH_BIFROST
+
+// Hard-Swish
+#define hard_swish_op(DATA_TYPE, x, A_VAL, B_VAL) \
+  (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667))
+
+// Logistic Activation
+#define logistic_op(DATA_TYPE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x)))
+
+// Hyperbolic Tangent Activation
+#define tanh_op(DATA_TYPE, x, A_VAL, B_VAL) ((DATA_TYPE)A_VAL * tanh((DATA_TYPE)B_VAL * x))
+
+// RELU Tangent Activation
+#define relu_op(DATA_TYPE, x, A_VAL, B_VAL) (max((DATA_TYPE)0.0, x))
+
+// Bounded RELU Activation
+#define brelu_op(DATA_TYPE, x, A_VAL, B_VAL) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)0.0, x)))
+
+// Lower Upper Bounded RELU Activation
+#define lu_brelu_op(DATA_TYPE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
+
+// Leaky RELU Activation
+#define lrelu_op(DATA_TYPE, x, A_VAL, B_VAL) \
+  ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
+
+// Soft RELU Activation
+#define srelu_op(DATA_TYPE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
+
+// ELU Activation
+#define elu_op(DATA_TYPE, x, A_VAL, B_VAL) \
+  (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, isgreaterequal(x, (DATA_TYPE)0.0)))
+
+// Absolute Activation
+#define abs_op(DATA_TYPE, x, A_VAL, B_VAL) (fabs(x))
+
+// Square Activation
+#define square_op(DATA_TYPE, x, A_VAL, B_VAL) (x * x)
+
+// Square-root Activation
+#define sqrt_op(DATA_TYPE, x, A_VAL, B_VAL) (sqrt(x))
+
+// Linear Activation
+#define linear_op(DATA_TYPE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x))
+
+// Identity Activation
+#define identity_op(DATA_TYPE, x, A_VAL, B_VAL) (x)
+
+#define ACT_OP(op, DATA_TYPE, x, A_VAL, B_VAL) op##_op(DATA_TYPE, x, A_VAL, B_VAL)
+
+#define ACTIVATION(op, DATA_TYPE, x, A_VAL, B_VAL) ACT_OP(op, DATA_TYPE, x, A_VAL, B_VAL)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gemm.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemm.cl
new file mode 100644
index 000000000..9b826a2bd
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemm.cl
@@ -0,0 +1,7210 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "gemm_helpers.h"
+#include "repeat.h"
+
+#if defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH)
+#define INC2 (VEC_DATA_TYPE(uint, 2))(0, 1)
+#define INC3 (VEC_DATA_TYPE(uint, 3))(0, 1, 2)
+#define INC4 (VEC_DATA_TYPE(uint, 4))(0, 1, 2, 3)
+#define INC8 (VEC_DATA_TYPE(uint, 8))(0, 1, 2, 3, 4, 5, 6, 7)
+#define INC16 (VEC_DATA_TYPE(uint, 16))(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+#define CONCAT_INC(K0) INC##K0
+#define INC(K0) CONCAT_INC(K0)
+
+#if (SRC_WIDTH % K0)
+#define BOUNDARY_CONDITION_X(x, a)                                                                \
+  ({                                                                                              \
+    a = select(                                                                                   \
+      0, a,                                                                                       \
+      CONVERT(((x * (VEC_DATA_TYPE(uint, K0))K0 + INC(K0)) < (VEC_DATA_TYPE(uint, K0))SRC_WIDTH), \
+              VEC_DATA_TYPE(DATA_TYPE, K0)));                                                     \
+  })
+#else // (SRC_WIDTH % K0)
+#define BOUNDARY_CONDITION_X(x, a) ({})
+#endif // (SRC_WIDTH % K0)
+
+/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks
+ * of size M0xK0 and stores each one (not transposed) in the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g.
+ * -DSRC_WIDTH=16)
+ * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g.
+ * -DM0=2, -DK0=2).
+ * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at
+ * compile time using -DV0 (e.g. -DV0=2)
+ * @note Only the following values for M0, K0 and V0 are supported:
+ *                                      M0: 2,3,4,5,6,7,8
+ *                                      K0: 2,3,4,8,16
+ *                                      V0: greater than 0
+ * @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer
+ * 1x1), the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile
+ * time.
+ *
+ * @param[in]  src_ptr                           Pointer to the source LHS tensor. Supported data
+ * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source LHS tensor in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source LHS tensor in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source LHS tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source LHS
+ * tensor
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data
+ * types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ * @param[in]  cross_plane_pad                   (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_INPUT_AS_3D)
+ */
+__kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst)
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                                      ,
+                                         uint cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+)
+{
+  // Block size
+#define BLOCK_SIZE ((M0) * (K0))
+
+  // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (K0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+  // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (K0) * (V0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (K0)
+#endif // defined(INTERLEAVE)
+
+  // Compute source and destination addresses
+  uint x = get_global_id(0);
+  uint y = get_global_id(1);
+  uint z = get_global_id(2);
+
+  // ------------------ Compute input/output addresses ---------------------------
+
+  // Compute the input address
+  __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes +
+                              x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;
+
+  // Compute the output address
+  __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes +
+                               (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) +
+                               ((y / (uint)V0) * (uint)dst_stride_y) +
+                               ((y % V0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));
+
+  // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;
+  REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply src_stride_z by DEPTH_GEMM3D
+
+  input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
+
+  // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  input_ptr += z * (uint)src_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  output_ptr += z * (uint)dst_stride_z;
+
+  // ---------------------------Load input values --------------------------------
+  // Load values from the LHS matrix
+  LOAD_BLOCK(M0, K0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
+  BOUNDARY_CONDITION_X(x, a0);
+#if M0 > 1
+  BOUNDARY_CONDITION_X(x, a1);
+#endif // M0 > 1
+#if M0 > 2
+  BOUNDARY_CONDITION_X(x, a2);
+#endif // M0 > 2
+#if M0 > 3
+  BOUNDARY_CONDITION_X(x, a3);
+#endif // M0 > 3
+#if M0 > 4
+  BOUNDARY_CONDITION_X(x, a4);
+#endif // M0 > 4
+#if M0 > 5
+  BOUNDARY_CONDITION_X(x, a5);
+#endif // M0 > 5
+#if M0 > 6
+  BOUNDARY_CONDITION_X(x, a6);
+#endif // M0 > 6
+#if M0 > 7
+  BOUNDARY_CONDITION_X(x, a7);
+#endif // M0 > 7
+  // ---------------------------Store output values ------------------------------
+  REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
+  STORE_BLOCK(M0, K0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+
+#if M0 == 2
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                              \
+  ({                                                                                          \
+    VEC_DATA_TYPE(DATA_TYPE, M0)                                                              \
+    res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i);                                   \
+    VSTORE(M0)                                                                                \
+    (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
+  })
+#elif M0 == 3 // M0 == 3
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                              \
+  ({                                                                                          \
+    VEC_DATA_TYPE(DATA_TYPE, M0)                                                              \
+    res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i);                          \
+    VSTORE(M0)                                                                                \
+    (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
+  })
+#elif M0 == 4 // M0 == 4
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                              \
+  ({                                                                                          \
+    VEC_DATA_TYPE(DATA_TYPE, M0)                                                              \
+    res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);                 \
+    VSTORE(M0)                                                                                \
+    (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \
+  })
+#elif M0 == 5 // M0 == 5
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                  \
+  ({                                                                                              \
+    VEC_DATA_TYPE(DATA_TYPE, 4)                                                                   \
+    res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);                     \
+    DATA_TYPE res1 = a4.s##i;                                                                     \
+    VSTORE(4)                                                                                     \
+    (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));    \
+    *((__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4) = res1; \
+  })
+#elif M0 == 6 // M0 == 6
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                   \
+  ({                                                                                               \
+    VEC_DATA_TYPE(DATA_TYPE, 4)                                                                    \
+    res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);                      \
+    VEC_DATA_TYPE(DATA_TYPE, 2)                                                                    \
+    res1 = (VEC_DATA_TYPE(DATA_TYPE, 2))(a4.s##i, a5.s##i);                                        \
+    VSTORE(4)                                                                                      \
+    (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));     \
+    VSTORE(2)                                                                                      \
+    (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \
+  })
+#elif M0 == 7 // M0 == 7
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                                   \
+  ({                                                                                               \
+    VEC_DATA_TYPE(DATA_TYPE, 4)                                                                    \
+    res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i);                      \
+    VEC_DATA_TYPE(DATA_TYPE, 3)                                                                    \
+    res1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(a4.s##i, a5.s##i, a6.s##i);                               \
+    VSTORE(4)                                                                                      \
+    (res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));     \
+    VSTORE(3)                                                                                      \
+    (res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \
+  })
+#elif M0 == 8 // M0 == 8
+#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i)                               \
+  ({                                                                                           \
+    VEC_DATA_TYPE(DATA_TYPE, M0)                                                               \
+    res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i, a4.s##i, a5.s##i, \
+                                         a6.s##i, a7.s##i);                                    \
+    VSTORE(M0)                                                                                 \
+    (res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)));  \
+  })
+#else // M0 not supported
+#error "M0 value not supported"
+#endif // N0 conditions
+
+/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks
+ * of size M0xK0 and stores each one (transposed) in the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g.
+ * -DSRC_WIDTH=16)
+ * @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g.
+ * -DM0=2, -DK0=2).
+ * @note The number of M0xK0 vertical blocks to store on the same output row must be passed at
+ * compile time using -DV0 (e.g. -DV0=2)
+ * @note Only the following values for M0, K0 and V0 are supported:
+ *                                      M0: 2,3,4,5,6,7,8
+ *                                      K0: 2,3,4,8,16
+ *                                      V0: greater than 0
+ * @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer
+ * 1x1), the following information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ * @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile
+ * time.
+ *
+ * @param[in]  src_ptr                           Pointer to the source LHS tensor. Supported data
+ * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source LHS tensor in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source LHS tensor in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source LHS tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source LHS
+ * tensor
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data
+ * types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ * @param[in]  cross_plane_pad                   (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_INPUT_AS_3D)
+ */
+__kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst)
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                                     ,
+                                        uint cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+)
+{
+  // Block size
+#define BLOCK_SIZE ((M0) * (K0))
+
+  // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (M0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+  // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (M0) * (V0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (M0)
+#endif // defined(INTERLEAVE)
+
+  // Compute source and destination addresses
+  uint x = get_global_id(0);
+  uint y = get_global_id(1);
+  uint z = get_global_id(2);
+
+  // ------------------ Compute input/output addresses ---------------------------
+
+  // Compute the input address
+  __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes +
+                              x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;
+
+  // Compute the output address
+  __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes +
+                               (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) +
+                               ((y / (uint)V0) * (uint)dst_stride_y) +
+                               ((y % V0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));
+
+  // Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;
+  REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply src_stride_z by DEPTH_GEMM3D
+
+  input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;
+
+  // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  input_ptr += z * (uint)src_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  output_ptr += z * (uint)dst_stride_z;
+
+  // ---------------------------Load input values --------------------------------
+
+  // Load values from the LHS matrix
+  LOAD_BLOCK(M0, K0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
+  BOUNDARY_CONDITION_X(x, a0);
+#if M0 > 1
+  BOUNDARY_CONDITION_X(x, a1);
+#endif // M0 > 1
+#if M0 > 2
+  BOUNDARY_CONDITION_X(x, a2);
+#endif // M0 > 2
+#if M0 > 3
+  BOUNDARY_CONDITION_X(x, a3);
+#endif // M0 > 3
+#if M0 > 4
+  BOUNDARY_CONDITION_X(x, a4);
+#endif // M0 > 4
+#if M0 > 5
+  BOUNDARY_CONDITION_X(x, a5);
+#endif // M0 > 5
+#if M0 > 6
+  BOUNDARY_CONDITION_X(x, a6);
+#endif // M0 > 6
+#if M0 > 7
+  BOUNDARY_CONDITION_X(x, a7);
+#endif // M0 > 7
+  // ---------------------------Transpose and store block -----------------------
+
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 0);
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 1);
+#if K0 > 2
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 2);
+#endif // K0 > 2
+#if K0 > 3
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 3);
+#endif // K0 > 3
+#if K0 > 4
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 4);
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 5);
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 6);
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 7);
+#endif // K0 > 4
+#if K0 > 8
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 8);
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 9);
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, A);
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, B);
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, C);
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, D);
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, E);
+  TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, F);
+#endif // K0 > 8
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+#endif // defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH)
+
+#if defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
+/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks
+ * of size K0xN0 and stores each one (not transposed) in the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g.
+ * -DSRC_HEIGHT=16)
+ * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g.
+ * -DK0=2, -DN0=2).
+ * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at
+ * compile time using -DH0 (e.g. -DH0=2)
+ * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile
+ * time.
+ * @note Only the following values for K0, N0 and H0 are supported:
+ *                                      N0: 2,3,4,8,16
+ *                                      K0: 1,2,3,4,8,16
+ *                                      H0: greater than 0
+ *
+ * @param[in]  src_ptr                           Pointer to the source RHS tensor. Supported data
+ * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source RHS tensor in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source RHS tensor in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source RHS tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source RHS
+ * tensor
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data
+ * types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ */
+__kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))
+{
+  // Block size
+#define BLOCK_SIZE ((K0) * (N0))
+
+  // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (N0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+  // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (N0) * (H0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (N0)
+#endif // defined(INTERLEAVE)
+
+  // Compute source and destination addresses
+  uint x = get_global_id(0);
+  uint y = get_global_id(1);
+  uint z = get_global_id(2);
+
+  // ------------------ Compute input/output addresses ---------------------------
+
+  // Compute the input address
+  __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes +
+                              x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y +
+                              z * (uint)src_stride_z;
+
+  // Compute the output address
+  __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes +
+                               (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) +
+                               ((x % (uint)H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) +
+                               ((x / (uint)H0) * (uint)dst_stride_y) + z * (uint)dst_stride_z;
+
+  // ---------------------------Load input values --------------------------------
+
+  REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a,
+                           0); ////uint a0=0, a1=0, a2=0...a(M0-1)=0;
+
+  // Load values from the RHS matrix
+  a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+#if K0 > 1
+  if (y * (uint)K0 + 1 < SRC_HEIGHT)
+  {
+    a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
+  }
+#endif // K0 > 1
+#if K0 > 2
+  if (y * (uint)K0 + 2 < SRC_HEIGHT)
+  {
+    a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
+  }
+#endif // K0 > 2
+#if K0 > 3
+  if (y * (uint)K0 + 3 < SRC_HEIGHT)
+  {
+    a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
+  }
+#endif // K0 > 3
+#if K0 > 4
+  if (y * (uint)K0 + 4 < SRC_HEIGHT)
+  {
+    a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));
+  }
+  if (y * (uint)K0 + 5 < SRC_HEIGHT)
+  {
+    a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));
+  }
+  if (y * (uint)K0 + 6 < SRC_HEIGHT)
+  {
+    a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));
+  }
+  if (y * (uint)K0 + 7 < SRC_HEIGHT)
+  {
+    a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));
+  }
+#endif // K0 > 4
+#if K0 > 8
+  if (y * (uint)K0 + 8 < SRC_HEIGHT)
+  {
+    a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));
+  }
+  if (y * (uint)K0 + 9 < SRC_HEIGHT)
+  {
+    a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));
+  }
+  if (y * (uint)K0 + 10 < SRC_HEIGHT)
+  {
+    aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));
+  }
+  if (y * (uint)K0 + 11 < SRC_HEIGHT)
+  {
+    aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));
+  }
+  if (y * (uint)K0 + 12 < SRC_HEIGHT)
+  {
+    aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));
+  }
+  if (y * (uint)K0 + 13 < SRC_HEIGHT)
+  {
+    aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));
+  }
+  if (y * (uint)K0 + 14 < SRC_HEIGHT)
+  {
+    aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));
+  }
+  if (y * (uint)K0 + 15 < SRC_HEIGHT)
+  {
+    aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));
+  }
+#endif // K0 > 8
+
+  // ---------------------------Store output values ------------------------------
+  REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
+  STORE_BLOCK(K0, N0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+
+#if defined(TRANSPOSE)
+/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks
+ * of size K0xN0 and stores each one (transposed) in the output matrix unrolling the values.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g.
+ * -DSRC_HEIGHT=16)
+ * @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g.
+ * -DK0=2, -DN0=2).
+ * @note The number of K0xN0 vertical blocks to store on the same output row must be passed at
+ * compile time using -DH0 (e.g. -DH0=2)
+ * @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile
+ * time.
+ * @note The option -DTRANSPOSE must passed at compile time.
+ * @note Only the following values for K0, N0 and H0 are supported:
+ *                                      N0: 2,3,4,8,16
+ *                                      K0: 2,3,4,8,16
+ *                                      H0: greater than 0
+ *
+ * @param[in]  src_ptr                           Pointer to the source RHS tensor. Supported data
+ * types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * @param[in]  src_stride_x                      Stride of the source RHS tensor in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source RHS tensor in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source RHS tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source RHS
+ * tensor
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data
+ * types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ */
+__kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))
+{
+  // Block size
+#define BLOCK_SIZE ((K0) * (N0))
+
+  // Output offset X
+#if defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (K0)
+#else // defined(INTERLEAVE)
+#define OUTPUT_OFFSET_X (BLOCK_SIZE)
+#endif // defined(INTERLEAVE)
+
+  // Output step X
+#if defined(INTERLEAVE)
+#define OUTPUT_STEP_X (K0) * (H0)
+#else // Do not interleave
+#define OUTPUT_STEP_X (K0)
+#endif // defined(INTERLEAVE)
+
+  // Compute source and destination addresses
+  uint x = get_global_id(0);
+  uint y = get_global_id(1);
+  uint z = get_global_id(2);
+
+  // ------------------ Compute input/output addresses ---------------------------
+
+  // Compute the input address
+  __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes +
+                              x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y +
+                              z * (uint)src_stride_z;
+
+  // Compute the output address
+  __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes +
+                               (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) +
+                               ((x % H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) +
+                               ((x / (uint)H0) * (uint)dst_stride_y) + z * (uint)dst_stride_z;
+
+  // ---------------------------Load input values --------------------------------
+  REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a,
+                           0); // VEC_DATA_TYPE(DATA_TYPE, N0)    a0=0, a1=0, ... a(K0-1)=0;
+
+  // Load values from the RHS matrix
+  a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+  if (y * (uint)K0 + 1 < SRC_HEIGHT)
+  {
+    a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
+  }
+#if K0 > 2
+  if (y * (uint)K0 + 2 < SRC_HEIGHT)
+  {
+    a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));
+  }
+#endif // K0 > 2
+#if K0 > 3
+  if (y * (uint)K0 + 3 < SRC_HEIGHT)
+  {
+    a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));
+  }
+#endif // K0 > 3
+#if K0 > 4
+  if (y * (uint)K0 + 4 < SRC_HEIGHT)
+  {
+    a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));
+  }
+  if (y * (uint)K0 + 5 < SRC_HEIGHT)
+  {
+    a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));
+  }
+  if (y * (uint)K0 + 6 < SRC_HEIGHT)
+  {
+    a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));
+  }
+  if (y * (uint)K0 + 7 < SRC_HEIGHT)
+  {
+    a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));
+  }
+#endif // K0 > 4
+#if K0 > 8
+  if (y * (uint)K0 + 8 < SRC_HEIGHT)
+  {
+    a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));
+  }
+  if (y * (uint)K0 + 9 < SRC_HEIGHT)
+  {
+    a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));
+  }
+  if (y * (uint)K0 + 10 < SRC_HEIGHT)
+  {
+    aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));
+  }
+  if (y * (uint)K0 + 11 < SRC_HEIGHT)
+  {
+    aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));
+  }
+  if (y * (uint)K0 + 12 < SRC_HEIGHT)
+  {
+    aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));
+  }
+  if (y * (uint)K0 + 13 < SRC_HEIGHT)
+  {
+    aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));
+  }
+  if (y * (uint)K0 + 14 < SRC_HEIGHT)
+  {
+    aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));
+  }
+  if (y * (uint)K0 + 15 < SRC_HEIGHT)
+  {
+    aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));
+  }
+#endif // K0 > 8
+
+  // ---------------------------Transpose the block ------------------------------
+  REPEAT_VAR_INIT_TO_CONST(
+    N0, VEC_DATA_TYPE(DATA_TYPE, K0), res,
+    0); // VEC_DATA_TYPE(DATA_TYPE, K0)    res0=0, res1=0, res2=0,... res(N0-1)=0;
+
+#if K0 == 2
+  // This part computes the following transpositions:
+  // 2x2 -> 2x2
+  // 2x4 -> 4x2
+  // 2x8 -> 8x2
+  // 2x16 -> 16x2
+  res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0);
+  res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1);
+#if N0 > 2
+  res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2);
+#endif // N0 > 2
+#if N0 > 3
+  res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3);
+#endif // N0 > 3
+#if N0 > 4
+  res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4);
+  res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5);
+  res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6);
+  res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7);
+#endif // N0 > 4
+#if N0 > 8
+  res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8);
+  res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9);
+  resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA);
+  resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB);
+  resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC);
+  resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD);
+  resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE);
+  resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF);
+#endif // N0 > 8
+
+#elif K0 == 3 // K0 == 2
+  // This part computes the following transpositions:
+  // 3x2 -> 2x3
+  // 3x4 -> 4x3
+  // 3x8 -> 8x3
+  // 3x16 -> 16x3
+  res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0);
+  res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1);
+#if N0 > 2
+  res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2);
+#endif // N0 > 2
+#if N0 > 3
+  res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3);
+#endif // N0 > 3
+#if N0 > 4
+  res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4);
+  res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5);
+  res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6);
+  res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7);
+#endif // N0 > 4
+#if N0 > 8
+  res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8);
+  res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9);
+  resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA);
+  resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB);
+  resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC);
+  resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD);
+  resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE);
+  resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF);
+#endif // N0 > 8
+
+#elif K0 == 4 // K0 == 4
+  // This part computes the following transpositions:
+  // 4x2 -> 2x4
+  // 4x4 -> 4x4
+  // 4x8 -> 8x4
+  // 4x16 -> 16x4
+  res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0);
+  res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1);
+#if N0 > 2
+  res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2);
+#endif // N0 > 2
+#if N0 > 3
+  res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3);
+#endif // N0 > 3
+#if N0 > 4
+  res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4);
+  res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5);
+  res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6);
+  res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7);
+#endif // N0 > 4
+#if N0 > 8
+  res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8);
+  res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9);
+  resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA);
+  resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB);
+  resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC);
+  resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD);
+  resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE);
+  resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF);
+#endif // N0 > 8
+
+#elif K0 == 8 // K0 == 8
+  // This part computes the following transpositions:
+  // 8x2 -> 2x8
+  // 8x4 -> 4x8
+  // 8x8 -> 8x8
+  // 8x16 -> 16x8
+  res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0);
+  res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1);
+#if N0 > 2
+  res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2);
+#endif // N0 > 2
+#if N0 > 3
+  res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3);
+#endif // N0 > 3
+#if N0 > 4
+  res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4);
+  res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5);
+  res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6);
+  res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7);
+#endif // N0 > 4
+#if N0 > 8
+  res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8);
+  res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9);
+  resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA);
+  resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB);
+  resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC);
+  resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD);
+  resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE);
+  resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF);
+#endif // N0 > 8
+
+#elif K0 == 16 // K0 == 16
+
+  // This part computes the following transpositions:
+  // 16x2 -> 2x16
+  // 16x4 -> 4x16
+  // 16x8 -> 8x16
+  // 16x16 -> 16x16
+  res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0,
+                                        a8.s0, a9.s0, aA.s0, aB.s0, aC.s0, aD.s0, aE.s0, aF.s0);
+  res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1,
+                                        a8.s1, a9.s1, aA.s1, aB.s1, aC.s1, aD.s1, aE.s1, aF.s1);
+#if N0 > 2
+  res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2,
+                                        a8.s2, a9.s2, aA.s2, aB.s2, aC.s2, aD.s2, aE.s2, aF.s2);
+#endif // N0 > 2
+#if N0 > 3
+  res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3,
+                                        a8.s3, a9.s3, aA.s3, aB.s3, aC.s3, aD.s3, aE.s3, aF.s3);
+#endif // N0 > 3
+#if N0 > 4
+  res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4,
+                                        a8.s4, a9.s4, aA.s4, aB.s4, aC.s4, aD.s4, aE.s4, aF.s4);
+  res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5,
+                                        a8.s5, a9.s5, aA.s5, aB.s5, aC.s5, aD.s5, aE.s5, aF.s5);
+  res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6,
+                                        a8.s6, a9.s6, aA.s6, aB.s6, aC.s6, aD.s6, aE.s6, aF.s6);
+  res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7,
+                                        a8.s7, a9.s7, aA.s7, aB.s7, aC.s7, aD.s7, aE.s7, aF.s7);
+#endif // N0 > 4
+#if N0 > 8
+  res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8,
+                                        a8.s8, a9.s8, aA.s8, aB.s8, aC.s8, aD.s8, aE.s8, aF.s8);
+  res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9,
+                                        a8.s9, a9.s9, aA.s9, aB.s9, aC.s9, aD.s9, aE.s9, aF.s9);
+  resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA,
+                                        a8.sA, a9.sA, aA.sA, aB.sA, aC.sA, aD.sA, aE.sA, aF.sA);
+  resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB,
+                                        a8.sB, a9.sB, aA.sB, aB.sB, aC.sB, aD.sB, aE.sB, aF.sB);
+  resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC,
+                                        a8.sC, a9.sC, aA.sC, aB.sC, aC.sC, aD.sC, aE.sC, aF.sC);
+  resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD,
+                                        a8.sD, a9.sD, aA.sD, aB.sD, aC.sD, aD.sD, aE.sD, aF.sD);
+  resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE,
+                                        a8.sE, a9.sE, aA.sE, aB.sE, aC.sE, aD.sE, aE.sE, aF.sE);
+  resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF,
+                                        a8.sF, a9.sF, aA.sF, aB.sF, aC.sF, aD.sF, aE.sF, aF.sF);
+#endif // N0 > 8
+
+#else // N0 == 16
+#error "Not supported N0 value"
+#endif // N0 > 2
+
+  // ---------------------------Store the output values ------------------------------
+  REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);
+  STORE_BLOCK(N0, K0, DATA_TYPE, res, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);
+
+#undef BLOCK_SIZE
+#undef OUTPUT_OFFSET_X
+#undef OUTPUT_STEP_X
+}
+#endif // defined(TRANSPOSE)
+#endif // defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && \
+  defined(M) && defined(N) && defined(K)
+
+#define CONCAT(a, b) a##b
+
+#define ARM_DOT1(a, b, c) ({ c = fma(a, b, c); })
+#define ARM_DOT2(a, b, c)   \
+  ({                        \
+    c = fma(a.s0, b.s0, c); \
+    c = fma(a.s1, b.s1, c); \
+  })
+#define ARM_DOT3(a, b, c)       \
+  ({                            \
+    ARM_DOT2(a, b, c);          \
+    c = fma((a.s2), (b.s2), c); \
+  })
+#define ARM_DOT4(a, b, c)       \
+  ({                            \
+    ARM_DOT3(a, b, c);          \
+    c = fma((a.s3), (b.s3), c); \
+  })
+#define ARM_DOT8(a, b, c)        \
+  ({                             \
+    ARM_DOT4((a.lo), (b.lo), c); \
+    ARM_DOT4((a.hi), (b.hi), c); \
+  })
+#define ARM_DOT16(a, b, c)       \
+  ({                             \
+    ARM_DOT8((a.lo), (b.lo), c); \
+    ARM_DOT8((a.hi), (b.hi), c); \
+  })
+
+#if N0 == 2
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+  ({                               \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##0), (c.s0));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##1), (c.s1));         \
+  })
+#elif N0 == 3 // N0 == 3
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+  ({                               \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##0), (c.s0));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##1), (c.s1));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##2), (c.s2));         \
+  })
+#elif N0 == 4 // N0 == 4
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+  ({                               \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##0), (c.s0));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##1), (c.s1));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##2), (c.s2));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##3), (c.s3));         \
+  })
+#elif N0 == 8 // N0 == 8
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+  ({                               \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##0), (c.s0));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##1), (c.s1));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##2), (c.s2));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##3), (c.s3));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##4), (c.s4));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##5), (c.s5));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##6), (c.s6));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##7), (c.s7));         \
+  })
+#elif N0 == 16 // N0 == 16
+#define ARM_DOT_K0XN0(k0, a, b, c) \
+  ({                               \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##0), (c.s0));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##1), (c.s1));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##2), (c.s2));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##3), (c.s3));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##4), (c.s4));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##5), (c.s5));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##6), (c.s6));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##7), (c.s7));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##8), (c.s8));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##9), (c.s9));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##A), (c.sA));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##B), (c.sB));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##C), (c.sC));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##D), (c.sD));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##E), (c.sE));         \
+    CONCAT(ARM_DOT, k0)            \
+    ((a), (b##F), (c.sF));         \
+  })
+#else // N0 not supported
+#error "N0 value not supported"
+#endif // N0 conditions
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
+ *
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items"
+ * support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK
+ * (e.g. -DM=52, -DN=30 and -DK=90)
+ * @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at
+ * compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS
+ * matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option
+ * -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS matrix. Supported data type:
+ * F16/F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS matrix
+ * @param[in]  rhs_ptr                            Pointer to the RHS reshaped matrix. Supported data
+ * type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                       Stride of the RHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in]  rhs_step_x                         src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                       Stride of the RHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in]  rhs_step_y                         src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS
+ * reshaped matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  lhs_stride_z                       Stride of the LHS matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit
+ * of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix
+ * in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+                                          IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                          IMAGE_DECLARATION(dst), uint lhs_stride_z,
+                                          uint rhs_stride_z,
+#if defined(BETA)
+                                          uint bias_stride_z,
+#endif // defined(BETA)
+                                          uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                          ,
+                                          uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                          ,
+                                          uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+  // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+  uint x = get_global_id(0);
+  uint y = get_global_id(1);
+  uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+  if ((x * N0 >= N) || (y * M0 >= M))
+  {
+    return;
+  }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+  // Compute LHS matrix address
+  uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+  // Compute RHS reshaped matrix address
+  uint rhs_offset = rhs_offset_first_element_in_bytes +
+                    (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) +
+                    (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); // uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
+  REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad,
+                     lhs_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply lhs_stride_z by DEPTH_GEMM3D
+  lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Initialize the accumulators
+  REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c,
+                           0); // VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+  int i = 0;
+  for (; i <= (K - K0); i += K0)
+  {
+    // Supported cases (M0, K0):
+    // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+    // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+    // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+    // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+    // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+    // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+    // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+    // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+    // Load values from LHS matrix
+    LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+    // Load values from RHS reshaped matrix
+    LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
+
+    // Accumulate
+    ARM_DOT_K0XN0(K0, a0, b, c0);
+#if M0 > 1
+    ARM_DOT_K0XN0(K0, a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+    ARM_DOT_K0XN0(K0, a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+    ARM_DOT_K0XN0(K0, a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+    ARM_DOT_K0XN0(K0, a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+    ARM_DOT_K0XN0(K0, a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+    ARM_DOT_K0XN0(K0, a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+    ARM_DOT_K0XN0(K0, a7, b, c7);
+#endif // M0 > 7
+
+    lhs_offset += K0 * sizeof(DATA_TYPE);
+    rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
+  }
+
+  // Left-over accumulations
+  for (; i < K; ++i)
+  {
+    // Load values from LHS matrix
+    LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+    // Load values from RHS reshaped matrix
+    LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);
+
+    // Accumulate
+    ARM_DOT_K0XN0(1, a0, b, c0);
+#if M0 > 1
+    ARM_DOT_K0XN0(1, a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+    ARM_DOT_K0XN0(1, a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+    ARM_DOT_K0XN0(1, a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+    ARM_DOT_K0XN0(1, a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+    ARM_DOT_K0XN0(1, a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+    ARM_DOT_K0XN0(1, a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+    ARM_DOT_K0XN0(1, a7, b, c7);
+#endif // M0 > 7
+
+    lhs_offset += sizeof(DATA_TYPE);
+    rhs_offset += sizeof(DATA_TYPE);
+  }
+
+  __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes +
+                             (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
+
+  REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad,
+                     dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+  __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes +
+                              (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+  LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(M0, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes +
+                              (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) +
+                              (get_global_id(1) * (uint)M0 * bias_stride_y) +
+                              get_global_id(2) * bias_stride_z;
+
+  LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias
+  ADD_BLOCK(M0, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store output block
+  STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+
+#define VFMA(a, b, c) ({ c = fma(a, b, c); })
+
+#if M0 == 1
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                               \
+  ({                                                                                             \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                                 \
+    b = VLOAD(N0)(                                                                               \
+      0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                \
+  })
+#elif M0 == 2 // M0 == 2
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                               \
+  ({                                                                                             \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                                 \
+    b = VLOAD(N0)(                                                                               \
+      0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                \
+  })
+#elif M0 == 3 // M0 == 3
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                               \
+  ({                                                                                             \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                                 \
+    b = VLOAD(N0)(                                                                               \
+      0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2));                                \
+  })
+#elif M0 == 4 // M0 == 4
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                               \
+  ({                                                                                             \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                                 \
+    b = VLOAD(N0)(                                                                               \
+      0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3));                                \
+  })
+#elif M0 == 5 // M0 == 5
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                               \
+  ({                                                                                             \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                                 \
+    b = VLOAD(N0)(                                                                               \
+      0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4));                                \
+  })
+#elif M0 == 6 // M0 == 6
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                               \
+  ({                                                                                             \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                                 \
+    b = VLOAD(N0)(                                                                               \
+      0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5));                                \
+  })
+#elif M0 == 7 // M0 == 7
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                               \
+  ({                                                                                             \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                                 \
+    b = VLOAD(N0)(                                                                               \
+      0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6));                                \
+  })
+#elif M0 == 8 // M0 == 8
+#define LD_RHS_VFMA_M0xN0(i, a, c)                                                               \
+  ({                                                                                             \
+    VEC_DATA_TYPE(DATA_TYPE, N0)                                                                 \
+    b = VLOAD(N0)(                                                                               \
+      0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0x##i * RHS_STEP_X * sizeof(DATA_TYPE))); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6));                                \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7));                                \
+  })
+#else // M0 not supported
+#error "M0 not supported"
+#endif // M0 not supported
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed
+ *
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items"
+ * support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK
+ * (e.g. -DM=52, -DN=30 and -DK=90).
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at
+ * compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS
+ * matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option
+ * -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS matrix. Supported data type:
+ * F16/F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS matrix
+ * @param[in]  rhs_ptr                            Pointer to the RHS reshaped matrix. Supported data
+ * type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                       Stride of the RHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in]  rhs_step_x                         src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                       Stride of the RHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in]  rhs_step_y                         src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS
+ * reshaped matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  lhs_stride_z                       Stride of the LHS matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit
+ * of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix
+ * in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+                                           IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                           IMAGE_DECLARATION(dst), uint lhs_stride_z,
+                                           uint rhs_stride_z,
+#if defined(BETA)
+                                           uint bias_stride_z,
+#endif // defined(BETA)
+                                           uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                           ,
+                                           uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                           ,
+                                           uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+  // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (N0)
+#define RHS_STEP_X ((N0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (N0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+  uint x = get_global_id(0);
+  uint y = get_global_id(1);
+  uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+  if ((x * N0 >= N) || (y * M0 >= M))
+  {
+    return;
+  }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+  // Compute LHS matrix address
+  uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+  // Compute RHS reshaped matrix address
+  uint rhs_offset = rhs_offset_first_element_in_bytes +
+                    (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) +
+                    (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);   // uint zin0=0,zin1=0,zin2=0,... zin7=0;
+  REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); // uint zero0=0,zero1=0,zero2=0,... zero7=0;
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+
+  // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad,
+                     lhs_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply lhs_stride_z by DEPTH_GEMM3D
+  lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Initialize the accumulators
+  REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c,
+                           0); // VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(N0-1)=0;
+
+  int i = 0;
+  for (; i <= (K - K0); i += K0)
+  {
+    // Supported cases (M0, K0):
+    // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+    // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+    // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+    // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+    // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+    // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+    // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+    // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+    // Load values from LHS matrix
+    LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);
+
+    LD_RHS_VFMA_M0xN0(0, a, c);
+    LD_RHS_VFMA_M0xN0(1, a, c);
+#if K0 > 2
+    LD_RHS_VFMA_M0xN0(2, a, c);
+#endif // K0 > 2
+#if K0 > 3
+    LD_RHS_VFMA_M0xN0(3, a, c);
+#endif // K0 > 3
+#if K0 > 4
+    LD_RHS_VFMA_M0xN0(4, a, c);
+    LD_RHS_VFMA_M0xN0(5, a, c);
+    LD_RHS_VFMA_M0xN0(6, a, c);
+    LD_RHS_VFMA_M0xN0(7, a, c);
+#endif // K0 > 4
+#if K0 > 8
+    LD_RHS_VFMA_M0xN0(8, a, c);
+    LD_RHS_VFMA_M0xN0(9, a, c);
+    LD_RHS_VFMA_M0xN0(A, a, c);
+    LD_RHS_VFMA_M0xN0(B, a, c);
+    LD_RHS_VFMA_M0xN0(C, a, c);
+    LD_RHS_VFMA_M0xN0(D, a, c);
+    LD_RHS_VFMA_M0xN0(E, a, c);
+    LD_RHS_VFMA_M0xN0(F, a, c);
+#endif // K0 > 8
+
+    lhs_offset += K0 * sizeof(DATA_TYPE);
+    rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE);
+  }
+
+  // Left-over accumulations
+  for (; i < K; ++i)
+  {
+    // Load values from LHS matrix
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));
+#if M0 > 1
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));
+#endif // M0 > 1
+#if M0 > 2
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));
+#endif // M0 > 2
+#if M0 > 3
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));
+#endif // M0 > 3
+#if M0 > 4
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));
+#endif // M0 > 4
+#if M0 > 5
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));
+#endif // M0 > 5
+#if M0 > 6
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));
+#endif // M0 > 6
+#if M0 > 7
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));
+#endif // M0 > 7
+
+    LD_RHS_VFMA_M0xN0(0, a, c);
+
+    lhs_offset += sizeof(DATA_TYPE);
+    rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE);
+  }
+
+  __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes +
+                             (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
+
+  REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad,
+                     dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+  __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes +
+                              (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+  LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(M0, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes +
+                              (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) +
+                              (get_global_id(1) * (uint)M0 * bias_stride_y) +
+                              get_global_id(2) * bias_stride_z;
+
+  LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias
+  ADD_BLOCK(M0, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store output block
+  STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) &&
+       // defined(M) && defined(N) && defined(K)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && \
+  defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR) && defined(M) && defined(N)
+
+#if defined(MIXED_PRECISION)
+#if K0 == 2
+#define ARM_DOT_K0(a, b, c) \
+  ({                        \
+    c += a.s0 * b.s0;       \
+    c += a.s1 * b.s1;       \
+  })
+#elif K0 == 3 // K0 == 3
+#define ARM_DOT_K0(a, b, c) \
+  ({                        \
+    c += a.s0 * b.s0;       \
+    c += a.s1 * b.s1;       \
+    c += a.s2 * b.s2;       \
+  })
+#elif K0 == 4 // K0 == 4
+#define ARM_DOT_K0(a, b, c) \
+  ({                        \
+    c += a.s0 * b.s0;       \
+    c += a.s1 * b.s1;       \
+    c += a.s2 * b.s2;       \
+    c += a.s3 * b.s3;       \
+  })
+#elif K0 == 8 // K0 == 8
+#define ARM_DOT_K0(a, b, c) \
+  ({                        \
+    c += a.s0 * b.s0;       \
+    c += a.s1 * b.s1;       \
+    c += a.s2 * b.s2;       \
+    c += a.s3 * b.s3;       \
+    c += a.s4 * b.s4;       \
+    c += a.s5 * b.s5;       \
+    c += a.s6 * b.s6;       \
+    c += a.s7 * b.s7;       \
+  })
+#elif K0 == 16 // K0 == 16
+#define ARM_DOT_K0(a, b, c) \
+  ({                        \
+    c += a.s0 * b.s0;       \
+    c += a.s1 * b.s1;       \
+    c += a.s2 * b.s2;       \
+    c += a.s3 * b.s3;       \
+    c += a.s4 * b.s4;       \
+    c += a.s5 * b.s5;       \
+    c += a.s6 * b.s6;       \
+    c += a.s7 * b.s7;       \
+    c += a.s8 * b.s8;       \
+    c += a.s9 * b.s9;       \
+    c += a.sA * b.sA;       \
+    c += a.sB * b.sB;       \
+    c += a.sC * b.sC;       \
+    c += a.sD * b.sD;       \
+    c += a.sE * b.sE;       \
+    c += a.sF * b.sF;       \
+  })
+#else // K0 not supported
+#error "K0 value not supported"
+#endif // K0 conditions
+#else  // defined(MIXED_PRECISION)
+#if K0 == 2
+#define ARM_DOT_K0(a, b, c) \
+  ({                        \
+    c = fma(a.s0, b.s0, c); \
+    c = fma(a.s1, b.s1, c); \
+  })
+#elif K0 == 3 // K0 == 3
+#define ARM_DOT_K0(a, b, c) \
+  ({                        \
+    c = fma(a.s0, b.s0, c); \
+    c = fma(a.s1, b.s1, c); \
+    c = fma(a.s2, b.s2, c); \
+  })
+#elif K0 == 4 // K0 == 4
+#define ARM_DOT_K0(a, b, c) \
+  ({                        \
+    c = fma(a.s0, b.s0, c); \
+    c = fma(a.s1, b.s1, c); \
+    c = fma(a.s2, b.s2, c); \
+    c = fma(a.s3, b.s3, c); \
+  })
+#elif K0 == 8 // K0 == 8
+#define ARM_DOT_K0(a, b, c) \
+  ({                        \
+    c = fma(a.s0, b.s0, c); \
+    c = fma(a.s1, b.s1, c); \
+    c = fma(a.s2, b.s2, c); \
+    c = fma(a.s3, b.s3, c); \
+    c = fma(a.s4, b.s4, c); \
+    c = fma(a.s5, b.s5, c); \
+    c = fma(a.s6, b.s6, c); \
+    c = fma(a.s7, b.s7, c); \
+  })
+#elif K0 == 16 // K0 == 16
+#define ARM_DOT_K0(a, b, c) \
+  ({                        \
+    c = fma(a.s0, b.s0, c); \
+    c = fma(a.s1, b.s1, c); \
+    c = fma(a.s2, b.s2, c); \
+    c = fma(a.s3, b.s3, c); \
+    c = fma(a.s4, b.s4, c); \
+    c = fma(a.s5, b.s5, c); \
+    c = fma(a.s6, b.s6, c); \
+    c = fma(a.s7, b.s7, c); \
+    c = fma(a.s8, b.s8, c); \
+    c = fma(a.s9, b.s9, c); \
+    c = fma(a.sA, b.sA, c); \
+    c = fma(a.sB, b.sB, c); \
+    c = fma(a.sC, b.sC, c); \
+    c = fma(a.sD, b.sD, c); \
+    c = fma(a.sE, b.sE, c); \
+    c = fma(a.sF, b.sF, c); \
+  })
+#else // K0 not supported
+#error "K0 value not supported"
+#endif // K0 conditions
+#endif // defined(MIXED_PRECISION)
+
+#if N0 == 2
+#define ARM_DOT_K0XN0(a, b, c)       \
+  ({                                 \
+    ARM_DOT_K0((a), (b##0), (c.s0)); \
+    ARM_DOT_K0((a), (b##1), (c.s1)); \
+  })
+#elif N0 == 3 // N0 == 3
+#define ARM_DOT_K0XN0(a, b, c)       \
+  ({                                 \
+    ARM_DOT_K0((a), (b##0), (c.s0)); \
+    ARM_DOT_K0((a), (b##1), (c.s1)); \
+    ARM_DOT_K0((a), (b##2), (c.s2)); \
+  })
+#elif N0 == 4 // N0 == 4
+#define ARM_DOT_K0XN0(a, b, c)       \
+  ({                                 \
+    ARM_DOT_K0((a), (b##0), (c.s0)); \
+    ARM_DOT_K0((a), (b##1), (c.s1)); \
+    ARM_DOT_K0((a), (b##2), (c.s2)); \
+    ARM_DOT_K0((a), (b##3), (c.s3)); \
+  })
+#elif N0 == 8 // N0 == 8
+#define ARM_DOT_K0XN0(a, b, c)       \
+  ({                                 \
+    ARM_DOT_K0((a), (b##0), (c.s0)); \
+    ARM_DOT_K0((a), (b##1), (c.s1)); \
+    ARM_DOT_K0((a), (b##2), (c.s2)); \
+    ARM_DOT_K0((a), (b##3), (c.s3)); \
+    ARM_DOT_K0((a), (b##4), (c.s4)); \
+    ARM_DOT_K0((a), (b##5), (c.s5)); \
+    ARM_DOT_K0((a), (b##6), (c.s6)); \
+    ARM_DOT_K0((a), (b##7), (c.s7)); \
+  })
+#elif N0 == 16 // N0 == 16
+#define ARM_DOT_K0XN0(a, b, c)       \
+  ({                                 \
+    ARM_DOT_K0((a), (b##0), (c.s0)); \
+    ARM_DOT_K0((a), (b##1), (c.s1)); \
+    ARM_DOT_K0((a), (b##2), (c.s2)); \
+    ARM_DOT_K0((a), (b##3), (c.s3)); \
+    ARM_DOT_K0((a), (b##4), (c.s4)); \
+    ARM_DOT_K0((a), (b##5), (c.s5)); \
+    ARM_DOT_K0((a), (b##6), (c.s6)); \
+    ARM_DOT_K0((a), (b##7), (c.s7)); \
+    ARM_DOT_K0((a), (b##8), (c.s8)); \
+    ARM_DOT_K0((a), (b##9), (c.s9)); \
+    ARM_DOT_K0((a), (b##A), (c.sA)); \
+    ARM_DOT_K0((a), (b##B), (c.sB)); \
+    ARM_DOT_K0((a), (b##C), (c.sC)); \
+    ARM_DOT_K0((a), (b##D), (c.sD)); \
+    ARM_DOT_K0((a), (b##E), (c.sE)); \
+    ARM_DOT_K0((a), (b##F), (c.sF)); \
+  })
+#else // N0 not supported
+#error "N0 value not supported"
+#endif // N0 conditions
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT
+ * transposed The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0
+ * must be transposed
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)
+ * @note The data type used for the accumulators must be passed at compile time using
+ * -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)
+ * @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION
+ * passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items"
+ * support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (e.g. -DM=52
+ * and -DN=90).
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0)
+ * must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS
+ * matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS
+ * matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option
+ * -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option
+ * -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - V0 >= 1
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution
+ * layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS reshaped matrix. Supported data
+ * type: F16/F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS
+ * reshaped matrix
+ * @param[in]  rhs_ptr                            Pointer to the RHS reshaped matrix. Supported data
+ * type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                       Stride of the RHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in]  rhs_step_x                         src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                       Stride of the RHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in]  rhs_step_y                         src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS
+ * reshaped matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  k                                  Number of columns in LHS matrix and rows in RHS
+ * matrix not reshaped.
+ * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+                                            IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                            IMAGE_DECLARATION(dst), uint k, uint lhs_stride_z,
+                                            uint rhs_stride_z,
+#if defined(BETA)
+                                            uint bias_stride_z,
+#endif // defined(BETA)
+                                            uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                            ,
+                                            uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (K0)
+#define LHS_STEP_X ((K0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (K0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+  // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+  // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+#if defined(DUMMY_WORK_ITEMS)
+  if ((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))
+  {
+    return;
+  }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+  // Compute LHS matrix address
+  __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes +
+                             (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) +
+                             (get_global_id(1) / V0) * (uint)lhs_stride_y +
+                             (get_global_id(2) * lhs_stride_z);
+
+  // Compute RHS matrix address
+  __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes +
+                             (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) +
+                             (get_global_id(0) / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  rhs_addr += get_global_id(2) * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  // Initialize the accumulators
+  REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
+
+  REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); // uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;
+  REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+  for (int i = 0; i < k; i += K0)
+  {
+    // Supported cases (M0, K0):
+    // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+    // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+    // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+    // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+    // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+    // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+    // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+    // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+    // Load values from LHS matrix
+    LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);
+
+    // Load values from RHS matrix
+    LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zero);
+
+    // Accumulate
+    ARM_DOT_K0XN0(a0, b, c0);
+#if M0 > 1
+    ARM_DOT_K0XN0(a1, b, c1);
+#endif // M0 > 1
+#if M0 > 2
+    ARM_DOT_K0XN0(a2, b, c2);
+#endif // M0 > 2
+#if M0 > 3
+    ARM_DOT_K0XN0(a3, b, c3);
+#endif // M0 > 3
+#if M0 > 4
+    ARM_DOT_K0XN0(a4, b, c4);
+#endif // M0 > 4
+#if M0 > 5
+    ARM_DOT_K0XN0(a5, b, c5);
+#endif // M0 > 5
+#if M0 > 6
+    ARM_DOT_K0XN0(a6, b, c6);
+#endif // M0 > 6
+#if M0 > 7
+    ARM_DOT_K0XN0(a7, b, c7);
+#endif // M0 > 7
+
+    lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);
+    rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);
+  }
+
+  __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes +
+                             (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) +
+                             (get_global_id(1) * (uint)M0 * dst_stride_y);
+
+  REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1), HEIGHT_GEMM3D, DEPTH_GEMM3D,
+                     dst_cross_plane_pad, dst_stride_y);
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  dst_addr += get_global_id(2) * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+  __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes +
+                              (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+  LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias[broadcasted]
+#if defined(MIXED_PRECISION)
+  CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+  ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
+#else  // defined(MIXED_PRECISION)
+  ADD_BLOCK_BROADCAST(M0, c, bias0);
+#endif // defined(MIXED_PRECISION)
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes +
+                              (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) +
+                              (get_global_id(1) * (uint)M0 * bias_stride_y) +
+                              get_global_id(2) * bias_stride_z;
+
+  LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias
+#if defined(MIXED_PRECISION)
+  CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+  ADD_BLOCK(M0, c, bias_hp);
+#else  // defined(MIXED_PRECISION)
+  ADD_BLOCK(M0, c, bias);
+#endif // defined(MIXED_PRECISION)
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+#if defined(MIXED_PRECISION)
+  ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);
+#else  // defined(MIXED_PRECISION)
+  ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+#endif // defined(MIXED_PRECISION)
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store output block
+#if defined(MIXED_PRECISION)
+  CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+#else  // defined(MIXED_PRECISION)
+  STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+#endif // defined(MIXED_PRECISION)
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+
+#if defined(LHS_TRANSPOSE)
+
+#define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE)
+
+#if defined(MIXED_PRECISION)
+
+#if (GPU_ARCH == GPU_ARCH_MIDGARD)
+#define ARM_VFMA(N0, a, b, c)                                   \
+  c += (CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))) * \
+       (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0)));
+#else // GPU_ARCH == GPU_ARCH_MIDGARD
+#define ARM_VFMA(N0, a, b, c)                                     \
+  c = fma((CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), \
+          (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (c));
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+#else // defined(MIXED_PRECISION
+
+#if (GPU_ARCH == GPU_ARCH_MIDGARD)
+#define ARM_VFMA(N0, a, b, c) c += (a) * (b);
+#else // GPU_ARCH == GPU_ARCH_MIDGARD
+#define ARM_VFMA(N0, a, b, c) c = fma((a), (b), (c));
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+#endif // defined(MIXED_PRECISION)
+
+#define ARM_VVM_T_NT_1xN0x1(N0, TYPE, a, b, C) ({ ARM_VFMA(N0, (VTYPE(TYPE, N0))(a), b, (C##0)); })
+#define ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C)        \
+  ({                                                  \
+    ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s0), b, (C##0)); \
+    ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s1), b, (C##1)); \
+  })
+#define ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C)        \
+  ({                                                  \
+    ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C);           \
+    ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s2), b, (C##2)); \
+  })
+#define ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C)        \
+  ({                                                  \
+    ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C);           \
+    ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s3), b, (C##3)); \
+  })
+#define ARM_VVM_T_NT_8xN0x1(N0, TYPE, a, b, C)        \
+  ({                                                  \
+    ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C);           \
+    ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s4), b, (C##4)); \
+    ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s5), b, (C##5)); \
+    ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s6), b, (C##6)); \
+    ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s7), b, (C##7)); \
+  })
+
+// Factory macro for the column-vector (transposed) by row-vector (not transposed) multiplication.
+// K0 = 1 a is the column-vector (transposed) b is the row-vector (not transposed) C is the output
+// matrix Lower case is a vector (a, b) Upper case is a matrix (C)
+#define ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, a, b, C) ARM_VVM_T_NT_##M0##xN0x1(N0, TYPE, a, b, C)
+
+#define ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C) \
+  ({ ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##0), (B##0), C); })
+#define ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C)         \
+  ({                                                       \
+    ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C);            \
+    ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##1), (B##1), C); \
+  })
+#define ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C)         \
+  ({                                                       \
+    ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C);            \
+    ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##2), (B##2), C); \
+  })
+#define ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C)         \
+  ({                                                       \
+    ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C);            \
+    ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##3), (B##3), C); \
+  })
+#define ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C)         \
+  ({                                                       \
+    ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C);            \
+    ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##4), (B##4), C); \
+    ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##5), (B##5), C); \
+    ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##6), (B##6), C); \
+    ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##7), (B##7), C); \
+  })
+#define ARM_MM_T_NT_M0xN0x16(M0, N0, TYPE, A, B, C)       \
+  ({                                                      \
+    ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C);           \
+    ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##8), (B##8), C); \
+    ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##9), (B##9), C); \
+    ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##A), (B##A), C); \
+    ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##B), (B##B), C); \
+    ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##C), (B##C), C); \
+    ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##D), (B##D), C); \
+    ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##E), (B##E), C); \
+    ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##F), (B##F), C); \
+  })
+
+// Factory macro for the matrix (transposed) by matrix (not transposed) multiplication.
+// The dimensions for this matrix multiplications are defined through M0, N0 and K0
+// The dimensions supported are:
+// M0: 1, 2, 3, 4, 8
+// N0: 1, 2, 3, 4, 8, 16
+// K0: 1, 2, 3, 4, 8, 16
+// This macro calls the vector-by-matrix macro K0 times
+// A, B and C are matrices
+#define ARM_MM_T_NT(M0, N0, K0, TYPE, A, B, C) \
+  CONCAT(ARM_MM_T_NT_M0xN0x, K0)               \
+  (M0, N0, TYPE, A, B, C)
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be
+ * transposed The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0
+ * must be NOT transposed
+ *
+ * @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g.
+ * -DLHS_TRANSPOSE).
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items"
+ * support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (e.g. -DM=52
+ * and -DN=90).
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0)
+ * must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS
+ * matrix must be passed at compile time using -DV0 (e.g. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS
+ * matrix must be passed at compile time using -DH0 (e.g. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option
+ * -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option
+ * -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 2, 3, 4, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - V0 >= 1
+ *  - H0 >= 1
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution
+ * layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS reshaped matrix. Supported data
+ * type: F16/F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in]  lhs_step_x                         src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in]  lhs_step_y                         src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS
+ * reshaped matrix
+ * @param[in]  rhs_ptr                            Pointer to the RHS reshaped matrix. Supported data
+ * type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                       Stride of the RHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in]  rhs_step_x                         src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                       Stride of the RHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in]  rhs_step_y                         src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS
+ * reshaped matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  k                                  Number of columns in LHS matrix and rows in RHS
+ * matrix not reshaped.
+ * @param[in]  lhs_stride_z                       Stride of the LHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+                                            IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                                            IMAGE_DECLARATION(dst), uint k, uint lhs_stride_z,
+                                            uint rhs_stride_z,
+#if defined(BETA)
+                                            uint bias_stride_z,
+#endif // defined(BETA)
+                                            uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                            ,
+                                            uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (M0)
+#define LHS_STEP_X ((M0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (M0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+  // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+  // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (N0)
+#define RHS_STEP_X ((N0) * (H0))
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (N0)
+#endif // defined(RHS_INTERLEAVE)
+
+  const uint x = get_global_id(0);
+  const uint y = get_global_id(1);
+  const uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+  if ((x * N0 >= N) || (y * M0 >= M))
+  {
+    return;
+  }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+  // Compute LHS matrix address
+  __global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes +
+                             (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) +
+                             (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);
+
+  // Compute RHS matrix address
+  __global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes +
+                             (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) +
+                             (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  rhs_addr += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  // Initialize the accumulators
+  REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);
+
+  REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);
+
+  __global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);
+  __global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr);
+
+  for (int i = 0; i < k; i += K0)
+  {
+    VEC_DATA_TYPE(DATA_TYPE, M0)
+    a0 = VLOAD(M0)(0, lhs);
+    VEC_DATA_TYPE(DATA_TYPE, N0)
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+
+#if K0 > 1
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+#endif // K0 > 1
+
+#if K0 > 2
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+#endif // K0 > 2
+
+#if K0 > 3
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+#endif // K0 > 3
+
+#if K0 > 4
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+#endif // K0 > 4
+
+#if K0 > 8
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+
+    a0 = VLOAD(M0)(0, lhs);
+    b0 = VLOAD(N0)(0, rhs);
+
+    ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);
+
+    lhs += LHS_STEP_X;
+    rhs += RHS_STEP_X;
+#endif // K0 > 8
+
+#ifndef LHS_INTERLEAVE
+    lhs += (M0 * K0 * (V0 - 1));
+#endif // LHS_INTERLEAVE
+
+#ifndef RHS_INTERLEAVE
+    rhs += (N0 * K0 * (H0 - 1));
+#endif // RHS_INTERLEAVE
+  }
+
+  __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes +
+                             (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
+
+  REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad,
+                     dst_stride_y);
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+  __global uchar *bias_addr =
+    bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));
+
+  LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias[broadcasted]
+#if defined(MIXED_PRECISION)
+  CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+  ADD_BLOCK_BROADCAST(M0, c, bias_hp0);
+#else  // defined(MIXED_PRECISION)
+  ADD_BLOCK_BROADCAST(M0, c, bias0);
+#endif // defined(MIXED_PRECISION)
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes +
+                              (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) +
+                              z * bias_stride_z;
+
+  LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+#if defined(MIXED_PRECISION)
+  CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);
+  ADD_BLOCK(M0, c, bias_hp);
+#else  // defined(MIXED_PRECISION)
+  ADD_BLOCK(M0, c, bias);
+#endif // defined(MIXED_PRECISION)
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+#if defined(MIXED_PRECISION)
+  ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);
+#else  // defined(MIXED_PRECISION)
+  ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+#endif // defined(MIXED_PRECISION)
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store output block
+#if defined(MIXED_PRECISION)
+  CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+#else  // defined(MIXED_PRECISION)
+  STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+#endif // defined(MIXED_PRECISION)
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+
+#endif // defined(LHS_TRANSPOSE)
+
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K) &&
+       // defined(DATA_TYPE)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)
+
+#define VFMA(a, b, c) ({ c = fma(a, b, c); })
+
+#if M0 == 1
+#define RHS_VFMA_M0xN0(i, a, b, c) \
+  ({ VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); })
+#elif M0 == 2 // M0 == 2
+#define RHS_VFMA_M0xN0(i, a, b, c)                                \
+  ({                                                              \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+  })
+#elif M0 == 3 // M0 == 3
+#define RHS_VFMA_M0xN0(i, a, b, c)                                \
+  ({                                                              \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+  })
+#elif M0 == 4 // M0 == 4
+#define RHS_VFMA_M0xN0(i, a, b, c)                                \
+  ({                                                              \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+  })
+#elif M0 == 5 // M0 == 5
+#define RHS_VFMA_M0xN0(i, a, b, c)                                \
+  ({                                                              \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+  })
+#elif M0 == 6 // M0 == 6
+#define RHS_VFMA_M0xN0(i, a, b, c)                                \
+  ({                                                              \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+  })
+#elif M0 == 7 // M0 == 7
+#define RHS_VFMA_M0xN0(i, a, b, c)                                \
+  ({                                                              \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
+  })
+#elif M0 == 8 // M0 == 8
+#define RHS_VFMA_M0xN0(i, a, b, c)                                \
+  ({                                                              \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \
+    VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \
+  })
+#else // M0 not supported
+#error "M0 not supported"
+#endif // M0 not supported
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS matrix is NOT reshaped
+ *
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items"
+ * support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK
+ * (e.g. -DM=52, -DN=30 and -DK=90)
+ * @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)
+ * @note The number of K0 partial accumulations must be passed at compile time using -DK0 (e.g.,
+ * -DK0=2)
+ * @note The number of N0 columns to process must be passed at compile time using -DN0 (e.g. -DN0=2)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                            Pointer to the LHS matrix. Supported data type:
+ * F16/F32
+ * @param[in]  lhs_stride_x                       Stride of the LHS matrix in X dimension (in bytes)
+ * @param[in]  lhs_step_x                         lhs_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                       Stride of the LHS matrix in Y dimension (in bytes)
+ * @param[in]  lhs_step_y                         lhs_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes  The offset of the first element in the LHS matrix
+ * @param[in]  rhs_ptr                            Pointer to the RHS matrix. Supported data type:
+ * same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                       Stride of the RHS matrix in X dimension (in bytes)
+ * @param[in]  rhs_step_x                         rhs_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                       Stride of the RHS matrix in Y dimension (in bytes)
+ * @param[in]  rhs_step_y                         rhs_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes  The offset of the first element in the RHS matrix
+ * @param[in]  bias_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  bias_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  bias_step_x                        (Optional) bias_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  bias_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  bias_step_y                        (Optional) bias_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  bias_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  lhs_stride_z                       Stride of the LHS matrix in Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                       Stride of the RHS matrix in Z dimension (in bytes)
+ * @param[in]  bias_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  lhs_cross_plane_pad                (Optional) Bottom paddings for LHS matrix in unit
+ * of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings for the output matrix
+ * in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_native(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs),
+#if defined(BETA)
+                             IMAGE_DECLARATION(bias),
+#endif // defined(BETA)
+                             IMAGE_DECLARATION(dst), uint lhs_stride_z, uint rhs_stride_z,
+#if defined(BETA)
+                             uint bias_stride_z,
+#endif // defined(BETA)
+                             uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                             ,
+                             uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                             ,
+                             uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+  // RHS offset and step X
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+
+  uint x = get_global_id(0);
+  uint y = get_global_id(1);
+  uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+  if ((x * N0 >= N) || (y * M0 >= M))
+  {
+    return;
+  }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+  // Compute LHS matrix address
+  uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+  // Compute RHS matrix address
+  uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);
+  REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad,
+                     lhs_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply lhs_stride_z by DEPTH_GEMM3D
+  lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Initialize the accumulators
+  REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c,
+                           0); // VEC_DATA_TYPE(DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+  int i = 0;
+  for (; i <= (K - K0); i += K0)
+  {
+    // Supported cases (M0, K0):
+    // 1,2 - 1,3 - 1,4 - 1,8 - 1,16
+    // 2,2 - 2,3 - 2,4 - 2,8 - 2,16
+    // 3,2 - 3,3 - 3,4 - 3,8 - 3,16
+    // 4,2 - 4,3 - 4,4 - 4,8 - 4,16
+    // 5,2 - 5,3 - 5,4 - 5,8 - 5,16
+    // 6,2 - 6,3 - 6,4 - 6,8 - 6,16
+    // 7,2 - 7,3 - 7,4 - 7,8 - 7,16
+    // 8,2 - 8,3 - 8,4 - 8,8 - 8,16
+    // Load values from LHS matrix
+    LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+    // Load values from RHS matrix
+    LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zero);
+
+    RHS_VFMA_M0xN0(0, a, b0, c);
+    RHS_VFMA_M0xN0(1, a, b1, c);
+#if K0 > 2
+    RHS_VFMA_M0xN0(2, a, b2, c);
+#endif // K0 > 2
+#if K0 > 3
+    RHS_VFMA_M0xN0(3, a, b3, c);
+#endif // K0 > 3
+#if K0 > 4
+    RHS_VFMA_M0xN0(4, a, b4, c);
+    RHS_VFMA_M0xN0(5, a, b5, c);
+    RHS_VFMA_M0xN0(6, a, b6, c);
+    RHS_VFMA_M0xN0(7, a, b7, c);
+#endif // K0 > 4
+#if K0 > 8
+    RHS_VFMA_M0xN0(8, a, b8, c);
+    RHS_VFMA_M0xN0(9, a, b9, c);
+    RHS_VFMA_M0xN0(A, a, bA, c);
+    RHS_VFMA_M0xN0(B, a, bB, c);
+    RHS_VFMA_M0xN0(C, a, bC, c);
+    RHS_VFMA_M0xN0(D, a, bD, c);
+    RHS_VFMA_M0xN0(E, a, bE, c);
+    RHS_VFMA_M0xN0(F, a, bF, c);
+#endif // K0 > 8
+
+    lhs_offset += K0 * sizeof(DATA_TYPE);
+    rhs_offset += K0 * rhs_stride_y;
+  }
+
+  // Left-over accumulations
+  for (; i < K; ++i)
+  {
+    // Load values from LHS matrix
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0));
+#if M0 > 1
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1));
+#endif // M0 > 1
+#if M0 > 2
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2));
+#endif // M0 > 2
+#if M0 > 3
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3));
+#endif // M0 > 3
+#if M0 > 4
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4));
+#endif // M0 > 4
+#if M0 > 5
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5));
+#endif // M0 > 5
+#if M0 > 6
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6));
+#endif // M0 > 6
+#if M0 > 7
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7));
+#endif // M0 > 7
+
+    VEC_DATA_TYPE(DATA_TYPE, N0)
+    b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * rhs_stride_y));
+    RHS_VFMA_M0xN0(0, a, b, c);
+
+    lhs_offset += sizeof(DATA_TYPE);
+    rhs_offset += rhs_stride_y;
+  }
+
+  __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes +
+                             (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);
+
+  REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad,
+                     dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+#if defined(BROADCAST_BIAS)
+  __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes +
+                              (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));
+
+  LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(M0, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes +
+                              (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) +
+                              (get_global_id(1) * (uint)M0 * bias_stride_y) +
+                              get_global_id(2) * bias_stride_z;
+
+  LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias
+  ADD_BLOCK(M0, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store output block
+  STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)
+
+#if defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)
+/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between
+ * matrix A reshaped (src0) and matrix B reshaped (src1)
+ *
+ * @note The number of columns of matrix B and the optional alpha's value need to be passed at
+ * compile time using -DCOLS_B and -DALPHA
+ * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be
+ * passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at
+ * compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution
+ * layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data
+ * types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                 IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                 IMAGE_DECLARATION(dst), uint src0_stride_z,
+                                                 uint src1_stride_z,
+#if defined(BETA)
+                                                 uint src2_stride_z,
+#endif // defined(BETA)
+                                                 uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                 ,
+                                                 uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+  int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+  int z = get_global_id(2);
+
+  // Offset
+  const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+  const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;
+
+  // src_addr_a = address of matrix A
+  // src_addr_b = address of matrix B
+  int src0_addr_in_bytes =
+    z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+  int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);
+  __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);
+
+  // Compute end row address for matrix B
+  __global float *src_end_addr_b = src_addr_b + COLS_B;
+
+  src_addr_a += offset_row_a;
+  src_addr_b += offset_row_b;
+
+  // Reset accumulators
+  float4 c0 = 0.0f;
+  float4 c1 = 0.0f;
+  float4 c2 = 0.0f;
+  float4 c3 = 0.0f;
+
+  for (; src_addr_b <= (src_end_addr_b - (int)(8 * MULT_TRANSPOSE1XW_WIDTH));
+       src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
+  {
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    float4 a0 = vload4(0, src_addr_a);
+    float4 b0 = vload4(0, src_addr_b);
+
+    c0 += (float4)a0.s0 * b0;
+    c1 += (float4)a0.s1 * b0;
+    c2 += (float4)a0.s2 * b0;
+    c3 += (float4)a0.s3 * b0;
+
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);
+    b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH);
+
+    c0 += (float4)a0.s0 * b0;
+    c1 += (float4)a0.s1 * b0;
+    c2 += (float4)a0.s2 * b0;
+    c3 += (float4)a0.s3 * b0;
+  }
+
+  for (; src_addr_b < src_end_addr_b;
+       src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH)
+  {
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    float4 a0 = vload4(0, src_addr_a);
+    float4 b0 = vload4(0, src_addr_b);
+
+    c0 += (float4)a0.s0 * b0;
+    c1 += (float4)a0.s1 * b0;
+    c2 += (float4)a0.s2 * b0;
+    c3 += (float4)a0.s3 * b0;
+  }
+
+  // Compute destination address
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  // Compute dst address
+  __global uchar *dst_addr = offset(&dst, 0, 0);
+
+  uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+  zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+  zout = min(DEPTH_GEMM3D - 1, zout);
+
+  // Add offset due to the cross plane paddings
+  zout *= (cross_plane_pad * dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(4, float, c, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+  REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
+
+  LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, float, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(4, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) +
+    (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(2) * src2_stride_z;
+
+  LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(4, float, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias
+  ADD_BLOCK(4, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store 4x4 block
+  vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
+  vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
+  vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
+  vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
+}
+
+/** This OpenCL kernel is optimized for Bifrost and tt computes the matrix multiplication between
+ * matrix A reshaped (src0) and matrix B reshaped (src1)
+ *
+ * @note The number of columns of matrix B and the optional alpha's value need to be passed at
+ * compile time using -DCOLS_B and -DALPHA
+ * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be
+ * passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at
+ * compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at
+ * compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution
+ * layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data
+ * types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),
+                                                         IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                         IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                         IMAGE_DECLARATION(dst), uint src0_stride_z,
+                                                         uint src1_stride_z,
+#if defined(BETA)
+                                                         uint src2_stride_z,
+#endif // defined(BETA)
+                                                         uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                         ,
+                                                         uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+  int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+  int z = get_global_id(2);
+
+  // Offset
+  const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+  const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;
+
+  // src_addr_a = address of matrix A
+  // src_addr_b = address of matrix B
+  int src0_addr_in_bytes =
+    z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+  int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  __global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);
+  __global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);
+
+  src_addr_a += offset_row_a;
+  src_addr_b += offset_row_b;
+
+  // Reset accumulators
+  float4 c0 = 0.0f;
+  float4 c1 = 0.0f;
+  float4 c2 = 0.0f;
+  float4 c3 = 0.0f;
+
+#define COLS_MTX_B (COLS_B / (4 * MULT_TRANSPOSE1XW_WIDTH))
+
+  int i = 0;
+  for (; i <= (int)(COLS_MTX_B - 4); i += 4)
+  {
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    float4 a0 = vload4(0, src_addr_a);
+    float4 b0 = vload4(0, src_addr_b);
+
+    src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+    src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0.s0 = fma(a0.s0, b0.s0, c0.s0);
+    c0.s1 = fma(a0.s0, b0.s1, c0.s1);
+    c0.s2 = fma(a0.s0, b0.s2, c0.s2);
+    c0.s3 = fma(a0.s0, b0.s3, c0.s3);
+
+    c1.s0 = fma(a0.s1, b0.s0, c1.s0);
+    c1.s1 = fma(a0.s1, b0.s1, c1.s1);
+    c1.s2 = fma(a0.s1, b0.s2, c1.s2);
+    c1.s3 = fma(a0.s1, b0.s3, c1.s3);
+
+    c2.s0 = fma(a0.s2, b0.s0, c2.s0);
+    c2.s1 = fma(a0.s2, b0.s1, c2.s1);
+    c2.s2 = fma(a0.s2, b0.s2, c2.s2);
+    c2.s3 = fma(a0.s2, b0.s3, c2.s3);
+
+    c3.s0 = fma(a0.s3, b0.s0, c3.s0);
+    c3.s1 = fma(a0.s3, b0.s1, c3.s1);
+    c3.s2 = fma(a0.s3, b0.s2, c3.s2);
+    c3.s3 = fma(a0.s3, b0.s3, c3.s3);
+
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    a0 = vload4(0, src_addr_a);
+    b0 = vload4(0, src_addr_b);
+
+    src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+    src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0.s0 = fma(a0.s0, b0.s0, c0.s0);
+    c0.s1 = fma(a0.s0, b0.s1, c0.s1);
+    c0.s2 = fma(a0.s0, b0.s2, c0.s2);
+    c0.s3 = fma(a0.s0, b0.s3, c0.s3);
+
+    c1.s0 = fma(a0.s1, b0.s0, c1.s0);
+    c1.s1 = fma(a0.s1, b0.s1, c1.s1);
+    c1.s2 = fma(a0.s1, b0.s2, c1.s2);
+    c1.s3 = fma(a0.s1, b0.s3, c1.s3);
+
+    c2.s0 = fma(a0.s2, b0.s0, c2.s0);
+    c2.s1 = fma(a0.s2, b0.s1, c2.s1);
+    c2.s2 = fma(a0.s2, b0.s2, c2.s2);
+    c2.s3 = fma(a0.s2, b0.s3, c2.s3);
+
+    c3.s0 = fma(a0.s3, b0.s0, c3.s0);
+    c3.s1 = fma(a0.s3, b0.s1, c3.s1);
+    c3.s2 = fma(a0.s3, b0.s2, c3.s2);
+    c3.s3 = fma(a0.s3, b0.s3, c3.s3);
+
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    a0 = vload4(0, src_addr_a);
+    b0 = vload4(0, src_addr_b);
+
+    src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+    src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0.s0 = fma(a0.s0, b0.s0, c0.s0);
+    c0.s1 = fma(a0.s0, b0.s1, c0.s1);
+    c0.s2 = fma(a0.s0, b0.s2, c0.s2);
+    c0.s3 = fma(a0.s0, b0.s3, c0.s3);
+
+    c1.s0 = fma(a0.s1, b0.s0, c1.s0);
+    c1.s1 = fma(a0.s1, b0.s1, c1.s1);
+    c1.s2 = fma(a0.s1, b0.s2, c1.s2);
+    c1.s3 = fma(a0.s1, b0.s3, c1.s3);
+
+    c2.s0 = fma(a0.s2, b0.s0, c2.s0);
+    c2.s1 = fma(a0.s2, b0.s1, c2.s1);
+    c2.s2 = fma(a0.s2, b0.s2, c2.s2);
+    c2.s3 = fma(a0.s2, b0.s3, c2.s3);
+
+    c3.s0 = fma(a0.s3, b0.s0, c3.s0);
+    c3.s1 = fma(a0.s3, b0.s1, c3.s1);
+    c3.s2 = fma(a0.s3, b0.s2, c3.s2);
+    c3.s3 = fma(a0.s3, b0.s3, c3.s3);
+
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    a0 = vload4(0, src_addr_a);
+    b0 = vload4(0, src_addr_b);
+
+    src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+    src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0.s0 = fma(a0.s0, b0.s0, c0.s0);
+    c0.s1 = fma(a0.s0, b0.s1, c0.s1);
+    c0.s2 = fma(a0.s0, b0.s2, c0.s2);
+    c0.s3 = fma(a0.s0, b0.s3, c0.s3);
+
+    c1.s0 = fma(a0.s1, b0.s0, c1.s0);
+    c1.s1 = fma(a0.s1, b0.s1, c1.s1);
+    c1.s2 = fma(a0.s1, b0.s2, c1.s2);
+    c1.s3 = fma(a0.s1, b0.s3, c1.s3);
+
+    c2.s0 = fma(a0.s2, b0.s0, c2.s0);
+    c2.s1 = fma(a0.s2, b0.s1, c2.s1);
+    c2.s2 = fma(a0.s2, b0.s2, c2.s2);
+    c2.s3 = fma(a0.s2, b0.s3, c2.s3);
+
+    c3.s0 = fma(a0.s3, b0.s0, c3.s0);
+    c3.s1 = fma(a0.s3, b0.s1, c3.s1);
+    c3.s2 = fma(a0.s3, b0.s2, c3.s2);
+    c3.s3 = fma(a0.s3, b0.s3, c3.s3);
+  }
+
+  for (; i < (int)(COLS_MTX_B); ++i)
+  {
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    float4 a0 = vload4(0, src_addr_a);
+    float4 b0 = vload4(0, src_addr_b);
+
+    src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+    src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0.s0 = fma(a0.s0, b0.s0, c0.s0);
+    c0.s1 = fma(a0.s0, b0.s1, c0.s1);
+    c0.s2 = fma(a0.s0, b0.s2, c0.s2);
+    c0.s3 = fma(a0.s0, b0.s3, c0.s3);
+
+    c1.s0 = fma(a0.s1, b0.s0, c1.s0);
+    c1.s1 = fma(a0.s1, b0.s1, c1.s1);
+    c1.s2 = fma(a0.s1, b0.s2, c1.s2);
+    c1.s3 = fma(a0.s1, b0.s3, c1.s3);
+
+    c2.s0 = fma(a0.s2, b0.s0, c2.s0);
+    c2.s1 = fma(a0.s2, b0.s1, c2.s1);
+    c2.s2 = fma(a0.s2, b0.s2, c2.s2);
+    c2.s3 = fma(a0.s2, b0.s3, c2.s3);
+
+    c3.s0 = fma(a0.s3, b0.s0, c3.s0);
+    c3.s1 = fma(a0.s3, b0.s1, c3.s1);
+    c3.s2 = fma(a0.s3, b0.s2, c3.s2);
+    c3.s3 = fma(a0.s3, b0.s3, c3.s3);
+  }
+
+  // Compute destination address
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  // Compute dst address
+  __global uchar *dst_addr = offset(&dst, 0, 0);
+
+  uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+  zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+  zout = min(DEPTH_GEMM3D - 1, zout);
+
+  // Add offset due to the cross plane paddings
+  zout *= (cross_plane_pad * dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(4, float, c, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+  REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
+
+  LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, float, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(4, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) +
+    (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(2) * src2_stride_z;
+
+  LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(4, float, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias
+  ADD_BLOCK(4, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store 4x4 block
+  vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
+  vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
+  vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
+  vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
+}
+
+// Undefine local defines
+#undef COLS_MTX_B
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and
+ * matrix B reshaped (src1)
+ *
+ * @note The number of columns of matrix B and the optional alpha's value need to be passed at
+ * compile time using -DCOLS_B and -DALPHA
+ * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be
+ * passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at
+ * compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution
+ * layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data
+ * types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                 IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                 IMAGE_DECLARATION(dst), uint src0_stride_z,
+                                                 uint src1_stride_z,
+#if defined(BETA)
+                                                 uint src2_stride_z,
+#endif // defined(BETA)
+                                                 uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                 ,
+                                                 uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+  int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+  int z = get_global_id(2);
+
+  // Offset
+  const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+  const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
+
+  // src_addr_a = address of matrix A
+  // src_addr_b = address of matrix B
+  int src0_addr_in_bytes =
+    z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+  int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
+  __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
+
+  // Compute end row address for matrix B
+  __global half *src_end_addr_b = src_addr_b + COLS_B;
+
+  src_addr_a += offset_row_a;
+  src_addr_b += offset_row_b;
+
+  // Reset accumulators
+  half8 c0 = 0.0f;
+  half8 c1 = 0.0f;
+  half8 c2 = 0.0f;
+  half8 c3 = 0.0f;
+
+  for (; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH));
+       src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)
+  {
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    half4 a0 = vload4(0, src_addr_a);
+    half8 b0 = vload8(0, src_addr_b);
+
+    c0 += (half8)a0.s0 * b0;
+    c1 += (half8)a0.s1 * b0;
+    c2 += (half8)a0.s2 * b0;
+    c3 += (half8)a0.s3 * b0;
+
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);
+    b0 = vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH);
+
+    c0 += (half8)a0.s0 * b0;
+    c1 += (half8)a0.s1 * b0;
+    c2 += (half8)a0.s2 * b0;
+    c3 += (half8)a0.s3 * b0;
+  }
+
+  for (; src_addr_b < src_end_addr_b;
+       src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
+  {
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    half4 a0 = vload4(0, src_addr_a);
+    half8 b0 = vload8(0, src_addr_b);
+
+    c0 += (half8)a0.s0 * b0;
+    c1 += (half8)a0.s1 * b0;
+    c2 += (half8)a0.s2 * b0;
+    c3 += (half8)a0.s3 * b0;
+  }
+
+  // Compute destination address
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  // Compute dst address
+  __global uchar *dst_addr = offset(&dst, 0, 0);
+
+  uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+  zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+  zout = min(DEPTH_GEMM3D - 1, zout);
+
+  // Add offset due to the cross plane paddings
+  zout *= (cross_plane_pad * dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(4, half, c, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+  REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
+
+  LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, half, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(4, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) +
+    (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(2) * src2_stride_z;
+
+  LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(4, half, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias
+  ADD_BLOCK(4, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store 4x8 block
+  vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));
+  vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));
+  vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));
+  vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));
+}
+
+/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and
+ * matrix B reshaped (src1) while accumulating the result in a 32 floating point variable.
+ *
+ * @note The number of columns of matrix B and the optional alpha's value need to be passed at
+ * compile time using -DCOLS_B and -DALPHA
+ * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be
+ * passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at
+ * compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution
+ * layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data
+ * types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f16_acc32(IMAGE_DECLARATION(src0),
+                                                       IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                       IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                       IMAGE_DECLARATION(dst), uint src0_stride_z,
+                                                       uint src1_stride_z,
+#if defined(BETA)
+                                                       uint src2_stride_z,
+#endif // defined(BETA)
+                                                       uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                       ,
+                                                       uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+  int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+  int z = get_global_id(2);
+
+  // Offset
+  const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+  const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
+
+  // src_addr_a = address of matrix A
+  // src_addr_b = address of matrix B
+  int src0_addr_in_bytes =
+    z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+  int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
+  __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
+
+  // Compute end row address for matrix B
+  __global half *src_end_addr_b = src_addr_b + COLS_B;
+
+  src_addr_a += offset_row_a;
+  src_addr_b += offset_row_b;
+
+  // Reset accumulators
+  float8 c0 = 0.0f;
+  float8 c1 = 0.0f;
+  float8 c2 = 0.0f;
+  float8 c3 = 0.0f;
+
+  for (; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH));
+       src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)
+  {
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    float4 a0 = convert_float4(vload4(0, src_addr_a));
+    float8 b0 = convert_float8(vload8(0, src_addr_b));
+
+    c0 += (float8)a0.s0 * b0;
+    c1 += (float8)a0.s1 * b0;
+    c2 += (float8)a0.s2 * b0;
+    c3 += (float8)a0.s3 * b0;
+
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    a0 = convert_float4(vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT));
+    b0 = convert_float8(vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH));
+
+    c0 += (float8)a0.s0 * b0;
+    c1 += (float8)a0.s1 * b0;
+    c2 += (float8)a0.s2 * b0;
+    c3 += (float8)a0.s3 * b0;
+  }
+
+  for (; src_addr_b < src_end_addr_b;
+       src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)
+  {
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    float4 a0 = convert_float4(vload4(0, src_addr_a));
+    float8 b0 = convert_float8(vload8(0, src_addr_b));
+
+    c0 += (float8)a0.s0 * b0;
+    c1 += (float8)a0.s1 * b0;
+    c2 += (float8)a0.s2 * b0;
+    c3 += (float8)a0.s3 * b0;
+  }
+
+  // Compute destination address
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  // Compute dst address
+  __global uchar *dst_addr = offset(&dst, 0, 0);
+
+  uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+  zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+  zout = min(DEPTH_GEMM3D - 1, zout);
+
+  // Add offset due to the cross plane paddings
+  zout *= (cross_plane_pad * dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(4, float, c, ALPHA);
+#endif // defined(ALPHA)
+
+#if defined(BETA)
+  REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
+
+  LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+  float8 bias_f0 = convert_float8(bias0);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, float, bias_f, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(4, c, bias_f0);
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) +
+    (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(2) * src2_stride_z;
+
+  LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+  float8 bias_f0 = convert_float8(bias0);
+  float8 bias_f1 = convert_float8(bias1);
+  float8 bias_f2 = convert_float8(bias2);
+  float8 bias_f3 = convert_float8(bias3);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(4, float, bias_f, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias
+  ADD_BLOCK(4, c, bias_f);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+  half8 c_h0 = convert_half8(c0);
+  half8 c_h1 = convert_half8(c1);
+  half8 c_h2 = convert_half8(c2);
+  half8 c_h3 = convert_half8(c3);
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c_h, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store 4x8 block
+  vstore8(c_h0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));
+  vstore8(c_h1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));
+  vstore8(c_h2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));
+  vstore8(c_h3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));
+}
+
+/** This OpenCL kernel optimized for Bifrost architectures computes the matrix multiplication
+ * between matrix A reshaped (src0) and matrix B reshaped (src1)
+ *
+ * @note The number of columns of matrix B and the optional alpha's value need to be passed at
+ * compile time using -DCOLS_B and -DALPHA
+ * @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be
+ * passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)
+ * @note The multiplication factor for the height of the 4x4 interleaved block must be passed at
+ * compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution
+ * layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data
+ * types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  cross_plane_pad                    (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0),
+                                                         IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                         IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                         IMAGE_DECLARATION(dst), uint src0_stride_z,
+                                                         uint src1_stride_z,
+#if defined(BETA)
+                                                         uint src2_stride_z,
+#endif // defined(BETA)
+                                                         uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                         ,
+                                                         uint cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;
+  int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;
+  int z = get_global_id(2);
+
+  // Offset
+  const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;
+  const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;
+
+  // src_addr_a = address of matrix A
+  // src_addr_b = address of matrix B
+  int src0_addr_in_bytes =
+    z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;
+  int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  src1_addr_in_bytes += z * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  __global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);
+  __global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);
+
+  // Compute end row address for matrix B
+  __global half *src_end_addr_b = src_addr_b + COLS_B;
+
+  src_addr_a += offset_row_a;
+  src_addr_b += offset_row_b;
+
+  // Reset accumulators
+  half8 c0 = 0.0f;
+  half8 c1 = 0.0f;
+  half8 c2 = 0.0f;
+  half8 c3 = 0.0f;
+
+#define COLS_MTX_B (COLS_B / (8 * MULT_TRANSPOSE1XW_WIDTH))
+
+  int i = 0;
+  for (; i <= (int)(COLS_MTX_B - 4); i += 4)
+  {
+#if MULT_INTERLEAVE4X4_HEIGHT == 1
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    half8 a0 = vload8(0, src_addr_a);
+    half8 b0 = vload8(0, src_addr_b);
+
+    src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;
+    src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0 = fma((half8)a0.s0, b0, c0);
+    c1 = fma((half8)a0.s1, b0, c1);
+    c2 = fma((half8)a0.s2, b0, c2);
+    c3 = fma((half8)a0.s3, b0, c3);
+
+    // Load values from matrix B (transposed)
+    b0 = vload8(0, src_addr_b);
+
+    src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0 = fma((half8)a0.s4, b0, c0);
+    c1 = fma((half8)a0.s5, b0, c1);
+    c2 = fma((half8)a0.s6, b0, c2);
+    c3 = fma((half8)a0.s7, b0, c3);
+
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    a0 = vload8(0, src_addr_a);
+    b0 = vload8(0, src_addr_b);
+
+    src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;
+    src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0 = fma((half8)a0.s0, b0, c0);
+    c1 = fma((half8)a0.s1, b0, c1);
+    c2 = fma((half8)a0.s2, b0, c2);
+    c3 = fma((half8)a0.s3, b0, c3);
+
+    // Load values from matrix B (transposed)
+    b0 = vload8(0, src_addr_b);
+
+    src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0 = fma((half8)a0.s4, b0, c0);
+    c1 = fma((half8)a0.s5, b0, c1);
+    c2 = fma((half8)a0.s6, b0, c2);
+    c3 = fma((half8)a0.s7, b0, c3);
+#else  // MULT_INTERLEAVE4X4_HEIGHT == 1
+       // Load values from matrix A (interleaved) and matrix B (transposed)
+    half4 a0 = vload4(0, src_addr_a);
+    half8 b0 = vload8(0, src_addr_b);
+
+    src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+    src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0 = fma((half8)a0.s0, b0, c0);
+    c1 = fma((half8)a0.s1, b0, c1);
+    c2 = fma((half8)a0.s2, b0, c2);
+    c3 = fma((half8)a0.s3, b0, c3);
+
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    a0 = vload4(0, src_addr_a);
+    b0 = vload8(0, src_addr_b);
+
+    src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+    src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0 = fma((half8)a0.s0, b0, c0);
+    c1 = fma((half8)a0.s1, b0, c1);
+    c2 = fma((half8)a0.s2, b0, c2);
+    c3 = fma((half8)a0.s3, b0, c3);
+
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    a0 = vload4(0, src_addr_a);
+    b0 = vload8(0, src_addr_b);
+
+    src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+    src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0 = fma((half8)a0.s0, b0, c0);
+    c1 = fma((half8)a0.s1, b0, c1);
+    c2 = fma((half8)a0.s2, b0, c2);
+    c3 = fma((half8)a0.s3, b0, c3);
+
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    a0 = vload4(0, src_addr_a);
+    b0 = vload8(0, src_addr_b);
+
+    src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+    src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0 = fma((half8)a0.s0, b0, c0);
+    c1 = fma((half8)a0.s1, b0, c1);
+    c2 = fma((half8)a0.s2, b0, c2);
+    c3 = fma((half8)a0.s3, b0, c3);
+#endif // MULT_INTERLEAVE4X4_HEIGHT == 1
+  }
+
+  for (; i < (int)(COLS_MTX_B); ++i)
+  {
+    // Load values from matrix A (interleaved) and matrix B (transposed)
+    half4 a0 = vload4(0, src_addr_a);
+    half8 b0 = vload8(0, src_addr_b);
+
+    src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;
+    src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;
+
+    c0 = fma((half8)a0.s0, b0, c0);
+    c1 = fma((half8)a0.s1, b0, c1);
+    c2 = fma((half8)a0.s2, b0, c2);
+    c3 = fma((half8)a0.s3, b0, c3);
+  }
+
+  // Compute destination address
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  // Compute dst address
+  __global uchar *dst_addr = offset(&dst, 0, 0);
+
+  uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D
+  zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;
+  zout = min(DEPTH_GEMM3D - 1, zout);
+
+  // Add offset due to the cross plane paddings
+  zout *= (cross_plane_pad * dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(4, half, c, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+  REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
+
+  LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, half, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(4, c, bias0);
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) +
+    (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(2) * src2_stride_z;
+
+  LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(4, half, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias
+  ADD_BLOCK(4, c, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store 4x8 block
+  vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));
+  vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));
+  vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));
+  vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));
+}
+
+// Undefine local defines
+#undef COLS_MTX_B
+
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+
+#endif // defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)
+
+#if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && \
+  (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+#if defined(DATA_TYPE)
+#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, NUM_ELEMS_PROCESSED_PER_THREAD_X)
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and
+ * matrix B (src1) in case both matrices have not been reshaped.
+ *
+ * @note This OpenCL kernel works with floating point data types (F16/F32)
+ * @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g.
+ * -DDATA_TYPE=float)
+ * @note The number of elements processed along the x and y directions must be passed at compile
+ * time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y
+ * @note The number of matrix A columns and the optional alpha's value need to be passed at compile
+ * time using -DCOLS_A and -DALPHA
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data
+ * types: F16/F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for
+ * the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements for
+ * the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                     IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                     IMAGE_DECLARATION(dst), uint src0_stride_z, uint src1_stride_z,
+#if defined(BETA)
+                                     uint src2_stride_z,
+#endif // defined(BETA)
+                                     uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                     ,
+                                     uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                     ,
+                                     uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+  // Compute starting address for matrix A and Matrix B
+  int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+  // Update address for the matrix A
+  src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+  // Update address for the matrix B
+  src_addr.s1 += idx * sizeof(DATA_TYPE);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+  // by HEIGHT_GEMM3D
+  uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+              (uint4)HEIGHT_GEMM3D;
+  zin = min(DEPTH_GEMM3D - 1, zin);
+
+  // Add offset due to the cross plane paddings
+  zin *= (src_cross_plane_pad * src0_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply src0_stride_z by DEPTH_GEMM3D
+  src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(DATA_TYPE));
+
+  VECTOR_TYPE acc0 = 0.0f;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+  VECTOR_TYPE acc1 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+  VECTOR_TYPE acc2 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  VECTOR_TYPE acc3 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+  for (; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(DATA_TYPE));
+       src_addr += (int2)(2 * sizeof(DATA_TYPE), 2 * src1_stride_y))
+  {
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Load values from matrix A
+    LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 2, DATA_TYPE, a, src0_ptr, src_addr.s0,
+               src0_stride_y, zin.s);
+#else // defined(REINTERPRET_INPUT_AS_3D)
+      // Load values from matrix A
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a0 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a1 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a2 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    VEC_DATA_TYPE(DATA_TYPE, 2)
+    a3 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Load values from matrix B
+    VECTOR_TYPE b0 =
+      VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
+    VECTOR_TYPE b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(
+      0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1 + src1_stride_y));
+
+    // Accumulate
+    acc0 += b0 * (VECTOR_TYPE)a0.s0;
+    acc0 += b1 * (VECTOR_TYPE)a0.s1;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 += b0 * (VECTOR_TYPE)a1.s0;
+    acc1 += b1 * (VECTOR_TYPE)a1.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 += b0 * (VECTOR_TYPE)a2.s0;
+    acc2 += b1 * (VECTOR_TYPE)a2.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 += b0 * (VECTOR_TYPE)a3.s0;
+    acc3 += b1 * (VECTOR_TYPE)a3.s1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  }
+
+  for (; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(DATA_TYPE), src1_stride_y))
+  {
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Load values from matrix A
+    DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+       // Load values from matrix A
+    DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Load values from matrix B
+    VECTOR_TYPE b0 =
+      VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));
+
+    // Accumulate
+    acc0 += b0 * (VECTOR_TYPE)a0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 += b0 * (VECTOR_TYPE)a1;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 += b0 * (VECTOR_TYPE)a2;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 += b0 * (VECTOR_TYPE)a3;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  }
+
+  int z = get_global_id(2);
+
+  // Compute destination address
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  // Compute dst address
+  __global uchar *dst_addr = offset(&dst, 0, 0);
+
+  uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+  // by HEIGHT_GEMM3D
+  zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+         (uint4)HEIGHT_GEMM3D;
+  zout = min(DEPTH_GEMM3D - 1, zout);
+
+  // Add offset due to the cross plane paddings
+  zout *= (dst_cross_plane_pad * dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, DATA_TYPE, acc, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+  REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes +
+    (get_global_id(0) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_X * sizeof(DATA_TYPE));
+
+  LOAD_BLOCK(1, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, bias, src2_addr, 0, src2_stride_y,
+             zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes +
+    (get_global_id(0) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_X * sizeof(DATA_TYPE)) +
+    (get_global_id(1) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) +
+    get_global_id(2) * src2_stride_z;
+
+  LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, bias,
+             src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, DATA_TYPE, bias, BETA);
+#endif // UNIT_BIAS
+
+  // c = c + bias
+  ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, DATA_TYPE, acc, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store output block
+  STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, acc,
+              dst_addr, dst_stride_y, zout.s);
+}
+#endif // defined(DATA_TYPE)
+
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and
+ * matrix B (src1) in case both matrices have not been reshaped
+ *
+ * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma
+ * units.
+ * @note The number of elements processed along the x and y directions must be passed at compile
+ * time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y. This kernel
+ * optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
+ * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data
+ * types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for
+ * the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                 IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                 IMAGE_DECLARATION(dst), uint src0_stride_z,
+                                                 uint src1_stride_z,
+#if defined(BETA)
+                                                 uint src2_stride_z,
+#endif // defined(BETA)
+                                                 uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                 ,
+                                                 uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                 ,
+                                                 uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+  // Compute starting address for matrix A and matrix B
+  int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+  // Update address for matrix A
+  src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+  // Update address for matrix B
+  src_addr.s1 += idx * sizeof(float);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+  // by HEIGHT_GEMM3D
+  uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+              (uint4)HEIGHT_GEMM3D;
+  zin = min(DEPTH_GEMM3D - 1, zin);
+
+  // Add offset due to the cross plane paddings
+  zin *= (src_cross_plane_pad * src0_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply src0_stride_z by DEPTH_GEMM3D
+  src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  // Initialize accumulators
+  float4 acc0 = 0.0f;
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+  float4 acc1 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+  float4 acc2 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  float4 acc3 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+  // A and B src indices get incremented at the same time.
+  int i = 0;
+  for (; i <= ((int)COLS_A - 4); i += 4)
+  {
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Load values from matrix A and matrix B
+    LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, float, a, src0_ptr, src_addr.s0, src0_stride_y,
+               zin.s);
+#else // defined(REINTERPRET_INPUT_AS_3D)
+      // Load values from matrix A and matrix B
+    float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    float4 a1 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    float4 a2 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    float4 a3 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+
+    // Multiply and accumulate
+    acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);
+    acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);
+    acc0.s2 = fma(a0.s0, b0.s2, acc0.s2);
+    acc0.s3 = fma(a0.s0, b0.s3, acc0.s3);
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+
+    acc1.s0 = fma(a1.s0, b0.s0, acc1.s0);
+    acc1.s1 = fma(a1.s0, b0.s1, acc1.s1);
+    acc1.s2 = fma(a1.s0, b0.s2, acc1.s2);
+    acc1.s3 = fma(a1.s0, b0.s3, acc1.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+
+    acc2.s0 = fma(a2.s0, b0.s0, acc2.s0);
+    acc2.s1 = fma(a2.s0, b0.s1, acc2.s1);
+    acc2.s2 = fma(a2.s0, b0.s2, acc2.s2);
+    acc2.s3 = fma(a2.s0, b0.s3, acc2.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    acc3.s0 = fma(a3.s0, b0.s0, acc3.s0);
+    acc3.s1 = fma(a3.s0, b0.s1, acc3.s1);
+    acc3.s2 = fma(a3.s0, b0.s2, acc3.s2);
+    acc3.s3 = fma(a3.s0, b0.s3, acc3.s3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    // Load values from matrix A and matrix B
+    b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+
+    // Multiply and accumulate
+    acc0.s0 = fma(a0.s1, b0.s0, acc0.s0);
+    acc0.s1 = fma(a0.s1, b0.s1, acc0.s1);
+    acc0.s2 = fma(a0.s1, b0.s2, acc0.s2);
+    acc0.s3 = fma(a0.s1, b0.s3, acc0.s3);
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+
+    acc1.s0 = fma(a1.s1, b0.s0, acc1.s0);
+    acc1.s1 = fma(a1.s1, b0.s1, acc1.s1);
+    acc1.s2 = fma(a1.s1, b0.s2, acc1.s2);
+    acc1.s3 = fma(a1.s1, b0.s3, acc1.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+
+    acc2.s0 = fma(a2.s1, b0.s0, acc2.s0);
+    acc2.s1 = fma(a2.s1, b0.s1, acc2.s1);
+    acc2.s2 = fma(a2.s1, b0.s2, acc2.s2);
+    acc2.s3 = fma(a2.s1, b0.s3, acc2.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    acc3.s0 = fma(a3.s1, b0.s0, acc3.s0);
+    acc3.s1 = fma(a3.s1, b0.s1, acc3.s1);
+    acc3.s2 = fma(a3.s1, b0.s2, acc3.s2);
+    acc3.s3 = fma(a3.s1, b0.s3, acc3.s3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    // Load values from matrix A and matrix B
+    b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+
+    // Multiply and accumulate
+    acc0.s0 = fma(a0.s2, b0.s0, acc0.s0);
+    acc0.s1 = fma(a0.s2, b0.s1, acc0.s1);
+    acc0.s2 = fma(a0.s2, b0.s2, acc0.s2);
+    acc0.s3 = fma(a0.s2, b0.s3, acc0.s3);
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+
+    acc1.s0 = fma(a1.s2, b0.s0, acc1.s0);
+    acc1.s1 = fma(a1.s2, b0.s1, acc1.s1);
+    acc1.s2 = fma(a1.s2, b0.s2, acc1.s2);
+    acc1.s3 = fma(a1.s2, b0.s3, acc1.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+
+    acc2.s0 = fma(a2.s2, b0.s0, acc2.s0);
+    acc2.s1 = fma(a2.s2, b0.s1, acc2.s1);
+    acc2.s2 = fma(a2.s2, b0.s2, acc2.s2);
+    acc2.s3 = fma(a2.s2, b0.s3, acc2.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    acc3.s0 = fma(a3.s2, b0.s0, acc3.s0);
+    acc3.s1 = fma(a3.s2, b0.s1, acc3.s1);
+    acc3.s2 = fma(a3.s2, b0.s2, acc3.s2);
+    acc3.s3 = fma(a3.s2, b0.s3, acc3.s3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    // Load values from matrix A and matrix B
+    b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+
+    // Multiply and accumulate
+    acc0.s0 = fma(a0.s3, b0.s0, acc0.s0);
+    acc0.s1 = fma(a0.s3, b0.s1, acc0.s1);
+    acc0.s2 = fma(a0.s3, b0.s2, acc0.s2);
+    acc0.s3 = fma(a0.s3, b0.s3, acc0.s3);
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+
+    acc1.s0 = fma(a1.s3, b0.s0, acc1.s0);
+    acc1.s1 = fma(a1.s3, b0.s1, acc1.s1);
+    acc1.s2 = fma(a1.s3, b0.s2, acc1.s2);
+    acc1.s3 = fma(a1.s3, b0.s3, acc1.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+
+    acc2.s0 = fma(a2.s3, b0.s0, acc2.s0);
+    acc2.s1 = fma(a2.s3, b0.s1, acc2.s1);
+    acc2.s2 = fma(a2.s3, b0.s2, acc2.s2);
+    acc2.s3 = fma(a2.s3, b0.s3, acc2.s3);
+
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    acc3.s0 = fma(a3.s3, b0.s0, acc3.s0);
+    acc3.s1 = fma(a3.s3, b0.s1, acc3.s1);
+    acc3.s2 = fma(a3.s3, b0.s2, acc3.s2);
+    acc3.s3 = fma(a3.s3, b0.s3, acc3.s3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    src_addr.s0 += 4 * sizeof(float);
+  }
+
+  for (; i < (int)COLS_A; ++i)
+  {
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Load values from matrix A
+    float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+       // Load values from matrix A
+    float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Load values from matrix B
+    float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+
+    // Multiply and accumulate
+    acc0.s0 = fma(a0, b0.s0, acc0.s0);
+    acc0.s1 = fma(a0, b0.s1, acc0.s1);
+    acc0.s2 = fma(a0, b0.s2, acc0.s2);
+    acc0.s3 = fma(a0, b0.s3, acc0.s3);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1.s0 = fma(a1, b0.s0, acc1.s0);
+    acc1.s1 = fma(a1, b0.s1, acc1.s1);
+    acc1.s2 = fma(a1, b0.s2, acc1.s2);
+    acc1.s3 = fma(a1, b0.s3, acc1.s3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2.s0 = fma(a2, b0.s0, acc2.s0);
+    acc2.s1 = fma(a2, b0.s1, acc2.s1);
+    acc2.s2 = fma(a2, b0.s2, acc2.s2);
+    acc2.s3 = fma(a2, b0.s3, acc2.s3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3.s0 = fma(a3, b0.s0, acc3.s0);
+    acc3.s1 = fma(a3, b0.s1, acc3.s1);
+    acc3.s2 = fma(a3, b0.s2, acc3.s2);
+    acc3.s3 = fma(a3, b0.s3, acc3.s3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    src_addr.s0 += sizeof(float);
+  }
+
+  int z = get_global_id(2);
+
+  // Compute destination address
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  // Compute dst address
+  __global uchar *dst_addr = offset(&dst, 0, 0);
+
+  uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+  // by HEIGHT_GEMM3D
+  zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+         (uint4)HEIGHT_GEMM3D;
+  zout = min(DEPTH_GEMM3D - 1, zout);
+
+  // Add offset due to the cross plane paddings
+  zout *= (dst_cross_plane_pad * dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+  REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));
+
+  LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, float, bias, BETA);
+#endif // UNIT_BIAS
+
+  // acc = acc + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) +
+    (get_global_id(1) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) +
+    get_global_id(2) * src2_stride_z;
+
+  LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias, BETA);
+#endif // UNIT_BIAS
+
+  // acc = acc + bias
+  ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, float, acc, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store the output block
+  vstore4(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+  vstore4(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+  vstore4(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  vstore4(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+}
+
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and
+ * matrix B (src1) in case both matrices have not been reshaped
+ *
+ * @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma
+ * units. This OpenCL kernel is optimized for Bifrost when the number of matrix B columns is less or
+ * equal to 1000.
+ * @note The number of elements processed along the x and y directions must be passed at compile
+ * time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y. This kernel
+ * optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=2.
+ * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha if
+ * alpha!=1.0f.
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data
+ * types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for
+ * the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),
+                                                      IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                      IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                      IMAGE_DECLARATION(dst), uint src0_stride_z,
+                                                      uint src1_stride_z,
+#if defined(BETA)
+                                                      uint src2_stride_z,
+#endif // defined(BETA)
+                                                      uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                      ,
+                                                      uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                      ,
+                                                      uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  // Requires 2 NUM_ELEMS_PROCESSED_PER_THREAD_X, C vect2, A vect4, B (2 vload2) // to fix for
+  // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+  int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+  // Compute starting address for matrix A and Matrix B
+  int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+  // Update address for the matrix A
+  src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+  // Update address for the matrix B
+  src_addr.s1 += idx * sizeof(float);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+  // by HEIGHT_GEMM3D
+  uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+              (uint4)HEIGHT_GEMM3D;
+  zin = min(DEPTH_GEMM3D - 1, zin);
+
+  // Add offset due to the cross plane paddings
+  zin *= (src_cross_plane_pad * src0_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply src0_stride_z by DEPTH_GEMM3D
+  src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  // Initialize accumulators
+  float2 acc0 = 0.0f;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+  float2 acc1 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+  float2 acc2 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  float2 acc3 = 0.0f;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+  // A and B src indices get incremented at the same time.
+  int i = 0;
+  for (; i <= ((int)COLS_A - 8); i += 8)
+  {
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Load values from matrix A
+    float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + zin.s0));
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+       // Load values from matrix A
+    float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0));
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Load values from matrix B
+    float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+    float2 b1 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+    float2 b2 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+    float2 b3 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+    float2 b4 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+    float2 b5 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+    float2 b6 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+    float2 b7 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+
+    // Multiply and accumulate
+    acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);
+    acc0.s0 = fma(a0.s1, b1.s0, acc0.s0);
+    acc0.s0 = fma(a0.s2, b2.s0, acc0.s0);
+    acc0.s0 = fma(a0.s3, b3.s0, acc0.s0);
+    acc0.s0 = fma(a0.s4, b4.s0, acc0.s0);
+    acc0.s0 = fma(a0.s5, b5.s0, acc0.s0);
+    acc0.s0 = fma(a0.s6, b6.s0, acc0.s0);
+    acc0.s0 = fma(a0.s7, b7.s0, acc0.s0);
+
+    acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);
+    acc0.s1 = fma(a0.s1, b1.s1, acc0.s1);
+    acc0.s1 = fma(a0.s2, b2.s1, acc0.s1);
+    acc0.s1 = fma(a0.s3, b3.s1, acc0.s1);
+    acc0.s1 = fma(a0.s4, b4.s1, acc0.s1);
+    acc0.s1 = fma(a0.s5, b5.s1, acc0.s1);
+    acc0.s1 = fma(a0.s6, b6.s1, acc0.s1);
+    acc0.s1 = fma(a0.s7, b7.s1, acc0.s1);
+
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if defined(REINTERPRET_INPUT_AS_3D)
+    a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+    a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+    acc1.s0 = fma(a0.s0, b0.s0, acc1.s0);
+    acc1.s0 = fma(a0.s1, b1.s0, acc1.s0);
+    acc1.s0 = fma(a0.s2, b2.s0, acc1.s0);
+    acc1.s0 = fma(a0.s3, b3.s0, acc1.s0);
+    acc1.s0 = fma(a0.s4, b4.s0, acc1.s0);
+    acc1.s0 = fma(a0.s5, b5.s0, acc1.s0);
+    acc1.s0 = fma(a0.s6, b6.s0, acc1.s0);
+    acc1.s0 = fma(a0.s7, b7.s0, acc1.s0);
+
+    acc1.s1 = fma(a0.s0, b0.s1, acc1.s1);
+    acc1.s1 = fma(a0.s1, b1.s1, acc1.s1);
+    acc1.s1 = fma(a0.s2, b2.s1, acc1.s1);
+    acc1.s1 = fma(a0.s3, b3.s1, acc1.s1);
+    acc1.s1 = fma(a0.s4, b4.s1, acc1.s1);
+    acc1.s1 = fma(a0.s5, b5.s1, acc1.s1);
+    acc1.s1 = fma(a0.s6, b6.s1, acc1.s1);
+    acc1.s1 = fma(a0.s7, b7.s1, acc1.s1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if defined(REINTERPRET_INPUT_AS_3D)
+    a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+    a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+    acc2.s0 = fma(a0.s0, b0.s0, acc2.s0);
+    acc2.s0 = fma(a0.s1, b1.s0, acc2.s0);
+    acc2.s0 = fma(a0.s2, b2.s0, acc2.s0);
+    acc2.s0 = fma(a0.s3, b3.s0, acc2.s0);
+    acc2.s0 = fma(a0.s4, b4.s0, acc2.s0);
+    acc2.s0 = fma(a0.s5, b5.s0, acc2.s0);
+    acc2.s0 = fma(a0.s6, b6.s0, acc2.s0);
+    acc2.s0 = fma(a0.s7, b7.s0, acc2.s0);
+
+    acc2.s1 = fma(a0.s0, b0.s1, acc2.s1);
+    acc2.s1 = fma(a0.s1, b1.s1, acc2.s1);
+    acc2.s1 = fma(a0.s2, b2.s1, acc2.s1);
+    acc2.s1 = fma(a0.s3, b3.s1, acc2.s1);
+    acc2.s1 = fma(a0.s4, b4.s1, acc2.s1);
+    acc2.s1 = fma(a0.s5, b5.s1, acc2.s1);
+    acc2.s1 = fma(a0.s6, b6.s1, acc2.s1);
+    acc2.s1 = fma(a0.s7, b7.s1, acc2.s1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#if defined(REINTERPRET_INPUT_AS_3D)
+    a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+    a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+    acc3.s0 = fma(a0.s0, b0.s0, acc3.s0);
+    acc3.s0 = fma(a0.s1, b1.s0, acc3.s0);
+    acc3.s0 = fma(a0.s2, b2.s0, acc3.s0);
+    acc3.s0 = fma(a0.s3, b3.s0, acc3.s0);
+    acc3.s0 = fma(a0.s4, b4.s0, acc3.s0);
+    acc3.s0 = fma(a0.s5, b5.s0, acc3.s0);
+    acc3.s0 = fma(a0.s6, b6.s0, acc3.s0);
+    acc3.s0 = fma(a0.s7, b7.s0, acc3.s0);
+
+    acc3.s1 = fma(a0.s0, b0.s1, acc3.s1);
+    acc3.s1 = fma(a0.s1, b1.s1, acc3.s1);
+    acc3.s1 = fma(a0.s2, b2.s1, acc3.s1);
+    acc3.s1 = fma(a0.s3, b3.s1, acc3.s1);
+    acc3.s1 = fma(a0.s4, b4.s1, acc3.s1);
+    acc3.s1 = fma(a0.s5, b5.s1, acc3.s1);
+    acc3.s1 = fma(a0.s6, b6.s1, acc3.s1);
+    acc3.s1 = fma(a0.s7, b7.s1, acc3.s1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    src_addr.s0 += sizeof(float) * 8;
+  }
+  // float size increment
+  for (; i < (int)COLS_A; ++i)
+  {
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Load values from matrix A
+    float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+       // Load values from matrix A
+    float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Load values from matrix B
+    float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+
+    // Multiply and accumulate
+    acc0.s0 = fma(a0, b0.s0, acc0.s0);
+    acc0.s1 = fma(a0, b0.s1, acc0.s1);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1.s0 = fma(a1, b0.s0, acc1.s0);
+    acc1.s1 = fma(a1, b0.s1, acc1.s1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2.s0 = fma(a2, b0.s0, acc2.s0);
+    acc2.s1 = fma(a2, b0.s1, acc2.s1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3.s0 = fma(a3, b0.s0, acc3.s0);
+    acc3.s1 = fma(a3, b0.s1, acc3.s1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    src_addr.s0 += sizeof(float);
+  }
+
+  int z = get_global_id(2);
+
+  // Compute destination address
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  // Compute dst address
+  __global uchar *dst_addr = offset(&dst, 0, 0);
+
+  uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+  // by HEIGHT_GEMM3D
+  zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+         (uint4)HEIGHT_GEMM3D;
+  zout = min(DEPTH_GEMM3D - 1, zout);
+
+  // Add offset due to the cross plane paddings
+  zout *= (dst_cross_plane_pad * dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+  REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float));
+
+  LOAD_BLOCK(1, 2, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, float, bias, BETA);
+#endif // UNIT_BIAS
+
+  // acc = acc + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float)) +
+    (get_global_id(1) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) +
+    get_global_id(2) * src2_stride_z;
+
+  LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 2, float, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias, BETA);
+#endif // UNIT_BIAS
+
+  // acc = acc + bias
+  ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, float, acc, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store the output block
+  vstore2(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+  vstore2(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+  vstore2(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  vstore2(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+}
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and
+ * matrix B (src1) in case both matrices have not beed reshaped
+ *
+ * @note This OpenCL kernel works with the 16-bit floating point data type (half) and accumulating
+ * the result in a 32 floating point variable.
+ * @note The number of elements processed along the x and y directions must be passed at compile
+ * time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y. This kernel
+ * optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
+ * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data
+ * types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for
+ * the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),
+                                                       IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                       IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                       IMAGE_DECLARATION(dst), uint src0_stride_z,
+                                                       uint src1_stride_z,
+#if defined(BETA)
+                                                       uint src2_stride_z,
+#endif // defined(BETA)
+                                                       uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                       ,
+                                                       uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                       ,
+                                                       uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+  // Compute starting address for matrix A and Matrix B
+  int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+  // Update address for the matrix A
+  src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+  // Update address for the matrix B
+  src_addr.s1 += idx * sizeof(half);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+  // by HEIGHT_GEMM3D
+  uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+              (uint4)HEIGHT_GEMM3D;
+  zin = min(DEPTH_GEMM3D - 1, zin);
+
+  // Add offset due to the cross plane paddings
+  zin *= (src_cross_plane_pad * src0_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply src0_stride_z by DEPTH_GEMM3D
+  src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  float8 acc0 = 0.0h;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+  float8 acc1 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+  float8 acc2 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  float8 acc3 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+  int i = 0;
+  for (; i <= ((int)COLS_A - 4); i += 4)
+  {
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Load values from matrix A
+    LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y,
+               zin.s);
+#else // defined(REINTERPRET_INPUT_AS_3D)
+      // Load values from matrix A
+    half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Load values from matrix B
+    float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+    src_addr.s1 += src1_stride_y;
+
+    // Accumulate
+    acc0 = fma(b0, (float8)a0.s0, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 = fma(b0, (float8)a1.s0, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 = fma(b0, (float8)a2.s0, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 = fma(b0, (float8)a3.s0, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+    src_addr.s1 += src1_stride_y;
+    acc0 = fma(b0, (float8)a0.s1, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 = fma(b0, (float8)a1.s1, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 = fma(b0, (float8)a2.s1, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 = fma(b0, (float8)a3.s1, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+    src_addr.s1 += src1_stride_y;
+    acc0 = fma(b0, (float8)a0.s2, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 = fma(b0, (float8)a1.s2, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 = fma(b0, (float8)a2.s2, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 = fma(b0, (float8)a3.s2, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+    src_addr.s1 += src1_stride_y;
+    acc0 = fma(b0, (float8)a0.s3, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 = fma(b0, (float8)a1.s3, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 = fma(b0, (float8)a2.s3, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 = fma(b0, (float8)a3.s3, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    src_addr.s0 += 4 * sizeof(half);
+  }
+
+  for (; i < (int)COLS_A; ++i)
+  {
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Load values from matrix A
+    half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+       // Load values from matrix A
+    half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Load values from matrix B
+    float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));
+
+    src_addr += (int2)(sizeof(half), src1_stride_y);
+
+    // Accumulate
+    acc0 = fma(b0, (float8)a0, acc0); // b0 * (half8)a0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 = fma(b0, (float8)a1, acc1); // b0 * (half8)a1;
+#endif                                // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 = fma(b0, (float8)a2, acc2); // b0 * (half8)a2;
+#endif                                // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 = fma(b0, (float8)a3, acc3); // b0 * (half8)a3;
+#endif                                // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  }
+
+  int z = get_global_id(2);
+
+  // Compute destination address
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  // Compute dst address
+  __global uchar *dst_addr = offset(&dst, 0, 0);
+
+  uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+  // by HEIGHT_GEMM3D
+  zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+         (uint4)HEIGHT_GEMM3D;
+  zout = min(DEPTH_GEMM3D - 1, zout);
+
+  // Add offset due to the cross plane paddings
+  zout *= (dst_cross_plane_pad * dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);
+#endif // defined(ALPHA)
+
+#if defined(BETA)
+  REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
+
+  LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+  float8 bias_f0 = convert_float8(bias0);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, float, bias_f, BETA);
+#endif // UNIT_BIAS
+
+  // acc = acc + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias_f0);
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) +
+    (get_global_id(1) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) +
+    get_global_id(2) * src2_stride_z;
+
+  LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+  float8 bias_f0 = convert_float8(bias0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+  float8 bias_f1 = convert_float8(bias1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+  float8 bias_f2 = convert_float8(bias2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  float8 bias_f3 = convert_float8(bias3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias_f, BETA);
+#endif // UNIT_BIAS
+
+  // acc = acc + bias
+  ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias_f);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+  half8 acc_h0 = convert_half8(acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+  half8 acc_h1 = convert_half8(acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+  half8 acc_h2 = convert_half8(acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  half8 acc_h3 = convert_half8(acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, half, acc_h, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store the output block
+  STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, acc_h, dst_addr, dst_stride_y, zout.s);
+}
+
+/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and
+ * matrix B (src1) in case both matrices have not beed reshaped
+ *
+ * @note This OpenCL kernel works with the 16-bit floating point data type (half) and uses the fma
+ * units.
+ * @note The number of elements processed along the x and y directions must be passed at compile
+ * time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y. This kernel
+ * optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.
+ * @note The number of matrix A columns must be passed at compile time using -DCOLS_A.
+ * @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha
+ * @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid
+ * out-of-bounds reads, the number of channels of matrix B must be passed at compile time using
+ * MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16) This case can happen when GEMM is used to perform the
+ * element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have
+ * multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])
+ *
+ * @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g.
+ * -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed
+ * at compile time as well using -DA_VAL= and -DB_VAL= respectively. The activation function is
+ * performed after the bias addition
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data
+ * types: F16
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src2_ptr                           (Optional) Pointer to the bias matrix. Supported
+ * data type: same as @p lhs_ptr
+ * @param[in]  src2_stride_x                      (Optional) Stride of the bias matrix in X
+ * dimension (in bytes)
+ * @param[in]  src2_step_x                        (Optional) src2_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  src2_stride_y                      (Optional) Stride of the bias matrix in Y
+ * dimension (in bytes)
+ * @param[in]  src2_step_y                        (Optional) src2_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  src2_offset_first_element_in_bytes (Optional) The offset of the first element in the
+ * bias matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ * @param[in]  src0_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src2_stride_z                      (Optional) Stride of the bias matrix in Z
+ * dimension (in bytes)
+ * @param[in]  dst_stride_z                       Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  src_cross_plane_pad                (Optional) Bottom paddings in unit of elements for
+ * the input tensor (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0), IMAGE_DECLARATION(src1),
+#if defined(BETA)
+                                                 IMAGE_DECLARATION(src2),
+#endif // defined(BETA)
+                                                 IMAGE_DECLARATION(dst), uint src0_stride_z,
+                                                 uint src1_stride_z,
+#if defined(BETA)
+                                                 uint src2_stride_z,
+#endif // defined(BETA)
+                                                 uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                                 ,
+                                                 uint src_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                 ,
+                                                 uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;
+
+  // Compute starting address for matrix A and Matrix B
+  int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));
+
+  // Update address for the matrix A
+  src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;
+
+  // Update address for the matrix B
+  src_addr.s1 += idx * sizeof(half);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+  // by HEIGHT_GEMM3D
+  uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+              (uint4)HEIGHT_GEMM3D;
+  zin = min(DEPTH_GEMM3D - 1, zin);
+
+  // Add offset due to the cross plane paddings
+  zin *= (src_cross_plane_pad * src0_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply src0_stride_z by DEPTH_GEMM3D
+  src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  src_addr.s0 += get_global_id(2) * src0_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  src_addr.s1 += get_global_id(2) * src1_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  half8 acc0 = 0.0h;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+  half8 acc1 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+  half8 acc2 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  half8 acc3 = 0.0h;
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+  int i = 0;
+  for (; i <= ((int)COLS_A - 4); i += 4)
+  {
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Load values from matrix A
+    LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y,
+               zin.s);
+#else // defined(REINTERPRET_INPUT_AS_3D)
+      // Load values from matrix A
+    half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Load values from matrix B
+    half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+
+    // Accumulate
+    acc0 = fma(b0, (half8)a0.s0, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 = fma(b0, (half8)a1.s0, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 = fma(b0, (half8)a2.s0, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 = fma(b0, (half8)a3.s0, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+    acc0 = fma(b0, (half8)a0.s1, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 = fma(b0, (half8)a1.s1, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 = fma(b0, (half8)a2.s1, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 = fma(b0, (half8)a3.s1, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+    acc0 = fma(b0, (half8)a0.s2, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 = fma(b0, (half8)a1.s2, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 = fma(b0, (half8)a2.s2, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 = fma(b0, (half8)a3.s2, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+    src_addr.s1 += src1_stride_y;
+    acc0 = fma(b0, (half8)a0.s3, acc0);
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 = fma(b0, (half8)a1.s3, acc1);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 = fma(b0, (half8)a2.s3, acc2);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 = fma(b0, (half8)a3.s3, acc3);
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+
+    src_addr.s0 += 4 * sizeof(half);
+  }
+
+  for (; i < (int)COLS_A; ++i)
+  {
+#if defined(REINTERPRET_INPUT_AS_3D)
+    // Load values from matrix A
+    half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#else  // defined(REINTERPRET_INPUT_AS_3D)
+       // Load values from matrix A
+    half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));
+#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+    // Load values from matrix B
+    half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));
+
+    src_addr += (int2)(sizeof(half), src1_stride_y);
+
+    // Accumulate
+    acc0 = fma(b0, (half8)a0, acc0); // b0 * (half8)a0;
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+    acc1 = fma(b0, (half8)a1, acc1); // b0 * (half8)a1;
+#endif                               // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+    acc2 = fma(b0, (half8)a2, acc2); // b0 * (half8)a2;
+#endif                               // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2
+#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+    acc3 = fma(b0, (half8)a3, acc3); // b0 * (half8)a3;
+#endif                               // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3
+  }
+
+  int z = get_global_id(2);
+
+  // Compute destination address
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  // Compute dst address
+  __global uchar *dst_addr = offset(&dst, 0, 0);
+
+  uint4 zout = 0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across
+  // the z dimension in order to take into account the presence of possible cross plane paddings
+  //
+  //  |                  |
+  //  |      plane0      |
+  //  |                  |
+  //  |__________________|
+  //  |******************|
+  //  |  cross_plane_pad |
+  //  |******************|
+  //  |                  |
+  //  |      plane1      |
+  //  |                  |
+  //  |__________________|
+
+  // The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+  // by HEIGHT_GEMM3D
+  zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) /
+         (uint4)HEIGHT_GEMM3D;
+  zout = min(DEPTH_GEMM3D - 1, zout);
+
+  // Add offset due to the cross plane paddings
+  zout *= (dst_cross_plane_pad * dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+#else  // defined(REINTERPRET_OUTPUT_AS_3D)
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Multiply by the weight of matrix-matrix product and store the result
+#if defined(ALPHA)
+  SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, half, acc, ALPHA);
+#endif // defined(ALPHA)
+
+  // Add beta*bias
+#if defined(BETA)
+  REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);
+
+#if defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));
+
+  LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(1, half, bias, BETA);
+#endif // UNIT_BIAS
+
+  // acc = acc + bias[broadcasted]
+  ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);
+
+#else // defined(BROADCAST_BIAS)
+  __global uchar *src2_addr =
+    src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) +
+    (get_global_id(1) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) +
+    get_global_id(2) * src2_stride_z;
+
+  LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, bias, src2_addr, 0, src2_stride_y, zero);
+
+#ifndef UNIT_BETA
+  SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, half, bias, BETA);
+#endif // UNIT_BIAS
+
+  // acc = acc + bias
+  ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);
+
+#endif // defined(BROADCAST_BIAS)
+#endif // defined(BETA)
+
+#if defined(ACTIVATION_TYPE)
+  ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, half, acc, A_VAL, B_VAL);
+#endif // defined(ACTIVATION_TYPE)
+
+  // Store the output block
+  STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, acc, dst_addr, dst_stride_y, zout.s);
+}
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+
+#endif // defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) &&
+       // (NUM_ELEMS_PROCESSED_PER_THREAD_Y)
+
+#if defined(BETA)
+/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account
+ * that the second matrix might be weighted by a scalar value beta:
+ *
+ * @note The beta's value need to be passed at compile time using -DBETA
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types:
+ * F32
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                        dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data
+ * types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ */
+__kernel void gemm_ma_f32(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))
+{
+  // Compute source and destination addresses
+  Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+  Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+  // Load values from A x B
+  float4 alpha_ab = vload4(0, (__global float *)dst.ptr);
+
+  // Load values from Matrix C
+  float4 c = vload4(0, (__global float *)src.ptr);
+
+  // Computes alpha * axb + beta * c
+  float4 out = alpha_ab + (float4)BETA * c;
+
+  // Store final result in axb matrix
+  vstore4(out, 0, (__global float *)dst.ptr);
+}
+
+#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account
+ * that the second matrix might be weighted by a scalar value beta:
+ *
+ * @note The beta's value need to be passed at compile time using -DBETA
+ *
+ * @param[in]  src_ptr                           Pointer to the source matrix. Supported data types:
+ * F16
+ * @param[in]  src_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                        dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data
+ * types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ */
+__kernel void gemm_ma_f16(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))
+{
+  // Compute source and destination addresses
+  Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+  Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+  // Load values from A x B
+  half8 alpha_ab = vload8(0, (__global half *)dst.ptr);
+
+  // Load values from Matrix C
+  half8 c = vload8(0, (__global half *)src.ptr);
+
+  // Computes alpha * axb + beta * c
+  half8 out = alpha_ab + (half8)BETA * c;
+
+  // Store final result in axb matrix
+  vstore8(out, 0, (__global half *)dst.ptr);
+}
+#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)
+#endif // defined(BETA)
+
+#if defined(WIDTH_VECTOR_A)
+/** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and
+ * matrix B (src1) used for locally connected layer
+ *
+ * @note The width of A need to be passed at compile time using -DWIDTH_VECTOR_A
+ *
+ * @note The input A and matrix B must not be reshaped
+ *
+ * @param[in]  src0_ptr                           Pointer to the source matrix. Supported data
+ * types: F32
+ * @param[in]  src0_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src0_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src0_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src0_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src0_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[in]  src1_ptr                           Pointer to the source matrix. Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  src1_stride_x                      Stride of the source matrix in X dimension (in
+ * bytes)
+ * @param[in]  src1_step_x                        src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_y                      Stride of the source matrix in Y dimension (in
+ * bytes)
+ * @param[in]  src1_step_y                        src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src1_stride_z                      Stride of the source matrix in Z dimension (in
+ * bytes)
+ * @param[in]  src1_step_z                        src_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in]  src1_offset_first_element_in_bytes The offset of the first element in the source
+ * matrix
+ * @param[out] dst_ptr                            Pointer to the destination matrix Supported data
+ * types: same as @p src0_ptr
+ * @param[in]  dst_stride_x                       Stride of the destination matrix in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                         dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                       Stride of the destination matrix in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                         dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes  The offset of the first element in the destination
+ * matrix
+ */
+__kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0), TENSOR3D_DECLARATION(src1),
+                             IMAGE_DECLARATION(dst))
+{
+  int idx = get_global_id(0) * 4;
+  int idy = get_global_id(1);
+
+  // Compute the address for the vector A and matrix B
+  int2 src_addr = ((int2)(src0_offset_first_element_in_bytes + src0_stride_y * idy,
+                          src1_offset_first_element_in_bytes + src1_stride_z * idy));
+  src_addr.s1 += idx * sizeof(float);
+
+  int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));
+
+  float4 acc = 0.0f;
+
+  for (; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(float));
+       src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))
+  {
+    float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));
+    float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+    float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));
+
+    acc += b0 * (float4)a0.s0;
+    acc += b1 * (float4)a0.s1;
+  }
+
+  for (; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))
+  {
+    float a0 = *((__global float *)(src0_ptr + src_addr.s0));
+    float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));
+
+    acc += b0 * (float4)a0;
+  }
+
+  // Compute destination address
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));
+}
+#endif // defined(WIDTH_VECTOR_A)
+
+/** This kernel accumulates each row with the biases vector.
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=short.
+ * @note The vector size must be passed at compile time using -DVECTOR_SIZE e.g. -DVECTOR_SIZE=16.
+ *
+ * @param[in, out] accum_ptr                            Pointer to the accumulate tensor. Supported
+ * data type: U8/S8/U16/S16/F16/U32/S32/F32
+ * @param[in]      accum_stride_x                       Stride of the accmulate tensor in X
+ * dimension (in bytes)
+ * @param[in]      accum_step_x                         accum_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]      accum_stride_y                       Stride of the accumlulate tensor in Y
+ * dimension (in bytes)
+ * @param[in]      accum_step_y                         src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]      accum_offset_first_element_in_bytes  The offset of the first element in the
+ * accumulate tensor
+ * @param[in]      biases_ptr                           Pointer to the biases vector. Same as @p
+ * accum_ptr
+ * @param[in]      biases_stride_x                      Stride of the destination tensor in X
+ * dimension (in bytes)
+ * @param[in]      biases_step_x                        dst_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]      biases_offset_first_element_in_bytes The offset of the first element in the
+ * destination tensor
+ */
+#if defined(DATA_TYPE) && defined(VECTOR_SIZE)
+__kernel void gemm_accumulate_biases(IMAGE_DECLARATION(accum), VECTOR_DECLARATION(biases))
+{
+  Image accum = CONVERT_TO_IMAGE_STRUCT(accum);
+  Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
+
+  // Vector size, e.g. number of vector elements.
+  VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+  accum_value = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)accum.ptr);
+  VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+  biases_value = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)biases.ptr);
+  accum_value = biases_value + accum_value;
+  // Store result in the accumulate buffer
+  VSTORE(VECTOR_SIZE)
+  (accum_value, 0, (__global DATA_TYPE *)accum.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(VECTOR_SIZE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gemm_helpers.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemm_helpers.h
new file mode 100644
index 000000000..0c75d061f
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemm_helpers.h
@@ -0,0 +1,1235 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "activation_float_helpers.h"
+#include "helpers.h"
+
+/** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
+ * @name LOAD_ROW_n
+ *
+ * @param[in] N0        The number of rows to load
+ * @param[in] DATA_TYPE The data type of variables
+ * @param[in] BASENAME  The basename of the destination variables for the loaded rows
+ * @param[in] PTR       The base pointer
+ * @param[in] OFFSET    The offset within a row
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The z-axis offset vector
+ * @{
+ */
+#define LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
+  BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0));
+
+#define LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
+  BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1));
+
+#define LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
+  BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2));
+
+#define LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
+  BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3));
+
+#define LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
+  BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4));
+
+#define LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
+  BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5));
+
+#define LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
+  BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6));
+
+#define LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
+  BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7));
+
+#define LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                        \
+  BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8));
+
+#define LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)        \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9));
+
+#define LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A));
+
+#define LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B));
+
+#define LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C));
+
+#define LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D));
+
+#define LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E));
+
+#define LOAD_ROW_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F));
+
+/** @}*/ // end of group LOAD_ROW_n
+
+/** Load Blocks (consecutive rows and columns) with Z offset.
+ * @name LOAD_BLOCK
+ *
+ * Supported cases are M0=1,2,3,...,16 and N0=1,2,3,4,8,16
+ * The data to load is expected to have consecutive names for each row.
+ * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3, and Z=zin, the expected Z offsets are zin0, zin1 and zin2.
+ *
+ * @param[in] M0        The number of consecutive rows
+ * @param[in] N0        The number of consecutive columns
+ * @param[in] DATA_TYPE The data type of the target
+ * @param[in] BASENAME  The basename of the result variables
+ * @param[in] PTR       The base pointer for the data
+ * @param[in] OFFSET    The offset within a row
+ * @param[in] STRIDE_Y  The stride in y-axis direction
+ * @param[in] Z         The z-axis offset vector
+ * @{
+ */
+#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+  LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+/** @} */ // end of group LOAD_BLOCK
+
+/** Loads the elements from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
+ * @name LOAD_ELEMENT_n
+ *
+ * @param[in] N0        The number of rows to load
+ * @param[in] DATA_TYPE The data type of variables
+ * @param[in] BASENAME  The basename of the destination variables for the loaded rows
+ * @param[in] PTR       The base pointer
+ * @param[in] OFFSET    The offset within a row
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @{
+ */
+#define LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##0 = *((__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y));
+
+#define LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##1 = *((__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y));
+
+#define LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##2 = *((__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y));
+
+#define LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##3 = *((__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y));
+
+#define LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##4 = *((__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y));
+
+#define LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##5 = *((__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y));
+
+#define LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##6 = *((__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y));
+
+#define LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##7 = *((__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y));
+
+#define LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                         \
+  BASENAME##8 = *((__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y));
+
+#define LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)        \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                          \
+  BASENAME##9 = *((__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y));
+
+#define LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                          \
+  BASENAME##A = *((__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y));
+
+#define LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                          \
+  BASENAME##B = *((__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y));
+
+#define LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                          \
+  BASENAME##C = *((__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y));
+
+#define LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                          \
+  BASENAME##D = *((__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y));
+
+#define LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                          \
+  BASENAME##E = *((__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y));
+
+#define LOAD_ELEMENT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)       \
+  VEC_DATA_TYPE(DATA_TYPE, N0)                                          \
+  BASENAME##F = *((__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y));
+
+/** @}*/ // end of group LOAD_ELEMENT_n
+
+/** Load Scalar as Vector (consecutive elements).
+ * @name LOAD_SCALAR_AS_VECTOR
+ *
+ * Supported cases are M0=1,2,3,...,16 and N0=1,2,3,4,8,16
+ * The data to load is expected to have consecutive names for each row.
+ * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2.
+ *
+ * @param[in] M0        The number of consecutive rows
+ * @param[in] N0        The number of consecutive columns
+ * @param[in] DATA_TYPE The data type of the target
+ * @param[in] BASENAME  The basename of the result variables
+ * @param[in] PTR       The base pointer for the data
+ * @param[in] OFFSET    The offset within a row
+ * @param[in] STRIDE_Y  The stride in y-axis direction
+ * @{
+ */
+#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
+#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+  LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
+/** @} */ // end of group LOAD_SCALAR_AS_VECTOR
+
+/** Basic macros to calculate Z offset values from Z0 to Zn-1
+ * @name CALCULATE_Z_OFFSET_n
+ *
+ * @param[in] M0              The number of offset values to calculate
+ * @param[in] DATA_TYPE       The data type of the results
+ * @param[in] Z               The basename of the result variables
+ * @param[in] Y               The work-itme ID of y-axis
+ * @param[in] HEIGHT_GEMM3D   The height of GEMM3D
+ * @param[in] DEPTH_GEMM3D    The depth of GEMM3D
+ * @param[in] CROSS_PLANE_PAD The padding required for plane changes accross the z-dimension
+ * @param[in] STRIDE_Y        The stride value in y-axis direction
+ *
+ * @{
+ */
+#define CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+                             STRIDE_Y)                                                          \
+  Z##0 = (0 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                       \
+  Z##0 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##0);                                              \
+  Z##0 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+                             STRIDE_Y)                                                          \
+  CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD,       \
+                       STRIDE_Y)                                                                \
+  Z##1 = (1 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                       \
+  Z##1 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##1);                                              \
+  Z##1 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+                             STRIDE_Y)                                                          \
+  CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD,       \
+                       STRIDE_Y)                                                                \
+  Z##2 = (2 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                       \
+  Z##2 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##2);                                              \
+  Z##2 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+                             STRIDE_Y)                                                          \
+  CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD,       \
+                       STRIDE_Y)                                                                \
+  Z##3 = (3 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                       \
+  Z##3 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##3);                                              \
+  Z##3 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+                             STRIDE_Y)                                                          \
+  CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD,       \
+                       STRIDE_Y)                                                                \
+  Z##4 = (4 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                       \
+  Z##4 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##4);                                              \
+  Z##4 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+                             STRIDE_Y)                                                          \
+  CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD,       \
+                       STRIDE_Y)                                                                \
+  Z##5 = (5 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                       \
+  Z##5 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##5);                                              \
+  Z##5 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+                             STRIDE_Y)                                                          \
+  CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD,       \
+                       STRIDE_Y)                                                                \
+  Z##6 = (6 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                       \
+  Z##6 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##6);                                              \
+  Z##6 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+#define CALCULATE_Z_OFFSET_8(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+                             STRIDE_Y)                                                          \
+  CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD,       \
+                       STRIDE_Y)                                                                \
+  Z##7 = (7 + (DATA_TYPE)(Y * (DATA_TYPE)M0)) / (DATA_TYPE)HEIGHT_GEMM3D;                       \
+  Z##7 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##7);                                              \
+  Z##7 *= (CROSS_PLANE_PAD * STRIDE_Y);
+
+/** @} */ // end of group CALCULATE_Z_OFFSET_n
+
+/** Calculate Z offset values from Z0 to Zn-1
+ * @name CALCULATE_Z_OFFSET
+ *
+ * The Z offsets are expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected names of Z offsets are zin1, zin2, zin3.
+ * Note that, CROSS_PLANE_PAD (cross plain padding) is required to take into account
+ * the possible cross plane paddings in case of the plance changes across the z-dimension.
+ *
+ * <!--
+ * |                  |
+ * |      plane0      |
+ * |                  |
+ * |__________________|
+ * |******************|
+ * |  cross_plane_pad |
+ * |******************|
+ * |                  |
+ * |      plane1      |
+ * |                  |
+ * |__________________|
+ * -->
+ *
+ * @param[in] M0              The number of offset values to calculate
+ * @param[in] DATA_TYPE       The data type of the results
+ * @param[in] Z               The basename of the result variables
+ * @param[in] Y               The work-itme ID of y-axis
+ * @param[in] HEIGHT_GEMM3D   The height of GEMM3D
+ * @param[in] DEPTH_GEMM3D    The depth of GEMM3D
+ * @param[in] CROSS_PLANE_PAD The padding required for plane changes accross the z-dimension
+ * @param[in] STRIDE_Y        The stride value in y-axis direction
+ * @{
+ */
+#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+                               STRIDE_Y)                                                          \
+  CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD,      \
+                          STRIDE_Y)
+#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, \
+                           STRIDE_Y)                                                          \
+  CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD,   \
+                         STRIDE_Y)
+/** @} */ // end of group CALCULATE_Z_OFFSET
+
+/** Store the 0 to (n-1)th rows of the given variables
+ * @name STORE_ROW_n
+ *
+ * @param[in] N0        The size of the vectors
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] PTR       The base pointer
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The offset in z-axis direction
+ * @{
+ */
+#define STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  VSTORE(N0)                                                   \
+  (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
+
+#define STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                   \
+  (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
+
+#define STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                   \
+  (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
+
+#define STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                   \
+  (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
+
+#define STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                   \
+  (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
+
+#define STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                   \
+  (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
+
+#define STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                   \
+  (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
+
+#define STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                   \
+  (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
+
+#define STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                   \
+  (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
+
+#define STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)        \
+  VSTORE(N0)                                                    \
+  (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
+
+#define STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                    \
+  (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
+
+#define STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                    \
+  (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
+
+#define STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                    \
+  (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
+
+#define STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                    \
+  (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
+
+#define STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                    \
+  (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
+
+#define STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                    \
+  (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
+/** @} */ // end of groupd STORE_ROW_n
+
+/** Convert and store the 0th to (n-1)th rows of the given variables
+ * @name CONVERT_STORE_ROW_n
+ *
+ * @param[in] N0        The size of the vectors
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] PTR       The base pointer
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The offset in z-axis direction
+ * @{
+ */
+#define CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  VSTORE(N0)                                                           \
+  (CONVERT_SAT((BASENAME##0), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,        \
+   (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
+
+#define CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_1(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                           \
+  (CONVERT_SAT((BASENAME##1), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,        \
+   (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
+
+#define CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_2(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                           \
+  (CONVERT_SAT((BASENAME##2), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,        \
+   (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
+
+#define CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_3(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                           \
+  (CONVERT_SAT((BASENAME##3), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,        \
+   (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
+
+#define CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_4(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                           \
+  (CONVERT_SAT((BASENAME##4), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,        \
+   (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
+
+#define CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_5(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                           \
+  (CONVERT_SAT((BASENAME##5), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,        \
+   (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
+
+#define CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_6(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                           \
+  (CONVERT_SAT((BASENAME##6), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,        \
+   (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
+
+#define CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_7(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                           \
+  (CONVERT_SAT((BASENAME##7), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,        \
+   (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
+
+#define CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_8(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                           \
+  (CONVERT_SAT((BASENAME##8), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,        \
+   (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
+
+#define CONVERT_STORE_ROW_10(N0, DATA, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_9(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)   \
+  VSTORE(N0)                                                       \
+  (CONVERT_SAT((BASENAME##9), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,    \
+   (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
+
+#define CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_10(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                            \
+  (CONVERT_SAT((BASENAME##A), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,         \
+   (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
+
+#define CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_11(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                            \
+  (CONVERT_SAT((BASENAME##B), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,         \
+   (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
+
+#define CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_12(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                            \
+  (CONVERT_SAT((BASENAME##C), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,         \
+   (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
+
+#define CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_13(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                            \
+  (CONVERT_SAT((BASENAME##D), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,         \
+   (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
+
+#define CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_14(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                            \
+  (CONVERT_SAT((BASENAME##E), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,         \
+   (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
+
+#define CONVERT_STORE_ROW_16(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_15(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)       \
+  VSTORE(N0)                                                            \
+  (CONVERT_SAT((BASENAME##F), VEC_DATA_TYPE(DATA_TYPE, N0)), 0,         \
+   (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
+
+/** @} */ // end of groupd CONVERT_STORE_ROW_n
+
+/** Store a block of the given size M0xN0
+ * @name STORE_BLOCK
+ *
+ * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16.
+ * The data to store is expected to have consecutive names for each row.
+ * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
+ *
+ * @param[in] M0        The number of rows to store
+ * @param[in] N0        The size of each vector
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] PTR       The base pointer
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The offset in z-axis direction
+ * @{
+ */
+#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+/** @} */ // end of group STORE_BLOCK
+
+/** Convert and store a block of the given size M0xN0
+ * @name CONVERT_STORE_BLOCK
+ *
+ * Supported cases are M0=1,2,3,...,16 and N0=2,3,4,8,16.
+ * The data to store is expected to have consecutive names for each row.
+ * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
+ * The Z offset is expected to have consecutive names.
+ * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
+ *
+ * @param[in] M0        The number of rows to store
+ * @param[in] N0        The size of each vector
+ * @param[in] DATA_TYPE The data type of the vectors
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] PTR       The base pointer
+ * @param[in] STRIDE_Y  The stride value in y-axis direction
+ * @param[in] Z         The offset in z-axis direction
+ * @{
+ */
+#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+  CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+/** @} */ // end of group CONVERT_STORE_BLOCK
+
+/** Scale the rows in the given variables (BASENAME0 to BASENAMEn-1)
+ * @name SCALE_ROW_n
+ *
+ * @param[in] DATA_TYPE The data type of the variables
+ * @param[in] BASENAME  The basename of the variables
+ * @param[in] SCALE     The scale factor
+ * @{
+ */
+#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) BASENAME##0 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##1 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##2 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##3 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##4 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##5 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##6 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##7 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##8 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE)        \
+  BASENAME##9 *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##A *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##B *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##C *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##D *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##E *= (DATA_TYPE)SCALE;
+
+#define SCALE_ROW_16(DATA_TYPE, BASENAME, SCALE) \
+  SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE)       \
+  BASENAME##F *= (DATA_TYPE)SCALE;
+/** @} */ // end of group SCALE_ROW_n
+
+/** Scale elements stored in a block (BASENAME)
+ * @name SCALE_BLOCK
+ *
+ * Supported cases are N=1,2,3,...,16
+ *
+ * @param[in] N         The number of rows in the block
+ * @param[in] DATA_TYPE The data type of the block
+ * @param[in] BASENAME  The basename of the block
+ * @param[in] SCALE     The scale factor
+ * @{
+ */
+#define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE)
+#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE)
+/** @} */ // end of group SCALE_BLOCK
+
+/** Create a new vector containing the values at the given index for a set of given vectors
+ * @name COLUMN_VECTORn
+ *
+ * @param[in] IDX_COL  The index value
+ * @param[in] BASENAME The basename of the destination vectors
+ * @param[in] X        The basename of the source vectors
+ * @param[in] TYPE     The data type of the destination vectors
+ * @{
+ */
+#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) \
+  TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL);
+#define COLUMN_VECTOR2(IDX_COL, BASENAME, X, TYPE) \
+  VEC_DATA_TYPE(TYPE, 2)                           \
+  BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0).s##IDX_COL, (X##1).s##IDX_COL);
+#define COLUMN_VECTOR3(IDX_COL, BASENAME, X, TYPE) \
+  VEC_DATA_TYPE(TYPE, 3)                           \
+  BASENAME##IDX_COL =                              \
+    (VEC_DATA_TYPE(TYPE, 3))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL);
+#define COLUMN_VECTOR4(IDX_COL, BASENAME, X, TYPE)                                   \
+  VEC_DATA_TYPE(TYPE, 4)                                                             \
+  BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, \
+                                               (X##2).s##IDX_COL, (X##3).s##IDX_COL);
+#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE)                                                 \
+  VEC_DATA_TYPE(TYPE, 8)                                                                           \
+  BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))(                                                    \
+    (X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, \
+    (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL);
+#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE)                                                \
+  VEC_DATA_TYPE(TYPE, 16)                                                                          \
+  BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))(                                                   \
+    (X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, \
+    (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, (X##8).s##IDX_COL, (X##9).s##IDX_COL, \
+    (X##A).s##IDX_COL, (X##B).s##IDX_COL, (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, \
+    (X##F).s##IDX_COL);
+/** @} */ // end of group COLUMN_VECTORn
+
+/** Create a new vector containing the values at the given index. Utility macros for transposing a
+ * colum-vector
+ * @name COLUMN_VECTOR_SCALARn
+ *
+ * @param[in] IDX_COL  The index value
+ * @param[in] BASENAME The basename of the destination vectors
+ * @param[in] X        The basename of the source vectors
+ * @param[in] TYPE     The data type of the destination vectors
+ * @{
+ */
+#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) TYPE BASENAME##IDX_COL = (TYPE)((X##0));
+#define COLUMN_VECTOR_SCALAR2(IDX_COL, BASENAME, X, TYPE) \
+  VEC_DATA_TYPE(TYPE, 2)                                  \
+  BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0), (X##1));
+#define COLUMN_VECTOR_SCALAR3(IDX_COL, BASENAME, X, TYPE) \
+  VEC_DATA_TYPE(TYPE, 3)                                  \
+  BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0), (X##1), (X##2));
+#define COLUMN_VECTOR_SCALAR4(IDX_COL, BASENAME, X, TYPE) \
+  VEC_DATA_TYPE(TYPE, 4)                                  \
+  BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0), (X##1), (X##2), (X##3));
+#define COLUMN_VECTOR_SCALAR8(IDX_COL, BASENAME, X, TYPE) \
+  VEC_DATA_TYPE(TYPE, 8)                                  \
+  BASENAME##IDX_COL =                                     \
+    (VEC_DATA_TYPE(TYPE, 8))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7));
+#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE)                                    \
+  VEC_DATA_TYPE(TYPE, 16)                                                                     \
+  BASENAME##IDX_COL =                                                                         \
+    (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), \
+                              (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F));
+/** @} */ // end of group COLUMN_VECTORn
+
+/** Create transposed vectors of the given vectors
+ * @name TRANSPOSE_K0Xn
+ *
+ * @param[in] K0       The size of the source vectors
+ * @param[in] BASENAME The basename of transposed vectors
+ * @param[in] B        The basename of source vectors for transposition
+ * @param[in] TYPE     The data type of the transposed vectors
+ * @{
+ */
+#define TRANSPOSE_K0X1(K0, BASENAME, B, TYPE) COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, B, TYPE);
+#define TRANSPOSE_K0X2(K0, BASENAME, B, TYPE) \
+  COLUMN_VECTOR(K0, 0, BASENAME, B, TYPE);    \
+  COLUMN_VECTOR(K0, 1, BASENAME, B, TYPE);
+#define TRANSPOSE_K0X3(K0, BASENAME, B, TYPE) \
+  TRANSPOSE_K0X2(K0, BASENAME, B, TYPE);      \
+  COLUMN_VECTOR(K0, 2, BASENAME, B, TYPE);
+#define TRANSPOSE_K0X4(K0, BASENAME, B, TYPE) \
+  TRANSPOSE_K0X3(K0, BASENAME, B, TYPE);      \
+  COLUMN_VECTOR(K0, 3, BASENAME, B, TYPE);
+#define TRANSPOSE_K0X8(K0, BASENAME, B, TYPE) \
+  TRANSPOSE_K0X4(K0, BASENAME, B, TYPE);      \
+  COLUMN_VECTOR(K0, 4, BASENAME, B, TYPE);    \
+  COLUMN_VECTOR(K0, 5, BASENAME, B, TYPE);    \
+  COLUMN_VECTOR(K0, 6, BASENAME, B, TYPE);    \
+  COLUMN_VECTOR(K0, 7, BASENAME, B, TYPE);
+#define TRANSPOSE_K0X16(K0, BASENAME, B, TYPE) \
+  TRANSPOSE_K0X8(K0, BASENAME, B, TYPE);       \
+  COLUMN_VECTOR(K0, 8, BASENAME, B, TYPE);     \
+  COLUMN_VECTOR(K0, 9, BASENAME, B, TYPE);     \
+  COLUMN_VECTOR(K0, A, BASENAME, B, TYPE);     \
+  COLUMN_VECTOR(K0, B, BASENAME, B, TYPE);     \
+  COLUMN_VECTOR(K0, C, BASENAME, B, TYPE);     \
+  COLUMN_VECTOR(K0, D, BASENAME, B, TYPE);     \
+  COLUMN_VECTOR(K0, E, BASENAME, B, TYPE);     \
+  COLUMN_VECTOR(K0, F, BASENAME, B, TYPE);
+
+/** @} */ // end of group TRANSPOSE_K0Xn
+
+/** Create column vectors to contain the values at the given index for a set of given vectors
+ *
+ * @param[in] K0       The number of source vectors
+ * @param[in] IDX_COL  The index value
+ * @param[in] BASENAME The basename of the destination vectors
+ * @param[in] B        The basename of the source vectors
+ * @param[in] TYPE     The data type of the destination vectors
+ */
+#define COLUMN_VECTOR(K0, IDX_COL, BASENAME, B, TYPE) \
+  CONCAT(COLUMN_VECTOR, K0)                           \
+  (IDX_COL, BASENAME, B, TYPE);
+
+/** Create column vectors to contain the values at the given index. Utility macro for transposing a
+ * column-vector
+ *
+ * @param[in] K0       The number of source vectors
+ * @param[in] IDX_COL  The index value
+ * @param[in] BASENAME The basename of the destination vectors
+ * @param[in] B        The basename of the source vectors
+ * @param[in] TYPE     The data type of the destination vectors
+ */
+#define COLUMN_VECTOR_SCALAR(K0, IDX_COL, BASENAME, B, TYPE) \
+  CONCAT(COLUMN_VECTOR_SCALAR, K0)                           \
+  (IDX_COL, BASENAME, B, TYPE);
+
+/** Create transposed vectors form the given source vectors
+ *
+ * @param[in] K0       The size of source vectors
+ * @param[in] N0       The number of source vectors
+ * @param[in] BASENAME The basename of transposed vectors
+ * @param[in] B        The basename of source vectors for transposition
+ * @param[in] TYPE     The data type of the transposed vectors
+ *
+ */
+#define TRANSPOSE_K0XN0(K0, N0, BASENAME, B, TYPE) \
+  CONCAT(TRANSPOSE_K0X, N0)                        \
+  (K0, BASENAME, B, TYPE);
+
+/** Add the variables (BIAS0 to BIASn-1) to the others (BASENAME0 to BASENAMEn-1)
+ * @name ADD_ROW_n
+ *
+ * @param[in] BASENAME The basename of the destination variables
+ * @param[in] BIAS     The basename of the added variables
+ * @{
+ */
+#define ADD_ROW_1(BASENAME, BIAS) BASENAME##0 += BIAS##0;
+
+#define ADD_ROW_2(BASENAME, BIAS) \
+  ADD_ROW_1(BASENAME, BIAS)       \
+  BASENAME##1 += BIAS##1;
+
+#define ADD_ROW_3(BASENAME, BIAS) \
+  ADD_ROW_2(BASENAME, BIAS)       \
+  BASENAME##2 += BIAS##2;
+
+#define ADD_ROW_4(BASENAME, BIAS) \
+  ADD_ROW_3(BASENAME, BIAS)       \
+  BASENAME##3 += BIAS##3;
+
+#define ADD_ROW_5(BASENAME, BIAS) \
+  ADD_ROW_4(BASENAME, BIAS)       \
+  BASENAME##4 += BIAS##4;
+
+#define ADD_ROW_6(BASENAME, BIAS) \
+  ADD_ROW_5(BASENAME, BIAS)       \
+  BASENAME##5 += BIAS##5;
+
+#define ADD_ROW_7(BASENAME, BIAS) \
+  ADD_ROW_6(BASENAME, BIAS)       \
+  BASENAME##6 += BIAS##6;
+
+#define ADD_ROW_8(BASENAME, BIAS) \
+  ADD_ROW_7(BASENAME, BIAS)       \
+  BASENAME##7 += BIAS##7;
+
+#define ADD_ROW_9(BASENAME, BIAS) \
+  ADD_ROW_8(BASENAME, BIAS)       \
+  BASENAME##8 += BIAS##8;
+
+#define ADD_ROW_10(BASENAME, BIAS) \
+  ADD_ROW_9(BASENAME, BIAS)        \
+  BASENAME##9 += BIAS##9;
+
+#define ADD_ROW_11(BASENAME, BIAS) \
+  ADD_ROW_10(BASENAME, BIAS)       \
+  BASENAME##A += BIAS##A;
+
+#define ADD_ROW_12(BASENAME, BIAS) \
+  ADD_ROW_11(BASENAME, BIAS)       \
+  BASENAME##B += BIAS##B;
+
+#define ADD_ROW_13(BASENAME, BIAS) \
+  ADD_ROW_12(BASENAME, BIAS)       \
+  BASENAME##C += BIAS##C;
+
+#define ADD_ROW_14(BASENAME, BIAS) \
+  ADD_ROW_13(BASENAME, BIAS)       \
+  BASENAME##D += BIAS##D;
+
+#define ADD_ROW_15(BASENAME, BIAS) \
+  ADD_ROW_14(BASENAME, BIAS)       \
+  BASENAME##E += BIAS##E;
+
+#define ADD_ROW_16(BASENAME, BIAS) \
+  ADD_ROW_15(BASENAME, BIAS)       \
+  BASENAME##F += BIAS##F;
+
+/** @} */ // end of group ADD_ROW_n
+
+/** Add the block (BIAS) to another block (BASENAME)
+ * @name ADD_BLOCK
+ *
+ * Supported cases are N=1,2,3,...,16
+ *
+ * @param[in] N        The number of vectors in the block
+ * @param[in] BASENAME The basename of the destination variables
+ * @param[in] BIAS     The basename of the added variables
+ * @{
+ */
+#define ADD_BLOCK_STR(N, BASENAME, BIAS) ADD_ROW_##N(BASENAME, BIAS)
+#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS)
+/** @} */ // end of group ADD_BLOCK
+
+/** Broadcast (add single value) to the each element of the destination variables
+ * @name ADD_ROW_BROADCAST_n
+ *
+ * @param[in] BASENAME The basename of the destination variables
+ * @param[in] BIAS     The variable containing the value to add
+ * @{
+ */
+#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) BASENAME##0 += BIAS;
+
+#define ADD_ROW_BROADCAST_2(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_1(BASENAME, BIAS)       \
+  BASENAME##1 += BIAS;
+
+#define ADD_ROW_BROADCAST_3(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_2(BASENAME, BIAS)       \
+  BASENAME##2 += BIAS;
+
+#define ADD_ROW_BROADCAST_4(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_3(BASENAME, BIAS)       \
+  BASENAME##3 += BIAS;
+
+#define ADD_ROW_BROADCAST_5(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_4(BASENAME, BIAS)       \
+  BASENAME##4 += BIAS;
+
+#define ADD_ROW_BROADCAST_6(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_5(BASENAME, BIAS)       \
+  BASENAME##5 += BIAS;
+
+#define ADD_ROW_BROADCAST_7(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_6(BASENAME, BIAS)       \
+  BASENAME##6 += BIAS;
+
+#define ADD_ROW_BROADCAST_8(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_7(BASENAME, BIAS)       \
+  BASENAME##7 += BIAS;
+
+#define ADD_ROW_BROADCAST_9(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_8(BASENAME, BIAS)       \
+  BASENAME##8 += BIAS;
+
+#define ADD_ROW_BROADCAST_10(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_9(BASENAME, BIAS)        \
+  BASENAME##9 += BIAS;
+
+#define ADD_ROW_BROADCAST_11(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_10(BASENAME, BIAS)       \
+  BASENAME##A += BIAS;
+
+#define ADD_ROW_BROADCAST_12(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_11(BASENAME, BIAS)       \
+  BASENAME##B += BIAS;
+
+#define ADD_ROW_BROADCAST_13(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_12(BASENAME, BIAS)       \
+  BASENAME##C += BIAS;
+
+#define ADD_ROW_BROADCAST_14(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_13(BASENAME, BIAS)       \
+  BASENAME##D += BIAS;
+
+#define ADD_ROW_BROADCAST_15(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_14(BASENAME, BIAS)       \
+  BASENAME##E += BIAS;
+
+#define ADD_ROW_BROADCAST_16(BASENAME, BIAS) \
+  ADD_ROW_BROADCAST_15(BASENAME, BIAS)       \
+  BASENAME##F += BIAS;
+
+/** Broadcast (add a value) to the each element of the destination block (BASENAME)
+ * @name ADD_BLOCK_BROADCAST
+ *
+ * Supported cases are N=1,2,3,...,16.
+ *
+ * @param[in] N        The number of vectors in the block
+ * @param[in] BASENAME The basename of the destination variables
+ * @param[in] BIAS     The variable containing the value to add
+ * @{
+ */
+#define ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) ADD_ROW_BROADCAST_##N(BASENAME, BIAS)
+#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS)
+/** @} */ // end of group ADD_BLOCK_BROADCAST
+
+/** Apply activation to the given variables
+ * @name ACTIVATION_ROW_n
+ *
+ * @param[in] ACTIVATION_TYPE The type of the activation
+ * @param[in] DATA_TYPE       The data type of the vectors
+ * @param[in] BASENAME        The basename of the variables
+ * @param[in] A_VAL           Additional value required by the activation
+ * @param[in] B_VAL           Additional value required by the activation
+ * @{
+ */
+#define ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  BASENAME##0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##0, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##1 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##1, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##2 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##2, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##3 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##3, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##4 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##4, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##5 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##5, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##6 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##6, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##7 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##7, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##8 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##8, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)        \
+  BASENAME##9 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##9, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##A = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##A, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##B = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##B, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##C = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##C, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##D = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##D, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##E = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##E, A_VAL, B_VAL);
+
+#define ACTIVATION_ROW_16(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)       \
+  BASENAME##F = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, BASENAME##F, A_VAL, B_VAL);
+/** @} */ // end of group ACTIVATION_ROW_n
+
+/** Apply activation to a block (BASENAME)
+ * @name ACTIVATION_BLOCK
+ *
+ * Supported cases are N=1,2,3,...,16.
+ *
+ * @param[in] N               The number of vectors in the block
+ * @param[in] ACTIVATION_TYPE The type of the activation
+ * @param[in] DATA_TYPE       The data type of the vectors
+ * @param[in] BASENAME        The basename of the variables
+ * @param[in] A_VAL           Additional value required by the activation
+ * @param[in] B_VAL           Additional value required by the activation
+ * @{
+ */
+#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)
+#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL) \
+  ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, BASENAME, A_VAL, B_VAL)
+/** @} */ // end of group ACTIVATION_BLOCK
+
+/** Apply convert_<data_type> to the given variables
+ * @name CONVERT_ROW_n
+ *
+ * @param[in] N            The size of the vectors
+ * @param[in] DATA_TYPE    The data type of the vectors
+ * @param[in] BASENAME_SRC The basename of the source variables
+ * @param[in] BASENAME_DST The basename of the destination variables
+ */
+#define CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                   \
+  BASENAME_DST##0 = CONVERT(BASENAME_SRC##0, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                   \
+  BASENAME_DST##1 = CONVERT(BASENAME_SRC##1, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                   \
+  BASENAME_DST##2 = CONVERT(BASENAME_SRC##2, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                   \
+  BASENAME_DST##3 = CONVERT(BASENAME_SRC##3, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                   \
+  BASENAME_DST##4 = CONVERT(BASENAME_SRC##4, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                   \
+  BASENAME_DST##5 = CONVERT(BASENAME_SRC##5, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                   \
+  BASENAME_DST##6 = CONVERT(BASENAME_SRC##6, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                   \
+  BASENAME_DST##7 = CONVERT(BASENAME_SRC##7, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                   \
+  BASENAME_DST##8 = CONVERT(BASENAME_SRC##8, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)        \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                    \
+  BASENAME_DST##9 = CONVERT(BASENAME_SRC##9, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                    \
+  BASENAME_DST##A = CONVERT(BASENAME_SRC##A, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                    \
+  BASENAME_DST##B = CONVERT(BASENAME_SRC##B, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                    \
+  BASENAME_DST##C = CONVERT(BASENAME_SRC##C, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                    \
+  BASENAME_DST##D = CONVERT(BASENAME_SRC##D, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                    \
+  BASENAME_DST##E = CONVERT(BASENAME_SRC##E, VEC_DATA_TYPE(DATA_TYPE, N));
+
+#define CONVERT_ROW_16(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)       \
+  VEC_DATA_TYPE(DATA_TYPE, N)                                    \
+  BASENAME_DST##F = CONVERT(BASENAME_SRC##F, VEC_DATA_TYPE(DATA_TYPE, N));
+/** @} */ // end of group CONVERT_ROW_n
+
+/** Apply convert_<data_type> to a block (BASENAME_SRC) and save to another block (BASENAME_DST)
+ * @name CONVERT_BLOCK
+ *
+ * Supported cases N=1,2,3,...,16.
+ *
+ * @param[in] M            The number of vectors to convert
+ * @param[in] N            The size of the vectors
+ * @param[in] DATA_TYPE    The data type of the vectors
+ * @param[in] BASENAME_SRC The basename of the source variables
+ * @param[in] BASENAME_DST The basename of the destination variables
+ */
+#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
+#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+  CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
+/** @} */ // end of group CONVERT_BLOCK
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp.cl
new file mode 100644
index 000000000..2d9acc753
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/gemmlowp.cl
@@ -0,0 +1,2733 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "gemm_helpers.h"
+#include "helpers_asymm.h"
+#include "repeat.h"
+
+#if defined(DATA_TYPE) && defined(ACC_DATA_TYPE)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && \
+  defined(cl_arm_integer_dot_product_accumulate_int8)
+#define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), (val));
+#else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) &&
+      // defined(cl_arm_integer_dot_product_accumulate_int8)
+#define ARM_DOT(x, y, val) val += arm_dot((x), (y));
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) &&
+       // defined(cl_arm_integer_dot_product_accumulate_int8)
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+/** Specialized macros to perform the dot product instruction between two vectors of size N [1,16].
+ * These macros use the dot8 instruction */
+#define ARM_DOT1(a, b, c)                                                         \
+  ({                                                                              \
+    ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 3))0),     \
+            (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 3))0), c); \
+  })
+#define ARM_DOT2(a, b, c)                                                         \
+  ({                                                                              \
+    ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (VEC_DATA_TYPE(DATA_TYPE, 2))0),     \
+            (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (VEC_DATA_TYPE(DATA_TYPE, 2))0), c); \
+  })
+#define ARM_DOT3(a, b, c)                                       \
+  ({                                                            \
+    ARM_DOT((VEC_DATA_TYPE(DATA_TYPE, 4))(a, (DATA_TYPE)0),     \
+            (VEC_DATA_TYPE(DATA_TYPE, 4))(b, (DATA_TYPE)0), c); \
+  })
+#define ARM_DOT4(a, b, c) ({ ARM_DOT(a, b, c); })
+#define ARM_DOT8(a, b, c)        \
+  ({                             \
+    ARM_DOT4((a.lo), (b.lo), c); \
+    ARM_DOT4((a.hi), (b.hi), c); \
+  })
+#define ARM_DOT16(a, b, c)       \
+  ({                             \
+    ARM_DOT8((a.lo), (b.lo), c); \
+    ARM_DOT8((a.hi), (b.hi), c); \
+  })
+
+#else // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+/** Specialized macros to perform the dot product instruction between two vectors of size K0 [1,16]
+ * without using the dot8 instruction. */
+#define ARM_DOT1(a, b, c) ({ c += (ACC_DATA_TYPE)a * b; })
+#define ARM_DOT2(a, b, c)            \
+  ({                                 \
+    c += (ACC_DATA_TYPE)a.s0 * b.s0; \
+    c += (ACC_DATA_TYPE)a.s1 * b.s1; \
+  })
+#define ARM_DOT3(a, b, c)            \
+  ({                                 \
+    ARM_DOT2(a, b, c);               \
+    c += (ACC_DATA_TYPE)a.s2 * b.s2; \
+  })
+#define ARM_DOT4(a, b, c)            \
+  ({                                 \
+    ARM_DOT3(a, b, c);               \
+    c += (ACC_DATA_TYPE)a.s3 * b.s3; \
+  })
+#define ARM_DOT8(a, b, c)        \
+  ({                             \
+    ARM_DOT4((a.lo), (b.lo), c); \
+    ARM_DOT4((a.hi), (b.hi), c); \
+  })
+#define ARM_DOT16(a, b, c)       \
+  ({                             \
+    ARM_DOT8((a.lo), (b.lo), c); \
+    ARM_DOT8((a.hi), (b.hi), c); \
+  })
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+
+/** Specialized macros to perform a broadcast dot product operation between one vector "a" and N0
+ * vectors "b" of size K0 [1,16] */
+#define ARM_DOT_K0X1(k0, a, b, c) ({ ARM_DOT_K0(k0, (a), (b##0), (c)); })
+#define ARM_DOT_K0X2(k0, a, b, c)        \
+  ({                                     \
+    ARM_DOT_K0(k0, (a), (b##0), (c.s0)); \
+    ARM_DOT_K0(k0, (a), (b##1), (c.s1)); \
+  })
+#define ARM_DOT_K0X3(k0, a, b, c)        \
+  ({                                     \
+    ARM_DOT_K0X2(k0, a, b, c);           \
+    ARM_DOT_K0(k0, (a), (b##2), (c.s2)); \
+  })
+#define ARM_DOT_K0X4(k0, a, b, c)        \
+  ({                                     \
+    ARM_DOT_K0X3(k0, a, b, c);           \
+    ARM_DOT_K0(k0, (a), (b##3), (c.s3)); \
+  })
+#define ARM_DOT_K0X8(k0, a, b, c)        \
+  ({                                     \
+    ARM_DOT_K0X4(k0, a, b, c);           \
+    ARM_DOT_K0(k0, (a), (b##4), (c.s4)); \
+    ARM_DOT_K0(k0, (a), (b##5), (c.s5)); \
+    ARM_DOT_K0(k0, (a), (b##6), (c.s6)); \
+    ARM_DOT_K0(k0, (a), (b##7), (c.s7)); \
+  })
+#define ARM_DOT_K0X16(k0, a, b, c)       \
+  ({                                     \
+    ARM_DOT_K0X8(k0, a, b, c);           \
+    ARM_DOT_K0(k0, (a), (b##8), (c.s8)); \
+    ARM_DOT_K0(k0, (a), (b##9), (c.s9)); \
+    ARM_DOT_K0(k0, (a), (b##A), (c.sA)); \
+    ARM_DOT_K0(k0, (a), (b##B), (c.sB)); \
+    ARM_DOT_K0(k0, (a), (b##C), (c.sC)); \
+    ARM_DOT_K0(k0, (a), (b##D), (c.sD)); \
+    ARM_DOT_K0(k0, (a), (b##E), (c.sE)); \
+    ARM_DOT_K0(k0, (a), (b##F), (c.sF)); \
+  })
+
+/** Specialized macros to perform a partial matrix multiplication with dimensions M0,N0,K0 */
+#define ARM_MM_K0XN0X1(n0, k0, a, b, c) ({ ARM_DOT_K0XN0(n0, k0, (a##0), b, (c##0)); })
+#define ARM_MM_K0XN0X2(n0, k0, a, b, c)       \
+  ({                                          \
+    ARM_MM_K0XN0X1(n0, k0, a, b, c);          \
+    ARM_DOT_K0XN0(n0, k0, (a##1), b, (c##1)); \
+  })
+#define ARM_MM_K0XN0X3(n0, k0, a, b, c)       \
+  ({                                          \
+    ARM_MM_K0XN0X2(n0, k0, a, b, c);          \
+    ARM_DOT_K0XN0(n0, k0, (a##2), b, (c##2)); \
+  })
+#define ARM_MM_K0XN0X4(n0, k0, a, b, c)       \
+  ({                                          \
+    ARM_MM_K0XN0X3(n0, k0, a, b, c);          \
+    ARM_DOT_K0XN0(n0, k0, (a##3), b, (c##3)); \
+  })
+#define ARM_MM_K0XN0X5(n0, k0, a, b, c)       \
+  ({                                          \
+    ARM_MM_K0XN0X4(n0, k0, a, b, c);          \
+    ARM_DOT_K0XN0(n0, k0, (a##4), b, (c##4)); \
+  })
+#define ARM_MM_K0XN0X6(n0, k0, a, b, c)       \
+  ({                                          \
+    ARM_MM_K0XN0X5(n0, k0, a, b, c);          \
+    ARM_DOT_K0XN0(n0, k0, (a##5), b, (c##5)); \
+  })
+#define ARM_MM_K0XN0X7(n0, k0, a, b, c)       \
+  ({                                          \
+    ARM_MM_K0XN0X6(n0, k0, a, b, c);          \
+    ARM_DOT_K0XN0(n0, k0, (a##6), b, (c##6)); \
+  })
+#define ARM_MM_K0XN0X8(n0, k0, a, b, c)       \
+  ({                                          \
+    ARM_MM_K0XN0X7(n0, k0, a, b, c);          \
+    ARM_DOT_K0XN0(n0, k0, (a##7), b, (c##7)); \
+  })
+
+#define ARM_DOT_K0(k0, a, b, c) \
+  ({                            \
+    CONCAT(ARM_DOT, k0)         \
+    ((a), (b), (c));            \
+  })
+
+#define ARM_DOT_K0XN0(n0, k0, a, b, c) \
+  ({                                   \
+    CONCAT(ARM_DOT_K0X, n0)            \
+    (k0, (a), b, (c));                 \
+  })
+
+#define ARM_MM_K0XN0XM0(m0, n0, k0, a, b, c) \
+  ({                                         \
+    CONCAT(ARM_MM_K0XN0X, m0)                \
+    (n0, k0, a, b, c);                       \
+  })
+
+/** Specialized macros to perform a broadcast dot product operation between one vector "a" and N0
+ * vectors "b" of size K0 [1,16] */
+#define ARM_MUL_N0X1(VECTOR_ACC_TYPE, a, b, c) ({ c += CONVERT(b##0, VECTOR_ACC_TYPE) * a; })
+#define ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c)    \
+  ({                                              \
+    c += CONVERT(b##0, VECTOR_ACC_TYPE) * a.s##0; \
+    c += CONVERT(b##1, VECTOR_ACC_TYPE) * a.s##1; \
+  })
+#define ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c)    \
+  ({                                              \
+    ARM_MUL_N0X2(VECTOR_ACC_TYPE, a, b, c);       \
+    c += CONVERT(b##2, VECTOR_ACC_TYPE) * a.s##2; \
+  })
+#define ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c)    \
+  ({                                              \
+    ARM_MUL_N0X3(VECTOR_ACC_TYPE, a, b, c);       \
+    c += CONVERT(b##3, VECTOR_ACC_TYPE) * a.s##3; \
+  })
+#define ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c)    \
+  ({                                              \
+    ARM_MUL_N0X4(VECTOR_ACC_TYPE, a, b, c);       \
+    c += CONVERT(b##4, VECTOR_ACC_TYPE) * a.s##4; \
+    c += CONVERT(b##5, VECTOR_ACC_TYPE) * a.s##5; \
+    c += CONVERT(b##6, VECTOR_ACC_TYPE) * a.s##6; \
+    c += CONVERT(b##7, VECTOR_ACC_TYPE) * a.s##7; \
+  })
+#define ARM_MUL_N0X16(VECTOR_ACC_TYPE, a, b, c)   \
+  ({                                              \
+    ARM_MUL_N0X8(VECTOR_ACC_TYPE, a, b, c);       \
+    c += CONVERT(b##8, VECTOR_ACC_TYPE) * a.s##8; \
+    c += CONVERT(b##9, VECTOR_ACC_TYPE) * a.s##9; \
+    c += CONVERT(b##A, VECTOR_ACC_TYPE) * a.s##A; \
+    c += CONVERT(b##B, VECTOR_ACC_TYPE) * a.s##B; \
+    c += CONVERT(b##C, VECTOR_ACC_TYPE) * a.s##C; \
+    c += CONVERT(b##D, VECTOR_ACC_TYPE) * a.s##D; \
+    c += CONVERT(b##E, VECTOR_ACC_TYPE) * a.s##E; \
+    c += CONVERT(b##F, VECTOR_ACC_TYPE) * a.s##F; \
+  })
+/** Specialized macros to perform a a partial matrix multiplication with dimensions M0,N0,K0 */
+#define ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c) \
+  ({ ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##0), b, (c##0)); })
+#define ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c) \
+  ({                                                        \
+    ARM_MM_NATIVE_N0XK0X1(VECTOR_ACC_TYPE, k0, a, b, c);    \
+    ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##1), b, (c##1));  \
+  })
+#define ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c) \
+  ({                                                        \
+    ARM_MM_NATIVE_N0XK0X2(VECTOR_ACC_TYPE, k0, a, b, c);    \
+    ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##2), b, (c##2));  \
+  })
+#define ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c) \
+  ({                                                        \
+    ARM_MM_NATIVE_N0XK0X3(VECTOR_ACC_TYPE, k0, a, b, c);    \
+    ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##3), b, (c##3));  \
+  })
+#define ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c) \
+  ({                                                        \
+    ARM_MM_NATIVE_N0XK0X4(VECTOR_ACC_TYPE, k0, a, b, c);    \
+    ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##4), b, (c##4));  \
+  })
+#define ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c) \
+  ({                                                        \
+    ARM_MM_NATIVE_N0XK0X5(VECTOR_ACC_TYPE, k0, a, b, c);    \
+    ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##5), b, (c##5));  \
+  })
+#define ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c) \
+  ({                                                        \
+    ARM_MM_NATIVE_N0XK0X6(VECTOR_ACC_TYPE, k0, a, b, c);    \
+    ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##6), b, (c##6));  \
+  })
+#define ARM_MM_NATIVE_N0XK0X8(VECTOR_ACC_TYPE, k0, a, b, c) \
+  ({                                                        \
+    ARM_MM_NATIVE_N0XK0X7(VECTOR_ACC_TYPE, k0, a, b, c);    \
+    ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, (a##7), b, (c##7));  \
+  })
+#define ARM_MUL_N0XK0(VECTOR_ACC_TYPE, k0, a, b, c) \
+  ({                                                \
+    CONCAT(ARM_MUL_N0X, k0)                         \
+    (VECTOR_ACC_TYPE, (a), b, (c));                 \
+  })
+#define ARM_MM_NATIVE_N0XK0XM0(VECTOR_ACC_TYPE, m0, k0, a, b, c) \
+  ({                                                             \
+    CONCAT(ARM_MM_NATIVE_N0XK0X, m0)                             \
+    (VECTOR_ACC_TYPE, k0, a, b, c);                              \
+  })
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(M) && \
+  defined(N)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices with
+ * QASYMM/QASYMM_SIGNED data type. The LHS matrix must be reshaped with @ref
+ * CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed The RHS matrix must be reshaped
+ * with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e.
+ * -DDATA_TYPE=uchar)
+ * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e.
+ * -DACC_DATA_TYPE=uint)
+ * @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items"
+ * support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.
+ * @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (i.e. -DM=52
+ * and -DN=90).
+ * @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0)
+ * must be passed at compile time using -DM0, -DN0 and -DK0 (i.e. -DM0=4, -DN0=8, -DK0=4).
+ * @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS
+ * matrix must be passed at compile time using -DV0 (i.e. -DV0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS
+ * matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
+ * @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option
+ * -DLHS_INTERLEAVE must passed at compile time.
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option
+ * -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - V0 >= 1
+ *  - H0 >= 1
+ *
+ * @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution
+ * layer), the following information must be passed at compile time:
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped
+ *
+ * @param[in]  lhs_ptr                           Pointer to the LHS reshaped matrix. Supported data
+ * type: QASYMM8/QASYMM_SIGNED
+ * @param[in]  lhs_stride_x                      Stride of the LHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in]  lhs_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  lhs_stride_y                      Stride of the LHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in]  lhs_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped
+ * matrix
+ * @param[in]  rhs_ptr                           Pointer to the RHS reshaped matrix. Supported data
+ * type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                      Stride of the RHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in]  rhs_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  rhs_stride_y                      Stride of the RHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in]  rhs_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped
+ * matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data
+ * type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ * @param[in]  k                                 Number of columns in LHS matrix and rows in RHS
+ * matrix not reshaped.
+ * @param[in]  lhs_stride_z                      Stride of the LHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in]  rhs_stride_z                      Stride of the RHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in]  dst_cross_plane_pad               (Optional) Bottom paddings in unit of elements
+ * (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemmlowp_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs),
+                                                IMAGE_DECLARATION(dst), uint k, uint lhs_stride_z,
+                                                uint rhs_stride_z, uint dst_stride_z
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                                ,
+                                                uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  // Block size
+#define LHS_BLOCK_SIZE ((K0) * (M0))
+
+#if defined(LHS_INTERLEAVE)
+#define LHS_OFFSET_X (K0)
+#define LHS_STEP_X ((K0) * (V0))
+#define LHS_STEP_LOOP (1)
+#else // defined(INTERLEAVE)
+#define LHS_OFFSET_X (LHS_BLOCK_SIZE)
+#define LHS_STEP_X (K0)
+#define LHS_STEP_LOOP (V0)
+#endif // defined(INTERLEAVE)
+
+  // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+  // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+  uint x = get_global_id(0);
+  uint y = get_global_id(1);
+  uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+  if ((x * N0 >= N) || (y * M0 >= M))
+  {
+    return;
+  }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+  // Compute LHS matrix address
+  __global DATA_TYPE *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes +
+                                 (y % V0) * (uint)LHS_OFFSET_X + (y / V0) * (uint)lhs_stride_y +
+                                 (z * lhs_stride_z);
+
+  // Compute RHS matrix address
+  __global DATA_TYPE *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes +
+                                 (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  rhs_addr += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0;
+  REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
+
+  // Initialize the accumulators
+  REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c,
+                           0); // VEC_DATA_TYPE(ACC_DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+  for (int i = 0; i < k; i += K0)
+  {
+    // Load values from LHS matrix
+    LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X, zlhs);
+
+    // Load values from RHS matrix
+    LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X, zrhs);
+
+    // Partial matrix multiplication M0,N0,K0
+    ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c);
+
+    // Update address
+    lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP);
+    rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP);
+  }
+
+  __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes +
+                             (x * (uint)N0 * sizeof(int)) + (y * (uint)M0 * dst_stride_y);
+
+  REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad,
+                     dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Convert and store output block
+  CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout);
+
+#undef LHS_BLOCK_SIZE
+#undef LHS_OFFSET_X
+#undef LHS_STEP_X
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(K)
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS matrix is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is
+ * transposed
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e.
+ * -DDATA_TYPE=uchar)
+ * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e.
+ * -DACC_DATA_TYPE=uint)
+ * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at
+ * compile time using -DN0 and -DK0 (i.e. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS
+ * matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option
+ * -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - H0 >= 1
+ *
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                           Pointer to the LHS reshaped matrix. Supported data
+ * type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  lhs_stride_x                      Stride of the LHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in]  lhs_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  lhs_stride_y                      Stride of the LHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in]  lhs_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped
+ * matrix
+ * @param[in]  rhs_ptr                           Pointer to the RHS reshaped matrix. Supported data
+ * type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                      Stride of the RHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in]  rhs_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  rhs_stride_y                      Stride of the RHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in]  rhs_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped
+ * matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data
+ * type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ * @param[in]  lhs_stride_z                      Stride of the LHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in]  rhs_stride_z                      Stride of the RHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in]  lhs_cross_plane_pad               (Optional) Bottom paddings for LHS matrix in unit
+ * of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad               (Optional) Bottom paddings for the output matrix in
+ * unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemmlowp_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs),
+                                              IMAGE_DECLARATION(dst), uint lhs_stride_z,
+                                              uint rhs_stride_z, uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                              ,
+                                              uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                              ,
+                                              uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+  // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+  uint x = get_global_id(0);
+  uint y = get_global_id(1);
+  uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+  if ((x * N0 >= N) || (y * M0 >= M))
+  {
+    return;
+  }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+  // Compute LHS matrix address
+  uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+  // Compute RHS matrix address
+  uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X +
+                    (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0;
+  REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad,
+                     lhs_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply lhs_stride_z by DEPTH_GEMM3D
+  lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Initialize the accumulators
+  REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c,
+                           0); // VEC_DATA_TYPE(ACC_DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(N0-1)=0;
+
+  for (int i = 0; i < K; i += K0)
+  {
+    // Load values from LHS matrix
+    LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+    // Load values from RHS matrix
+    LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X, zrhs);
+
+    // Partial matrix multiplication M0,N0,K0
+    ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c);
+
+    lhs_offset += K0;
+    rhs_offset += N0 * RHS_STEP_X * RHS_STEP_LOOP;
+  }
+
+  __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes +
+                             (x * (uint)N0) * sizeof(int) + (y * (uint)M0 * dst_stride_y);
+
+  REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad,
+                     dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Convert and store output block
+  CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+
+#if defined(RESULT_OFFSET) && defined(RESULT_SHIFT) && defined(RESULT_MULTIPLIER)
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices with fused output stage
+ * using fixed-point arithmetic. The LHS matrix is NOT reshaped The RHS matrix is reshaped with @ref
+ * CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e.
+ * -DDATA_TYPE=uchar)
+ * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e.
+ * -DACC_DATA_TYPE=uint)
+ * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)
+ * @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at
+ * compile time using -DN0 and -DK0 (i.e. -DN0=8, -DK0=4).
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
+ * @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS
+ * matrix must be passed at compile time using -DH0 (i.e. -DH0=2)
+ * @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option
+ * -DRHS_INTERLEAVE must passed at compile time.
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *  - H0 >= 1
+ *
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @note The offset, scalar scale factor and number of bits to shift right of output tensor must be
+ * passed at compile time using -DRESULT_OFFSET, -RESULT_MULTIPLIER and -DRESULT_SHIFT
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile
+ * time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at
+ * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified
+ * linear unit" activation functions
+ * @note In case of per-channel quantization of matrix B, -DPER_CHANNEL_QUANTIZATION must be passed
+ * at compile time.
+ *
+ * @param[in]  lhs_ptr                                          Pointer to the LHS reshaped matrix.
+ * Supported data type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  lhs_stride_x                                     Stride of the LHS reshaped matrix in
+ * X dimension (in bytes)
+ * @param[in]  lhs_step_x                                       src_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  lhs_stride_y                                     Stride of the LHS reshaped matrix in
+ * Y dimension (in bytes)
+ * @param[in]  lhs_step_y                                       src_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes                The offset of the first element in
+ * the LHS reshaped matrix
+ * @param[in]  rhs_ptr                                          Pointer to the RHS reshaped matrix.
+ * Supported data type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                                     Stride of the RHS reshaped matrix in
+ * X dimension (in bytes)
+ * @param[in]  rhs_step_x                                       src_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  rhs_stride_y                                     Stride of the RHS reshaped matrix in
+ * Y dimension (in bytes)
+ * @param[in]  rhs_step_y                                       src_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes                The offset of the first element in
+ * the RHS reshaped matrix
+ * @param[out] dst_ptr                                          Pointer to the destination matrix
+ * Supported data type: same as @p lhs_ptr
+ * @param[in]  dst_stride_x                                     Stride of the destination matrix in
+ * X dimension (in bytes)
+ * @param[in]  dst_step_x                                       dst_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                                     Stride of the destination matrix in
+ * Y dimension (in bytes)
+ * @param[in]  dst_step_y                                       dst_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes                The offset of the first element in
+ * the destination matrix
+ * @param[in]  lhs_stride_z                                     Stride of the LHS reshaped matrix in
+ * Z dimension (in bytes)
+ * @param[in]  rhs_stride_z                                     Stride of the RHS reshaped matrix in
+ * Z dimension (in bytes)
+ * @param[in]  dst_stride_z                                     Stride of the destination tensor in
+ * Z dimension (in bytes)
+ * @param[in]  lhs_cross_plane_pad                              (Optional) Bottom paddings for LHS
+ * matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad                              (Optional) Bottom paddings for the
+ * output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ * @param[in]  sum_col_ptr                                      (Optional) Pointer to the source
+ * tensor. Supported data type: S32
+ * @param[in]  sum_col_stride_x                                 (Optional) Stride of the source
+ * tensor in X dimension (in bytes)
+ * @param[in]  sum_col_step_x                                   (Optional) sum_col_stride_x * number
+ * of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_col_stride_y                                 (Optional) Stride of the source
+ * tensor in Y dimension (in bytes)
+ * @param[in]  sum_col_step_y                                   (Optional) sum_col_stride_y * number
+ * of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_col_offset_first_element_in_bytes            (Optional) The offset of the first
+ * element in the source tensor
+ * @param[in]  sum_row_ptr                                      (Optional) Pointer to the source
+ * tensor. Supported data type: S32
+ * @param[in]  sum_row_stride_x                                 (Optional) Stride of the source
+ * tensor in X dimension (in bytes)
+ * @param[in]  sum_row_step_x                                   (Optional) sum_row_stride_x * number
+ * of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_row_stride_y                                 (Optional) Stride of the source
+ * tensor in Y dimension (in bytes)
+ * @param[in]  sum_row_step_y                                   (Optional) sum_row_stride_y * number
+ * of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_row_offset_first_element_in_bytes            (Optional) The offset of the first
+ * element in the source tensor
+ * @param[in]  biases_ptr                                       (Optional) Pointer to the biases
+ * tensor. Supported data type: S32
+ * @param[in]  biases_stride_x                                  (Optional) Stride of the biases
+ * tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                                    (Optional) biases_stride_x * number
+ * of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes             (Optional) The offset of the first
+ * element in the biases tensor
+ * @param[in]  result_multipliers_ptr                           (Optional) Pointer to the output
+ * multipliers vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_multipliers_stride_x                      (Optional) Stride of the output
+ * multipliers vector in X dimension (in bytes)
+ * @param[in]  result_multipliers_step_x                        (Optional)
+ * output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first
+ * element in the output multipliers vector
+ * @param[in]  result_shifts_ptr                                (Optional) Pointer to the output
+ * shifts vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_shifts_stride_x                           (Optional) Stride of the output
+ * shifts vector in X dimension (in bytes)
+ * @param[in]  result_shifts_step_x                             (Optional) output_shifts_stride_x *
+ * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_shifts_offset_first_element_in_bytes      (Optional) The offset of the first
+ * element in the output shifts vector
+ */
+__kernel void gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint(
+  IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs), IMAGE_DECLARATION(dst), uint lhs_stride_z,
+  uint rhs_stride_z, uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+  ,
+  uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  ,
+  uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+#if defined(A_OFFSET)
+  ,
+  IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+    ,
+  IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+    ,
+  VECTOR_DECLARATION(biases)
+#endif // defined(ADD_BIAS)
+#if defined(PER_CHANNEL_QUANTIZATION)
+    ,
+  VECTOR_DECLARATION(result_multipliers), VECTOR_DECLARATION(result_shifts)
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+)
+{
+  // Block size
+#define RHS_BLOCK_SIZE ((K0) * (N0))
+
+  // RHS offset and step X
+#if defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (K0)
+#define RHS_STEP_X ((K0) * (H0))
+#define RHS_STEP_LOOP (1)
+#else // defined(RHS_INTERLEAVE)
+#define RHS_OFFSET_X (RHS_BLOCK_SIZE)
+#define RHS_STEP_X (K0)
+#define RHS_STEP_LOOP (H0)
+#endif // defined(RHS_INTERLEAVE)
+
+  uint x = get_global_id(0);
+  uint y = get_global_id(1);
+  uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+  if ((x * N0 >= N) || (y * M0 >= M))
+  {
+    return;
+  }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+  // Compute LHS matrix address
+  uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+  // Compute RHS matrix address
+  uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X +
+                    (x / (uint)H0) * rhs_stride_y;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0;
+  REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad,
+                     lhs_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply lhs_stride_z by DEPTH_GEMM3D
+  lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Initialize the accumulators
+  REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c,
+                           0); // VEC_DATA_TYPE(ACC_DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(N0-1)=0;
+
+  for (int i = 0; i < K; i += K0)
+  {
+    // Load values from LHS matrix
+    LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+    // Load values from RHS matrix
+    LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X, zrhs);
+
+    // Partial matrix multiplication M0,N0,K0
+    ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c);
+
+    lhs_offset += K0;
+    rhs_offset += N0 * RHS_STEP_X * RHS_STEP_LOOP;
+  }
+
+  // Result of MM is of type DATA_TYPE
+  __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes +
+                             (x * (uint)N0) * sizeof(DATA_TYPE) + (y * (uint)M0 * dst_stride_y);
+
+  REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad,
+                     dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Convert result of matrix multiplication to S32
+  REPEAT_VAR_INIT_CONVERT_SAT(M0, VEC_DATA_TYPE(int, N0), c, c_int);
+
+  int batch_id = z;
+#if defined(DEPTH_GEMM3D)
+  batch_id /= (int)DEPTH_GEMM3D;
+#endif // defined(DEPTH_GEMM3D)
+
+  // Offset contribution: c += (A_OFFSET * sum_col) + (B_OFFSET * sum_row) +  K_OFFSET;
+  REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(int, N0), offset_s32_, K_OFFSET);
+
+#if defined(A_OFFSET)
+  // Compute the offset contribution due to A_OFFSET
+  __global uchar *sum_col_addr =
+    sum_col_ptr + sum_col_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int);
+
+#if defined(SUM_COL_HAS_BATCHES)
+  sum_col_addr += z * sum_col_stride_y;
+#endif // defined(SUM_COL_HAS_BATCHES)
+  VEC_DATA_TYPE(int, N0)
+  a_offset_s32 = VLOAD(N0)(0, (__global int *)sum_col_addr);
+  a_offset_s32 *= (VEC_DATA_TYPE(int, N0))A_OFFSET;
+
+  REPEAT_ADD_VECTOR_TO_VAR(M0, offset_s32_, a_offset_s32);
+#endif // defined(A_OFFSET)
+
+#if defined(B_OFFSET)
+  // Compute the offset contribution due to B_OFFSET
+  __global uchar *sum_row_addr = sum_row_ptr + sum_row_offset_first_element_in_bytes +
+                                 (y * (uint)M0) * sizeof(int) + z * sum_row_stride_y;
+
+#if defined(HEIGHT_GEMM3D) && defined(DEPTH_GEMM3D)
+  sum_row_addr += (batch_id % (int)DEPTH_GEMM3D) * (int)HEIGHT_GEMM3D * sizeof(int);
+#endif // defined(HEIGHT_GEMM3D) && defined(DEPTH_GEMM3D)
+  LOAD_SCALAR_AS_VECTOR(M0, N0, int, b_offset_s32_, sum_row_addr, 0, sum_row_stride_x);
+
+  REPEAT_MLA_VAR_WITH_CONST_VEC(M0, offset_s32_, b_offset_s32_, (VEC_DATA_TYPE(int, N0))B_OFFSET);
+#endif // defined(B_OFFSET)
+
+#if defined(ADD_BIAS)
+  // Add bias
+  __global uchar *bias_addr =
+    biases_ptr + biases_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int);
+
+  VEC_DATA_TYPE(int, N0)
+  bias_values = VLOAD(N0)(0, (__global int *)bias_addr);
+  REPEAT_ADD_VECTOR_TO_VAR(M0, offset_s32_, bias_values);
+#endif // defined(ADD_BIAS)
+
+  REPEAT_ADD_TWO_VARS(M0, c_int, offset_s32_);
+
+  // Multiply by result_mult_int and shift
+#if defined(PER_CHANNEL_QUANTIZATION)
+  __global uchar *result_multipliers_addr = result_multipliers_ptr +
+                                            result_multipliers_offset_first_element_in_bytes +
+                                            (x * (uint)N0) * sizeof(int);
+  __global uchar *result_shifts_addr =
+    result_shifts_ptr + result_shifts_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int);
+
+  VEC_DATA_TYPE(int, N0)
+  res_mul = VLOAD(N0)(0, (__global int *)result_multipliers_addr);
+  VEC_DATA_TYPE(int, N0)
+  res_shift = VLOAD(N0)(0, (__global int *)result_shifts_addr);
+
+  REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(M0, N0, c_int, res_mul, res_shift);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+
+#if RESULT_SHIFT < 0
+  REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(M0, N0, c_int, RESULT_MULTIPLIER,
+                                                         RESULT_SHIFT);
+#else  // RESULT_SHIFT >= 0
+  REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(M0, N0, c_int, RESULT_MULTIPLIER,
+                                                      RESULT_SHIFT);
+#endif // RESULT_SHIFT < 0
+
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+
+  // Add the offset terms to GEMM's result
+  REPEAT_ADD_CONST_TO_VAR(M0, VEC_DATA_TYPE(int, N0), c_int, RESULT_OFFSET);
+
+#if defined(MIN_BOUND)
+  REPEAT_MAX_CONST_VAR(M0, VEC_DATA_TYPE(int, N0), c_int, MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+  REPEAT_MIN_CONST_VAR(M0, VEC_DATA_TYPE(int, N0), c_int, MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+  // Convert and store output block (does convert saturate)
+  CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c_int, dst_addr, dst_stride_y, zout);
+
+#undef RHS_BLOCK_SIZE
+#undef RHS_OFFSET_X
+#undef RHS_STEP_X
+}
+#endif // defined(RESULT_OFFSET) && defined(RESULT_SHIFT) && defined(RESULT_MULTIPLIER)
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) &&
+       // defined(K)
+
+#if defined(M0) && defined(N0) && defined(K0) && defined(K)
+
+/** This OpenCL kernel computes the matrix multiplication between 2 matrices.
+ *  The LHS matrix is NOT reshaped
+ *  The RHS matrix is NOT reshaped
+ *
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e.
+ * -DDATA_TYPE=uchar)
+ * @note The accumulator data type must be passed at compile time using -DACC_DATA_TYPE (i.e.
+ * -DACC_DATA_TYPE=uint)
+ * @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)
+ * @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)
+ * @note The number of N0 columns to process must be passed at compile time using -DN0 (i.e. -DN0=2)
+ * @note The number of K0 partial accumulations must be passed at compile time using -DK0 (i.e.,
+ * -DK0=2)
+ * @note Only the following configurations of M0, N0 and K0 are currently supported:
+ *  - M0 = 1, 2, 3, 4, 5, 6, 7, 8
+ *  - N0 = 2, 3, 4, 8, 16
+ *  - K0 = 2, 3, 4, 8, 16
+ *
+ * @note In case the input or output have to be reinterpreted as a 3D tensor, the following
+ * information must be passed at compile time:
+ *       -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D
+ *       -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D
+ *       -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D
+ * tensor.
+ *       -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor
+ *          (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix
+ *
+ * @param[in]  lhs_ptr                           Pointer to the LHS reshaped matrix. Supported data
+ * type: QASYMM8
+ * @param[in]  lhs_stride_x                      Stride of the LHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in]  lhs_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  lhs_stride_y                      Stride of the LHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in]  lhs_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped
+ * matrix
+ * @param[in]  rhs_ptr                           Pointer to the RHS reshaped matrix. Supported data
+ * type: same as @p lhs_ptr
+ * @param[in]  rhs_stride_x                      Stride of the RHS reshaped matrix in X dimension
+ * (in bytes)
+ * @param[in]  rhs_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  rhs_stride_y                      Stride of the RHS reshaped matrix in Y dimension
+ * (in bytes)
+ * @param[in]  rhs_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped
+ * matrix
+ * @param[out] dst_ptr                           Pointer to the destination matrix Supported data
+ * type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination matrix in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination matrix in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * matrix
+ * @param[in]  lhs_stride_z                      Stride of the LHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in]  rhs_stride_z                      Stride of the RHS reshaped matrix in Z dimension
+ * (in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination tensor in Z dimension (in
+ * bytes)
+ * @param[in]  lhs_cross_plane_pad               (Optional) Bottom paddings for LHS matrix in unit
+ * of elements (only if defined REINTERPRET_INPUT_AS_3D)
+ * @param[in]  dst_cross_plane_pad               (Optional) Bottom paddings for the output matrix in
+ * unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)
+ */
+__kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs), IMAGE_DECLARATION(rhs),
+                                 IMAGE_DECLARATION(dst), uint lhs_stride_z, uint rhs_stride_z,
+                                 uint dst_stride_z
+#if defined(REINTERPRET_INPUT_AS_3D)
+                                 ,
+                                 uint lhs_cross_plane_pad
+#endif // REINTERPRET_INPUT_AS_3D
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+                                 ,
+                                 uint dst_cross_plane_pad
+#endif // REINTERPRET_OUTPUT_AS_3D
+)
+{
+  uint x = get_global_id(0);
+  uint y = get_global_id(1);
+  uint z = get_global_id(2);
+
+#if defined(DUMMY_WORK_ITEMS)
+  if ((x * N0 >= N) || (y * M0 >= M))
+  {
+    return;
+  }
+#endif // defined(DUMMY_WORK_ITEMS)
+
+  // Compute LHS matrix address
+  uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;
+
+  // Compute RHS matrix address
+  uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0;
+
+#if defined(MATRIX_B_DEPTH)
+  // Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3
+  rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;
+#else  // defined(MATRIX_B_DEPTH)
+  rhs_offset += z * rhs_stride_z;
+#endif // defined(MATRIX_B_DEPTH)
+
+  REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0);
+  REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);
+
+#if defined(REINTERPRET_INPUT_AS_3D)
+  // The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad,
+                     lhs_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply lhs_stride_z by DEPTH_GEMM3D
+  lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  lhs_offset += z * lhs_stride_z;
+
+#endif // defined(REINTERPRET_INPUT_AS_3D)
+
+  // Initialize the accumulators
+  REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(ACC_DATA_TYPE, N0), c,
+                           0); // VEC_DATA_TYPE(ACC_DATA_TYPE, N0)    c0=0,c1=0,c2=0,... c(M0-1)=0;
+
+  int i = 0;
+
+  for (; i <= (K - K0); i += K0)
+  {
+    // Load values from LHS matrix
+    LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+    // Load values from RHS matrix
+    LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs);
+
+    // Partial matrix multiplication M0,N0,K0
+#if (GPU_ARCH == GPU_ARCH_MIDGARD)
+    ARM_MM_NATIVE_N0XK0XM0(VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, K0, a, b, c);
+#else  // GPU_ARCH == GPU_ARCH_MIDGARD
+       // Transpose the values from RHS matrix
+    TRANSPOSE_K0XN0(K0, N0, b_t, b, DATA_TYPE);
+
+    ARM_MM_K0XN0XM0(M0, N0, K0, a, b_t, c);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+    // Update the offset
+    lhs_offset += K0;
+    rhs_offset += K0 * rhs_stride_y;
+  }
+
+  // Left-over for loop
+  for (; i < K; ++i)
+  {
+    // Load values from LHS matrix
+    LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);
+
+    // Load values from RHS matrix
+    LOAD_BLOCK(1, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs);
+
+    // Partial matrix multiplication M0,N0,1
+#if (GPU_ARCH == GPU_ARCH_MIDGARD)
+    ARM_MM_NATIVE_N0XK0XM0(VEC_DATA_TYPE(ACC_DATA_TYPE, N0), M0, 1, a, b, c);
+#else  // GPU_ARCH == GPU_ARCH_MIDGARD
+       // Transpose the values from RHS matrix
+    TRANSPOSE_K0XN0(1, N0, b_t, b, DATA_TYPE);
+
+    ARM_MM_K0XN0XM0(M0, N0, 1, a, b_t, c);
+#endif // GPU_ARCH == GPU_ARCH_MIDGARD
+
+    // Update the offset
+    lhs_offset += 1;
+    rhs_offset += rhs_stride_y;
+  }
+
+  __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes +
+                             (x * (uint)N0) * sizeof(int) + (y * (uint)M0 * dst_stride_y);
+
+  REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); // uint zout0=0,zout1=0,zout2=0,... zout7=0;
+
+#if defined(REINTERPRET_OUTPUT_AS_3D)
+  // The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D
+  CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad,
+                     dst_stride_y);
+
+  // Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we
+  // multiply dst_stride_z by DEPTH_GEMM3D
+  dst_addr += z * dst_stride_z * DEPTH_GEMM3D;
+
+#else // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Add offset for batched GEMM
+  dst_addr += z * dst_stride_z;
+
+#endif // defined(REINTERPRET_OUTPUT_AS_3D)
+
+  // Convert and store output block
+  CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout);
+}
+#endif // defined(M0) && defined(N0) && defined(K0) && defined(K)
+
+#if defined(COLS_A)
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix
+ * A. It is also possible to multiply each reduced row by a scalar value, if SCALAR is passed at
+ * compile time.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ *
+ * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e.
+ * -DDATA_TYPE=uchar)
+ * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE
+ * (i.e. -DACC_DATA_TYPE=uint)
+ * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (e.g.
+ * -DSCALAR=3)
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type:
+ * QASYMM8/QASYMM8_SIGNED
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor Supported data
+ * type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * tensor
+ */
+__kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst))
+{
+  // Compute source and destination addresses
+  Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
+  sum_row_32 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))0;
+  ACC_DATA_TYPE sum_row = 0;
+
+  __global const DATA_TYPE *matrix_a =
+    (__global const DATA_TYPE *)(src.ptr + get_global_id(0) * src_stride_y +
+                                 get_global_id(1) * src_stride_z);
+
+  int i = 0;
+
+  // This for loop performs 16 accumulations
+  for (; i <= ((int)COLS_A - 16); i += 16)
+  {
+    const VEC_DATA_TYPE(DATA_TYPE, 16) a0 = vload16(0, matrix_a + i);
+
+    sum_row_32 += CONVERT(a0.s0123, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) +
+                  CONVERT(a0.s4567, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) +
+                  CONVERT(a0.s89AB, VEC_DATA_TYPE(ACC_DATA_TYPE, 4)) +
+                  CONVERT(a0.sCDEF, VEC_DATA_TYPE(ACC_DATA_TYPE, 4));
+  }
+
+  // This for loop performs the leftover accumulations
+  for (; i < COLS_A; ++i)
+  {
+    sum_row += (ACC_DATA_TYPE)matrix_a[i];
+  }
+
+  sum_row += sum_row_32.s0 + sum_row_32.s1 + sum_row_32.s2 + sum_row_32.s3;
+
+#if defined(SCALAR)
+  sum_row *= (int)SCALAR;
+#endif // defined(SCALAR)
+  *((__global int *)dst.ptr) = (int)sum_row;
+}
+
+#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A
+ * using the arm dot product instruction. It is also possible to multiply each reduced row by a
+ * scalar value, if SCALAR is passed at compile time.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ *
+ * @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e.
+ * -DDATA_TYPE=uchar)
+ * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE
+ * (i.e. -DACC_DATA_TYPE=uint)
+ * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (e.g.
+ * -DSCALAR=3)
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type:
+ * QASYMM8/QASYMM8_SIGNED
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor Supported data
+ * type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * tensor
+ */
+__kernel void gemmlowp_matrix_a_reduction_dot8(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst))
+{
+  // Compute source and destination addresses
+  Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  ACC_DATA_TYPE sum_row = 0;
+
+  __global const DATA_TYPE *matrix_a =
+    (__global const DATA_TYPE *)(src.ptr + get_global_id(0) * src_stride_y +
+                                 get_global_id(1) * src_stride_z);
+
+  int i = 0;
+
+  // This for loop performs 16 accumulations
+  for (; i <= ((int)COLS_A - 32); i += 32)
+  {
+    VEC_DATA_TYPE(DATA_TYPE, 16)
+    a0 = vload16(0, matrix_a + i);
+
+    sum_row += arm_dot(a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+    sum_row += arm_dot(a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+    sum_row += arm_dot(a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+    sum_row += arm_dot(a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+
+    a0 = vload16(1, matrix_a + i);
+
+    sum_row += arm_dot(a0.s0123, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+    sum_row += arm_dot(a0.s4567, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+    sum_row += arm_dot(a0.s89AB, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+    sum_row += arm_dot(a0.sCDEF, (VEC_DATA_TYPE(DATA_TYPE, 4))(1));
+  }
+
+  // This for loop performs the leftover accumulations
+  for (; i < COLS_A; ++i)
+  {
+    sum_row += (ACC_DATA_TYPE)matrix_a[i];
+  }
+
+#if defined(SCALAR)
+  sum_row *= (int)SCALAR;
+#endif // defined(SCALAR)
+  *((__global int *)dst.ptr) = (int)sum_row;
+}
+#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
+#endif // defined(COLS_A)
+
+#if defined(COLS_B) && defined(ROWS_B)
+/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of
+ * Matrix B. It is also possible to multiply each reduced column by a scalar value, if SCALAR is
+ * passed at compile time.
+ *
+ * @note This stage is needed to handle the offset of matrix product
+ *       https://github.com/google/gemmlowp/blob/master/doc/low-precision.md
+ *
+ * @attention The number of matrix B columns and rows needs to be passed at compile time using
+ * -DCOLS_B and -DROWS_B
+ * @note The input data type must be passed at compile time using -DDATA_TYPE (i.e.
+ * -DDATA_TYPE=uchar)
+ * @note The data type for the accumulation must be passed at compile time using -DACC_DATA_TYPE
+ * (i.e. -DACC_DATA_TYPE=uint)
+ * @note In case of scaling the scalar value must be passed at compile time using -DSCALAR (i.e.
+ * -DSCALAR=3)
+ *
+ * @param[in]  src_ptr                           Pointer to the source tensor. Supported data type:
+ * QASYMM8/QASYMM8_SIGNED
+ * @param[in]  src_stride_x                      Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ * @param[out] dst_ptr                           Pointer to the destination tensor Supported data
+ * type: S32
+ * @param[in]  dst_stride_x                      Stride of the destination tensor in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination tensor in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * tensor
+ */
+__kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src), IMAGE_DECLARATION(dst))
+{
+  // Compute source and destination addresses
+  Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+  Image dst = CONVERT_TO_IMAGE_STRUCT(dst);
+
+  VEC_DATA_TYPE(ACC_DATA_TYPE, 16)
+  sum_col_32 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 16))0;
+
+  __global const DATA_TYPE *matrix_b =
+    (__global const DATA_TYPE *)(src.ptr + get_global_id(1) * src_stride_z);
+
+  int i = 0;
+  // This for loop performs 4 accumulations
+  for (; i <= ((int)ROWS_B - 4); i += 4)
+  {
+    const VEC_DATA_TYPE(DATA_TYPE, 16) b0 = vload16(0, matrix_b + 0 * src_stride_y);
+    const VEC_DATA_TYPE(DATA_TYPE, 16) b1 = vload16(0, matrix_b + 1 * src_stride_y);
+    const VEC_DATA_TYPE(DATA_TYPE, 16) b2 = vload16(0, matrix_b + 2 * src_stride_y);
+    const VEC_DATA_TYPE(DATA_TYPE, 16) b3 = vload16(0, matrix_b + 3 * src_stride_y);
+
+    sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, 16)) +
+                  CONVERT(b1, VEC_DATA_TYPE(ACC_DATA_TYPE, 16)) +
+                  CONVERT(b2, VEC_DATA_TYPE(ACC_DATA_TYPE, 16)) +
+                  CONVERT(b3, VEC_DATA_TYPE(ACC_DATA_TYPE, 16));
+
+    matrix_b += 4 * src_stride_y;
+  }
+
+  // This for loop perfoms the leftover accumulations
+  for (; i < (int)ROWS_B; ++i)
+  {
+    const VEC_DATA_TYPE(DATA_TYPE, 16) b0 = vload16(0, matrix_b);
+
+    sum_col_32 += CONVERT(b0, VEC_DATA_TYPE(ACC_DATA_TYPE, 16));
+
+    matrix_b += src_stride_y;
+  }
+
+#if defined(SCALAR)
+  sum_col_32 *= (VEC_DATA_TYPE(ACC_DATA_TYPE, 16))SCALAR;
+#endif // defined(SCALAR)
+  VSTORE(16)
+  (convert_int16(sum_col_32), 0, (__global int *)dst.ptr);
+}
+#endif // defined(COLS_B) && defined(ROWS_B)
+
+#endif // defined(DATA_TYPE) && defined(ACC_DATA_TYPE)
+
+#if defined(K_OFFSET)
+
+/* Helper function used to calculate the offset contribution after matrix multiplication.
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication),
+ * and calculates the offset contribution of matrix A and matrix B.
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns)
+ * needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at
+ * compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at
+ * compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually
+ * if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ *
+ * @param[in] x                                     get_global_id(0) * 4
+ * @param[in] y                                     get_global_id(1)
+ * @param[in] z                                     get_global_id(2)
+ * @param[in] sum_col_ptr                           (Optional) Pointer to the source tensor.
+ * Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_col_stride_x                      (Optional) Stride of the source tensor in X
+ * dimension (in bytes)
+ * @param[in] sum_col_step_x                        (Optional) sum_col_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] sum_col_stride_y                      (Optional) Stride of the source tensor in Y
+ * dimension (in bytes)
+ * @param[in] sum_col_step_y                        (Optional) sum_col_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in
+ * the source tensor
+ * @param[in] sum_row_ptr                           (Optional) Pointer to the source tensor.
+ * Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_row_stride_x                      (Optional) Stride of the source tensor in X
+ * dimension (in bytes)
+ * @param[in] sum_row_step_x                        (Optional) sum_row_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] sum_row_stride_y                      (Optional) Stride of the source tensor in Y
+ * dimension (in bytes)
+ * @param[in] sum_row_step_y                        (Optional) sum_row_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in
+ * the source tensor
+ * @param[in] biases_ptr                            (Optional) Pointer to the biases tensor.
+ * Supported data type: same as @p src_ptr
+ * @param[in] biases_stride_x                       (Optional) Stride of the biases tensor in X
+ * dimension (in bytes)
+ * @param[in] biases_step_x                         (Optional) biases_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes  (Optional) The offset of the first element in
+ * the biases tensor
+ */
+inline int4 offset_contribution(int x, int y, int z
+#if defined(A_OFFSET)
+                                ,
+                                IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                                  ,
+                                IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+                                  ,
+                                VECTOR_DECLARATION(biases)
+#endif // defined(ADD_BIAS)
+)
+{
+  int4 a_offset_s32 = (int4)0;
+  int4 b_offset_s32 = (int4)0;
+
+  int batch_id = z;
+#if defined(DEPTH_INPUT3D)
+  batch_id /= (int)DEPTH_INPUT3D;
+#endif // defined(DEPTH_INPUT3D)
+
+#if defined(A_OFFSET)
+  // Compute the offset contribution due to A_OFFSET
+  __global uchar *sum_col_addr =
+    sum_col_ptr + sum_col_offset_first_element_in_bytes + x * sizeof(int);
+
+  // Compute the offset contribution due to A_OFFSET
+#if defined(SUM_COL_HAS_BATCHES)
+  a_offset_s32 = vload4(0, (__global int *)(sum_col_addr + batch_id * sum_col_stride_y));
+#else  // defined(SUM_COL_HAS_BATCHES)
+  a_offset_s32 = vload4(0, (__global int *)sum_col_addr);
+#endif // defined(SUM_COL_HAS_BATCHES)
+
+  a_offset_s32 *= (int4)A_OFFSET;
+#endif // defined(A_OFFSET)
+
+#if defined(B_OFFSET)
+  // Compute the offset contribution due to A_OFFSET
+  __global uchar *sum_row_addr =
+    sum_row_ptr + sum_row_offset_first_element_in_bytes + y * sizeof(int);
+
+  // Compute the offset contribution due to B_OFFSET
+#if defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
+  b_offset_s32 = (int4) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)) +
+                           (z % (int)DEPTH_INPUT3D) * (int)HEIGHT_INPUT3D);
+#else  // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
+  b_offset_s32 = (int4) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)));
+#endif // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)
+  b_offset_s32 *= (int4)B_OFFSET;
+#endif // defined(B_OFFSET)
+
+#if defined(ADD_BIAS)
+  // Add bias
+  __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+  int4 biases_values = vload4(0, (__global int *)bias_addr);
+  b_offset_s32 += (int4)biases_values;
+#endif // defined(ADD_BIAS)
+
+  return (int4)K_OFFSET + a_offset_s32 + b_offset_s32;
+}
+
+/* OpenCL kernel used to add the offset contribution after matrix multiplication. The computation is
+ * performed in-place
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication),
+ * and adds to it the offset contribution of matrix A and matrix B in-place.
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns)
+ * needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at
+ * compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at
+ * compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually
+ * if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ *
+ * The final result is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ *                   (sum_col[k] * A_OFFSET) +
+ *                   (sum_row[i] * B_OFFSET) +
+ *                   (K_OFFSET)
+ *
+ * @param[in] mm_result_ptr                           Pointer to the source tensor. Supported data
+ * type: S32
+ * @param[in] mm_result_stride_x                      Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in] mm_result_step_x                        mm_result_stride_x * number of elements along
+ * X processed per workitem(in bytes)
+ * @param[in] mm_result_stride_y                      Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in] mm_result_step_y                        mm_result_stride_y * number of elements along
+ * Y processed per workitem(in bytes)
+ * @param[in] mm_result_stride_z                      Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in] mm_result_step_z                        mm_result_stride_z * number of elements along
+ * Z processed per workitem(in bytes)
+ * @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source
+ * tensor
+ * @param[in] sum_col_ptr                             (Optional) Pointer to the source tensor.
+ * Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_col_stride_x                        (Optional) Stride of the source tensor in X
+ * dimension (in bytes)
+ * @param[in] sum_col_step_x                          (Optional) sum_col_stride_x * number of
+ * elements along X processed per workitem(in bytes)
+ * @param[in] sum_col_stride_y                        (Optional) Stride of the source tensor in Y
+ * dimension (in bytes)
+ * @param[in] sum_col_step_y                          (Optional) sum_col_stride_y * number of
+ * elements along Y processed per workitem(in bytes)
+ * @param[in] sum_col_offset_first_element_in_bytes   (Optional) The offset of the first element in
+ * the source tensor
+ * @param[in] sum_row_ptr                             (Optional) Pointer to the source tensor.
+ * Supported data type: same as @p mm_result_ptr
+ * @param[in] sum_row_stride_x                        (Optional) Stride of the source tensor in X
+ * dimension (in bytes)
+ * @param[in] sum_row_step_x                          (Optional) sum_row_stride_x * number of
+ * elements along X processed per workitem(in bytes)
+ * @param[in] sum_row_stride_y                        (Optional) Stride of the source tensor in Y
+ * dimension (in bytes)
+ * @param[in] sum_row_step_y                          (Optional) sum_row_stride_y * number of
+ * elements along Y processed per workitem(in bytes)
+ * @param[in] sum_row_offset_first_element_in_bytes   (Optional) The offset of the first element in
+ * the source tensor
+ * @param[in] biases_ptr                              (Optional) Pointer to the biases tensor.
+ * Supported data type: same as @p src_ptr
+ * @param[in] biases_stride_x                         (Optional) Stride of the biases tensor in X
+ * dimension (in bytes)
+ * @param[in] biases_step_x                           (Optional) biases_stride_x * number of
+ * elements along X processed per workitem(in bytes)
+ * @param[in] biases_offset_first_element_in_bytes    (Optional) The offset of the first element in
+ * the biases tensor
+ */
+__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result)
+#if defined(A_OFFSET)
+                                             ,
+                                           IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                                             ,
+                                           IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+                                             ,
+                                           VECTOR_DECLARATION(biases)
+#endif // defined(ADD_BIAS))
+)
+{
+  const int x = get_global_id(0) * 4;
+  const int y = get_global_id(1);
+  const int z = get_global_id(2);
+
+  // Compute offset contribution
+  int4 offset_term_s32 = offset_contribution(
+    x, y, z
+#if defined(A_OFFSET)
+    ,
+    sum_col_ptr, sum_col_stride_x, sum_col_step_x, sum_col_stride_y, sum_col_step_y,
+    sum_col_offset_first_element_in_bytes
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+    ,
+    sum_row_ptr, sum_row_stride_x, sum_row_step_x, sum_row_stride_y, sum_row_step_y,
+    sum_row_offset_first_element_in_bytes
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+    ,
+    biases_ptr, biases_stride_x, biases_step_x, biases_offset_first_element_in_bytes
+#endif // defined(ADD_BIAS)
+  );
+
+  __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes +
+                                   x * sizeof(int) + y * mm_result_stride_y +
+                                   z * mm_result_stride_z;
+
+  int4 in_s32 = vload4(0, (__global int *)mm_result_addr);
+
+  // Add the offset terms to GEMM's result
+  in_s32 += offset_term_s32;
+
+  // Store the result with the offset contribution
+  vstore4(in_s32, 0, (__global int *)mm_result_addr);
+}
+
+#if defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT) && \
+  defined(OUTPUT_DATA_TYPE)
+/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel and
+ * it quantizes down to uint8.
+ *
+ * This kernel takes a final int32 accumulator value (the output of
+ * @CLGEMMLowpMatrixMultiplyKernel), adds to it the offset contribution of matrix A and matrix B and
+ * quantizes to uint8 through the output stage.
+ *
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns)
+ * needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at
+ * compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at
+ * compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually
+ * if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ *
+ * The result before the output stage is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ *                   (sum_col[k] * A_OFFSET) +
+ *                   (sum_row[i] * B_OFFSET) +
+ *                   (K_OFFSET)
+ *
+ * This result is quantized down to uint8/int8 using the output stage. The output stage computes the
+ * following operations:
+ *
+ *  -# Add offset terms to final result
+ *  -# Multiply each entry of result by result_mult_int
+ *  -# Add bias to final result (if -DADD_BIAS is passed at compile time)
+ *  -# Shift the int32 accumulator by result_shift
+ *  -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND
+ * are passed at compile time)
+ *  -# Clamp the resulting int32 values:
+ *      - to the [0..255] range and cast to QASYMM8.
+ *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor
+ * must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile
+ * time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at
+ * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified
+ * linear unit" activation functions
+ *
+ * @param[in]  mm_result_ptr                                    Pointer to the source tensor.
+ * Supported data type: S32
+ * @param[in]  mm_result_stride_x                               Stride of the source tensor in X
+ * dimension (in bytes)
+ * @param[in]  mm_result_step_x                                 mm_result_stride_x * number of
+ * elements along X processed per workitem(in bytes)
+ * @param[in]  mm_result_stride_y                               Stride of the source tensor in Y
+ * dimension (in bytes)
+ * @param[in]  mm_result_step_y                                 mm_result_stride_y * number of
+ * elements along Y processed per workitem(in bytes)
+ * @param[in]  mm_result_stride_z                               Stride of the source tensor in Z
+ * dimension (in bytes)
+ * @param[in]  mm_result_step_z                                 mm_result_stride_z * number of
+ * elements along Z processed per workitem(in bytes)
+ * @param[in]  mm_result_offset_first_element_in_bytes          The offset of the first element in
+ * the source tensor
+ * @param[in]  sum_col_ptr                                      (Optional) Pointer to the source
+ * tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in]  sum_col_stride_x                                 (Optional) Stride of the source
+ * tensor in X dimension (in bytes)
+ * @param[in]  sum_col_step_x                                   (Optional) sum_col_stride_x * number
+ * of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_col_stride_y                                 (Optional) Stride of the source
+ * tensor in Y dimension (in bytes)
+ * @param[in]  sum_col_step_y                                   (Optional) sum_col_stride_y * number
+ * of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_col_offset_first_element_in_bytes            (Optional) The offset of the first
+ * element in the source tensor
+ * @param[in]  sum_row_ptr                                      (Optional) Pointer to the source
+ * tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in]  sum_row_stride_x                                 (Optional) Stride of the source
+ * tensor in X dimension (in bytes)
+ * @param[in]  sum_row_step_x                                   (Optional) sum_row_stride_x * number
+ * of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_row_stride_y                                 (Optional) Stride of the source
+ * tensor in Y dimension (in bytes)
+ * @param[in]  sum_row_step_y                                   (Optional) sum_row_stride_y * number
+ * of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_row_offset_first_element_in_bytes            (Optional) The offset of the first
+ * element in the source tensor
+ * @param[in]  biases_ptr                                       (Optional) Pointer to the biases
+ * tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                                  (Optional) Stride of the biases
+ * tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                                    (Optional) biases_stride_x * number
+ * of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes             (Optional) The offset of the first
+ * element in the biases tensor
+ * @param[out] dst_ptr                                          Pointer to the destination tensor
+ * Supported data type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  dst_stride_x                                     Stride of the destination tensor in
+ * X dimension (in bytes)
+ * @param[in]  dst_step_x                                       dst_gx_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                                     Stride of the destination tensor in
+ * Y dimension (in bytes)
+ * @param[in]  dst_step_y                                       dst_gx_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                                     Stride of the source tensor in Z
+ * dimension (in bytes)
+ * @param[in]  dst_step_z                                       src_stride_z * number of elements
+ * along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes                The offset of the first element in
+ * the destination tensor
+ * @param[in]  result_multipliers_ptr                           (Optional) Pointer to the output
+ * multipliers vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_multipliers_stride_x                      (Optional) Stride of the output
+ * multipliers vector in X dimension (in bytes)
+ * @param[in]  result_multipliers_step_x                        (Optional)
+ * output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first
+ * element in the output multipliers vector
+ * @param[in]  result_shifts_ptr                                (Optional) Pointer to the output
+ * shifts vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_shifts_stride_x                           (Optional) Stride of the output
+ * shifts vector in X dimension (in bytes)
+ * @param[in]  result_shifts_step_x                             (Optional) output_shifts_stride_x *
+ * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_shifts_offset_first_element_in_bytes      (Optional) The offset of the first
+ * element in the output shifts vector
+ */
+__kernel void gemmlowp_offset_contribution_quantize_down(TENSOR3D_DECLARATION(mm_result)
+#if defined(A_OFFSET)
+                                                           ,
+                                                         IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                                                           ,
+                                                         IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+                                                           ,
+#if defined(ADD_BIAS)
+                                                         VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                         TENSOR3D_DECLARATION(dst)
+#if defined(PER_CHANNEL_QUANTIZATION)
+                                                           ,
+                                                         VECTOR_DECLARATION(result_multipliers),
+                                                         VECTOR_DECLARATION(result_shifts)
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+)
+{
+  const int x = get_global_id(0) * 4;
+  const int y = get_global_id(1);
+  const int z = get_global_id(2);
+
+  __global uchar *dst_addr =
+    dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+  // Compute offset contribution
+  int4 offset_term_s32 = offset_contribution(
+    x, y, z
+#if defined(A_OFFSET)
+    ,
+    sum_col_ptr, sum_col_stride_x, sum_col_step_x, sum_col_stride_y, sum_col_step_y,
+    sum_col_offset_first_element_in_bytes
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+    ,
+    sum_row_ptr, sum_row_stride_x, sum_row_step_x, sum_row_stride_y, sum_row_step_y,
+    sum_row_offset_first_element_in_bytes
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+    ,
+    biases_ptr, biases_stride_x, biases_step_x, biases_offset_first_element_in_bytes
+#endif // defined(ADD_BIAS)
+  );
+
+  __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes +
+                                   x * sizeof(int) + y * mm_result_stride_y +
+                                   z * mm_result_stride_z;
+
+  int4 in_s32 = vload4(0, (__global int *)mm_result_addr);
+
+  // Add the offset terms to GEMM's result
+  in_s32 += offset_term_s32;
+
+  // -------------- OUTPUT STAGE
+
+  // Add the offset terms to GEMM's result
+  in_s32 += (int4)RESULT_OFFSET;
+
+  // Multiply by result_mult_int and shift
+#if defined(PER_CHANNEL_QUANTIZATION)
+  __global uchar *result_multipliers_addr =
+    result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int);
+  __global uchar *result_shifts_addr =
+    result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int);
+  int4 result_multipliers_values = vload4(0, (__global int *)result_multipliers_addr);
+  int4 result_shifts_values = vload4(0, (__global int *)result_shifts_addr);
+
+  in_s32 *= result_multipliers_values;
+  in_s32 >>= result_shifts_values;
+#else  // defined(PER_CHANNEL_QUANTIZATION)
+  in_s32 *= RESULT_MULTIPLIER;
+
+  in_s32 >>= RESULT_SHIFT;
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+
+  VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)
+  res = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4));
+
+#if defined(MIN_BOUND)
+  res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+  res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+  // Store the result
+  vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr);
+}
+
+/* OpenCL kernel used to add the offset contribution after matrix multiplication and it quantizes
+ * down to uint8.
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication), adds to
+ * it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output
+ * stage.
+ *
+ *
+ * @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns)
+ * needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)
+ * @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at
+ * compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)
+ * @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at
+ * compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)
+ * @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually
+ * if gemmlowp is used to accelerate convolution layer, sum_col will not have batches
+ *
+ * The result before the output stage is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ *                   (sum_col[k] * A_OFFSET) +
+ *                   (sum_row[i] * B_OFFSET) +
+ *                   (K_OFFSET)
+ *
+ * This result is quantized down to uint8/int8 using the output stage. The output stage computes the
+ * following operations:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by
+ * result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Round to nearest division by a power-of-two using result_shift
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values:
+ *      - to the [0..255] range and cast to QASYMM8.
+ *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor
+ * must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile
+ * time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at
+ * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified
+ * linear unit" activation functions
+ *
+ * @param[in]  mm_result_ptr                                    Pointer to the source tensor.
+ * Supported data type: S32
+ * @param[in]  mm_result_stride_x                               Stride of the source tensor in X
+ * dimension (in bytes)
+ * @param[in]  mm_result_step_x                                 mm_result_stride_x * number of
+ * elements along X processed per workitem(in bytes)
+ * @param[in]  mm_result_stride_y                               Stride of the source tensor in Y
+ * dimension (in bytes)
+ * @param[in]  mm_result_step_y                                 mm_result_stride_y * number of
+ * elements along Y processed per workitem(in bytes)
+ * @param[in]  mm_result_stride_z                               Stride of the source tensor in Z
+ * dimension (in bytes)
+ * @param[in]  mm_result_step_z                                 mm_result_stride_z * number of
+ * elements along Z processed per workitem(in bytes)
+ * @param[in]  mm_result_offset_first_element_in_bytes          The offset of the first element in
+ * the source tensor
+ * @param[in]  sum_col_ptr                                      (Optional) Pointer to the source
+ * tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in]  sum_col_stride_x                                 (Optional) Stride of the source
+ * tensor in X dimension (in bytes)
+ * @param[in]  sum_col_step_x                                   (Optional) sum_col_stride_x * number
+ * of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_col_stride_y                                 (Optional) Stride of the source
+ * tensor in Y dimension (in bytes)
+ * @param[in]  sum_col_step_y                                   (Optional) sum_col_stride_y * number
+ * of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_col_offset_first_element_in_bytes            (Optional) The offset of the first
+ * element in the source tensor
+ * @param[in]  sum_row_ptr                                      (Optional) Pointer to the source
+ * tensor. Supported data type: same as @p mm_result_ptr
+ * @param[in]  sum_row_stride_x                                 (Optional) Stride of the source
+ * tensor in X dimension (in bytes)
+ * @param[in]  sum_row_step_x                                   (Optional) sum_row_stride_x * number
+ * of elements along X processed per workitem(in bytes)
+ * @param[in]  sum_row_stride_y                                 (Optional) Stride of the source
+ * tensor in Y dimension (in bytes)
+ * @param[in]  sum_row_step_y                                   (Optional) sum_row_stride_y * number
+ * of elements along Y processed per workitem(in bytes)
+ * @param[in]  sum_row_offset_first_element_in_bytes            (Optional) The offset of the first
+ * element in the source tensor
+ * @param[in]  biases_ptr                                       (Optional) Pointer to the biases
+ * tensor. Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                                  (Optional) Stride of the biases
+ * tensor in X dimension (in bytes)
+ * @param[in]  biases_step_x                                    (Optional) biases_stride_x * number
+ * of elements along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes             (Optional) The offset of the first
+ * element in the biases tensor
+ * @param[out] dst_ptr                                          Pointer to the destination tensor
+ * Supported data type: QASYMM8
+ * @param[in]  dst_stride_x                                     Stride of the destination tensor in
+ * X dimension (in bytes)
+ * @param[in]  dst_step_x                                       dst_gx_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                                     Stride of the destination tensor in
+ * Y dimension (in bytes)
+ * @param[in]  dst_step_y                                       dst_gx_stride_y * number of elements
+ * along Y processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                                     Stride of the source tensor in Z
+ * dimension (in bytes)
+ * @param[in]  dst_step_z                                       src_stride_z * number of elements
+ * along Z processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes                The offset of the first element in
+ * the destination tensor
+ * @param[in]  result_multipliers_ptr                           (Optional) Pointer to the output
+ * multipliers vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_multipliers_stride_x                      (Optional) Stride of the output
+ * multipliers vector in X dimension (in bytes)
+ * @param[in]  result_multipliers_step_x                        (Optional)
+ * output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_multipliers_offset_first_element_in_bytes (Optional) The offset of the first
+ * element in the output multipliers vector
+ * @param[in]  result_shifts_ptr                                (Optional) Pointer to the output
+ * shifts vector for per-channel quantization. Supported data types: S32
+ * @param[in]  result_shifts_stride_x                           (Optional) Stride of the output
+ * shifts vector in X dimension (in bytes)
+ * @param[in]  result_shifts_step_x                             (Optional) output_shifts_stride_x *
+ * number of elements along X processed per workitem(in bytes)
+ * @param[in]  result_shifts_offset_first_element_in_bytes      (Optional) The offset of the first
+ * element in the output shifts vector
+ */
+__kernel void
+  gemmlowp_offset_contribution_quantize_down_fixedpoint(TENSOR3D_DECLARATION(mm_result)
+#if defined(A_OFFSET)
+                                                          ,
+                                                        IMAGE_DECLARATION(sum_col)
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+                                                          ,
+                                                        IMAGE_DECLARATION(sum_row)
+#endif // defined(B_OFFSET)
+                                                          ,
+#if defined(ADD_BIAS)
+                                                        VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                        TENSOR3D_DECLARATION(dst)
+#if defined(PER_CHANNEL_QUANTIZATION)
+                                                          ,
+                                                        VECTOR_DECLARATION(result_multipliers),
+                                                        VECTOR_DECLARATION(result_shifts)
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+  )
+{
+  const int x = get_global_id(0) * 4;
+  const int y = get_global_id(1);
+  const int z = get_global_id(2);
+
+  // Compute offset contribution
+  int4 offset_term_s32 = offset_contribution(
+    x, y, z
+#if defined(A_OFFSET)
+    ,
+    sum_col_ptr, sum_col_stride_x, sum_col_step_x, sum_col_stride_y, sum_col_step_y,
+    sum_col_offset_first_element_in_bytes
+#endif // defined(A_OFFSET)
+#if defined(B_OFFSET)
+    ,
+    sum_row_ptr, sum_row_stride_x, sum_row_step_x, sum_row_stride_y, sum_row_step_y,
+    sum_row_offset_first_element_in_bytes
+#endif // defined(B_OFFSET)
+#if defined(ADD_BIAS)
+    ,
+    biases_ptr, biases_stride_x, biases_step_x, biases_offset_first_element_in_bytes
+#endif // defined(ADD_BIAS)
+  );
+
+  __global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes +
+                                   x * sizeof(int) + y * mm_result_stride_y +
+                                   z * mm_result_stride_z;
+
+  __global uchar *dst_addr =
+    dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+  int4 in_s32 = vload4(0, (__global int *)mm_result_addr);
+
+  // Add the offset terms to GEMM's result
+  in_s32 += offset_term_s32;
+
+  // -------------- OUTPUT STAGE
+
+  // Multiply by result_mult_int and shift
+#if defined(PER_CHANNEL_QUANTIZATION)
+  __global uchar *result_multipliers_addr =
+    result_multipliers_ptr + result_multipliers_offset_first_element_in_bytes + x * sizeof(int);
+  __global uchar *result_shifts_addr =
+    result_shifts_ptr + result_shifts_offset_first_element_in_bytes + x * sizeof(int);
+  int4 result_multipliers_values = vload4(0, (__global int *)result_multipliers_addr);
+  int4 result_shifts_values = vload4(0, (__global int *)result_shifts_addr);
+
+  int4 in_s32_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(
+    in_s32, result_multipliers_values, result_shifts_values, 4);
+  int4 in_s32_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(
+    in_s32, result_multipliers_values, result_shifts_values, 4);
+  in_s32 = select(in_s32_shift_lt0, in_s32_shift_gt0, result_shifts_values >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+
+#if RESULT_SHIFT < 0
+  in_s32 =
+    ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, 4);
+#else  // RESULT_SHIFT >= 0
+  in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, 4);
+#endif // RESULT_SHIFT < 0
+
+#endif // defined(PER_CHANNEL_QUANTIZATION)
+
+  // Add the offset terms to GEMM's result
+  in_s32 += (int4)RESULT_OFFSET;
+
+  VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)
+  res = CONVERT_SAT(in_s32, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4));
+
+#if defined(MIN_BOUND)
+  res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+  res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+  // Store the result
+  vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr);
+}
+#endif // defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT) &&
+       // defined(OUTPUT_DATA_TYPE)
+
+#endif // defined(K_OFFSET)
+
+#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to
+ * QASYMM8/QASYMM8_SIGNED
+ *
+ * This kernel takes a final int32 accumulator value and processes it to obtain the final
+ * QASYMM8/QASYMM8_SIGNED value. The following computations will be performed by the kernel:
+ *
+ *  -# Add offset terms to final result
+ *  -# Multiply each entry of result by result_mult_int
+ *  -# Add bias to final result (if -DADD_BIAS is passed at compile time)
+ *  -# Shift the int32 accumulator by result_shift
+ *  -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND
+ * are passed at compile time)
+ *  -# Clamp the resulting int32 values:
+ *  -#  - to the [0..255] range and cast to QASYMM8.
+ *  -#  - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor
+ * must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile
+ * time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at
+ * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified
+ * linear unit" activation functions
+ *
+ * @param[in]  src_ptr                              Pointer to the source tensor. Supported data
+ * type: S32
+ * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source
+ * tensor
+ * @param[in]  biases_ptr                           (Optional) Pointer to the biases tensor.
+ * Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                      (Optional) Stride of the biases tensor in X
+ * dimension (in bytes)
+ * @param[in]  biases_step_x                        (Optional) biases_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes (Optional) The offset of the first element in
+ * the biases tensor
+ * @param[out] dst_ptr                              Pointer to the destination tensor Supported data
+ * type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in]  dst_step_z                           src_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the
+ * destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+                                                  VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                  TENSOR3D_DECLARATION(dst))
+{
+  // Compute source and destination addresses
+  int x = get_global_id(0) * 4;
+  int y = get_global_id(1);
+  int z = get_global_id(2);
+
+  __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) +
+                             y * src_stride_y + z * src_stride_z;
+
+  __global uchar *dst_addr =
+    dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+  int4 input_values = vload4(0, (__global int *)src_addr);
+
+#if defined(ADD_BIAS)
+  // Add bias
+  __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+  int4 biases_values = vload4(0, (__global int *)bias_addr);
+  input_values += (int4)biases_values;
+#endif // defined(ADD_BIAS)
+
+  // Add the offset terms to GEMM's result
+  input_values += (int4)RESULT_OFFSET;
+
+  // Multiply by result_mult_int and shift
+  input_values *= RESULT_MULT_INT;
+
+#if RESULT_SHIFT < 0
+  input_values >>= -RESULT_SHIFT;
+#else  // RESULT_SHIFT >= 0
+  input_values >>= RESULT_SHIFT;
+#endif // RESULT_SHIFT < 0
+
+  VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)
+  res = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4));
+
+#if defined(MIN_BOUND)
+  res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+  res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+  // Store the result
+  vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr);
+}
+#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)
+
+#if defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && \
+  defined(RESULT_SHIFT)
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to
+ * QASYMM8/QASYMM8_SIGNED
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and
+ * processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. The following computations will be
+ * performed by the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by
+ * result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Round to nearest division by a power-of-two using result_shift
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values:
+ *      - to the [0..255] range and cast to QASYMM8.
+ *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor
+ * must be passed at compile time using -DRESULT_OFFSET_AFTER_SHIFT, -DRESULT_FIXEDPOINT_MULTIPLIER
+ * and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile
+ * time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at
+ * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified
+ * linear unit" activation functions
+ *
+ * @param[in]  src_ptr                              Pointer to the source tensor. Supported data
+ * type: S32
+ * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source
+ * tensor
+ * @param[in]  biases_ptr                           (Optional) Pointer to the biases tensor.
+ * Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                      (Optional) Stride of the biases tensor in X
+ * dimension (in bytes)
+ * @param[in]  biases_step_x                        (Optional) biases_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes (Optional) The offset of the first element in
+ * the biases tensor
+ * @param[out] dst_ptr                              Pointer to the destination tensor Supported data
+ * type: QASYMM8/QASYMM8_SIGNED
+ * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in]  dst_step_z                           src_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the
+ * destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+                                                             VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                             TENSOR3D_DECLARATION(dst))
+{
+  // Compute source and destination addresses
+  int x = get_global_id(0) * 4;
+  int y = get_global_id(1);
+  int z = get_global_id(2);
+
+  __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) +
+                             y * src_stride_y + z * src_stride_z;
+
+  __global uchar *dst_addr =
+    dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+  int4 input_values = vload4(0, (__global int *)src_addr);
+
+#if defined(ADD_BIAS)
+  // Add bias
+  __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+  int4 biases_values = vload4(0, (__global int *)bias_addr);
+  input_values += (int4)biases_values;
+#endif // defined(ADD_BIAS)
+
+  // Multiply by result_mult_int and shift
+#if RESULT_SHIFT < 0
+  input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(
+    input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+#else  // RESULT_SHIFT >= 0
+  input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(
+    input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+#endif // RESULT_SHIFT < 0
+
+  // Add the offset terms to GEMM's result
+  input_values += (int4)RESULT_OFFSET_AFTER_SHIFT;
+
+  VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)
+  res = CONVERT_SAT(input_values, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4));
+
+#if defined(MIN_BOUND)
+  res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+  res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+  // Store the result
+  vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr);
+}
+#endif // defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) &&
+       // defined(RESULT_SHIFT)
+
+#if defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
+
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QSYMM16
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and
+ * processes it to obtain the final QSYMM16 value. The following computations will be performed by
+ * the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by
+ * result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Round to nearest division by a power-of-two using result_shift
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values to the [-32768..32767] range and cast to QSYMM16.
+ *
+ * @attention The offset, scalar scale factor and number of bits to shift right of output tensor
+ * must be passed at compile time using -DRESULT_FIXEDPOINT_MULTIPLIER and -DRESULT_SHIFT
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile
+ * time
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at
+ * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified
+ * linear unit" activation functions
+ *
+ * @param[in]  src_ptr                              Pointer to the source tensor. Supported data
+ * type: S32
+ * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source
+ * tensor
+ * @param[in]  biases_ptr                           (Optional) Pointer to the biases tensor.
+ * Supported data type: same as @p src_ptr
+ * @param[in]  biases_stride_x                      (Optional) Stride of the biases tensor in X
+ * dimension (in bytes)
+ * @param[in]  biases_step_x                        (Optional) biases_stride_x * number of elements
+ * along X processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes (Optional) The offset of the first element in
+ * the biases tensor
+ * @param[out] dst_ptr                              Pointer to the destination tensor Supported data
+ * type: QSYMM16
+ * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in]  dst_step_z                           src_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the
+ * destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+                                                                     VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+                                                                     TENSOR3D_DECLARATION(dst))
+{
+  // Compute source and destination addresses
+  int x = get_global_id(0) * 4;
+  int y = get_global_id(1);
+  int z = get_global_id(2);
+
+  __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) +
+                             y * src_stride_y + z * src_stride_z;
+
+  __global uchar *dst_addr =
+    dst_ptr + dst_offset_first_element_in_bytes + x * 2 + y * dst_stride_y + z * dst_stride_z;
+
+  int4 input_values = vload4(0, (__global int *)src_addr);
+
+#if defined(ADD_BIAS)
+  // Add bias
+  __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+  int4 biases_values = vload4(0, (__global int *)bias_addr);
+  input_values += (int4)biases_values;
+#endif // defined(ADD_BIAS)
+
+  // Multiply by result_mult_int and shift
+#if RESULT_SHIFT < 0
+  input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(
+    input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+#else  // RESULT_SHIFT >= 0
+  input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(
+    input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+#endif // RESULT_SHIFT < 0
+
+  short4 res = convert_short4_sat(input_values);
+
+#if defined(MIN_BOUND)
+  res = max(res, (short4)MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+  res = min(res, (short4)MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+  // Store the result
+  vstore4(res, 0, (__global short *)dst_addr);
+}
+#endif // defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)
+
+#if defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
+/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to
+ * QASYMM8/QASYMM8_SIGNED
+ *
+ * This kernel takes a final int32 accumulator value (the output of matrix multiplication), and
+ * processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. The following computations will be
+ * performed by the kernel:
+ *
+ *  -# Compute fixed point multiplication between each entry of input by
+ * result_fixedpoint_multiplier
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Requantize
+ *  -# Add offset to each result
+ *  -# Clamp the value between the specified min and max bounds
+ *  -# Clamp the resulting int32 values:
+ *      - to the [0..255] range and cast to QASYMM8.
+ *      - to the [-128..127] range and cast to QASYMM8_SIGNED.
+ *
+ * @attention The offset and scalar scale factor must be passed at compile time using
+ * -DRESULT_OFFSET, -DREAL_MULTIPLIER
+ *
+ * @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile
+ * time
+ * @note The output datatype should be passed at compile time using -DOUTPUT_DATA_TYPE
+ * @note In case the clamping of the result is required, the min and max bounds can be passed at
+ * compile time using -DMIN_BOUND and -DMAX_BOUND. These values can be used to implement "rectified
+ * linear unit" activation functions
+ *
+ * @param[in]  src_ptr                              Pointer to the source tensor. Supported data
+ * type: S32
+ * @param[in]  src_stride_x                         Stride of the source tensor in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                           src_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  src_stride_y                         Stride of the source tensor in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                           src_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  src_stride_z                         Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                           src_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes    The offset of the first element in the source
+ * tensor
+ * @param[in]  biases_ptr                           Pointer to the biases tensor. Supported data
+ * type: same as @p src_ptr
+ * @param[in]  biases_stride_x                      Stride of the biases tensor in X dimension (in
+ * bytes)
+ * @param[in]  biases_step_x                        biases_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  biases_offset_first_element_in_bytes The offset of the first element in the biases
+ * tensor
+ * @param[out] dst_ptr                              Pointer to the destination tensor Supported data
+ * type: QASYMM8
+ * @param[in]  dst_stride_x                         Stride of the destination tensor in X dimension
+ * (in bytes)
+ * @param[in]  dst_step_x                           dst_gx_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_y                         Stride of the destination tensor in Y dimension
+ * (in bytes)
+ * @param[in]  dst_step_y                           dst_gx_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_z                         Stride of the source tensor in Z dimension (in
+ * bytes)
+ * @param[in]  dst_step_z                           src_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in]  dst_stride_w                         Stride of the source tensor in W dimension (in
+ * bytes)
+ * @param[in]  dst_step_w                           src_stride_w * number of elements along W
+ * processed per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes    The offset of the first element in the
+ * destination tensor
+ */
+__kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src),
+#if defined(ADD_BIAS)
+                                                        VECTOR_DECLARATION(biases),
+#endif // defined(ADD_BIAS)
+#if defined(DST_HEIGHT)
+                                                        TENSOR4D_DECLARATION(dst))
+#else  // defined(DST_HEIGHT)
+                                                        TENSOR3D_DECLARATION(dst))
+#endif // defined(DST_HEIGHT)
+{
+  // Compute source and destination addresses
+  int x = get_global_id(0) * 4;
+  int y = get_global_id(1);
+  int z = get_global_id(2);
+
+  __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) +
+                             y * src_stride_y + z * src_stride_z;
+
+  __global uchar *dst_addr =
+    dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+  int4 input_values = vload4(0, (__global int *)src_addr);
+
+#if defined(ADD_BIAS)
+  // Add bias
+  __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+  int4 biases_values = vload4(0, (__global int *)bias_addr);
+  input_values += (int4)biases_values;
+#endif // defined(ADD_BIAS)
+
+  // Convert to float
+  float4 input_values_f = convert_float4(input_values);
+  input_values_f = round(input_values_f * (float)REAL_MULTIPLIER + (float)OUTPUT_OFFSET);
+
+  VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4)
+  res = CONVERT_SAT(input_values_f, VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4));
+
+#if defined(MIN_BOUND)
+  res = max(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MIN_BOUND);
+#endif // defined(MIN_BOUND)
+#if defined(MAX_BOUND)
+  res = min(res, (VEC_DATA_TYPE(OUTPUT_DATA_TYPE, 4))MAX_BOUND);
+#endif // defined(MAX_BOUND)
+
+  // Store the result
+  vstore4(res, 0, (__global OUTPUT_DATA_TYPE *)dst_addr);
+}
+#endif // defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/memset.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/memset.cl
new file mode 100644
index 000000000..51919c8a5
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/memset.cl
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Copyright (c) 2018-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(CONSTANT_VALUE) // Check for compile time constants
+
+/** Fill the tensor's planes with all value
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE = Tensor data type. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+ * -# -DCONSTANT_VALUE = The value use to fill the tensor's planes
+ * -# -DVEC_SIZE = Vector size
+ * -# -DLAST_ACCESSED_X = The element that is on the X border (threads trying to set this, might
+ * need to step back a bit)
+ *
+ * @param[in] tensor_ptr                           Pointer to the source image. Data types
+ * supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
+ * @param[in] tensor_stride_x                      Stride of the source image in X dimension (in
+ * bytes)
+ * @param[in] tensor_step_x                        tensor_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in] tensor_stride_y                      Stride of the source image in Y dimension (in
+ * bytes)
+ * @param[in] tensor_step_y                        tensor_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in] tensor_offset_first_element_in_bytes The offset of the first element in the source
+ * image
+ * @param[in] value                                The value used to fill the pages of the tensor
+ */
+__kernel void memset(TENSOR3D_DECLARATION(tensor))
+{
+  Tensor3D tensor = CONVERT_TO_TENSOR3D_STRUCT(tensor);
+
+#if defined(VEC_SIZE)
+
+#if defined(LAST_ACCESSED_X)
+  // Check if access on width gets out of bounds
+  // If it does shift access vector to access elements within bounds
+  const int xi = (int)(get_global_id(0) * VEC_SIZE);
+  tensor.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * tensor_stride_x;
+#endif // defined(LAST_ACCESSED_X)
+
+  VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+  data = (DATA_TYPE)(CONSTANT_VALUE);
+
+  VSTORE(VEC_SIZE)
+  (data, 0, (__global DATA_TYPE *)tensor.ptr);
+#else  // !defined(VEC_SIZE)
+  *((__global DATA_TYPE *)(tensor.ptr)) = (DATA_TYPE)(CONSTANT_VALUE);
+#endif // defined(VEC_SIZE)
+}
+
+#endif // Check for compile time constants
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/pad_layer.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/pad_layer.cl
new file mode 100644
index 000000000..96f2f9ef0
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/pad_layer.cl
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(SELECT_DT) && defined(VEC_SIZE) && defined(PAD_X_BEFORE) && \
+  defined(SRC_WIDTH)
+
+#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+#define VEC_SELECT VEC_DATA_TYPE(SELECT_DT, VEC_SIZE)
+#define OFFSETS VEC_OFFS(VEC_SELECT, VEC_SIZE)
+
+#if defined(CONST_VAL)
+/** Perform a pad operation when PaddingMode is CONSTANT
+ *
+ * @note Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @note Vector size must be passed using the -DVEC_SIZE compile flag, e.g. -DVEC_SIZE=4
+ * @note Constant value used to fill the pads must be passed using the -DCONST_VAL compile flag,
+ * e.g. -DCONST_VAL=1.27
+ * @note Pad to add to the left must be passed using the -DPAD_X_BEFORE compile flag, e.g.
+ * -DPAD_X_BEFORE=5
+ * @note Input tensor's width must be passed using the -DSRC_WIDTH compile flag, e.g.
+ * -DSRC_WIDTH=224
+ * @note Data type to use for the select instruction must be passed using the -DSELECT_DT compile
+ * flag, e.g. -DSELECT_DT=float
+ * @note In case pad left is more than the vector size, the number of threads to skip along the X
+ * axis must be passed using the -DNUM_THREADS_TO_SKIP_X compile flag, e.g.
+ * -DNUM_THREADS_TO_SKIP_X=1. This is defined as (PAD_X_BEFORE / VEC_SIZE)
+ * @note If pad also needs to be added to the top of the tensor, the following compile flags must be
+ * passed at compile time:
+ *       -# -DPAD_Y_BEFORE: Pad to add to the top of the input tensor (e.g. -DPAD_Y_BEFORE=3)
+ *       -# -DSRC_HEIGHT: Input tensor's height (e.g. -DSRC_HEIGHT=127)
+ * @note If pad also needs to be added to the depth of the tensor, the following compile flags must
+ * be passed at compile time:
+ *       -# -DPAD_Z_BEFORE: Pad to add before the first plane of the input tensor (e.g.
+ * -DPAD_Z_BEFORE=3)
+ *       -# -DSRC_DEPTH: Input tensor's depth (e.g. -DSRC_DEPTH=32)
+ * @note If pad also needs to be added to the batch of the tensor, the following compile flags must
+ * be passed at compile time:
+ *       -# -DPAD_W_BEFORE: Pad to add before the first batch of the input tensor (e.g.
+ * -DPAD_W_BEFORE=3)
+ *       -# -DSRC_BATCH: Input tensor's batch size (e.g. -DSRC_BATCH=4)
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types:
+ * U8, S8, QASYMM8, QASYMM8_SIGNED, U16, S16, U32, S32, F16, F32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source image in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data
+ * types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination image in Z dimension (in
+ * bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * image
+ * @param[in]  batch                             (Optional) Batch index if 4D pad must be applied
+ */
+__kernel void pad_layer_constant(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst)
+#if defined(PAD_W_BEFORE)
+                                                              ,
+                                 uint batch
+#endif // defined(PAD_W_BEFORE)
+)
+{
+  const int x = get_global_id(0);
+  const int y = get_global_id(1);
+  const int z = get_global_id(2);
+
+  uint cond = 0;
+
+#if defined(PAD_W_BEFORE)
+  cond |= batch < PAD_W_BEFORE || batch >= (SRC_BATCH + PAD_W_BEFORE);
+#endif // defined(PAD_W_BEFORE)
+#if defined(PAD_Z_BEFORE)
+  cond |= z < PAD_Z_BEFORE || z >= (SRC_DEPTH + PAD_Z_BEFORE);
+#endif // defined(PAD_Z_BEFORE)
+
+  if (cond)
+  {
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+    VSTORE(VEC_SIZE)
+    ((VEC_TYPE)CONST_VAL, 0, (__global DATA_TYPE *)dst.ptr);
+  }
+  else
+  {
+    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#if defined(NUM_THREADS_TO_SKIP_X)
+    /* In case the pad left is greater than the vector size, and we are past the threads operating
+     * solely on pad values, the input pointer must be brought back along the X axis to start from
+     * the first non-pad values.
+     *
+     * E.g. with VEC_SIZE=2, PAD_X_BEFORE=5, CONST_VAL=0 and 1D input |1 2 3 4 5 6|:
+     *  -# The first thread will compute the output values |0 0| since it detects (x_outs == (0, 1))
+     * < PAD_X_BEFORE
+     *  -# The second thread will compute the output values |0 0| since it detects (x_outs == (2,
+     * 3)) < PAD_X_BEFORE
+     *  -# The third thread should compute |0 1|, however the input pointer is now ahead of ((x *
+     * VEC_SIZE) == 4) values, reading |4 5|
+     *  -# To detect this, we use ((PAD_X_BEFORE / VEC_SIZE) == NUM_THREADS_TO_SKIP_X == 2) and
+     * check that it is >= to the current x
+     *  -# So, we bring the pointer back of NUM_THREADS_TO_SKIP_X threads, which means multiplying
+     * this constant by the input's step along the X axis
+     *  -# Now that the pointer is back of ((NUM_THREADS_TO_SKIP_X * src_step_x) == 4) values, it
+     * will read the desired values |0 1|
+     */
+    src.ptr -= select(0u, NUM_THREADS_TO_SKIP_X * src_step_x, x >= NUM_THREADS_TO_SKIP_X);
+#endif // defined(NUM_THREADS_TO_SKIP_X)
+#if defined(PAD_Z_BEFORE)
+    src.ptr -= PAD_Z_BEFORE * src_step_z;
+#endif // defined(PAD_Z_BEFORE)
+#if defined(PAD_W_BEFORE)
+    src.ptr -= PAD_W_BEFORE * SRC_DEPTH * src_step_z;
+#endif // defined(PAD_W_BEFORE)
+
+    VEC_TYPE src_vals = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
+
+    VEC_INT xs_out = (VEC_INT)(x * VEC_SIZE) + CONVERT(OFFSETS, VEC_INT);
+    VEC_INT cond = xs_out < (VEC_INT)PAD_X_BEFORE || xs_out >= (VEC_INT)(SRC_WIDTH + PAD_X_BEFORE);
+#if defined(PAD_Y_BEFORE)
+    cond |=
+      (VEC_INT)y < (VEC_INT)PAD_Y_BEFORE || (VEC_INT)y >= (VEC_INT)(SRC_HEIGHT + PAD_Y_BEFORE);
+#endif // defined(PAD_Y_BEFORE)
+    VSTORE(VEC_SIZE)
+    (select(src_vals, (VEC_TYPE)CONST_VAL, CONVERT(cond, VEC_SELECT)), 0,
+     (__global DATA_TYPE *)dst.ptr);
+  }
+}
+#endif // defined(CONST_VAL)
+
+#if defined(PAD_X_BEFORE_REMAINDER) && defined(PAD_X_AFTER_REMAINDER) &&         \
+  defined(PAD_X_BEFORE_REMAINDER_REFL) && defined(PAD_X_AFTER_REMAINDER_REFL) && \
+  defined(AFTER_PAD_FACT_X)
+
+#define SCALAR_COND(x) (VEC_SELECT) x == (VEC_SELECT)1
+#define ROTATE_REVERSE(x, n) ROTATE(REVERSE(x, VEC_SIZE), VEC_SIZE, n)
+#define SYMM_REFL_LEFT(x, n0, n1) \
+  select(ROTATE_REVERSE(x, n1), ROTATE(x, VEC_SIZE, n0), OFFSETS >= (VEC_SELECT)n0)
+#define SYMM_REFL_RIGHT(x, n0, n1) \
+  select(ROTATE(x, VEC_SIZE, n0), ROTATE_REVERSE(x, n1), OFFSETS >= (VEC_SELECT)n0)
+
+/** Perform a pad operation when PaddingMode is SYMMETRIC
+ *
+ * @note Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
+ * @note Vector size must be passed using the -DVEC_SIZE compile flag, e.g. -DVEC_SIZE=4
+ * @note Constant value must be passed using the -DCONST_VAL compile flag, e.g. -DCONST_VAL=1.27
+ * @note Pad to add to the left must be passed using the -DPAD_X_BEFORE compile flag, e.g.
+ * -DPAD_X_BEFORE=5
+ * @note Input tensor's width must be passed using the -DSRC_WIDTH compile flag, e.g.
+ * -DSRC_WIDTH=224
+ * @note Data type to use for the select instruction must be passed using the -DSELECT_DT compile
+ * flag, e.g. -DSELECT_DT=float
+ * @note Number of values to the left when operating across left padding must be passed using the
+ * -DPAD_X_BEFORE_REMAINDER compile flag, e.g. -DPAD_X_BEFORE_REMAINDER=5
+ * @note Number of values to the left when operating across right padding must be passed using the
+ * -DPAD_X_AFTER_REMAINDER compile flag, e.g. -DPAD_X_AFTER_REMAINDER=6
+ * @note To rearrange the vectors properly, (PAD_X_BEFORE_REMAINDER + 1) must be passed when mode is
+ * REFLECT using the -DPAD_X_BEFORE_REMAINDER_REFL compile flag, e.g. -DPAD_X_BEFORE_REMAINDER=6
+ * @note To rearrange the vectors properly, (PAD_X_AFTER_REMAINDER - 1) must be passed using the
+ * -DPAD_X_AFTER_REMAINDER_REFL compile flag, e.g. -DPAD_X_AFTER_REMAINDER=5
+ * @note When after pad X, starting point to read backward from must be passed using the
+ * -DAFTER_PAD_FACT_X compile flag, e.g. -DAFTER_PAD_FACT_X=253
+ * @note If padding mode is REFLECT, the -DIS_REFLECT compile flag must be set to 1, else it must be
+ * set to 0
+ * @note If pad also needs to be added to the top of the tensor, the following compile flags must be
+ * passed at compile time:
+ *       -# -DPAD_Y_BEFORE: Pad to add to the top of the input tensor (e.g. -DPAD_Y_BEFORE=3)
+ *       -# -DSRC_HEIGHT: Input tensor's height (e.g. -DSRC_HEIGHT=127)
+ * @note If pad also needs to be added to the depth of the tensor, the following compile flags must
+ * be passed at compile time:
+ *       -# -DPAD_Z_BEFORE: Pad to add before the first plane of the input tensor (e.g.
+ * -DPAD_Z_BEFORE=3)
+ *       -# -DSRC_DEPTH: Input tensor's depth (e.g. -DSRC_DEPTH=32)
+ * @note If the starting point to read backward from is less than the output's last element accessed
+ * in the X, the following compile flags must be passed at compile time to avoid negative offsets:
+ *       -# -DAFTER_PAD_REM: Defines how much to rotate the vector if the backward calculation
+ * attempted to read from a negative offset (e.g. -DAFTER_PAD_REM=3)
+ *
+ * @param[in]  src_ptr                           Pointer to the source image. Supported data types:
+ * U8, S8, QASYMM8, QASYMM8_SIGNED, U16, S16, U32, S32, F16, F32
+ * @param[in]  src_stride_x                      Stride of the source image in X dimension (in
+ * bytes)
+ * @param[in]  src_step_x                        src_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_y                      Stride of the source image in Y dimension (in
+ * bytes)
+ * @param[in]  src_step_y                        src_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  src_stride_z                      Stride of the source image in Z dimension (in
+ * bytes)
+ * @param[in]  src_step_z                        src_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr                           Pointer to the destination image. Supported data
+ * types: same as @p src_ptr
+ * @param[in]  dst_stride_x                      Stride of the destination image in X dimension (in
+ * bytes)
+ * @param[in]  dst_step_x                        dst_stride_x * number of elements along X processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_y                      Stride of the destination image in Y dimension (in
+ * bytes)
+ * @param[in]  dst_step_y                        dst_stride_y * number of elements along Y processed
+ * per workitem(in bytes)
+ * @param[in]  dst_stride_z                      Stride of the destination image in Z dimension (in
+ * bytes)
+ * @param[in]  dst_step_z                        dst_stride_z * number of elements along Z processed
+ * per workitem(in bytes)
+ * @param[in]  dst_offset_first_element_in_bytes The offset of the first element in the destination
+ * image
+ */
+__kernel void pad_layer_symmetric_reflect(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))
+{
+  // Get current thread position
+  const int x = get_global_id(0);
+  const int y = get_global_id(1);
+  const int z = get_global_id(2);
+
+  // Define conditions based on the thread X position w.r.t. pad left and right
+  const int x_out_first = x * VEC_SIZE;
+  const int x_out_last = x_out_first + VEC_SIZE;
+  const int is_before_pad_left = (x_out_last <= PAD_X_BEFORE);
+  const int is_across_pad_left = (x_out_first < PAD_X_BEFORE) && (x_out_last > PAD_X_BEFORE);
+  const int is_inside_input =
+    (x_out_first >= PAD_X_BEFORE) && (x_out_last <= (SRC_WIDTH + PAD_X_BEFORE));
+  const int is_across_pad_right =
+    (x_out_first < (SRC_WIDTH + PAD_X_BEFORE)) && (x_out_last > (SRC_WIDTH + PAD_X_BEFORE));
+  const int is_after_pad_right = (x_out_first >= (SRC_WIDTH + PAD_X_BEFORE));
+
+  // Calculate base pointers
+  __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes;
+  Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+  // Calculate input tensor's offset based on the defined conditions
+  int x_offset = 0;
+  x_offset = select(x_offset, PAD_X_BEFORE - x_out_last + IS_REFLECT, is_before_pad_left);
+  x_offset = select(x_offset, x_out_first - PAD_X_BEFORE, is_inside_input);
+  x_offset = select(x_offset, SRC_WIDTH - VEC_SIZE, is_across_pad_right);
+  x_offset = select(x_offset, AFTER_PAD_FACT_X - x_out_last, is_after_pad_right);
+
+#if defined(AFTER_PAD_REM)
+  int neg_offs = x_offset < 0;
+  x_offset = max(x_offset, 0);
+#endif // defined(AFTER_PAD_REM)
+
+  // Load input values from the computed offset
+  int y_in = y;
+  int z_in = z;
+#if defined(PAD_Y_BEFORE)
+  y_in = select(y - PAD_Y_BEFORE, PAD_Y_BEFORE - y + IS_REFLECT - 1, y < PAD_Y_BEFORE);
+  y_in = select(y_in, 2 * SRC_HEIGHT + PAD_Y_BEFORE - y - IS_REFLECT - 1,
+                y >= (SRC_HEIGHT + PAD_Y_BEFORE));
+#endif // defined(PAD_Y_BEFORE)
+#if defined(PAD_Z_BEFORE)
+  z_in = select(z - PAD_Z_BEFORE, PAD_Z_BEFORE - z + IS_REFLECT - 1, z < PAD_Z_BEFORE);
+  z_in = select(z_in, 2 * SRC_DEPTH + PAD_Z_BEFORE - z - IS_REFLECT - 1,
+                z >= (SRC_DEPTH + PAD_Z_BEFORE));
+#endif // defined(PAD_Y_BEFORE)
+
+  src_addr += x_offset * src_stride_x + y_in * src_step_y + z_in * src_step_z;
+
+#if SRC_WIDTH == 1
+  VSTORE(VEC_SIZE)
+  ((VEC_TYPE)(*(__global DATA_TYPE *)src_addr), 0, (__global DATA_TYPE *)dst.ptr);
+#else // SRC_WIDTH == 1
+
+  VEC_TYPE src_vals = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
+
+  // Choose rearrangement policy based on the defined conditions
+  src_vals =
+    select(src_vals, SYMM_REFL_LEFT(src_vals, PAD_X_BEFORE_REMAINDER, PAD_X_BEFORE_REMAINDER_REFL),
+           SCALAR_COND(is_across_pad_left));
+  src_vals =
+    select(src_vals, SYMM_REFL_RIGHT(src_vals, PAD_X_AFTER_REMAINDER, PAD_X_AFTER_REMAINDER_REFL),
+           SCALAR_COND(is_across_pad_right));
+  src_vals = select(src_vals, REVERSE(src_vals, VEC_SIZE),
+                    SCALAR_COND((is_before_pad_left || is_after_pad_right)));
+#if defined(AFTER_PAD_REM)
+  src_vals = select(src_vals, ROTATE(src_vals, VEC_SIZE, AFTER_PAD_REM), SCALAR_COND(neg_offs));
+#endif // defined(AFTER_PAD_REM)
+
+  // Store
+  VSTORE(VEC_SIZE)
+  (src_vals, 0, (__global DATA_TYPE *)dst.ptr);
+#endif // SRC_WIDTH == 1
+}
+#endif // defined(PAD_X_BEFORE_REMAINDER) && defined(PAD_X_AFTER_REMAINDER) &&
+       // defined(PAD_X_BEFORE_REMAINDER_REFL) && defined(PAD_X_AFTER_REMAINDER_REFL) &&
+       // defined(AFTER_PAD_FACT_X)
+#endif // defined(DATA_TYPE) && defined(SELECT_DT) && defined(VEC_SIZE) && defined(PAD_X_BEFORE) &&
+       // defined(SRC_WIDTH)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/repeat.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/repeat.h
new file mode 100644
index 000000000..cfc811cce
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/repeat.h
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_REPEAT_H
+#define ARM_COMPUTE_REPEAT_H
+
+#include "helpers.h"
+
+/** Macros that help in loop unrolling */
+// Repeat macros with 3 param, excluding the implicit ID param
+#define REPEAT_3_1(P_X, P_A, P_B, P_C) P_X##_DEF(0, P_A, P_B, P_C)
+#define REPEAT_3_2(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(1, P_A, P_B, P_C);         \
+  REPEAT_3_1(P_X, P_A, P_B, P_C)
+#define REPEAT_3_3(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(2, P_A, P_B, P_C);         \
+  REPEAT_3_2(P_X, P_A, P_B, P_C)
+#define REPEAT_3_4(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(3, P_A, P_B, P_C);         \
+  REPEAT_3_3(P_X, P_A, P_B, P_C)
+#define REPEAT_3_5(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(4, P_A, P_B, P_C);         \
+  REPEAT_3_4(P_X, P_A, P_B, P_C)
+#define REPEAT_3_6(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(5, P_A, P_B, P_C);         \
+  REPEAT_3_5(P_X, P_A, P_B, P_C)
+#define REPEAT_3_7(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(6, P_A, P_B, P_C);         \
+  REPEAT_3_6(P_X, P_A, P_B, P_C)
+#define REPEAT_3_8(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(7, P_A, P_B, P_C);         \
+  REPEAT_3_7(P_X, P_A, P_B, P_C)
+#define REPEAT_3_9(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(8, P_A, P_B, P_C);         \
+  REPEAT_3_8(P_X, P_A, P_B, P_C)
+#define REPEAT_3_10(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(9, P_A, P_B, P_C);          \
+  REPEAT_3_9(P_X, P_A, P_B, P_C)
+#define REPEAT_3_11(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(A, P_A, P_B, P_C);          \
+  REPEAT_3_10(P_X, P_A, P_B, P_C)
+#define REPEAT_3_12(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(B, P_A, P_B, P_C);          \
+  REPEAT_3_11(P_X, P_A, P_B, P_C)
+#define REPEAT_3_13(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(C, P_A, P_B, P_C);          \
+  REPEAT_3_12(P_X, P_A, P_B, P_C)
+#define REPEAT_3_14(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(D, P_A, P_B, P_C);          \
+  REPEAT_3_13(P_X, P_A, P_B, P_C)
+#define REPEAT_3_15(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(E, P_A, P_B, P_C);          \
+  REPEAT_3_14(P_X, P_A, P_B, P_C)
+#define REPEAT_3_16(P_X, P_A, P_B, P_C) \
+  P_X##_DEF(F, P_A, P_B, P_C);          \
+  REPEAT_3_15(P_X, P_A, P_B, P_C)
+
+#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) \
+  REPEAT_3_##P_NUM(P_OP, P_A, P_B, P_C) // One level of indirection to ensure order of expansion
+                                        // does not affect preprocessing P_NUM
+#define REPEAT_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C)
+
+// Repeat macros with 4 param, excluding the implicit ID param
+#define REPEAT_4_1(P_X, P_A, P_B, P_C, P_D) P_X##_DEF(0, P_A, P_B, P_C, P_D)
+#define REPEAT_4_2(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(1, P_A, P_B, P_C, P_D);         \
+  REPEAT_4_1(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_3(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(2, P_A, P_B, P_C, P_D);         \
+  REPEAT_4_2(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_4(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(3, P_A, P_B, P_C, P_D);         \
+  REPEAT_4_3(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_5(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(4, P_A, P_B, P_C, P_D);         \
+  REPEAT_4_4(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_6(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(5, P_A, P_B, P_C, P_D);         \
+  REPEAT_4_5(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_7(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(6, P_A, P_B, P_C, P_D);         \
+  REPEAT_4_6(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_8(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(7, P_A, P_B, P_C, P_D);         \
+  REPEAT_4_7(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_9(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(8, P_A, P_B, P_C, P_D);         \
+  REPEAT_4_8(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_10(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(9, P_A, P_B, P_C, P_D);          \
+  REPEAT_4_9(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_11(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(A, P_A, P_B, P_C, P_D);          \
+  REPEAT_4_10(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_12(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(B, P_A, P_B, P_C, P_D);          \
+  REPEAT_4_11(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_13(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(C, P_A, P_B, P_C, P_D);          \
+  REPEAT_4_12(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_14(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(D, P_A, P_B, P_C, P_D);          \
+  REPEAT_4_13(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_15(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(E, P_A, P_B, P_C, P_D);          \
+  REPEAT_4_14(P_X, P_A, P_B, P_C, P_D)
+#define REPEAT_4_16(P_X, P_A, P_B, P_C, P_D) \
+  P_X##_DEF(F, P_A, P_B, P_C, P_D);          \
+  REPEAT_4_15(P_X, P_A, P_B, P_C, P_D)
+
+#define REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) \
+  REPEAT_4_##P_NUM(P_OP, P_A, P_B, P_C, P_D) // One level of indirection to ensure order of
+                                             // expansion does not affect preprocessing P_NUM
+#define REPEAT_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D)
+
+// Macro for initializing N variables. Generates N statements that defines VAR##N =
+// RHS_ACCESSOR_DEF(...)
+#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL
+#define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL)
+
+// Macro for initializing N variables by converting the data type. Generates N statements that
+// defines VAR##N = RHS_ACCESSOR_DEF(...)
+#define VAR_INIT_CONVERT_SAT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) \
+  TYPE_OUT VAR_OUT##ID = CONVERT_SAT(VAR_IN##ID, TYPE_OUT)
+#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) \
+  REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT)
+
+// Macro for adding a constant to N variables. Generates N statements that defines VAR##N
+// =RHS_ACCESSOR_DEF(...)
+#define ADD_CONST_TO_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID += (TYPE)VAL
+#define REPEAT_ADD_CONST_TO_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, ADD_CONST_TO_VAR, TYPE, VAR, VAL)
+
+// Macro for multiplying N variables (VAR_B) by a constant (VAL) and adding to other N variables
+// (VAR_A). Generates N statements that defines VAR_A##N =RHS_ACCESSOR_DEF(...)
+#define MLA_VAR_WITH_CONST_VEC_DEF(ID, VAR_A, VAR_B, VAL) VAR_A##ID += VAR_B##ID * VAL
+#define REPEAT_MLA_VAR_WITH_CONST_VEC(N, VAR_A, VAR_B, VAL) \
+  REPEAT_3_N(N, MLA_VAR_WITH_CONST_VEC, VAR_A, VAR_B, VAL)
+
+// Macro for adding a vector to N-variables. Generates N statements that defines VAR##N
+// =RHS_ACCESSOR_DEF(...)
+#define ADD_VECTOR_TO_VAR_DEF(ID, TYPE, VAR, VEC) VAR##ID += VEC
+#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC) REPEAT_3_N(N, ADD_VECTOR_TO_VAR, "", VAR, VEC)
+
+// Macro for adding a two N-variables. Generates N statements that defines VAR##N
+// =RHS_ACCESSOR_DEF(...)
+#define ADD_TWO_VARS_DEF(ID, TYPE, VAR_A, VAR_B) VAR_A##ID += VAR_B##ID
+#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B) REPEAT_3_N(N, ADD_TWO_VARS, "", VAR_A, VAR_B)
+
+// Macro for performing Max between a constant and N variables. Generates N statements that defines
+// VAR##N =RHS_ACCESSOR_DEF(...)
+#define MAX_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = max(VAR##ID, (TYPE)VAL)
+#define REPEAT_MAX_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MAX_CONST_VAR, TYPE, VAR, VAL)
+
+// Macro for performing Min between a constant and N variables. Generates N statements that defines
+// VAR##N =RHS_ACCESSOR_DEF(...)
+#define MIN_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = min(VAR##ID, (TYPE)VAL)
+#define REPEAT_MIN_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MIN_CONST_VAR, TYPE, VAR, VAL)
+
+// Macro for performing ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE to N variables. Generates N
+// statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \
+  VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
+#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) \
+  REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
+
+// Macro for performing ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE to N variables. Generates N
+// statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \
+  VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
+#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) \
+  REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
+
+// Macro for performing per-channel ASYMM_MULT_BY_QUANT_MULTIPLIER to N variables.
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \
+  ({                                                                                      \
+    VEC_DATA_TYPE(int, N0)                                                                \
+    VAR##ID_shift_lt0 =                                                                   \
+      ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0);   \
+    VEC_DATA_TYPE(int, N0)                                                                \
+    VAR##ID_shift_gt0 =                                                                   \
+      ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0);      \
+    VAR##ID = select(VAR##ID_shift_lt0, VAR##ID_shift_gt0, RES_SHIFT >= 0);               \
+  })
+#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT) \
+  REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL, SIZE, VAR, RES_MUL, RES_SHIFT)
+
+#endif // ARM_COMPUTE_REPEAT_H
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/reshape_layer.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/reshape_layer.cl
new file mode 100644
index 000000000..8da8bfc8e
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/reshape_layer.cl
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** Perform tensor reshape
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g.
+ * -DDATA_TYPE=short
+ *
+ * @param[in]  input_ptr                            Pointer to the first source tensor. Supported
+ * data types: All
+ * @param[in]  input_stride_x                       Stride of the first source tensor in X dimension
+ * (in bytes)
+ * @param[in]  input_step_x                         input_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  input_stride_y                       Stride of the first source tensor in Y dimension
+ * (in bytes)
+ * @param[in]  input_step_y                         input_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  input_stride_z                       Stride of the first source tensor in Z dimension
+ * (in bytes)
+ * @param[in]  input_step_z                         input_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the first
+ * source tensor
+ * @param[out] output_ptr                           Pointer to the destination tensor. Supported
+ * data types: same as @p input_ptr
+ * @param[in]  output_stride_x                      Stride of the destination tensor in X dimension
+ * (in bytes)
+ * @param[in]  output_step_x                        output_stride_x * number of elements along X
+ * processed per workitem(in bytes)
+ * @param[in]  output_stride_y                      Stride of the destination tensor in Y dimension
+ * (in bytes)
+ * @param[in]  output_step_y                        output_stride_y * number of elements along Y
+ * processed per workitem(in bytes)
+ * @param[in]  output_stride_z                      Stride of the destination tensor in Z dimension
+ * (in bytes)
+ * @param[in]  output_step_z                        output_stride_z * number of elements along Z
+ * processed per workitem(in bytes)
+ * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
+ * destination tensor
+ * @param[in]  input_shape                          Input spatial shape
+ * @param[in]  output_shape                         Output spatial shape
+ */
+__kernel void reshape_layer(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output),
+                            int2 input_shape, int2 output_shape)
+{
+  Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+  Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output);
+
+  int3 id = (int3)(get_global_id(0), get_global_id(1), get_global_id(2));
+
+  // Linearize index
+  int linear_idx = id.x + id.y * input_shape.x + id.z * input_shape.x * input_shape.y;
+
+  // Translate to output
+  int3 out_id;
+  out_id.x = linear_idx % output_shape.x;
+  out_id.y = (linear_idx / output_shape.x) % output_shape.y;
+  out_id.z = linear_idx / (output_shape.x * output_shape.y);
+
+  // Store result
+  *((__global DATA_TYPE *)tensor3D_offset(&out, out_id.x, out_id.y, out_id.z)) =
+    *((__global DATA_TYPE *)in.ptr);
+}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp
index 45307fad7..987409739 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLArgMinMaxLayerKernelEx.cpp
@@ -39,16 +39,18 @@
  */
 #include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernelEx.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/CLValidate.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include "support/StringSupport.h"
 
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
index ffa2c5a67..a5daa2410 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
@@ -43,6 +43,8 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/AccessWindowStatic.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp
index 3f2ae357d..dc06bfbb3 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLCastBoolKernel.cpp
@@ -41,13 +41,16 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 #include "support/StringSupport.h"
 
 #include <cstddef>
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
index e4c617c8d..4206f1fd4 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
@@ -43,6 +43,9 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
index 8b5885225..62da2376e 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
@@ -45,6 +45,10 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
 #include "arm_compute/core/UtilsEx.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
index f0a761b97..03ca6ddcb 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
@@ -43,6 +43,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
@@ -111,7 +112,7 @@ void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTenso
   _hits = hits;
 
   // Make _lookup_indices tensor
-  _lookup_indices = support::cpp14::make_unique<CLTensor>();
+  _lookup_indices = std::make_unique<CLTensor>();
   _lookup_indices->allocator()->init(
     TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32));
   _lookup_indices->allocator()->allocate();
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
index dab6480b2..945af3c51 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
@@ -42,12 +42,16 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 #include "support/StringSupport.h"
 #include "support/ToolchainSupport.h"
 
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLMemsetKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLMemsetKernel.cpp
new file mode 100644
index 000000000..a00fc5e2e
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLMemsetKernel.cpp
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2018-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+CLMemsetKernel::CLMemsetKernel() : ICLKernel(), _tensor(nullptr), _full_window() {}
+
+void CLMemsetKernel::configure(ICLTensor *tensor, const PixelValue &constant_value, Window *window)
+{
+  configure(CLKernelLibrary::get().get_compile_context(), tensor, constant_value, window);
+}
+
+void CLMemsetKernel::configure(const CLCompileContext &compile_context, ICLTensor *tensor,
+                               const PixelValue &constant_value, Window *window)
+{
+  ARM_COMPUTE_UNUSED(compile_context);
+  ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
+  ARM_COMPUTE_ERROR_THROW_ON(validate(tensor->info(), constant_value, window));
+
+  _tensor = tensor;
+
+  const DataType data_type = tensor->info()->data_type();
+  const int vec_size_x = 16 / tensor->info()->element_size();
+
+  // Create and update the window (if needed)
+  _full_window = calculate_max_window(*tensor->info());
+  Window win = _full_window;
+  if (window != nullptr)
+  {
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(win, *window);
+    win = *window;
+  }
+
+  const int output_width_x = win.num_iterations(0);
+  const bool multi_access_x = output_width_x >= vec_size_x;
+  const bool remainder_x = output_width_x % vec_size_x > 0;
+
+  if (multi_access_x)
+  {
+    win.set(
+      Window::DimX,
+      Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+  }
+  ICLKernel::configure_internal(win);
+
+  // Create kernel
+  CLBuildOptions build_opts;
+  build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+  build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(constant_value, data_type));
+  build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+  build_opts.add_option_if(multi_access_x && remainder_x,
+                           "-DLAST_ACCESSED_X=" + support::cpp11::to_string(
+                                                    std::max<int>(output_width_x - vec_size_x, 0)));
+
+  _kernel =
+    static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel("memset", build_opts.options()));
+}
+
+Status CLMemsetKernel::validate(const ITensorInfo *tensor, const PixelValue &constant_value,
+                                Window *window)
+{
+  ARM_COMPUTE_UNUSED(tensor);
+  ARM_COMPUTE_UNUSED(constant_value);
+  if (window != nullptr)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON(window->x().step() != 1);
+  }
+  return Status{};
+}
+
+void CLMemsetKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  // Collapse all the batches on the third
+  Window collapsed = window.collapse_if_possible(_full_window, Window::DimZ);
+  Window slice = collapsed.first_slice_window_3D();
+
+  do
+  {
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, _tensor, slice);
+    enqueue(queue, *this, slice, lws_hint());
+  } while (collapsed.slide_window_slice_3D(slice));
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
index 1d4b141a7..da7437e97 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
@@ -40,15 +40,19 @@
 
 #include "arm_compute/core/CL/kernels/CLMultiplyScaleFactorKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
index ee633d437..cd5e571e9 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
@@ -43,6 +43,9 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp
index 0b8e7cc41..4c4cbe710 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLOneHotKernel.cpp
@@ -42,6 +42,10 @@
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 #include "support/StringSupport.h"
 #include <string>
 namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernelEx.cpp
new file mode 100644
index 000000000..b6efeac35
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLPadLayerKernelEx.cpp
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLPadLayerKernelEx.h"
+
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/StringSupport.h"
+
+namespace arm_compute
+{
+namespace
+{
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
+                          const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_UNUSED(constant_value);
+  ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
+  ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > input->num_dimensions());
+  if (mode == PaddingMode::REFLECT || mode == PaddingMode::SYMMETRIC)
+  {
+    ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 3);
+
+    const auto is_reflect = static_cast<unsigned int>(mode == PaddingMode::REFLECT);
+    for (size_t i = 0; i < padding.size(); ++i)
+    {
+      ARM_COMPUTE_RETURN_ERROR_ON(padding.at(i).first > (input->dimension(i) - is_reflect));
+      ARM_COMPUTE_RETURN_ERROR_ON(padding.at(i).second > (input->dimension(i) - is_reflect));
+    }
+  }
+
+  if (output->total_size() > 0)
+  {
+    TensorShape padded_shape =
+      misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding);
+
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(output, input);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), padded_shape);
+  }
+
+  return Status{};
+}
+
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const PaddingList &padding,
+                              PixelValue constant_value, PaddingMode mode,
+                              unsigned int &num_elems_processed_per_iteration)
+{
+  ARM_COMPUTE_UNUSED(constant_value, mode);
+
+  const TensorShape padded_shape =
+    misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding);
+  auto_init_if_empty(*output, input->clone()->set_tensor_shape(padded_shape));
+
+  num_elems_processed_per_iteration =
+    std::min(16U, 32U / static_cast<unsigned int>(element_size_from_data_type(input->data_type())));
+  if (input->dimension(0) < num_elems_processed_per_iteration)
+  {
+    num_elems_processed_per_iteration =
+      1 << static_cast<unsigned int>(std::log2(input->dimension(0)));
+  }
+
+  // Configure kernel window
+  Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+
+  const int input_start_x =
+    mode == PaddingMode::CONSTANT ? -(padding.at(0).first % num_elems_processed_per_iteration) : 0;
+  const int input_start_y =
+    (mode == PaddingMode::CONSTANT && padding.size() > 1) ? -padding.at(1).first : 0;
+
+  AccessWindowRectangle input_access(input, input_start_x, input_start_y,
+                                     num_elems_processed_per_iteration, 1);
+  AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+
+  const bool window_changed = update_window_and_padding(win, input_access, output_access);
+  output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
+
+  Status err = (window_changed)
+                 ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                 : Status{};
+  return std::make_pair(err, win);
+}
+} // namespace
+
+CLPadLayerKernelEx::CLPadLayerKernelEx()
+  : _input(nullptr), _output(nullptr), _input_start_x(0), _input_start_y(0), _4d_enabled(false)
+{
+}
+
+void CLPadLayerKernelEx::configure(const ICLTensor *input, ICLTensor *output,
+                                   const PaddingList &padding, PixelValue constant_value,
+                                   PaddingMode mode)
+{
+  configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value,
+            mode);
+}
+
+void CLPadLayerKernelEx::configure(const CLCompileContext &compile_context, const ICLTensor *input,
+                                   ICLTensor *output, const PaddingList &padding,
+                                   PixelValue constant_value, PaddingMode mode)
+{
+  ARM_COMPUTE_UNUSED(compile_context);
+  // Perform validation step
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+  ARM_COMPUTE_ERROR_THROW_ON(
+    validate_arguments(input->info(), output->info(), padding, constant_value, mode));
+
+  _input = input;
+  _output = output;
+  _4d_enabled = (mode == PaddingMode::CONSTANT) && (padding.size() > 3);
+
+  // Configure window
+  unsigned int vec_size;
+  auto win_config = validate_and_configure_window(input->info(), output->info(), padding,
+                                                  constant_value, mode, vec_size);
+  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+  ICLKernel::configure_internal(win_config.second);
+
+  // Set build options
+  std::string kernel_name = "pad_layer_";
+
+  const DataType &data_type = input->info()->data_type();
+  const unsigned int input_width = input->info()->dimension(0);
+  const unsigned int input_height = input->info()->dimension(1);
+  const unsigned int input_depth = input->info()->dimension(2);
+  const unsigned int pad_x_before = padding.at(0).first;
+  const unsigned int pad_y_before = padding.size() > 1 ? padding.at(1).first : 0;
+  const unsigned int pad_z_before = padding.size() > 2 ? padding.at(2).first : 0;
+  const unsigned int pad_right_start = input_width + pad_x_before;
+
+  _input_start_x = mode == PaddingMode::CONSTANT ? -(pad_x_before % vec_size) : 0;
+  _input_start_y = (mode == PaddingMode::CONSTANT && padding.size() > 1) ? -padding.at(1).first : 0;
+
+  CLBuildOptions build_opts;
+  build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+  build_opts.add_option("-DSELECT_DT=" + get_cl_select_type_from_data_type(data_type));
+  build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
+  build_opts.add_option("-DPAD_X_BEFORE=" + support::cpp11::to_string(pad_x_before));
+  build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width));
+  if (padding.size() > 1)
+  {
+    build_opts.add_option("-DPAD_Y_BEFORE=" + support::cpp11::to_string(pad_y_before));
+    build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input_height));
+
+    if (padding.size() > 2)
+    {
+      build_opts.add_option("-DPAD_Z_BEFORE=" + support::cpp11::to_string(pad_z_before));
+      build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input_depth));
+    }
+  }
+
+  switch (mode)
+  {
+    case PaddingMode::CONSTANT:
+    {
+      kernel_name += "constant";
+
+      build_opts.add_option("-DCONST_VAL=" + string_from_pixel_value(constant_value, data_type));
+      build_opts.add_option_if(pad_x_before >= vec_size,
+                               "-DNUM_THREADS_TO_SKIP_X=" +
+                                 support::cpp11::to_string(pad_x_before / vec_size));
+
+      if (_4d_enabled)
+      {
+        build_opts.add_option("-DPAD_W_BEFORE=" + support::cpp11::to_string(padding.at(3).first));
+        build_opts.add_option("-DSRC_BATCH=" +
+                              support::cpp11::to_string(input->info()->dimension(3)));
+      }
+
+      break;
+    }
+    case PaddingMode::SYMMETRIC:
+    case PaddingMode::REFLECT:
+    {
+      kernel_name += "symmetric_reflect";
+
+      const auto is_reflect = static_cast<unsigned int>(mode == PaddingMode::REFLECT);
+
+      const unsigned int pad_x_before_remainder = pad_x_before % vec_size;
+      const unsigned int pad_x_after_remainder = pad_right_start % vec_size;
+      const unsigned int after_pad_fact_x = (2 * input_width + pad_x_before) - is_reflect;
+      const unsigned int output_last_x =
+        ceil_to_multiple(pad_right_start + padding.at(0).second, vec_size);
+
+      build_opts.add_option("-DIS_REFLECT=" + support::cpp11::to_string(is_reflect));
+      build_opts.add_option("-DPAD_X_BEFORE_REMAINDER=" +
+                            support::cpp11::to_string(pad_x_before_remainder));
+      build_opts.add_option("-DPAD_X_AFTER_REMAINDER=" +
+                            support::cpp11::to_string(pad_x_after_remainder));
+      build_opts.add_option(
+        "-DPAD_X_BEFORE_REMAINDER_REFL=" +
+        support::cpp11::to_string((pad_x_before_remainder + is_reflect) % vec_size));
+      build_opts.add_option(
+        "-DPAD_X_AFTER_REMAINDER_REFL=" +
+        support::cpp11::to_string((pad_x_after_remainder - is_reflect) % vec_size));
+      build_opts.add_option("-DAFTER_PAD_FACT_X=" + support::cpp11::to_string(after_pad_fact_x));
+      build_opts.add_option_if(after_pad_fact_x < output_last_x,
+                               "-DAFTER_PAD_REM=" +
+                                 support::cpp11::to_string(after_pad_fact_x % vec_size));
+
+      break;
+    }
+    default:
+      ARM_COMPUTE_ERROR("Padding mode not supported.");
+  }
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+    CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options()));
+}
+
+Status CLPadLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+                                    const PaddingList &padding, PixelValue constant_value,
+                                    PaddingMode mode)
+{
+  unsigned int vec_size;
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, padding, constant_value, mode));
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
+                                                            output->clone().get(), padding,
+                                                            constant_value, mode, vec_size)
+                                .first);
+
+  return Status{};
+}
+
+void CLPadLayerKernelEx::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+  Window win_in = window;
+  win_in.adjust(Window::DimX, _input_start_x, true);
+  win_in.adjust(Window::DimY, _input_start_y, true);
+
+  Window slice_out = window.first_slice_window_3D();
+  Window slice_in = win_in.first_slice_window_3D();
+  unsigned int batch = 0;
+  do
+  {
+    unsigned int idx = 0;
+    add_3D_tensor_argument(idx, _input, slice_in);
+    add_3D_tensor_argument(idx, _output, slice_out);
+    if (_4d_enabled)
+    {
+      add_argument<unsigned int>(idx, batch++);
+    }
+
+    enqueue(queue, *this, slice_out, lws_hint());
+  } while (window.slide_window_slice_3D(slice_out) && win_in.slide_window_slice_3D(slice_in));
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
index b417a7103..9aa815f55 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
@@ -40,15 +40,19 @@
 
 #include "arm_compute/core/CL/kernels/CLQuantizationSymmetricKernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/CL/CLValidate.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 #include "support/StringSupport.h"
 
 namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
index 3906009c2..70374ba61 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
@@ -43,6 +43,9 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
index 4a6374444..c9d6dc31c 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
@@ -40,7 +40,7 @@
 
 #include "arm_compute/core/CL/kernels/CLScaleFactorSymm8Kernel.h"
 
-#include "arm_compute/core/AccessWindowStatic.h"
+#include "src/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
@@ -48,6 +48,10 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 #include "support/StringSupport.h"
 
 #include <climits>
diff --git a/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp b/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp
index c88bef6d7..1d4d33ac2 100644
--- a/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/NEElementwiseOperationFuncs.cpp
@@ -42,7 +42,7 @@
 
 #include <algorithm>
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
+#include "src/core/NEON/NEAsymm.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Window.h"
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp
index a8464afce..0551fc7db 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEBinaryLogicalOperationKernel.cpp
@@ -43,10 +43,10 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <algorithm>
 #include <arm_neon.h>
@@ -163,7 +163,7 @@ void elementwise_logic_op(const ITensor *in1, const ITensor *in2, ITensor *out,
 
 std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)> configure_func(
   const ITensor *input1, const ITensor *input2, ITensor *output,
-  std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function)
+  std::map<std::string, cpu::kernels::CpuElementwiseKernel::ElementwiseFunction *> map_function)
 {
   std::string function_to_call("op_");
   function_to_call += string_from_data_type(input1->info()->data_type()) + "_";
@@ -185,9 +185,9 @@ template <BinaryLogicalOperation op>
 std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)>
 configure_logic_func(const ITensor *input1, const ITensor *input2, ITensor *output)
 {
-  static std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function = {
-    {"op_U8_U8_U8", &elementwise_logic_op<op, uint8_t, uint8x16_t>},
-    {"op_QASYMM8_QASYMM8_QASYMM8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}};
+  static std::map<std::string, cpu::kernels::CpuElementwiseKernel::ElementwiseFunction *>
+    map_function = {{"op_U8_U8_U8", &elementwise_logic_op<op, uint8_t, uint8x16_t>},
+                    {"op_QASYMM8_QASYMM8_QASYMM8", &elementwise_logic_op<op, uint8_t, uint8x16_t>}};
 
   return configure_func(input1, input2, output, map_function);
 }
@@ -196,7 +196,7 @@ void NEBinaryLogicalOperationKernel::configure(BinaryLogicalOperation op, const
                                                const ITensor *input2, ITensor *output)
 {
   ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info()));
-  configure_common(input1, input2, output);
+  configure_common(input1->info(), input2->info(), output->info());
   switch (op)
   {
     case BinaryLogicalOperation::AND:
@@ -251,5 +251,4 @@ Status NEBinaryLogicalOperationKernel::validate(BinaryLogicalOperation op,
   ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output));
   return Status{};
 }
-
 } // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp
index f935596e6..87e716b4f 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NECastBoolKernel.cpp
@@ -39,16 +39,19 @@
  */
 #include "arm_compute/core/NEON/kernels/NECastBoolKernel.h"
 
-#include "arm_compute/core/CPP/Validate.h"
+#include "src/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEMath.h"
+#include "src/core/NEON/NEMath.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/SaturateCast.h"
+#include "support/SaturateCast.h"
 
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
+#include "src/core/NEON/INEKernel.h"
 
 using namespace arm_compute;
 
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp
index e3a77c6b1..3ad9ee945 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEEmbeddingLookupKernel.cpp
@@ -47,6 +47,9 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 using namespace arm_compute;
 
 NEEmbeddingLookupKernel::NEEmbeddingLookupKernel()
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
new file mode 100644
index 000000000..375fa28e5
--- /dev/null
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2017-2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include <arm_neon.h>
+#include <cstddef>
+#include <cstdint>
+#include <mutex>
+
+using namespace arm_compute;
+
+namespace
+{
+inline Status validate_arguments(const ITensorInfo *accum, const ITensorInfo *biases)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(accum);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
+  ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+  ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != accum->dimension(0));
+
+  return Status{};
+}
+
+inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *accum,
+                                                               ITensorInfo *biases)
+{
+  constexpr unsigned int num_elems_processed_per_iteration = 16;
+
+  // Configure kernel window
+  Window win = calculate_max_window(*accum, Steps(num_elems_processed_per_iteration));
+
+  bool window_changed = update_window_and_padding(
+    win, AccessWindowHorizontal(accum, 0, num_elems_processed_per_iteration),
+    AccessWindowStatic(biases, 0, 0,
+                       ceil_to_multiple(biases->dimension(0), num_elems_processed_per_iteration),
+                       biases->tensor_shape().y()));
+
+  AccessWindowHorizontal output_access(accum, 0, num_elems_processed_per_iteration);
+
+  // Set the valid region for the accum tensor
+  Coordinates coord;
+  coord.set_num_dimensions(accum->num_dimensions());
+  output_access.set_valid_region(win, ValidRegion(coord, accum->tensor_shape()));
+
+  Status err = (window_changed)
+                 ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                 : Status{};
+  return std::make_pair(err, win);
+}
+} // namespace
+
+NEGEMMMatrixAccumulateBiasesKernel::NEGEMMMatrixAccumulateBiasesKernel()
+  : _accum(nullptr), _biases(nullptr)
+{
+}
+
+void NEGEMMMatrixAccumulateBiasesKernel::configure(ITensor *accum, const ITensor *biases)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(accum, biases);
+
+  // Perform validate step
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(accum->info(), biases->info()));
+
+  _biases = biases;
+  _accum = accum;
+
+  // Configure kernel window
+  auto win_config = validate_and_configure_window(accum->info(), biases->info());
+  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+  INEKernel::configure(win_config.second);
+}
+
+Status NEGEMMMatrixAccumulateBiasesKernel::validate(const ITensorInfo *accum,
+                                                    const ITensorInfo *biases)
+{
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(accum, biases));
+  ARM_COMPUTE_RETURN_ON_ERROR(
+    validate_and_configure_window(accum->clone().get(), biases->clone().get()).first);
+
+  return Status{};
+}
+
+std::mutex m;
+void NEGEMMMatrixAccumulateBiasesKernel::run(const Window &window, const ThreadInfo &info)
+{
+  std::lock_guard<std::mutex> lock_guard(m);
+  ARM_COMPUTE_UNUSED(info);
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+
+  Window win_biases;
+  win_biases.set(Window::DimX,
+                 Window::Dimension(window.x().start(), window.x().end(), window.x().step()));
+  win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+  Iterator in0_out(_accum, window);
+  Iterator in1(_biases, win_biases);
+
+  switch (_accum->info()->data_type())
+  {
+    case DataType::F32:
+    {
+      execute_window_loop(
+        window,
+        [&](const Coordinates &) {
+          const float32x4x4_t accum = vld4q_f32(reinterpret_cast<const float *>(in0_out.ptr()));
+          const float32x4x4_t biases = vld4q_f32(reinterpret_cast<const float *>(in1.ptr()));
+          const float32x4x4_t res = {
+            {vaddq_f32(accum.val[0], biases.val[0]), vaddq_f32(accum.val[1], biases.val[1]),
+             vaddq_f32(accum.val[2], biases.val[2]), vaddq_f32(accum.val[3], biases.val[3])}};
+
+          vst4q_f32(reinterpret_cast<float *>(in0_out.ptr()), res);
+        },
+        in0_out, in1);
+      break;
+    }
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    case DataType::F16:
+    {
+      execute_window_loop(
+        window,
+        [&](const Coordinates &) {
+          const float16x8x2_t accum = vld2q_f16(reinterpret_cast<const float16_t *>(in0_out.ptr()));
+          const float16x8x2_t biases = vld2q_f16(reinterpret_cast<const float16_t *>(in1.ptr()));
+          const float16x8x2_t res = {
+            {vaddq_f16(accum.val[0], biases.val[0]), vaddq_f16(accum.val[1], biases.val[1])}};
+
+          vst2q_f16(reinterpret_cast<float16_t *>(in0_out.ptr()), res);
+        },
+        in0_out, in1);
+      break;
+    }
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+    default:
+      ARM_COMPUTE_ERROR("Data type not supported");
+      break;
+  }
+}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp
index c9f0799d4..d4144e6b9 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEGatherKernelEx.cpp
@@ -40,7 +40,7 @@
 
 #include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h"
 
-#include "arm_compute/core/CPP/Validate.h"
+#include "src/core/CPP/Validate.h"
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -50,6 +50,9 @@
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
 
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 namespace arm_compute
 {
 namespace
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp
index 52b40e767..f178865b7 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEHashtableLookupKernel.cpp
@@ -47,6 +47,9 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 #include <unordered_map>
 
 using namespace arm_compute;
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp
index 4dc0f5535..7804f9c6a 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.cpp
@@ -40,17 +40,22 @@
 
 #include "arm_compute/core/NEON/kernels/NEInstanceNormalizationLayerKernelEx.h"
 
-#include "arm_compute/core/CPP/Validate.h"
+#include "src/core/CPP/Validate.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEMath.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/INEKernel.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
+#include "src/core/NEON/wrapper/wrapper.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp
index ad4728175..8ad998313 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEMultiplyScaleFactorKernel.cpp
@@ -42,13 +42,15 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
-#include "arm_compute/core/CPP/Validate.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/INEKernel.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp
index 0daff5c6a..e56fbf7f3 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEOneHotKernel.cpp
@@ -38,7 +38,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/NEON/kernels/NEOneHotKernel.h"
-#include "arm_compute/core/CPP/Validate.h"
+#include "src/core/CPP/Validate.h"
 #include "arm_compute/core/Coordinates.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -47,6 +47,10 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+
 namespace arm_compute
 {
 namespace
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
index 2306228d5..420e5063c 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
@@ -42,13 +42,16 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/INEKernel.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
-#include "arm_compute/core/CPP/Validate.h"
+#include "src/core/CPP/Validate.h"
+
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp
index b02a48ef2..6b9b0d4b4 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgMinMaxLayerEx.cpp
@@ -45,7 +45,9 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/Utils.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/runtime/Utils.h"
 
 namespace arm_compute
 {
@@ -66,7 +68,7 @@ Status CLArgMinMaxLayerEx::validate(const ITensorInfo *input, int axis, const IT
                                   "Reduction axis greater than max number of dimensions");
   ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
   const unsigned int num_of_stages =
-    calculate_number_of_stages_only_x_axis(input->dimension(0), axis);
+    utils::calculate_number_of_stages_only_x_axis(input->dimension(0), axis);
 
   DataType output_data_type = DataType::S32;
   TensorInfo not_reshaped_output;
@@ -132,7 +134,7 @@ Status CLArgMinMaxLayerEx::validate(const ITensorInfo *input, int axis, const IT
     ARM_COMPUTE_RETURN_ON_ERROR(CLArgMinMaxLayerKernelEx::validate(
       input, &sums_vector[last_stage - 1], &not_reshaped_output, axis, op));
   }
-  ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayerKernel::validate(&not_reshaped_output, output));
+  ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(&not_reshaped_output, output));
   return Status{};
 }
 
@@ -140,7 +142,7 @@ void CLArgMinMaxLayerEx::configure(const ICLTensor *input, int axis, ICLTensor *
                                    const ReductionOperation &op)
 {
   ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  _num_of_stages = calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis);
+  _num_of_stages = utils::calculate_number_of_stages_only_x_axis(input->info()->dimension(0), axis);
   _reduction_axis = axis;
 
   const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(
@@ -204,7 +206,8 @@ void CLArgMinMaxLayerEx::configure(const ICLTensor *input, int axis, ICLTensor *
                                                     &_not_reshaped_output, axis, op);
     _results_vector[last_stage - 1].allocator()->allocate();
   }
-  _reshape_kernel.configure(&_not_reshaped_output, output);
+  _reshape_kernel.configure(CLKernelLibrary::get().get_compile_context(), &_not_reshaped_output,
+                            output);
   _not_reshaped_output.allocator()->allocate();
 }
 
@@ -216,6 +219,6 @@ void CLArgMinMaxLayerEx::run()
   {
     CLScheduler::get().enqueue(_reduction_kernels_vector[i], false);
   }
-  CLScheduler::get().enqueue(_reshape_kernel, false);
+  _reshape_kernel.run();
 }
 } // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
index e5122ab8f..31c96b080 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
@@ -42,13 +42,14 @@
 
 #include "arm_compute/core/CL/kernels/CLBinaryLogicalOpKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 using namespace arm_compute;
 
 void CLBinaryLogicalOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
                                   BinaryLogicalOperation op)
 {
-  auto k = support::cpp14::make_unique<CLBinaryLogicalOpKernel>();
+  auto k = std::make_unique<CLBinaryLogicalOpKernel>();
   k->configure(input1, input2, output, op);
   _kernel = std::move(k);
 
@@ -57,7 +58,7 @@ void CLBinaryLogicalOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTenso
     ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
     if (broadcasted_info->info()->dimension(0) == 1)
     {
-      _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+      _border_handler->configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
     }
   }
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLCastBool.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLCastBool.cpp
index c7d0ac8e2..96f9c17a9 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLCastBool.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLCastBool.cpp
@@ -46,7 +46,7 @@ using namespace arm_compute;
 
 void CLCastBool::configure(ICLTensor *input, ICLTensor *output)
 {
-  auto k = arm_compute::support::cpp14::make_unique<CLCastBoolKernel>();
+  auto k = std::make_unique<CLCastBoolKernel>();
   k->configure(input, output);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
index 6359b4bcb..464f60dee 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
@@ -45,6 +45,8 @@
 #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/core/helpers/AutoConfiguration.h"
+
 #include <memory>
 #include <tuple>
 
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
index ae9d8afc6..003ec8042 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
@@ -39,7 +39,6 @@
  */
 
 #include "arm_compute/runtime/CL/functions/CLEmbeddingLookup.h"
-
 #include "arm_compute/core/CL/kernels/CLEmbeddingLookupKernel.h"
 
 using namespace arm_compute;
@@ -47,7 +46,7 @@ using namespace arm_compute;
 void CLEmbeddingLookup::configure(const ICLTensor *input, ICLTensor *output,
                                   const ICLTensor *lookups)
 {
-  auto k = support::cpp14::make_unique<CLEmbeddingLookupKernel>();
+  auto k = std::make_unique<CLEmbeddingLookupKernel>();
   k->configure(input, output, lookups);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
index 79d0929a9..af936e873 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
@@ -45,7 +45,6 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/MemorySupport.h"
 
 #include <algorithm>
 
@@ -68,7 +67,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
 
 void CLFullyConnectedHybridLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output)
 {
-  auto k = support::cpp14::make_unique<CLTransposeKernel>();
+  auto k = std::make_unique<CLTransposeKernel>();
   k->configure(input, output);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
index 13d3acbac..c6a88d340 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
@@ -42,11 +42,11 @@
 
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/MemorySupport.h"
+
+#include "support/Cast.h"
 
 #include <algorithm>
 
@@ -141,7 +141,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
 
 void CLFullyConnectedLayerReshapeWeightsEx::configure(const ICLTensor *input, ICLTensor *output)
 {
-  auto k = support::cpp14::make_unique<CLTransposeKernel>();
+  auto k = std::make_unique<CLTransposeKernel>();
   k->configure(input, output);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
index ac6982e6f..cda784541 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
@@ -19,6 +19,7 @@
 #include <arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h>
 #include <arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h>
 #include <arm_compute/runtime/CL/functions/CLFullyConnectedLayerEx.h>
+#include "src/core/helpers/AutoConfiguration.h"
 
 using namespace arm_compute;
 
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMMatrixAccumulateBiasesKernel.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMMatrixAccumulateBiasesKernel.cpp
new file mode 100644
index 000000000..cd7409417
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMMatrixAccumulateBiasesKernel.cpp
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+ * Copyright (c) 2017-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "arm_compute/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.h"
+
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/CLKernelLibraryEx.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "support/StringSupport.h"
+#include "src/core/CL/CLValidate.h"
+#include "src/core/AccessWindowStatic.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+using namespace arm_compute;
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *accum, const ITensorInfo *biases)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(accum);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum);
+  ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() != 1);
+
+  return Status{};
+}
+
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *accum, ITensorInfo *biases, GPUTarget gpu_target,
+                              unsigned int &num_elems_processed_per_iteration)
+{
+  // Select the vector size to use (8 for Bifrost; 16 for Midgard).
+  bool is_gpu_bifrost =
+    gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G76, GPUTarget::G51,
+                     GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::G52, GPUTarget::G52LIT);
+  num_elems_processed_per_iteration = is_gpu_bifrost ? 8 : 16;
+
+  // Configure kernel window
+  Window win = calculate_max_window(*accum, Steps(num_elems_processed_per_iteration));
+
+  AccessWindowStatic biases_access(
+    biases, 0, 0, ceil_to_multiple(biases->dimension(0), num_elems_processed_per_iteration),
+    biases->dimension(1));
+  AccessWindowHorizontal accum_access(accum, 0, num_elems_processed_per_iteration);
+
+  bool window_changed = update_window_and_padding(win, biases_access, accum_access);
+
+  Status err = (window_changed)
+                 ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
+                 : Status{};
+  return std::make_pair(err, win);
+}
+} // namespace
+
+CLGEMMMatrixAccumulateBiasesKernel::CLGEMMMatrixAccumulateBiasesKernel()
+  : _accum(nullptr), _biases(nullptr)
+{
+}
+
+void CLGEMMMatrixAccumulateBiasesKernel::configure(ICLTensor *accum, const ICLTensor *biases)
+{
+  configure(CLKernelLibrary::get().get_compile_context(), accum, biases);
+}
+
+void CLGEMMMatrixAccumulateBiasesKernel::configure(const CLCompileContext &compile_context,
+                                                   ICLTensor *accum, const ICLTensor *biases)
+{
+  ARM_COMPUTE_UNUSED(compile_context);
+  // Perform validate step
+  ARM_COMPUTE_ERROR_ON_NULLPTR(accum, biases);
+  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(accum->info(), biases->info()));
+
+  _biases = biases;
+  _accum = accum;
+
+  // Get the target gpu
+  GPUTarget gpu_target = get_target();
+  unsigned int vector_size = 0;
+
+  // Configure kernel window
+  auto win_config =
+    validate_and_configure_window(accum->info(), biases->info(), gpu_target, vector_size);
+  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+  ICLKernel::configure_internal(win_config.second);
+
+  // Add build options
+  CLBuildOptions build_opts;
+  build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(accum->info()->data_type()));
+  build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
+
+  // Create kernel
+  _kernel = static_cast<cl::Kernel>(
+    CLKernelLibraryEx::get().create_kernel("gemm_accumulate_biases", build_opts.options()));
+}
+
+Status CLGEMMMatrixAccumulateBiasesKernel::validate(const ITensorInfo *accum,
+                                                    const ITensorInfo *biases, GPUTarget gpu_target)
+{
+  unsigned int num_elems_processed_per_iteration = 0;
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(accum, biases));
+  ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(accum->clone().get(),
+                                                            biases->clone().get(), gpu_target,
+                                                            num_elems_processed_per_iteration)
+                                .first);
+
+  return Status{};
+}
+
+void CLGEMMMatrixAccumulateBiasesKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+  Window accum_slice = window.first_slice_window_2D();
+
+  Window biases_slice(accum_slice);
+  biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+  // Run kernel
+  do
+  {
+    // Set arguments
+    unsigned int idx = 0;
+    add_2D_tensor_argument(idx, _accum, accum_slice);
+    add_1D_tensor_argument(idx, _biases, biases_slice);
+
+    enqueue(queue, *this, accum_slice, lws_hint());
+  } while (window.slide_window_slice_2D(accum_slice));
+}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
index e0b833b04..f380e3e2c 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
@@ -41,6 +41,8 @@
 #include "arm_compute/runtime/CL/functions/CLGatherEx.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "src/core/CL/kernels/CLGatherKernel.h"
+
 #include "arm_compute/core/CL/kernels/CLGatherExKernel.h"
 
 using namespace arm_compute;
@@ -48,7 +50,7 @@ using namespace arm_compute;
 void CLGatherEx::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output,
                            int axis)
 {
-  auto k = support::cpp14::make_unique<CLGatherExKernel>();
+  auto k = std::make_unique<CLGatherExKernel>();
   k->configure(input, indices, output, axis);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
index 65b89a389..9896abd4b 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
@@ -47,7 +47,7 @@ using namespace arm_compute;
 void CLHashtableLookup::configure(const ICLTensor *lookups, const ICLTensor *keys,
                                   const ICLTensor *input, ICLTensor *output, ICLTensor *hits)
 {
-  auto k = support::cpp14::make_unique<CLHashtableLookupKernel>();
+  auto k = std::make_unique<CLHashtableLookupKernel>();
   k->configure(lookups, keys, input, output, hits);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
index 5a7e40839..ca45a57f8 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
@@ -50,7 +50,7 @@ CLInstanceNormalizationLayerEx::CLInstanceNormalizationLayerEx() {}
 void CLInstanceNormalizationLayerEx::configure(ICLTensor *input, ICLTensor *output,
                                                ICLTensor *gamma, ICLTensor *beta, float epsilon)
 {
-  auto k = support::cpp14::make_unique<CLInstanceNormalizationLayerKernelEx>();
+  auto k = std::make_unique<CLInstanceNormalizationLayerKernelEx>();
   k->configure(input, output, gamma, beta, epsilon);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp
index 28e5bc0da..2bdc451b3 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLNeg.cpp
@@ -46,7 +46,7 @@ using namespace arm_compute;
 
 void CLNeg::configure(ICLTensor *input, ICLTensor *output)
 {
-  auto k = arm_compute::support::cpp14::make_unique<CLNegKernel>();
+  auto k = std::make_unique<CLNegKernel>();
   k->configure(input, output);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp
index aa9f32ec6..759a19ff3 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLOneHot.cpp
@@ -41,7 +41,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/kernels/CLOneHotKernel.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/MemorySupport.h"
+
 namespace arm_compute
 {
 CLOneHot::CLOneHot() : _memset_kernel(), _onehot_kernel(), _has_to_memset(false) {}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp
new file mode 100644
index 000000000..4d940e966
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLPadLayerEx.cpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLPadLayerEx.h"
+#include "arm_compute/core/CL/kernels/CLPadLayerKernelEx.h"
+
+namespace arm_compute
+{
+CLPadLayerEx::CLPadLayerEx()
+  : _pad_kernel(std::make_unique<CLPadLayerKernelEx>()),
+    _copy_kernel(std::make_unique<opencl::kernels::ClCopyKernel>()), _perform_pad(false)
+{
+}
+
+void CLPadLayerEx::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding,
+                             PixelValue constant_value, PaddingMode mode)
+{
+  configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value,
+            mode);
+}
+
+void CLPadLayerEx::configure(const CLCompileContext &compile_context, ICLTensor *input,
+                             ICLTensor *output, const PaddingList &padding,
+                             PixelValue constant_value, PaddingMode mode)
+{
+  ARM_COMPUTE_ERROR_THROW_ON(
+    validate(input->info(), output->info(), padding, constant_value, mode));
+
+  _perform_pad = std::any_of(padding.begin(), padding.end(),
+                             [](PaddingInfo info) { return info.first > 0 || info.second > 0; });
+
+  if (_perform_pad)
+  {
+    _pad_kernel->configure(compile_context, input, output, padding, constant_value, mode);
+  }
+  else
+  {
+    Window copy_window = Window();
+    copy_window.use_tensor_dimensions(output->info()->tensor_shape());
+    // Copy the input to the whole output if no padding is applied
+    _copy_kernel->configure(compile_context, input->info(), output->info(), &copy_window);
+  }
+}
+Status CLPadLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
+                              const PaddingList &padding, PixelValue constant_value,
+                              PaddingMode mode)
+{
+  bool perform_pad = std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) {
+    return info.first > 0 || info.second > 0;
+  });
+
+  if (perform_pad)
+  {
+    ARM_COMPUTE_RETURN_ON_ERROR(
+      CLPadLayerKernelEx::validate(input, output, padding, constant_value, mode));
+  }
+  else
+  {
+    ARM_COMPUTE_RETURN_ON_ERROR(opencl::kernels::ClCopyKernel::validate(input, output));
+  }
+  return Status{};
+}
+void CLPadLayerEx::run()
+{
+  if (_perform_pad)
+  {
+    CLScheduler::get().enqueue(*_pad_kernel);
+  }
+  else
+  {
+    CLScheduler::get().enqueue(*_copy_kernel);
+  }
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
index c246041bb..6740835a8 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
@@ -61,7 +61,7 @@ Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo *
   ARM_COMPUTE_RETURN_ERROR_ON(num_of_kernels < 1);
 
   // Create temporary tensor infos
-  auto interm_tensors = support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
+  auto interm_tensors = std::make_unique<TensorInfo[]>(num_of_interm_tensors);
 
   // Create intermediate tensor info
   TensorShape shape{input->tensor_shape()};
@@ -124,8 +124,8 @@ void CLReduceOperation::configure(ICLTensor *input, ICLTensor *output,
     throw std::runtime_error("CLReduceOperation: there is no axis to reduce");
   }
 
-  _interm_tensors = support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
-  _reduce_kernels = support::cpp14::make_unique<CLReduceOperationKernel[]>(num_of_kernels);
+  _interm_tensors = std::make_unique<CLTensor[]>(num_of_interm_tensors);
+  _reduce_kernels = std::make_unique<CLReduceOperationKernel[]>(num_of_kernels);
 
   // Set a vector that is ordered ICLTensors sequentially.
   std::vector<ICLTensor *> tensors;
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp
index 12c0aa829..73f5f6eb1 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLSplitVEx.cpp
@@ -47,6 +47,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "src/core/helpers/AutoConfiguration.h"
 #include <cassert>
 
 using namespace arm_compute;
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
index 0754fd813..f3f093c18 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
@@ -79,7 +79,7 @@ void CLTransposeConvLayer::configure(const CLCompileContext &compile_context, IC
   {
     case DeconvolutionMethod::DIRECT:
     {
-      auto f = arm_compute::support::cpp14::make_unique<CLDirectTransposeConvLayer>();
+      auto f = std::make_unique<CLDirectTransposeConvLayer>();
       f->configure(compile_context, input, weights, bias, output, deconv_info, invalid_right,
                    invalid_bottom, weights_info);
       _function = std::move(f);
@@ -87,7 +87,7 @@ void CLTransposeConvLayer::configure(const CLCompileContext &compile_context, IC
     }
     case DeconvolutionMethod::GEMM:
     {
-      auto f = arm_compute::support::cpp14::make_unique<CLGEMMDeconvolutionLayer>(_memory_manager);
+      auto f = std::make_unique<CLGEMMDeconvolutionLayer>(_memory_manager);
       f->configure(compile_context, input, weights, bias, output, deconv_info);
       _function = std::move(f);
       break;
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
index 2fc94b267..e6b7329d1 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
@@ -38,11 +38,10 @@
  * SOFTWARE.
  */
 
-#include "arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h"
 #include <arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h>
+#include "arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h"
 
 #include "arm_compute/core/ITensor.h"
-#include "support/MemorySupport.h"
 
 #include <utility>
 
@@ -53,7 +52,7 @@ template <BinaryLogicalOperation COP>
 void NEBinaryLogicalOperationStatic<COP>::configure(ITensor *input1, ITensor *input2,
                                                     ITensor *output)
 {
-  auto k = support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
+  auto k = std::make_unique<NEBinaryLogicalOperationKernel>();
   k->configure(COP, input1, input2, output);
   _kernel = std::move(k);
 }
@@ -69,7 +68,7 @@ Status NEBinaryLogicalOperationStatic<COP>::validate(const ITensorInfo *input1,
 void NEBinaryLogicalOperation::configure(ITensor *input1, ITensor *input2, ITensor *output,
                                          BinaryLogicalOperation op)
 {
-  auto k = support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
+  auto k = std::make_unique<NEBinaryLogicalOperationKernel>();
   k->configure(op, input1, input2, output);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NECastBool.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NECastBool.cpp
index 6ad3e1b12..f6eec2603 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NECastBool.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NECastBool.cpp
@@ -40,13 +40,12 @@
 #include "arm_compute/runtime/NEON/functions/NECastBool.h"
 
 #include "arm_compute/core/NEON/kernels/NECastBoolKernel.h"
-#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
 void NECastBool::configure(const ITensor *input, ITensor *output)
 {
-  auto k = arm_compute::support::cpp14::make_unique<NECastBoolKernel>();
+  auto k = std::make_unique<NECastBoolKernel>();
   k->configure(input, output);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
index e0ab3e025..99fc5c579 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
@@ -41,13 +41,12 @@
 #include "arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h"
 
 #include "arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h"
-#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
 void NEEmbeddingLookup::configure(const ITensor *input, ITensor *output, const ITensor *lookups)
 {
-  auto k = support::cpp14::make_unique<NEEmbeddingLookupKernel>();
+  auto k = std::make_unique<NEEmbeddingLookupKernel>();
   k->configure(input, output, lookups);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
index e212a03c7..fbd88fff0 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
@@ -66,7 +66,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
 
 void NEFullyConnectedHybridLayerReshapeWeights::configure(const ITensor *input, ITensor *output)
 {
-  auto k = support::cpp14::make_unique<NETransposeKernel>();
+  auto k = std::make_unique<NETransposeKernel>();
   k->configure(input, output);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
index a639f2979..758f7dc59 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedLayerEx.cpp
@@ -50,7 +50,8 @@
 #include <algorithm>
 #include <cmath>
 
-using namespace arm_compute;
+namespace arm_compute
+{
 using namespace arm_compute::misc::shape_calculator;
 
 namespace
@@ -164,9 +165,8 @@ void NEFullyConnectedLayerEx::configure(const ITensor *input, const ITensor *wei
                                         const ITensor *biases, ITensor *output,
                                         FullyConnectedLayerInfo fc_info)
 {
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-
   // Perform validate step
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
   ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayerEx::validate(
     input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
     fc_info));
@@ -348,7 +348,7 @@ Status NEFullyConnectedLayerEx::validate(const ITensorInfo *input, const ITensor
        (input->dimension(0) * input->dimension(1) * input->dimension(2))));
 
     // Validate flatten kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayerKernel::validate(input, &flatten_input));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayer::validate(input, &flatten_input));
     input_to_use = &flatten_input;
   }
   else
@@ -374,9 +374,13 @@ void NEFullyConnectedLayerEx::run()
   if (!_is_prepared)
   {
     if (!_are_weights_reshaped)
+    {
       _reshape_weights_output.allocator()->allocate();
+    }
     if (!_are_weights_converted)
+    {
       _converted_weights_output.allocator()->allocate();
+    }
     _is_prepared = true;
   }
 
@@ -407,7 +411,7 @@ void NEFullyConnectedLayerEx::run()
   // Linearize input if it comes from a convolutional layer
   if (_is_fc_after_conv)
   {
-    NEScheduler::get().schedule(&_flatten_kernel, Window::DimY);
+    _flatten_kernel.run();
   }
 
   // Run matrix multiply
@@ -490,3 +494,4 @@ void NEFullyConnectedLayerEx::prepare()
   }
 #endif
 }
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
index 234c783f9..2199839fb 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
@@ -19,6 +19,8 @@
 #include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h>
 #include <arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h>
 #include <arm_compute/runtime/NEON/functions/NEFullyConnectedLayerEx.h>
+#include "src/core/helpers/AutoConfiguration.h"
+#include <cassert>
 
 using namespace arm_compute;
 
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
index 433c35d58..e5607ab9a 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
@@ -41,7 +41,6 @@
 #include "arm_compute/runtime/NEON/functions/NEGatherEx.h"
 
 #include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h"
-#include "support/MemorySupport.h"
 
 #include <utility>
 
@@ -49,7 +48,7 @@ namespace arm_compute
 {
 void NEGatherEx::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis)
 {
-  auto k = support::cpp14::make_unique<NEGatherKernelEx>();
+  auto k = std::make_unique<NEGatherKernelEx>();
   k->configure(input, indices, output, axis);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
index 52d58accf..7cc6c89e7 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
@@ -41,14 +41,13 @@
 #include "arm_compute/runtime/NEON/functions/NEHashtableLookup.h"
 
 #include "arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h"
-#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
 void NEHashtableLookup::configure(const ITensor *lookups, const ITensor *keys, const ITensor *input,
                                   ITensor *output, ITensor *hits)
 {
-  auto k = support::cpp14::make_unique<NEHashtableLookupKernel>();
+  auto k = std::make_unique<NEHashtableLookupKernel>();
   k->configure(lookups, keys, input, output, hits);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEOneHot.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEOneHot.cpp
index 275c55024..e0620bad2 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEOneHot.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEOneHot.cpp
@@ -39,14 +39,14 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEOneHot.h"
 #include "arm_compute/core/NEON/kernels/NEOneHotKernel.h"
-#include "support/MemorySupport.h"
+
 #include <utility>
 namespace arm_compute
 {
 void NEOneHot::configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value,
                          const ITensor *off_value, ITensor *output, int axis)
 {
-  auto k = arm_compute::support::cpp14::make_unique<NEOneHotKernel>();
+  auto k = std::make_unique<NEOneHotKernel>();
   k->configure(indices, depth, on_value, off_value, output, axis);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp
index c45c335b3..a30c00ea1 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceOperation.cpp
@@ -40,11 +40,13 @@
 
 #include "arm_compute/runtime/NEON/functions/NEReduceOperation.h"
 
-#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/runtime/Tensor.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 using namespace arm_compute;
 
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp
index b21717e86..7a1342644 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceSum.cpp
@@ -40,9 +40,13 @@
 
 #include "arm_compute/runtime/NEON/functions/NEReduceSum.h"
 
-#include "arm_compute/core/CPP/Validate.h"
+#include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 using namespace arm_compute;
 
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
index 50311071b..4675121b2 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
@@ -44,6 +44,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 using namespace arm_compute::misc::shape_calculator;
 
diff --git a/compute/cker/include/cker/Types.h b/compute/cker/include/cker/Types.h
index 10f3ecbd3..c5dd63b5b 100644
--- a/compute/cker/include/cker/Types.h
+++ b/compute/cker/include/cker/Types.h
@@ -111,6 +111,8 @@ struct SoftmaxParams
   int32_t zero_point;
   float scale;
   float *table;
+  uint8_t *uint8_table1;
+  uint8_t *uint8_table2;
 };
 
 struct PackParams
diff --git a/compute/cker/include/cker/Utils.h b/compute/cker/include/cker/Utils.h
index f73c01523..9aae0a957 100644
--- a/compute/cker/include/cker/Utils.h
+++ b/compute/cker/include/cker/Utils.h
@@ -20,6 +20,8 @@
 
 #include "Shape.h"
 
+#include "neon/neon_check.h"
+
 #include <algorithm>
 #include <cstdint>
 #include <fixedpoint/fixedpoint.h>
@@ -29,6 +31,11 @@ namespace nnfw
 namespace cker
 {
 
+template <typename T> struct is_quant8
+{
+  static constexpr bool value = std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value;
+};
+
 template <typename T>
 inline T ActivationFunctionWithMinMax(T x, T output_activation_min, T output_activation_max)
 {
@@ -106,6 +113,34 @@ inline int32_t MultiplyByQuantizedMultiplierSmallerThanOneExp(int32_t x,
     gemmlowp::SaturatingRoundingDoublingHighMul(x, quantized_multiplier), -left_shift);
 }
 
+#ifdef USE_NEON
+inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows(int32x4x4_t input_val,
+                                                      int32_t quantized_multiplier, int32_t shift)
+{
+  const int left_shift = std::max(shift, 0);
+  const int right_shift = std::min(shift, 0);
+  int32x4x4_t result;
+
+  int32x4_t multiplier_dup = vdupq_n_s32(quantized_multiplier);
+  int32x4_t left_shift_dup = vdupq_n_s32(left_shift);
+  int32x4_t right_shift_dup = vdupq_n_s32(right_shift);
+
+  result.val[0] = vrshlq_s32(
+    vqrdmulhq_s32(vshlq_s32(input_val.val[0], left_shift_dup), multiplier_dup), right_shift_dup);
+
+  result.val[1] = vrshlq_s32(
+    vqrdmulhq_s32(vshlq_s32(input_val.val[1], left_shift_dup), multiplier_dup), right_shift_dup);
+
+  result.val[2] = vrshlq_s32(
+    vqrdmulhq_s32(vshlq_s32(input_val.val[2], left_shift_dup), multiplier_dup), right_shift_dup);
+
+  result.val[3] = vrshlq_s32(
+    vqrdmulhq_s32(vshlq_s32(input_val.val[3], left_shift_dup), multiplier_dup), right_shift_dup);
+
+  return result;
+}
+#endif
+
 inline int NodeOffset(int b, int h, int w, int height, int width)
 {
   return (b * height + h) * width + w;
diff --git a/compute/cker/include/cker/operation/AveragePool.h b/compute/cker/include/cker/operation/AveragePool.h
index a70e39cc9..e10f02ad4 100644
--- a/compute/cker/include/cker/operation/AveragePool.h
+++ b/compute/cker/include/cker/operation/AveragePool.h
@@ -395,6 +395,129 @@ void AveragePool<uint8_t>(const PoolParams &params, const Shape &input_shape,
   }
 }
 
+template <>
+void AveragePool<int8_t>(const PoolParams &params, const Shape &input_shape,
+                         const int8_t *input_data, const Shape &output_shape, int8_t *output_data)
+{
+  // Here, and in other pooling ops, in order to maintain locality of reference,
+  // to minimize some recalculations, and to load into NEON vector registers, we
+  // use an inner loop down the depth. Since depths can be large and hence we
+  // would need arbitrarily large temporary storage, we divide the work up into
+  // depth tranches just within the batch loop.
+  static constexpr int kPoolingAccTrancheSize = 256;
+
+  assert(params.quantized_activation_min <= params.quantized_activation_max);
+  assert(input_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int depth = MatchingDim(input_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  const int stride_height = params.stride_height;
+  const int stride_width = params.stride_width;
+
+  int32_t acc[kPoolingAccTrancheSize];
+  for (int batch = 0; batch < batches; ++batch)
+  {
+    // We proceed through the depth in tranches (see comment above). The
+    // depth_base is the depth at the beginning of the tranche. The
+    // tranche_depth is the depth dimension of the tranche.
+    for (int depth_base = 0; depth_base < depth; depth_base += kPoolingAccTrancheSize)
+    {
+      const int tranche_depth = std::min(depth - depth_base, kPoolingAccTrancheSize);
+      for (int out_y = 0; out_y < output_height; ++out_y)
+      {
+        for (int out_x = 0; out_x < output_width; ++out_x)
+        {
+          const int in_x_origin = (out_x * stride_width) - params.padding_values.width;
+          const int in_y_origin = (out_y * stride_height) - params.padding_values.height;
+          const int filter_x_start = std::max(0, -in_x_origin);
+          const int filter_x_end = std::min(params.filter_width, input_width - in_x_origin);
+          const int filter_y_start = std::max(0, -in_y_origin);
+          const int filter_y_end = std::min(params.filter_height, input_height - in_y_origin);
+          const int filter_count =
+            (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
+          memset(acc, 0, tranche_depth * sizeof(acc[0]));
+          const int8_t *input_ptr =
+            input_data + depth_base +
+            depth * (in_x_origin + input_width * (in_y_origin + input_height * batch));
+          for (int fy = filter_y_start; fy < filter_y_end; fy++)
+          {
+            const int8_t *input_row_ptr = input_ptr + depth * (fy * input_width + filter_x_start);
+            for (int fx = filter_x_start; fx < filter_x_end; fx++)
+            {
+              const int8_t *input_channel_ptr = input_row_ptr;
+              int channel = 0;
+#ifdef USE_NEON
+              for (; channel <= tranche_depth - 16; channel += 16)
+              {
+                int16x4_t acc_reg[4];
+                int8x16_t input_reg = vld1q_s8(input_channel_ptr);
+                input_channel_ptr += 16;
+                acc_reg[0] = vget_low_s16(vmovl_s8(vget_low_s8(input_reg)));
+                acc_reg[1] = vget_high_s16(vmovl_s8(vget_low_s8(input_reg)));
+                acc_reg[2] = vget_low_s16(vmovl_s8(vget_high_s8(input_reg)));
+                acc_reg[3] = vget_high_s16(vmovl_s8(vget_high_s8(input_reg)));
+                for (int i = 0; i < 4; i++)
+                {
+                  vst1q_s32(acc + channel + 4 * i,
+                            vaddw_s16(vld1q_s32(acc + channel + 4 * i), acc_reg[i]));
+                }
+              }
+              for (; channel <= tranche_depth - 8; channel += 8)
+              {
+                int16x4_t acc_reg[2];
+                int16x8_t input_reg = vmovl_s8(vld1_s8(input_channel_ptr));
+                input_channel_ptr += 8;
+                acc_reg[0] = vget_low_s16(input_reg);
+                acc_reg[1] = vget_high_s16(input_reg);
+                for (int i = 0; i < 2; i++)
+                {
+                  vst1q_s32(acc + channel + 4 * i,
+                            vaddw_s16(vld1q_s32(acc + channel + 4 * i), acc_reg[i]));
+                }
+              }
+#endif
+              for (; channel < tranche_depth; ++channel)
+              {
+                acc[channel] += *input_channel_ptr++;
+              }
+              input_row_ptr += depth;
+            }
+          }
+          int8_t *output_ptr = output_data + Offset(output_shape, batch, out_y, out_x, depth_base);
+          int channel = 0;
+#ifdef USE_NEON
+          for (; channel <= tranche_depth - 8; channel += 8)
+          {
+            int16_t buf[8];
+            for (int i = 0; i < 8; i++)
+            {
+              buf[i] = acc[channel + i] > 0 ? (acc[channel + i] + filter_count / 2) / filter_count
+                                            : (acc[channel + i] - filter_count / 2) / filter_count;
+            }
+            int8x8_t buf8 = vqmovn_s16(vld1q_s16(buf));
+            buf8 = vmin_s8(buf8, vdup_n_s8(params.quantized_activation_max));
+            buf8 = vmax_s8(buf8, vdup_n_s8(params.quantized_activation_min));
+            vst1_s8(output_ptr + channel, buf8);
+          }
+#endif
+          for (; channel < tranche_depth; ++channel)
+          {
+            int16_t a = acc[channel] > 0 ? (acc[channel] + filter_count / 2) / filter_count
+                                         : (acc[channel] - filter_count / 2) / filter_count;
+            a = std::max<int16_t>(a, params.quantized_activation_min);
+            a = std::min<int16_t>(a, params.quantized_activation_max);
+            output_ptr[channel] = static_cast<int8_t>(a);
+          }
+        }
+      }
+    }
+  }
+}
+
 } // namespace cker
 } // namespace nnfw
 
diff --git a/compute/cker/include/cker/operation/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/BinaryArithmeticOps.h
index fe5f87746..c7878496a 100644
--- a/compute/cker/include/cker/operation/BinaryArithmeticOps.h
+++ b/compute/cker/include/cker/operation/BinaryArithmeticOps.h
@@ -190,34 +190,34 @@ inline bool ProcessBroadcastShapes(const Shape &shape0, const Shape &shape1,
 }
 
 template <BinaryArithmeticOpType op_type, typename T>
-inline void BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
-                               const T *input1_data, const Shape &input2_shape,
-                               const T *input2_data, const Shape &output_shape, T *output_data)
+inline typename std::enable_if_t<!is_quant8<T>::value>
+BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                   const T *input1_data, const Shape &input2_shape, const T *input2_data,
+                   const Shape &output_shape, T *output_data)
 {
   reference::BinaryArithmeticOp(params, input1_shape, input1_data, input2_shape, input2_data,
                                 output_shape, output_data, GetBinaryArtithmeticFn<op_type, T>());
 }
 
-template <BinaryArithmeticOpType op_type>
-inline void BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
-                               const uint8_t *input1_data, const Shape &input2_shape,
-                               const uint8_t *input2_data, const Shape &output_shape,
-                               uint8_t *output_data)
+template <BinaryArithmeticOpType op_type, typename T>
+inline typename std::enable_if_t<is_quant8<T>::value>
+BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                   const T *input1_data, const Shape &input2_shape, const T *input2_data,
+                   const Shape &output_shape, T *output_data)
 {
   switch (op_type)
   {
     case nnfw::cker::BinaryArithmeticOpType::ADD:
     case nnfw::cker::BinaryArithmeticOpType::SUB:
-      optimized::AddQuant8(params, input1_shape, input1_data, input2_shape, input2_data,
-                           output_shape, output_data);
+      optimized::Add(params, input1_shape, input1_data, input2_shape, input2_data, output_shape,
+                     output_data);
       break;
     case nnfw::cker::BinaryArithmeticOpType::MUL:
-      optimized::MulQuant8(params, input1_shape, const_cast<uint8_t *>(input1_data), input2_shape,
-                           const_cast<uint8_t *>(input2_data), output_shape, output_data);
+      optimized::Mul(params, input1_shape, input1_data, input2_shape, input2_data, output_shape,
+                     output_data);
       break;
     case nnfw::cker::BinaryArithmeticOpType::DIV:
       throw std::runtime_error{"Quant8 Asymm NYI"};
-
     default:
       assert(false);
       break;
@@ -256,33 +256,32 @@ inline void BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shap
 }
 
 template <BinaryArithmeticOpType op_type, typename T>
-inline void BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam &params, const Shape &input1_shape,
-                                        const T *input1_data, const Shape &input2_shape,
-                                        const T *input2_data, const Shape &output_shape,
-                                        T *output_data)
+inline typename std::enable_if_t<!is_quant8<T>::value>
+BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                            const T *input1_data, const Shape &input2_shape, const T *input2_data,
+                            const Shape &output_shape, T *output_data)
 {
   reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
                                              input2_data, output_shape, output_data,
                                              GetBinaryArtithmeticFn<op_type, T>());
 }
 
-template <BinaryArithmeticOpType op_type>
-inline void BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam &params, const Shape &input1_shape,
-                                        const uint8_t *input1_data, const Shape &input2_shape,
-                                        const uint8_t *input2_data, const Shape &output_shape,
-                                        uint8_t *output_data)
+template <BinaryArithmeticOpType op_type, typename T>
+inline typename std::enable_if_t<is_quant8<T>::value>
+BroadcastBinaryArithmeticOp(BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                            const T *input1_data, const Shape &input2_shape, const T *input2_data,
+                            const Shape &output_shape, T *output_data)
 {
   switch (op_type)
   {
     case nnfw::cker::BinaryArithmeticOpType::ADD:
     case nnfw::cker::BinaryArithmeticOpType::SUB:
-      optimized::BroadcastAddDispatchQuant8(params, input1_shape, input1_data, input2_shape,
-                                            input2_data, output_shape, output_data);
+      optimized::BroadcastAddDispatch(params, input1_shape, input1_data, input2_shape, input2_data,
+                                      output_shape, output_data);
       break;
     case nnfw::cker::BinaryArithmeticOpType::MUL:
-      optimized::BroadcastMulDispatchQuant8(
-        params, input1_shape, const_cast<uint8_t *>(input1_data), input2_shape,
-        const_cast<uint8_t *>(input2_data), output_shape, output_data);
+      optimized::BroadcastMulDispatch(params, input1_shape, input1_data, input2_shape, input2_data,
+                                      output_shape, output_data);
       break;
     case nnfw::cker::BinaryArithmeticOpType::DIV:
     case nnfw::cker::BinaryArithmeticOpType::POW:
diff --git a/compute/cker/include/cker/operation/BroadcastTo.h b/compute/cker/include/cker/operation/BroadcastTo.h
index 5068eca96..145deda29 100644
--- a/compute/cker/include/cker/operation/BroadcastTo.h
+++ b/compute/cker/include/cker/operation/BroadcastTo.h
@@ -126,7 +126,7 @@ template <typename Device, typename T> struct BroadcastTo
     }
   }
 };
-} // functor
+} // namespace functor
 
 template <typename T>
 inline void BroadcastTo(const Shape &input_shape, T *input_data, const Shape &output_shape,
diff --git a/compute/cker/include/cker/operation/Conv.h b/compute/cker/include/cker/operation/Conv.h
index b20bac3ac..16c937a27 100644
--- a/compute/cker/include/cker/operation/Conv.h
+++ b/compute/cker/include/cker/operation/Conv.h
@@ -138,6 +138,17 @@ public:
     }
   }
 
+  void operator()(const ConvParams &params, const Shape &input_shape, const int8_t *input_data,
+                  const Shape &filter_shape, const int8_t *filter_data, const Shape &bias_shape,
+                  const int32_t *bias_data, const Shape &output_shape, int8_t *output_data)
+  {
+    reference::Conv(params, _per_channel_output_multiplier.data(), _per_channel_output_shift.data(),
+                    input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data,
+                    output_shape, output_data);
+  }
+  std::vector<int32_t> &per_channel_output_multiplier() { return _per_channel_output_multiplier; }
+  std::vector<int> &per_channel_output_shift() { return _per_channel_output_shift; }
+
 private:
   bool usableMultiThreaded(PaddingType padding_type, uint32_t dilation_width_factor,
                            int32_t dilation_height_factor)
@@ -180,6 +191,9 @@ private:
   Shape _im2col_shape;
   bool _need_im2col;
   bool _prepared;
+  // Per channel output multiplier and shift.
+  std::vector<int32_t> _per_channel_output_multiplier;
+  std::vector<int> _per_channel_output_shift;
 };
 } // namespace cker
 } // namespace nnfw
diff --git a/compute/cker/include/cker/operation/DepthwiseConv.h b/compute/cker/include/cker/operation/DepthwiseConv.h
index 436ddd8c9..06ee780bb 100644
--- a/compute/cker/include/cker/operation/DepthwiseConv.h
+++ b/compute/cker/include/cker/operation/DepthwiseConv.h
@@ -24,6 +24,7 @@
 #include "cker/neon/neon_check.h"
 #include "cker/operation/optimized/DepthwiseConvFloat.h"
 #include "cker/operation/optimized/DepthwiseConvUint8.h"
+#include "cker/operation/optimized/integer_ops/DepthwiseConvInt8.h"
 #include "cker/CpuBackendThreadpool.h"
 
 namespace nnfw
diff --git a/compute/cker/include/cker/operation/Einsum.h b/compute/cker/include/cker/operation/Einsum.h
index 13fccfd15..6721a7508 100644
--- a/compute/cker/include/cker/operation/Einsum.h
+++ b/compute/cker/include/cker/operation/Einsum.h
@@ -177,7 +177,7 @@ inline Shape copyShape(const Shape &shape)
 {
   return Shape::ExtendedShape(shape.DimensionsCount(), shape);
 }
-}
+} // namespace
 
 class Einsum
 {
diff --git a/compute/cker/include/cker/operation/Fill.h b/compute/cker/include/cker/operation/Fill.h
index d657acc12..f88c3a5fb 100644
--- a/compute/cker/include/cker/operation/Fill.h
+++ b/compute/cker/include/cker/operation/Fill.h
@@ -24,7 +24,8 @@ namespace nnfw
 {
 namespace cker
 {
-template <typename T> inline void Fill(const T value_data, const Shape &output_shape, T output_data)
+template <typename T>
+inline void Fill(const T *value_data, const Shape &output_shape, T *output_data)
 {
   int output_size = output_shape.FlatSize();
   for (int i = 0; i < output_size; i++)
diff --git a/compute/cker/include/cker/operation/Helper/RandomDistributions.h b/compute/cker/include/cker/operation/Helper/RandomDistributions.h
index cbebff142..f16e5019d 100644
--- a/compute/cker/include/cker/operation/Helper/RandomDistributions.h
+++ b/compute/cker/include/cker/operation/Helper/RandomDistributions.h
@@ -772,7 +772,7 @@ PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32_t x0, uint32_t x1)
 }
 
 } // namespace random
-} // namespace tensorflow
-}
+} // namespace cker
+} // namespace nnfw
 
 #endif // __NNFW_CKER_HELPER_RANDOM_DISTRIBUTIONS_H__
diff --git a/compute/cker/include/cker/operation/Helper/RandomOp.h b/compute/cker/include/cker/operation/Helper/RandomOp.h
index 7dc51fe94..6b7049ddf 100644
--- a/compute/cker/include/cker/operation/Helper/RandomOp.h
+++ b/compute/cker/include/cker/operation/Helper/RandomOp.h
@@ -47,6 +47,6 @@ template <class Distribution> struct FillPhiloxRandom<CPUDevice, Distribution>
 };
 
 } // namespace functor
-} // namespace tensorflow
-}
+} // namespace cker
+} // namespace nnfw
 #endif // __NNFW_CKER_HELPER_RANDOM_OP_H__
diff --git a/compute/cker/include/cker/operation/Helper/RandomOpCpu.h b/compute/cker/include/cker/operation/Helper/RandomOpCpu.h
index 6e9ffbdfd..c99f69709 100644
--- a/compute/cker/include/cker/operation/Helper/RandomOpCpu.h
+++ b/compute/cker/include/cker/operation/Helper/RandomOpCpu.h
@@ -157,7 +157,7 @@ operator()(random::PhiloxRandom gen, typename Distribution::ResultElementType *d
 
 } // namespace functor
 
-} // end namespace tensorflow
-}
+} // namespace cker
+} // namespace nnfw
 
 #endif // __NNFW_CKER_HELPER_RANDOM_OP_CPU_H__
diff --git a/compute/cker/include/cker/operation/Quantize.h b/compute/cker/include/cker/operation/Quantize.h
index 5c82d111f..8e5fc22bb 100644
--- a/compute/cker/include/cker/operation/Quantize.h
+++ b/compute/cker/include/cker/operation/Quantize.h
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.*
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,8 +21,10 @@
 #include "cker/Shape.h"
 #include "cker/Types.h"
 #include "cker/Utils.h"
-#include <stdexcept>
+#include <cassert>
 #include <iostream>
+#include <stdexcept>
+
 namespace nnfw
 {
 namespace cker
@@ -41,6 +44,251 @@ inline void Quantize(const Shape &input_shape, const InputT *input_data, const S
     output_data[i] = clamped;
   }
 }
+
+inline void Quantize(const int32_t *multiplier, const int32_t *shift, int32_t channel_size,
+                     int32_t total_size, int32_t output_zp, int32_t output_min, int32_t output_max,
+                     int32_t *scratch, int8_t *output)
+{
+  // Here we're trying to quantize the raw accumulators:
+  //        output_channels
+  //       data data data data data
+  // rows  data data data data data
+  //       data data data data data
+  //          ....
+  //
+  // In order to minimize the reload of the multipliers & shifts, once we load
+  // the multipliers & shifts, we load & quantize the raw accumulators for every
+  // row.
+#ifdef USE_NEON
+  const int32x4_t output_offset_vec = vdupq_n_s32(output_zp);
+  const int32x4_t output_activation_min_vec = vdupq_n_s32(output_min);
+  const int32x4_t output_activation_max_vec = vdupq_n_s32(output_max);
+  const int32x4_t zeros = vdupq_n_s32(0);
+#endif
+
+  assert(total_size % channel_size == 0);
+  const int32_t rows = total_size / channel_size;
+
+  int c = 0;
+
+#ifdef USE_NEON
+  using gemmlowp::RoundingDivideByPOT;
+  for (; c <= channel_size - 8; c += 8)
+  {
+    int32x4_t out_shift_1 = vld1q_s32(shift + c);
+    int32x4_t out_shift_2 = vld1q_s32(shift + c + 4);
+    int32x4_t left_shift_1 = vmaxq_s32(out_shift_1, zeros);
+    int32x4_t left_shift_2 = vmaxq_s32(out_shift_2, zeros);
+
+    // Right shift will be performed as left shift with negative values.
+    int32x4_t right_shift_1 = vminq_s32(out_shift_1, zeros);
+    int32x4_t right_shift_2 = vminq_s32(out_shift_2, zeros);
+
+    int32x4_t out_mul_1 = vld1q_s32(multiplier + c);
+    int32x4_t out_mul_2 = vld1q_s32(multiplier + c + 4);
+    for (int n = 0; n < rows; ++n)
+    {
+      int loc = n * channel_size + c;
+      int32x4_t acc_1 = vld1q_s32(scratch + loc);
+      int32x4_t acc_2 = vld1q_s32(scratch + loc + 4);
+
+      // Saturating Rounding Doubling High Mul.
+      acc_1 = vshlq_s32(acc_1, left_shift_1);
+      acc_1 = vqrdmulhq_s32(acc_1, out_mul_1);
+      acc_2 = vshlq_s32(acc_2, left_shift_2);
+      acc_2 = vqrdmulhq_s32(acc_2, out_mul_2);
+
+      // Rounding Dividing By POT.
+      acc_1 = vrshlq_s32(acc_1, right_shift_1);
+      acc_2 = vrshlq_s32(acc_2, right_shift_2);
+
+      // Add the output offset.
+      acc_1 = vaddq_s32(acc_1, output_offset_vec);
+      acc_2 = vaddq_s32(acc_2, output_offset_vec);
+
+      // Apply the activation function.
+      acc_1 = vmaxq_s32(acc_1, output_activation_min_vec);
+      acc_1 = vminq_s32(acc_1, output_activation_max_vec);
+      acc_2 = vmaxq_s32(acc_2, output_activation_min_vec);
+      acc_2 = vminq_s32(acc_2, output_activation_max_vec);
+
+      // Saturating cast to int8 and store to destination.
+      const int16x4_t acc_s16_1 = vqmovn_s32(acc_1);
+      const int16x4_t acc_s16_2 = vqmovn_s32(acc_2);
+      const int16x8_t res_s16 = vcombine_s16(acc_s16_1, acc_s16_2);
+      const int8x8_t res_s8 = vqmovn_s16(res_s16);
+      vst1_s8(output + loc, res_s8);
+    }
+  }
+
+#endif // USE_NEON
+  // Handle leftover values, one by one. This is very slow.
+  for (; c < channel_size; c++)
+  {
+    for (int n = 0; n < rows; ++n)
+    {
+      int loc = n * channel_size + c;
+      int32_t acc = scratch[loc];
+      acc = MultiplyByQuantizedMultiplier(acc, multiplier[c], shift[c]);
+      acc += output_zp;
+      acc = std::max(acc, output_min);
+      acc = std::min(acc, output_max);
+      output[loc] = static_cast<int8_t>(acc);
+    }
+  }
+}
+
+template <typename input_type, typename output_type>
+inline void Requantize(const input_type *input_data, int32_t size,
+                       int32_t effective_scale_multiplier, int32_t effective_scale_shift,
+                       int32_t input_zeropoint, int32_t output_zeropoint, output_type *output_data)
+{
+  assert(!"Requantize: not supported type. It shouldn't reach here.");
+  UNUSED_ALL(input_data, size, effective_scale_multiplier, effective_scale_shift, input_zeropoint,
+             output_zeropoint, output_data);
+}
+
+template <>
+inline void Requantize<uint8_t, int8_t>(const uint8_t *input_data, int32_t size,
+                                        int32_t effective_scale_multiplier,
+                                        int32_t effective_scale_shift, int32_t input_zeropoint,
+                                        int32_t output_zeropoint, int8_t *output_data)
+{
+  static constexpr int32_t kMinOutput = std::numeric_limits<int8_t>::min();
+  static constexpr int32_t kMaxOutput = std::numeric_limits<int8_t>::max();
+
+  int i = 0;
+#ifdef USE_NEON
+  // Constants.
+  const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint);
+  const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint);
+  const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput);
+  const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput);
+
+  for (; i <= size - 16; i += 16)
+  {
+    const uint8x16_t input_vec = vld1q_u8(input_data + i);
+    const uint16x8_t first_half = vmovl_u8(vget_low_u8(input_vec));
+    const uint16x8_t second_half = vmovl_u8(vget_high_u8(input_vec));
+    int32x4x4_t input;
+    input.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(first_half)));
+    input.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(first_half)));
+    input.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(second_half)));
+    input.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(second_half)));
+    input.val[0] = vaddq_s32(input.val[0], input_zero_point_dup);
+    input.val[1] = vaddq_s32(input.val[1], input_zero_point_dup);
+    input.val[2] = vaddq_s32(input.val[2], input_zero_point_dup);
+    input.val[3] = vaddq_s32(input.val[3], input_zero_point_dup);
+
+    int32x4x4_t result =
+      MultiplyByQuantizedMultiplier4Rows(input, effective_scale_multiplier, effective_scale_shift);
+
+    result.val[0] = vaddq_s32(result.val[0], output_zero_point_dup);
+    result.val[1] = vaddq_s32(result.val[1], output_zero_point_dup);
+    result.val[2] = vaddq_s32(result.val[2], output_zero_point_dup);
+    result.val[3] = vaddq_s32(result.val[3], output_zero_point_dup);
+    result.val[0] = vmaxq_s32(vminq_s32(result.val[0], max_val_dup), min_val_dup);
+    result.val[1] = vmaxq_s32(vminq_s32(result.val[1], max_val_dup), min_val_dup);
+    result.val[2] = vmaxq_s32(vminq_s32(result.val[2], max_val_dup), min_val_dup);
+    result.val[3] = vmaxq_s32(vminq_s32(result.val[3], max_val_dup), min_val_dup);
+
+    const int16x4_t narrowed_val_1 = vqmovn_s32(result.val[0]);
+    const int16x4_t narrowed_val_2 = vqmovn_s32(result.val[1]);
+    const int16x4_t narrowed_val_3 = vqmovn_s32(result.val[2]);
+    const int16x4_t narrowed_val_4 = vqmovn_s32(result.val[3]);
+    const int16x8_t output_first_half = vcombine_s16(narrowed_val_1, narrowed_val_2);
+    const int16x8_t output_second_half = vcombine_s16(narrowed_val_3, narrowed_val_4);
+    const int8x8_t narrowed_first_half = vqmovn_s16(output_first_half);
+    const int8x8_t narrowed_second_half = vqmovn_s16(output_second_half);
+    const int8x16_t narrowed_result = vcombine_s8(narrowed_first_half, narrowed_second_half);
+    vst1q_s8(output_data + i, narrowed_result);
+  }
+
+#endif
+  for (; i < size; ++i)
+  {
+    const int32_t input = input_data[i] - input_zeropoint;
+    const int32_t output =
+      MultiplyByQuantizedMultiplier(input, effective_scale_multiplier, effective_scale_shift) +
+      output_zeropoint;
+    const int32_t clamped_output = std::max(std::min(output, kMaxOutput), kMinOutput);
+    output_data[i] = static_cast<int8_t>(clamped_output);
+  }
+}
+
+template <>
+inline void Requantize<int8_t, uint8_t>(const int8_t *input_data, int32_t size,
+                                        int32_t effective_scale_multiplier,
+                                        int32_t effective_scale_shift, int32_t input_zeropoint,
+                                        int32_t output_zeropoint, uint8_t *output_data)
+{
+  static constexpr int32_t kMinOutput = std::numeric_limits<uint8_t>::min();
+  static constexpr int32_t kMaxOutput = std::numeric_limits<uint8_t>::max();
+
+  int i = 0;
+#ifdef USE_NEON
+  // Constants.
+  const int32x4_t input_zero_point_dup = vdupq_n_s32(-input_zeropoint);
+  const int32x4_t output_zero_point_dup = vdupq_n_s32(output_zeropoint);
+  const int32x4_t min_val_dup = vdupq_n_s32(kMinOutput);
+  const int32x4_t max_val_dup = vdupq_n_s32(kMaxOutput);
+
+  for (; i <= size - 16; i += 16)
+  {
+    const int8x16_t input_vec = vld1q_s8(input_data + i);
+    const int16x8_t first_half = vmovl_s8(vget_low_s8(input_vec));
+    const int16x8_t second_half = vmovl_s8(vget_high_s8(input_vec));
+    int32x4x4_t input;
+    input.val[0] = vmovl_s16(vget_low_s16(first_half));
+    input.val[1] = vmovl_s16(vget_high_s16(first_half));
+    input.val[2] = vmovl_s16(vget_low_s16(second_half));
+    input.val[3] = vmovl_s16(vget_high_s16(second_half));
+    input.val[0] = vaddq_s32(input.val[0], input_zero_point_dup);
+    input.val[1] = vaddq_s32(input.val[1], input_zero_point_dup);
+    input.val[2] = vaddq_s32(input.val[2], input_zero_point_dup);
+    input.val[3] = vaddq_s32(input.val[3], input_zero_point_dup);
+
+    int32x4x4_t result =
+      MultiplyByQuantizedMultiplier4Rows(input, effective_scale_multiplier, effective_scale_shift);
+
+    result.val[0] = vaddq_s32(result.val[0], output_zero_point_dup);
+    result.val[1] = vaddq_s32(result.val[1], output_zero_point_dup);
+    result.val[2] = vaddq_s32(result.val[2], output_zero_point_dup);
+    result.val[3] = vaddq_s32(result.val[3], output_zero_point_dup);
+    result.val[0] = vmaxq_s32(vminq_s32(result.val[0], max_val_dup), min_val_dup);
+    result.val[1] = vmaxq_s32(vminq_s32(result.val[1], max_val_dup), min_val_dup);
+    result.val[2] = vmaxq_s32(vminq_s32(result.val[2], max_val_dup), min_val_dup);
+    result.val[3] = vmaxq_s32(vminq_s32(result.val[3], max_val_dup), min_val_dup);
+
+    const uint32x4_t result_val_1_unsigned = vreinterpretq_u32_s32(result.val[0]);
+    const uint32x4_t result_val_2_unsigned = vreinterpretq_u32_s32(result.val[1]);
+    const uint32x4_t result_val_3_unsigned = vreinterpretq_u32_s32(result.val[2]);
+    const uint32x4_t result_val_4_unsigned = vreinterpretq_u32_s32(result.val[3]);
+
+    const uint16x4_t narrowed_val_1 = vqmovn_u32(result_val_1_unsigned);
+    const uint16x4_t narrowed_val_2 = vqmovn_u32(result_val_2_unsigned);
+    const uint16x4_t narrowed_val_3 = vqmovn_u32(result_val_3_unsigned);
+    const uint16x4_t narrowed_val_4 = vqmovn_u32(result_val_4_unsigned);
+    const uint16x8_t output_first_half = vcombine_u16(narrowed_val_1, narrowed_val_2);
+    const uint16x8_t output_second_half = vcombine_u16(narrowed_val_3, narrowed_val_4);
+    const uint8x8_t narrowed_first_half = vqmovn_u16(output_first_half);
+    const uint8x8_t narrowed_second_half = vqmovn_u16(output_second_half);
+    const uint8x16_t narrowed_result = vcombine_u8(narrowed_first_half, narrowed_second_half);
+    vst1q_u8(output_data + i, narrowed_result);
+  }
+
+#endif
+  for (; i < size; ++i)
+  {
+    const int32_t input = input_data[i] - input_zeropoint;
+    const int32_t output =
+      MultiplyByQuantizedMultiplier(input, effective_scale_multiplier, effective_scale_shift) +
+      output_zeropoint;
+    const int32_t clamped_output = std::max(std::min(output, kMaxOutput), kMinOutput);
+    output_data[i] = static_cast<uint8_t>(clamped_output);
+  }
+}
+
 } // namespace cker
 } // namespace nnfw
 
diff --git a/compute/cker/include/cker/operation/Reduce.h b/compute/cker/include/cker/operation/Reduce.h
index dbf938147..f54f2e6f1 100644
--- a/compute/cker/include/cker/operation/Reduce.h
+++ b/compute/cker/include/cker/operation/Reduce.h
@@ -46,6 +46,7 @@ inline void OptimizedReduceSum(const float *input_data, const Shape &input_shape
     input_size *= input_dims[idx];
   }
   reduce_size = input_dims[input_num_dims - 1];
+  int offset = 0;
   for (int idx = 0; idx < input_size; idx++)
   {
     int r_idx = 0;
@@ -55,14 +56,14 @@ inline void OptimizedReduceSum(const float *input_data, const Shape &input_shape
     float32x4_t tmp_data_32x4 = vld1q_f32(tmp_data);
     for (; r_idx <= reduce_size - 32; r_idx += 32)
     {
-      float32x4_t a10 = vld1q_f32(input_data + r_idx);
-      float32x4_t a11 = vld1q_f32(input_data + r_idx + 4);
-      float32x4_t a12 = vld1q_f32(input_data + r_idx + 8);
-      float32x4_t a13 = vld1q_f32(input_data + r_idx + 12);
-      float32x4_t a20 = vld1q_f32(input_data + r_idx + 16);
-      float32x4_t a21 = vld1q_f32(input_data + r_idx + 20);
-      float32x4_t a22 = vld1q_f32(input_data + r_idx + 24);
-      float32x4_t a23 = vld1q_f32(input_data + r_idx + 28);
+      float32x4_t a10 = vld1q_f32(input_data + offset + r_idx);
+      float32x4_t a11 = vld1q_f32(input_data + offset + r_idx + 4);
+      float32x4_t a12 = vld1q_f32(input_data + offset + r_idx + 8);
+      float32x4_t a13 = vld1q_f32(input_data + offset + r_idx + 12);
+      float32x4_t a20 = vld1q_f32(input_data + offset + r_idx + 16);
+      float32x4_t a21 = vld1q_f32(input_data + offset + r_idx + 20);
+      float32x4_t a22 = vld1q_f32(input_data + offset + r_idx + 24);
+      float32x4_t a23 = vld1q_f32(input_data + offset + r_idx + 28);
 
       float32x4_t x0 = vaddq_f32(a10, a20);
       float32x4_t x1 = vaddq_f32(a11, a21);
@@ -74,10 +75,23 @@ inline void OptimizedReduceSum(const float *input_data, const Shape &input_shape
       float32x4_t y2 = vaddq_f32(y0, y1);
       tmp_data_32x4 = vaddq_f32(tmp_data_32x4, y2);
     }
+    for (; r_idx <= reduce_size - 16; r_idx += 16)
+    {
+      float32x4_t a10 = vld1q_f32(input_data + offset + r_idx);
+      float32x4_t a11 = vld1q_f32(input_data + offset + r_idx + 4);
+      float32x4_t a12 = vld1q_f32(input_data + offset + r_idx + 8);
+      float32x4_t a13 = vld1q_f32(input_data + offset + r_idx + 12);
+
+      float32x4_t x0 = vaddq_f32(a10, a11);
+      float32x4_t x1 = vaddq_f32(a12, a13);
+
+      float32x4_t y0 = vaddq_f32(x0, x1);
+      tmp_data_32x4 = vaddq_f32(tmp_data_32x4, y0);
+    }
     for (; r_idx <= reduce_size - 8; r_idx += 8)
     {
-      float32x4_t a1 = vld1q_f32(input_data + r_idx);
-      float32x4_t a2 = vld1q_f32(input_data + r_idx + 4);
+      float32x4_t a1 = vld1q_f32(input_data + offset + r_idx);
+      float32x4_t a2 = vld1q_f32(input_data + offset + r_idx + 4);
       float32x4_t x = vaddq_f32(a1, a2);
       tmp_data_32x4 = vaddq_f32(tmp_data_32x4, x);
     }
@@ -88,13 +102,14 @@ inline void OptimizedReduceSum(const float *input_data, const Shape &input_shape
     {
       if (r_idx == 0)
       {
-        output_data[idx] = input_data[idx * reduce_size];
+        output_data[idx] = input_data[offset];
       }
       else
       {
-        output_data[idx] += input_data[idx * reduce_size + r_idx];
+        output_data[idx] += input_data[offset + r_idx];
       }
     }
+    offset += reduce_size;
   }
 }
 #endif // NEON
diff --git a/compute/cker/include/cker/operation/ResizeBilinear.h b/compute/cker/include/cker/operation/ResizeBilinear.h
index 8d9a7495f..ae5af7bb3 100644
--- a/compute/cker/include/cker/operation/ResizeBilinear.h
+++ b/compute/cker/include/cker/operation/ResizeBilinear.h
@@ -264,6 +264,91 @@ void ResizeBilinear(ResizeBilinearParams &params, const Shape &input_shape,
     batches, input_height, input_width, depth, params.output_height, params.output_width,
     height_scale, width_scale, input_shape, input_data, output_data, params.half_pixel_centers);
 }
+
+inline void ComputeInterpolationValues(const int32_t value, const int32_t scale_10,
+                                       const bool half_pixel_centers, int32_t input_size,
+                                       int32_t *scaled_value, int32_t *lower_bound,
+                                       int32_t *upper_bound)
+{
+  if (half_pixel_centers)
+  {
+    *scaled_value = value * scale_10 + scale_10 / 2 - (1 << 9);
+  }
+  else
+  {
+    *scaled_value = value * scale_10;
+  }
+  *lower_bound = std::max(*scaled_value / (1 << 10), 0);
+  *upper_bound = std::min(*scaled_value / (1 << 10) + 1, input_size - 1);
+}
+
+inline void ResizeBilinear(const ResizeBilinearParams &op_params,
+                           const Shape &unextended_input_shape, const int8_t *input_data,
+                           const Shape &unextended_output_shape, int8_t *output_data)
+{
+  // If half_pixel_centers is True, align_corners must be False.
+  assert(!op_params.half_pixel_centers || !op_params.align_corners);
+  assert(unextended_input_shape.DimensionsCount() <= 4);
+  assert(unextended_output_shape.DimensionsCount() <= 4);
+  const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape);
+  const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
+
+  const int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int32_t input_height = input_shape.Dims(1);
+  const int32_t input_width = input_shape.Dims(2);
+  const int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
+
+  const int32_t output_height = op_params.output_height;
+  const int32_t output_width = op_params.output_width;
+
+  int32_t height_scale_10 = ((1 << 10) * input_height + output_height / 2) / output_height;
+  int32_t width_scale_10 = ((1 << 10) * input_width + output_width / 2) / output_width;
+  if (op_params.align_corners && output_height > 1)
+  {
+    height_scale_10 =
+      ((1 << 10) * (input_height - 1) + (output_height - 1) / 2) / (output_height - 1);
+  }
+  if (op_params.align_corners && output_width > 1)
+  {
+    width_scale_10 = ((1 << 10) * (input_width - 1) + (output_width - 1) / 2) / (output_width - 1);
+  }
+
+  for (int b = 0; b < batches; ++b)
+  {
+    for (int y = 0; y < output_height; ++y)
+    {
+      int32_t input_y, y0, y1;
+      ComputeInterpolationValues(y, height_scale_10, op_params.half_pixel_centers, input_height,
+                                 &input_y, &y0, &y1);
+      for (int x = 0; x < output_width; ++x)
+      {
+        int32_t input_x, x0, x1;
+        ComputeInterpolationValues(x, width_scale_10, op_params.half_pixel_centers, input_width,
+                                   &input_x, &x0, &x1);
+        for (int c = 0; c < depth; ++c)
+        {
+          const int64_t output_20_ll =
+            static_cast<int64_t>(input_data[Offset(input_shape, b, y0, x0, c)]) *
+            ((1 << 10) - (input_y - (1 << 10) * y0)) * ((1 << 10) - (input_x - (1 << 10) * x0));
+          const int64_t output_20_lu =
+            static_cast<int64_t>(input_data[Offset(input_shape, b, y1, x0, c)]) *
+            (input_y - (1 << 10) * y0) * ((1 << 10) - (input_x - (1 << 10) * x0));
+          const int64_t output_20_rl =
+            static_cast<int64_t>(input_data[Offset(input_shape, b, y0, x1, c)]) *
+            ((1 << 10) - (input_y - (1 << 10) * y0)) * (input_x - (1 << 10) * x0);
+          const int64_t output_20_ru =
+            static_cast<int64_t>(input_data[Offset(input_shape, b, y1, x1, c)]) *
+            (input_y - (1 << 10) * y0) * (input_x - (1 << 10) * x0);
+          const int64_t output_20 = output_20_ll + output_20_lu + output_20_rl + output_20_ru;
+          const int64_t round = (output_20 > 0) ? (1 << 19) : -(1 << 19);
+          const int8_t interpolation = static_cast<int8_t>((output_20 + round) / (1 << 20));
+          output_data[Offset(output_shape, b, y, x, c)] = interpolation;
+        }
+      }
+    }
+  }
+}
+
 } // namespace cker
 } // namespace nnfw
 
diff --git a/compute/cker/include/cker/operation/SoftMax.h b/compute/cker/include/cker/operation/SoftMax.h
index 620c1f968..35ecde4ba 100644
--- a/compute/cker/include/cker/operation/SoftMax.h
+++ b/compute/cker/include/cker/operation/SoftMax.h
@@ -23,6 +23,10 @@
 #include "cker/Types.h"
 #include "cker/eigen/Utils.h"
 
+#if __aarch64__ && __clang__
+#define TFLITE_SOFTMAX_USE_UINT16_LUT
+#endif
+
 #include <Eigen/Core>
 #include <fixedpoint/fixedpoint.h>
 #include <cmath>
@@ -69,7 +73,7 @@ inline void Softmax(const SoftmaxParams &params, const Shape &input_shape, const
     }
   }
 }
-}
+} // namespace reference
 
 // Performs softmax along the input of size (input_size * batch_size).
 inline void Softmax(const float *in, const int input_size, const int batch_size, const float beta,
@@ -127,87 +131,306 @@ inline void Softmax(const SoftmaxParams &params, const Shape &input_shape, const
   out_mat.array().rowwise() *= scale;
 }
 
-inline void Softmax(const SoftmaxParams &params, const Shape &input_shape,
-                    const uint8_t *input_data, const Shape &output_shape, uint8_t *output_data)
-{
-  const int32_t input_beta_multiplier = params.input_multiplier;
-  const int32_t input_beta_left_shift = params.input_left_shift;
-  const int diff_min = params.diff_min;
-  // The representation chosen for the input to the exp() function is Q5.26.
-  // We need to leave extra space since values that we skip might be as large as
-  // -32 before multiplying by input_beta_multiplier, and therefore as large as
-  // -16 afterwards.  Note that exp(-8) is definitely not insignificant to
-  // accumulation, but exp(-16) definitely is.
-  static const int kScaledDiffIntegerBits = 5;
-  static const int kAccumulationIntegerBits = 12;
-  using FixedPointScaledDiff = gemmlowp::FixedPoint<int32_t, kScaledDiffIntegerBits>;
-  using FixedPointAccum = gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>;
-  using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;
+template <typename T> inline int32_t QuantizeSoftmaxOutput(float prob_rescaled, int32_t zero_point)
+{
+  const int32_t prob_rnd = static_cast<int32_t>(std::round(prob_rescaled));
+  return prob_rnd + zero_point;
+}
+
+#if !__aarch64__
+// With ARM64, rounding is faster than add + truncation.
+template <> inline int32_t QuantizeSoftmaxOutput<uint8_t>(float prob_rescaled, int32_t)
+{
+  return static_cast<int32_t>(prob_rescaled + 0.5f);
+}
+#endif
+
+inline void PopulateSoftmaxLookupTable(float *table, float input_scale, float beta)
+{
+  const float scale = -input_scale * beta;
+  const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
+  for (int32_t val = 0; val <= max_uint8; ++val)
+  {
+    table[max_uint8 - val] = expf(scale * val);
+  }
+}
 
+template <typename In, typename Out>
+inline void Softmax(const SoftmaxParams &params, const Shape &input_shape, const In *input_data,
+                    const Shape &output_shape, Out *output_data)
+{
   const int trailing_dim = input_shape.DimensionsCount() - 1;
-  const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
-  const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+  const int excluding_last_dim = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int last_dim = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
 
-  for (int i = 0; i < outer_size; ++i)
+  const int32_t clamp_max = std::numeric_limits<Out>::max();
+  const int32_t clamp_min = std::numeric_limits<Out>::min();
+  for (int i = 0; i < excluding_last_dim; ++i)
   {
-    uint8_t max_in_row = 0;
-    for (int c = 0; c < depth; ++c)
+    int32_t max_val = std::numeric_limits<In>::min();
+    // Find max quantized value.
+    for (int j = 0; j < last_dim; ++j)
     {
-      max_in_row = std::max(max_in_row, input_data[i * depth + c]);
+      max_val = std::max(max_val, static_cast<int32_t>(input_data[j]));
     }
 
-    FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
-    for (int c = 0; c < depth; ++c)
+    float sum_exp = 0.0f;
+    const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
+    const float *table_offset = &params.table[max_uint8 - max_val];
+    // Calculate normalizer sum(exp(x)).
+    for (int j = 0; j < last_dim; ++j)
     {
-      int32_t input_diff = static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
-      if (input_diff >= diff_min)
-      {
-        const int32_t input_diff_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne(
-          input_diff, input_beta_multiplier, input_beta_left_shift);
-        const FixedPointScaledDiff scaled_diff_f8 =
-          FixedPointScaledDiff::FromRaw(input_diff_rescaled);
-        sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
-                                      exp_on_negative_values(scaled_diff_f8));
-      }
+      sum_exp += table_offset[input_data[j]];
     }
 
-    int32_t fixed_sum_of_exps = sum_of_exps.raw();
-    int headroom_plus_one = CountLeadingZeros(static_cast<uint32_t>(fixed_sum_of_exps));
-    // This is the number of bits to the left of the binary point above 1.0.
-    // Consider fixed_sum_of_exps=1.25.  In that case shifted_scale=0.8 and
-    // no later adjustment will be needed.
-    int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one;
-    int32_t shifted_sum_minus_one =
-      static_cast<int32_t>((static_cast<uint32_t>(fixed_sum_of_exps) << headroom_plus_one) -
-                           (static_cast<uint32_t>(1) << 31));
+    const float inv_sum_exp = 1.0f / (sum_exp * params.scale);
+    // Normalize and quantize probabilities.
+    for (int j = 0; j < last_dim; ++j)
+    {
+      const float prob_rescaled = table_offset[input_data[j]] * inv_sum_exp;
+      const int32_t prob_quantized = QuantizeSoftmaxOutput<Out>(prob_rescaled, params.zero_point);
+      output_data[j] = static_cast<Out>(std::max(std::min(clamp_max, prob_quantized), clamp_min));
+    }
+    input_data += last_dim;
+    output_data += last_dim;
+  }
+}
+
+#ifdef TFLITE_SOFTMAX_USE_UINT16_LUT
+// Looks up each element of <indices> in <table>, returns them in a vector.
+inline uint8x16_t aarch64_lookup_vector(const uint8x16x4_t table[4], uint8x16_t indices)
+{
+  // Look up in 1st quarter of the table: top 2 bits of indices == 00
+  uint8x16_t output1 = vqtbl4q_u8(table[0], indices);
+  // Look up in 2nd quarter of the table: top 2 bits of indices == 01
+  uint8x16_t output2 = vqtbl4q_u8(table[1], veorq_u8(indices, vdupq_n_u8(0x40)));
+  // Look up in 3rd quarter of the table: top 2 bits of indices == 10
+  uint8x16_t output3 = vqtbl4q_u8(table[2], veorq_u8(indices, vdupq_n_u8(0x80)));
+  // Look up in 4th quarter of the table: top 2 bits of indices == 11
+  uint8x16_t output4 = vqtbl4q_u8(table[3], veorq_u8(indices, vdupq_n_u8(0xc0)));
+
+  // Combine result of the 4 lookups.
+  return vorrq_u8(vorrq_u8(output1, output2), vorrq_u8(output3, output4));
+}
 
-    FixedPoint0 shifted_scale =
-      one_over_one_plus_x_for_x_in_0_1(FixedPoint0::FromRaw(shifted_sum_minus_one));
+inline void PopulateSoftmaxUInt8LookupTable(uint8_t *uint8_table1, uint8_t *uint8_table2,
+                                            float input_scale, float beta)
+{
+  const float scale = input_scale * beta;
+  const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
+  const int32_t max_uint16 = std::numeric_limits<uint16_t>::max();
 
-    for (int c = 0; c < depth; ++c)
+  for (int32_t val = 0; val <= max_uint8; ++val)
+  {
+    float input_to_exp = scale * (val - max_uint8);
+    int32_t temp = static_cast<int>(expf(input_to_exp) * max_uint16 + 0.5);
+    temp = std::min(max_uint16, temp);
+    uint8_t part1 = temp >> 8;
+    uint8_t part2 = temp & 0xff;
+    uint8_table1[val] = static_cast<uint8_t>(part1);
+    uint8_table2[val] = static_cast<uint8_t>(part2);
+  }
+}
+
+inline int FindMaxValue(int size, const uint8_t *input_data, uint8_t offset)
+{
+  int32_t max_val = std::numeric_limits<uint8_t>::min();
+  int j = 0;
+
+  uint8x16_t max_val_dup = vdupq_n_u8(max_val);
+  uint8x16_t offset_dup = vdupq_n_u8(offset);
+  for (; j <= size - 16; j += 16)
+  {
+    uint8x16_t input_value = vld1q_u8(input_data + j);
+    input_value = veorq_u8(input_value, offset_dup);
+    max_val_dup = vmaxq_u8(input_value, max_val_dup);
+  }
+  max_val = std::max(max_val, static_cast<int32_t>(vmaxvq_u8(max_val_dup)));
+
+  for (; j < size; ++j)
+  {
+    max_val = std::max(max_val, static_cast<int32_t>(input_data[j] ^ offset));
+  }
+  return max_val;
+}
+
+#ifdef USE_NEON
+// Value_to_store layout:
+// [high_high, high_low, low_high, low_low].
+inline void StoreValue(int32x4x4_t value_to_store, int8_t *output)
+{
+  const int16x8_t result_1 =
+    vcombine_s16(vqmovn_s32(value_to_store.val[1]), vqmovn_s32(value_to_store.val[0]));
+  const int16x8_t result_2 =
+    vcombine_s16(vqmovn_s32(value_to_store.val[3]), vqmovn_s32(value_to_store.val[2]));
+  const int8x16_t result = vcombine_s8(vqmovn_s16(result_2), vqmovn_s16(result_1));
+  vst1q_s8(output, result);
+}
+
+// Value_to_store layout:
+// [high_high, high_low, low_high, low_low].
+inline void StoreValue(int32x4x4_t value_to_store, uint8_t *output)
+{
+  const uint16x8_t result_1 =
+    vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[1])),
+                 vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[0])));
+  const uint16x8_t result_2 =
+    vcombine_u16(vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[3])),
+                 vqmovn_u32(vreinterpretq_u32_s32(value_to_store.val[2])));
+  const uint8x16_t result = vcombine_u8(vqmovn_u16(result_2), vqmovn_u16(result_1));
+  vst1q_u8(output, result);
+}
+
+#endif
+
+template <typename In, typename Out>
+inline void SoftmaxInt8LUT(const SoftmaxParams &params, const Shape &input_shape,
+                           const In *input_data, const Shape &output_shape, Out *output_data)
+{
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int excluding_last_dim = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int last_dim = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+
+  const int32_t clamp_max = std::numeric_limits<Out>::max();
+  const int32_t clamp_min = std::numeric_limits<Out>::min();
+
+  // Offset is used to interpret the input data "correctly".
+  // If the input is uint8, the data will be unchanged.
+  // If the input is int8, since it will be reinterpret as uint8.
+  // e.g.,
+  // int8 127 will be applied "offset" to become 255 in uint8.
+  uint8_t offset = 0;
+  if (std::is_same<In, int8_t>::value)
+  {
+    offset = 0x80;
+  }
+
+  const uint8_t *input_data_uint = reinterpret_cast<const uint8_t *>(input_data);
+
+  // This code uses ARM64-only instructions.
+  // TODO(b/143709993): Port to ARMv7
+
+  // Load the tables into registers. (4*4 128-bit registers)
+  uint8x16x4_t table1[4];
+  table1[0] = vld1q_u8_x4(params.uint8_table1 + 16 * 4 * 0);
+  table1[1] = vld1q_u8_x4(params.uint8_table1 + 16 * 4 * 1);
+  table1[2] = vld1q_u8_x4(params.uint8_table1 + 16 * 4 * 2);
+  table1[3] = vld1q_u8_x4(params.uint8_table1 + 16 * 4 * 3);
+
+  uint8x16x4_t table2[4];
+  table2[0] = vld1q_u8_x4(params.uint8_table2 + 16 * 4 * 0);
+  table2[1] = vld1q_u8_x4(params.uint8_table2 + 16 * 4 * 1);
+  table2[2] = vld1q_u8_x4(params.uint8_table2 + 16 * 4 * 2);
+  table2[3] = vld1q_u8_x4(params.uint8_table2 + 16 * 4 * 3);
+
+  for (int i = 0; i < excluding_last_dim; ++i)
+  {
+    // Find max quantized value.
+    int32_t max_val = FindMaxValue(last_dim, input_data_uint, offset);
+
+    int32_t sum_exp = 0;
+    const int32_t max_uint8 = std::numeric_limits<uint8_t>::max();
+    const uint8_t table_offset = max_uint8 - max_val;
+
+    // Calculate normalizer sum(exp(x)).
+    int sum_j = 0;
+    uint8x16_t table_offset_dup = vdupq_n_u8(table_offset);
+    uint8x16_t offset_dup = vdupq_n_u8(offset);
+    uint32x4_t sum_4 = vdupq_n_u32(0);
+    const int multiplier_shift = 8;
+    for (; sum_j <= last_dim - 16; sum_j += 16)
     {
-      int32_t input_diff = static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
-      if (input_diff >= diff_min)
-      {
-        const int32_t input_diff_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne(
-          input_diff, input_beta_multiplier, input_beta_left_shift);
-        const FixedPointScaledDiff scaled_diff_f8 =
-          FixedPointScaledDiff::FromRaw(input_diff_rescaled);
-
-        FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
-        int32_t unsat_output = gemmlowp::RoundingDivideByPOT((shifted_scale * exp_in_0).raw(),
-                                                             num_bits_over_unit + 31 - 8);
-
-        output_data[i * depth + c] = static_cast<uint8_t>(
-          std::max(std::min(unsat_output, static_cast<int32_t>(255)), static_cast<int32_t>(0)));
-      }
-      else
-      {
-        output_data[i * depth + c] = 0;
-      }
+      uint8x16_t input_value = vld1q_u8(input_data_uint + sum_j);
+      input_value = veorq_u8(input_value, offset_dup);
+      input_value = vaddq_u8(input_value, table_offset_dup);
+
+      const uint8x16_t output1 = aarch64_lookup_vector(table1, input_value);
+      const uint8x16_t output2 = aarch64_lookup_vector(table2, input_value);
+
+      uint16x8_t exp_value1 = vshll_n_u8(vget_high_u8(output1), multiplier_shift);
+      uint16x8_t exp_value2 = vshll_n_u8(vget_low_u8(output1), multiplier_shift);
+
+      exp_value1 = vaddw_u8(exp_value1, vget_high_u8(output2));
+      exp_value2 = vaddw_u8(exp_value2, vget_low_u8(output2));
+
+      sum_4 = vpadalq_u16(sum_4, exp_value1);
+      sum_4 = vpadalq_u16(sum_4, exp_value2);
+    }
+    int temp = vgetq_lane_u32(sum_4, 0) + vgetq_lane_u32(sum_4, 1) + vgetq_lane_u32(sum_4, 2) +
+               vgetq_lane_u32(sum_4, 3);
+    sum_exp += temp;
+
+    for (; sum_j < last_dim; ++sum_j)
+    {
+      const uint8_t index = (input_data_uint[sum_j] ^ offset) + table_offset;
+
+      uint8_t part1 = params.uint8_table1[index];
+      uint8_t part2 = params.uint8_table2[index];
+      sum_exp += ((part1 << 8) + part2);
+    }
+
+    const float inv_sum_exp = 1.0f / (sum_exp * params.scale);
+
+    int32_t multiplier, shift;
+    QuantizeMultiplier(inv_sum_exp, &multiplier, &shift);
+
+    // Normalize and quantize probabilities.
+    int j = 0;
+    const int32x4_t output_zp_dup = vdupq_n_s32(params.zero_point);
+    const int32x4_t max_val_dup = vdupq_n_s32(clamp_max);
+    const int32x4_t min_val_dup = vdupq_n_s32(clamp_min);
+
+    for (; j <= last_dim - 16; j += 16)
+    {
+      uint8x16_t input_value = vld1q_u8(input_data_uint + j);
+      input_value = veorq_u8(input_value, offset_dup);
+      input_value = vaddq_u8(input_value, table_offset_dup);
+
+      const uint8x16_t output1 = aarch64_lookup_vector(table1, input_value);
+      const uint8x16_t output2 = aarch64_lookup_vector(table2, input_value);
+
+      uint16x8_t exp_value1 = vshll_n_u8(vget_high_u8(output1), multiplier_shift);
+      uint16x8_t exp_value2 = vshll_n_u8(vget_low_u8(output1), multiplier_shift);
+
+      exp_value1 = vaddw_u8(exp_value1, vget_high_u8(output2));
+      exp_value2 = vaddw_u8(exp_value2, vget_low_u8(output2));
+
+      int32x4x4_t output_value;
+      output_value.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(exp_value1)));
+      output_value.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(exp_value1)));
+      output_value.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(exp_value2)));
+      output_value.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(exp_value2)));
+
+      int32x4x4_t temp_val = MultiplyByQuantizedMultiplier4Rows(output_value, multiplier, shift);
+
+      temp_val.val[0] = vaddq_s32(temp_val.val[0], output_zp_dup);
+      temp_val.val[1] = vaddq_s32(temp_val.val[1], output_zp_dup);
+      temp_val.val[2] = vaddq_s32(temp_val.val[2], output_zp_dup);
+      temp_val.val[3] = vaddq_s32(temp_val.val[3], output_zp_dup);
+
+      temp_val.val[0] = vmaxq_s32(vminq_s32(temp_val.val[0], max_val_dup), min_val_dup);
+      temp_val.val[1] = vmaxq_s32(vminq_s32(temp_val.val[1], max_val_dup), min_val_dup);
+      temp_val.val[2] = vmaxq_s32(vminq_s32(temp_val.val[2], max_val_dup), min_val_dup);
+      temp_val.val[3] = vmaxq_s32(vminq_s32(temp_val.val[3], max_val_dup), min_val_dup);
+
+      StoreValue(temp_val, output_data + j);
+    }
+    for (; j < last_dim; ++j)
+    {
+      const uint8_t index = (input_data_uint[j] ^ offset) + table_offset;
+      const uint8_t part1 = params.uint8_table1[index];
+      const uint8_t part2 = params.uint8_table2[index];
+      const int32_t exp_value = (part1 << 8) + part2;
+      const int32_t output_value = MultiplyByQuantizedMultiplier(exp_value, multiplier, shift);
+
+      output_data[j] = static_cast<Out>(
+        std::max(std::min(clamp_max, output_value + params.zero_point), clamp_min));
     }
+    input_data_uint += last_dim;
+    output_data += last_dim;
   }
 }
+#endif
 
 } // namespace cker
 } // namespace nnfw
diff --git a/compute/cker/include/cker/operation/StatelessRandomUniform.h b/compute/cker/include/cker/operation/StatelessRandomUniform.h
index cdd812a08..dcf649ca1 100644
--- a/compute/cker/include/cker/operation/StatelessRandomUniform.h
+++ b/compute/cker/include/cker/operation/StatelessRandomUniform.h
@@ -72,8 +72,8 @@ void Fill(random::PhiloxRandom random, Tensor *output)
                                                     Distribution());
 }
 
-inline void StatelessRandomUniform(const Shape &shape_shape, const int *shape_data,
-                                   const Shape &seed_shape, const int *seed_data,
+inline void StatelessRandomUniform(const Shape &shape_shape, const int32_t *shape_data,
+                                   const Shape &seed_shape, const int32_t *seed_data,
                                    const Shape &output_shape, float *output_data)
 {
   Tensor shape_t;
diff --git a/compute/cker/include/cker/operation/Transpose.h b/compute/cker/include/cker/operation/Transpose.h
index 62eb432ae..52c826c39 100644
--- a/compute/cker/include/cker/operation/Transpose.h
+++ b/compute/cker/include/cker/operation/Transpose.h
@@ -288,7 +288,7 @@ size_t Flatten(const Shape &input_shape, const Shape &output_shape, const Transp
   return flat_size;
 }
 
-} // namespace anonymous (util)
+} // namespace
 
 // Transpose2D only deals with typical 2D matrix transpose ops.
 // Perform transpose by transposing 4x4 blocks of the input, proceeding from
diff --git a/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h
index 8c1d31b56..1fe3e1517 100644
--- a/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h
+++ b/compute/cker/include/cker/operation/optimized/BinaryArithmeticOps.h
@@ -35,6 +35,7 @@ namespace cker
 namespace optimized
 {
 
+/* Old version: For Sub(float) and Div. */
 template <typename ElementwiseF, typename ScalarBroadcastF, typename T>
 inline void BinaryBroadcastFiveFold(const BinaryArithmeticOpParam &params, bool switch_inputs,
                                     const Shape & /* unswitched_input1_shape */,
@@ -122,8 +123,108 @@ inline void BinaryBroadcastFiveFold(const BinaryArithmeticOpParam &params, bool
   }
 }
 
-inline int32_t quant8_sum(const BinaryArithmeticOpParam &params, const uint8_t input1_data,
-                          const uint8_t input2_data)
+// New version: For Mul, Add and Sub(quant8)
+template <typename ElementwiseF, typename ScalarBroadcastF, typename T>
+inline void BinaryBroadcastFiveFold(const BinaryArithmeticOpParam &unswitched_params,
+                                    const Shape & /* unswitched_input1_shape */,
+                                    const T *unswitched_input1_data,
+                                    const Shape & /* unswitched_input2_shape */,
+                                    const T *unswitched_input2_data,
+                                    const Shape & /* output_shape */, T *output_data,
+                                    ElementwiseF elementwise_f, ScalarBroadcastF scalar_broadcast_f)
+{
+  BinaryArithmeticOpParam switched_params = unswitched_params;
+  switched_params.input1_offset = unswitched_params.input2_offset;
+  switched_params.input1_multiplier = unswitched_params.input2_multiplier;
+  switched_params.input1_shift = unswitched_params.input2_shift;
+  switched_params.input2_offset = unswitched_params.input1_offset;
+  switched_params.input2_multiplier = unswitched_params.input1_multiplier;
+  switched_params.input2_shift = unswitched_params.input1_shift;
+
+  const bool use_unswitched =
+    unswitched_params.broadcast_category == BroadcastableOpCategory::kFirstInputBroadcastsFast;
+
+  const BinaryArithmeticOpParam &params = use_unswitched ? unswitched_params : switched_params;
+  const T *input1_data = use_unswitched ? unswitched_input1_data : unswitched_input2_data;
+  const T *input2_data = use_unswitched ? unswitched_input2_data : unswitched_input1_data;
+
+  // Fivefold nested loops. The second input resets its position for each
+  // iteration of the second loop. The first input resets its position at the
+  // beginning of the fourth loop. The innermost loop is an elementwise add of
+  // sections of the arrays.
+  T *output_data_ptr = output_data;
+  const T *input1_data_ptr = input1_data;
+  const T *input2_data_reset = input2_data;
+  // In the fivefold pattern, y0, y2 and y4 are not broadcast, and so shared
+  // between input shapes. y3 for input 1 is always broadcast, and so the
+  // dimension there is 1, whereas optionally y1 might be broadcast for
+  // input 2. Put another way, input1.shape.FlatSize = y0 * y1 * y2 * y4,
+  // input2.shape.FlatSize = y0 * y2 * y3 * y4.
+  int y0 = params.broadcast_shape[0];
+  int y1 = params.broadcast_shape[1];
+  int y2 = params.broadcast_shape[2];
+  int y3 = params.broadcast_shape[3];
+  int y4 = params.broadcast_shape[4];
+  if (y4 > 1)
+  {
+    // General fivefold pattern, with y4 > 1 so there is a non-broadcast inner
+    // dimension.
+    for (int i0 = 0; i0 < y0; ++i0)
+    {
+      const T *input2_data_ptr = nullptr;
+      for (int i1 = 0; i1 < y1; ++i1)
+      {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2)
+        {
+          for (int i3 = 0; i3 < y3; ++i3)
+          {
+            elementwise_f(y4, params, input1_data_ptr, input2_data_ptr, output_data_ptr);
+            input2_data_ptr += y4;
+            output_data_ptr += y4;
+          }
+          // We have broadcast y4 of input1 data y3 times, and now move on.
+          input1_data_ptr += y4;
+        }
+      }
+      // We have broadcast y2*y3*y4 of input2 data y1 times, and now move on.
+      input2_data_reset = input2_data_ptr;
+    }
+  }
+  else
+  {
+    // Special case of y4 == 1, in which the innermost loop is a single
+    // element and can be combined with the next (y3) as an inner broadcast.
+    //
+    // Note that this handles the case of pure scalar broadcast when
+    // y0 == y1 == y2 == 1. With low overhead it handles cases such as scalar
+    // broadcast with batch (as y2 > 1).
+    //
+    // NOTE The process is the same as the above general case except
+    // simplified for y4 == 1 and the loop over y3 is contained within the
+    // AddScalarBroadcast function.
+    for (int i0 = 0; i0 < y0; ++i0)
+    {
+      const T *input2_data_ptr = nullptr;
+      for (int i1 = 0; i1 < y1; ++i1)
+      {
+        input2_data_ptr = input2_data_reset;
+        for (int i2 = 0; i2 < y2; ++i2)
+        {
+          scalar_broadcast_f(y3, params, *input1_data_ptr, input2_data_ptr, output_data_ptr);
+          input2_data_ptr += y3;
+          output_data_ptr += y3;
+          input1_data_ptr += 1;
+        }
+      }
+      input2_data_reset = input2_data_ptr;
+    }
+  }
+}
+
+template <typename T>
+inline typename std::enable_if_t<is_quant8<T>::value, int32_t>
+quant8_sum(const BinaryArithmeticOpParam &params, const T input1_data, const T input2_data)
 {
   const int32_t input1_val = params.input1_offset + input1_data;
   const int32_t input2_val = params.input2_offset + input2_data;
@@ -142,9 +243,9 @@ inline int32_t quant8_sum(const BinaryArithmeticOpParam &params, const uint8_t i
   return clamped_output;
 }
 
-inline void AddElementwiseQuant8(int size, const BinaryArithmeticOpParam &params,
-                                 const uint8_t *input1_data, const uint8_t *input2_data,
-                                 uint8_t *output_data)
+inline void AddElementwise(int size, const BinaryArithmeticOpParam &params,
+                           const uint8_t *input1_data, const uint8_t *input2_data,
+                           uint8_t *output_data)
 {
   int i = 0;
 
@@ -218,6 +319,119 @@ inline void AddElementwiseQuant8(int size, const BinaryArithmeticOpParam &params
   }
 }
 
+inline void AddElementwise(int size, const BinaryArithmeticOpParam &params,
+                           const int8_t *input1_data, const int8_t *input2_data,
+                           int8_t *output_data)
+{
+  int i = 0;
+#ifdef USE_NEON
+  const int8x16_t output_activation_min_vector = vdupq_n_s8(params.quantized_activation_min);
+  const int8x16_t output_activation_max_vector = vdupq_n_s8(params.quantized_activation_max);
+
+  const int input1_left_shift = params.left_shift + params.input1_shift;
+  const int input2_left_shift = params.left_shift + params.input2_shift;
+  const int32x4_t input1_left_dup = vdupq_n_s32(input1_left_shift);
+  const int32x4_t input2_left_dup = vdupq_n_s32(input2_left_shift);
+
+  const int16x8_t input1_offset_dup = vdupq_n_s16(params.input1_offset);
+  const int16x8_t input2_offset_dup = vdupq_n_s16(params.input2_offset);
+
+  for (; i <= size - 16; i += 16)
+  {
+    const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
+
+    const int16x8_t input1_val_s16_high = vmovl_s8(vget_high_s8(input1_val_original));
+    const int16x8_t input1_val_s16_low = vmovl_s8(vget_low_s8(input1_val_original));
+
+    const int16x8_t input2_val_s16_high = vmovl_s8(vget_high_s8(input2_val_original));
+    const int16x8_t input2_val_s16_low = vmovl_s8(vget_low_s8(input2_val_original));
+    const int16x8_t input1_val_high = vaddq_s16(input1_val_s16_high, input1_offset_dup);
+    const int16x8_t input2_val_high = vaddq_s16(input2_val_s16_high, input2_offset_dup);
+    const int16x8_t input1_val_low = vaddq_s16(input1_val_s16_low, input1_offset_dup);
+    const int16x8_t input2_val_low = vaddq_s16(input2_val_s16_low, input2_offset_dup);
+    const int16x4_t input1_val_high_high = vget_high_s16(input1_val_high);
+    const int16x4_t input1_val_high_low = vget_low_s16(input1_val_high);
+    const int16x4_t input1_val_low_high = vget_high_s16(input1_val_low);
+    const int16x4_t input1_val_low_low = vget_low_s16(input1_val_low);
+    const int16x4_t input2_val_high_high = vget_high_s16(input2_val_high);
+    const int16x4_t input2_val_high_low = vget_low_s16(input2_val_high);
+    const int16x4_t input2_val_low_high = vget_high_s16(input2_val_low);
+    const int16x4_t input2_val_low_low = vget_low_s16(input2_val_low);
+    int32x4_t x111 = vmovl_s16(input1_val_low_low);
+    int32x4_t x112 = vmovl_s16(input1_val_low_high);
+    int32x4_t x121 = vmovl_s16(input1_val_high_low);
+    int32x4_t x122 = vmovl_s16(input1_val_high_high);
+    int32x4_t x211 = vmovl_s16(input2_val_low_low);
+    int32x4_t x212 = vmovl_s16(input2_val_low_high);
+    int32x4_t x221 = vmovl_s16(input2_val_high_low);
+    int32x4_t x222 = vmovl_s16(input2_val_high_high);
+
+    x111 = vshlq_s32(x111, input1_left_dup);
+    x112 = vshlq_s32(x112, input1_left_dup);
+    x121 = vshlq_s32(x121, input1_left_dup);
+    x122 = vshlq_s32(x122, input1_left_dup);
+    x211 = vshlq_s32(x211, input2_left_dup);
+    x212 = vshlq_s32(x212, input2_left_dup);
+    x221 = vshlq_s32(x221, input2_left_dup);
+    x222 = vshlq_s32(x222, input2_left_dup);
+    x111 = vqrdmulhq_n_s32(x111, params.input1_multiplier);
+    x112 = vqrdmulhq_n_s32(x112, params.input1_multiplier);
+    x121 = vqrdmulhq_n_s32(x121, params.input1_multiplier);
+    x122 = vqrdmulhq_n_s32(x122, params.input1_multiplier);
+    x211 = vqrdmulhq_n_s32(x211, params.input2_multiplier);
+    x212 = vqrdmulhq_n_s32(x212, params.input2_multiplier);
+    x221 = vqrdmulhq_n_s32(x221, params.input2_multiplier);
+    x222 = vqrdmulhq_n_s32(x222, params.input2_multiplier);
+    int32x4_t s11 = vaddq_s32(x111, x211);
+    int32x4_t s12 = vaddq_s32(x112, x212);
+    int32x4_t s21 = vaddq_s32(x121, x221);
+    int32x4_t s22 = vaddq_s32(x122, x222);
+    s11 = vqrdmulhq_n_s32(s11, params.output_multiplier);
+    s12 = vqrdmulhq_n_s32(s12, params.output_multiplier);
+    s21 = vqrdmulhq_n_s32(s21, params.output_multiplier);
+    s22 = vqrdmulhq_n_s32(s22, params.output_multiplier);
+    using gemmlowp::RoundingDivideByPOT;
+    s11 = RoundingDivideByPOT(s11, -params.output_shift);
+    s12 = RoundingDivideByPOT(s12, -params.output_shift);
+    s21 = RoundingDivideByPOT(s21, -params.output_shift);
+    s22 = RoundingDivideByPOT(s22, -params.output_shift);
+    const int16x4_t s11_narrowed = vmovn_s32(s11);
+    const int16x4_t s12_narrowed = vmovn_s32(s12);
+    const int16x4_t s21_narrowed = vmovn_s32(s21);
+    const int16x4_t s22_narrowed = vmovn_s32(s22);
+    const int16x8_t s1 =
+      vaddq_s16(vcombine_s16(s11_narrowed, s12_narrowed), vdupq_n_s16(params.output_offset));
+    const int16x8_t s2 =
+      vaddq_s16(vcombine_s16(s21_narrowed, s22_narrowed), vdupq_n_s16(params.output_offset));
+    const int8x16_t s = vcombine_s8(vqmovn_s16(s1), vqmovn_s16(s2));
+
+    const int8x16_t clamped =
+      vmaxq_s8(output_activation_min_vector, vminq_s8(output_activation_max_vector, s));
+    vst1q_s8(output_data + i, clamped);
+  }
+#endif // NEON
+
+  for (; i < size; ++i)
+  {
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+    const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+      shifted_input1_val, params.input1_multiplier, params.input1_shift);
+    const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+      shifted_input2_val, params.input2_multiplier, params.input2_shift);
+    const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+    const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                                 raw_sum, params.output_multiplier, params.output_shift) +
+                               params.output_offset;
+    const int32_t clamped_output = std::min(params.quantized_activation_max,
+                                            std::max(params.quantized_activation_min, raw_output));
+    output_data[i] = static_cast<int8_t>(clamped_output);
+  }
+}
+
 struct BinaryOpFuncAddFloat
 {
 #ifdef USE_NEON
@@ -473,12 +687,13 @@ getBinaryOpWithActivationImplFloat(const BinaryArithmeticOpParam &params)
                                   BinaryOpScalarBroadcast<FUNC, BinaryOpActivationFloatMinMax>);
 }
 
-inline void AddQuant8(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
-                      const uint8_t *input1_data, const Shape &input2_shape,
-                      const uint8_t *input2_data, const Shape &output_shape, uint8_t *output_data)
+template <typename T>
+inline typename std::enable_if_t<is_quant8<T>::value>
+Add(const BinaryArithmeticOpParam &params, const Shape &input1_shape, const T *input1_data,
+    const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data)
 {
   const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
-  AddElementwiseQuant8(flat_size, params, input1_data, input2_data, output_data);
+  AddElementwise(flat_size, params, input1_data, input2_data, output_data);
 }
 
 inline void Add(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
@@ -493,9 +708,9 @@ inline void Add(const BinaryArithmeticOpParam &params, const Shape &input1_shape
 // Scalar-broadcast add that can be used for inner loop of more general
 // broadcast add, so that, for example, scalar-broadcast with batch will still
 // be fast.
-inline void AddScalarBroadcastQuant8(int size, const BinaryArithmeticOpParam &params,
-                                     uint8_t broadcast_value, const uint8_t *input2_data,
-                                     uint8_t *output_data)
+inline void AddScalarBroadcast(int size, const BinaryArithmeticOpParam &params,
+                               uint8_t broadcast_value, const uint8_t *input2_data,
+                               uint8_t *output_data)
 {
   int i = 0;
   int32_t clamped_output;
@@ -506,31 +721,115 @@ inline void AddScalarBroadcastQuant8(int size, const BinaryArithmeticOpParam &pa
   }
 }
 
-inline void BroadcastAddDispatchQuant8(const BinaryArithmeticOpParam &params,
-                                       const Shape &input1_shape, const uint8_t *input1_data,
-                                       const Shape &input2_shape, const uint8_t *input2_data,
-                                       const Shape &output_shape, uint8_t *output_data)
+// Scalar-broadcast add that can be used for inner loop of more general
+// broadcast add, so that, for example, scalar-broadcast with batch will still
+// be fast.
+inline void AddScalarBroadcast(int size, const BinaryArithmeticOpParam &params, int8_t input1_data,
+                               const int8_t *input2_data, int8_t *output_data)
 {
-  if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast)
+  using gemmlowp::RoundingDivideByPOT;
+  int i = 0;
+#ifdef USE_NEON
+  const int32x4_t left_shift_dup = vdupq_n_s32(params.left_shift);
+  const int8x8_t output_activation_min_vector = vdup_n_s8(params.quantized_activation_min);
+  const int8x8_t output_activation_max_vector = vdup_n_s8(params.quantized_activation_max);
+
+  // Process broadcast scalar.
+  const int8x8_t input1_val_original = vdup_n_s8(input1_data);
+  const int16x8_t input1_val_s16 = vmovl_s8(input1_val_original);
+  const int16x8_t input1_val = vaddq_s16(input1_val_s16, vdupq_n_s16(params.input1_offset));
+  const int16x4_t input1_val_high = vget_high_s16(input1_val);
+  const int16x4_t input1_val_low = vget_low_s16(input1_val);
+  int32x4_t x11 = vmovl_s16(input1_val_low);
+  int32x4_t x12 = vmovl_s16(input1_val_high);
+  x11 = vshlq_s32(x11, left_shift_dup);
+  x12 = vshlq_s32(x12, left_shift_dup);
+  x11 = vqrdmulhq_n_s32(x11, params.input1_multiplier);
+  x12 = vqrdmulhq_n_s32(x12, params.input1_multiplier);
+  const int32x4_t input1_shift_dup = vdupq_n_s32(params.input1_shift);
+  x11 = vshlq_s32(x11, input1_shift_dup);
+  x12 = vshlq_s32(x12, input1_shift_dup);
+
+  for (; i <= size - 8; i += 8)
   {
-    const std::function<uint8_t(const BinaryArithmeticOpParam &, const uint8_t &, const uint8_t &)>
-      fn =
-        [](const BinaryArithmeticOpParam &params, const uint8_t &a, const uint8_t &b) -> uint8_t {
-      return static_cast<uint8_t>(quant8_sum(params, a, b));
-    };
-    reference::BroadcastBinaryArithmeticOpSlowQuant8(
-      params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, fn);
+    const int8x8_t input2_val_original = vld1_s8(input2_data + i);
+    const int16x8_t input2_val_s16 = vmovl_s8(input2_val_original);
+    const int16x8_t input2_val = vaddq_s16(input2_val_s16, vdupq_n_s16(params.input2_offset));
+    const int16x4_t input2_val_high = vget_high_s16(input2_val);
+    const int16x4_t input2_val_low = vget_low_s16(input2_val);
+    int32x4_t x21 = vmovl_s16(input2_val_low);
+    int32x4_t x22 = vmovl_s16(input2_val_high);
+    x21 = vshlq_s32(x21, left_shift_dup);
+    x22 = vshlq_s32(x22, left_shift_dup);
+    x21 = vqrdmulhq_n_s32(x21, params.input2_multiplier);
+    x22 = vqrdmulhq_n_s32(x22, params.input2_multiplier);
+    const int32x4_t input2_shift_dup = vdupq_n_s32(params.input2_shift);
+    x21 = vshlq_s32(x21, input2_shift_dup);
+    x22 = vshlq_s32(x22, input2_shift_dup);
+    int32x4_t s1 = vaddq_s32(x11, x21);
+    int32x4_t s2 = vaddq_s32(x12, x22);
+    s1 = vqrdmulhq_n_s32(s1, params.output_multiplier);
+    s2 = vqrdmulhq_n_s32(s2, params.output_multiplier);
+    s1 = RoundingDivideByPOT(s1, -params.output_shift);
+    s2 = RoundingDivideByPOT(s2, -params.output_shift);
+    const int16x4_t s1_narrowed = vmovn_s32(s1);
+    const int16x4_t s2_narrowed = vmovn_s32(s2);
+    const int16x8_t s =
+      vaddq_s16(vcombine_s16(s1_narrowed, s2_narrowed), vdupq_n_s16(params.output_offset));
+    const int8x8_t clamped =
+      vmax_s8(output_activation_min_vector, vmin_s8(output_activation_max_vector, vqmovn_s16(s)));
+    vst1_s8(output_data + i, clamped);
   }
-  else
+#endif // NEON
+
+  if (i < size)
   {
-    BinaryBroadcastFiveFold(
-      params, params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast,
-      input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
-      static_cast<void (*)(int, const BinaryArithmeticOpParam &, const uint8_t *, const uint8_t *,
-                           uint8_t *)>(AddElementwiseQuant8),
-      static_cast<void (*)(int, const BinaryArithmeticOpParam &, uint8_t, const uint8_t *,
-                           uint8_t *)>(AddScalarBroadcastQuant8));
+    // Process broadcast scalar.
+    const int32_t input1_val = params.input1_offset + input1_data;
+    const int32_t shifted_input1_val = input1_val * (1 << params.left_shift);
+    const int32_t scaled_input1_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+      shifted_input1_val, params.input1_multiplier, params.input1_shift);
+
+    for (; i < size; ++i)
+    {
+      const int32_t input2_val = params.input2_offset + input2_data[i];
+      const int32_t shifted_input2_val = input2_val * (1 << params.left_shift);
+      const int32_t scaled_input2_val = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+        shifted_input2_val, params.input2_multiplier, params.input2_shift);
+      const int32_t raw_sum = scaled_input1_val + scaled_input2_val;
+      const int32_t raw_output = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                                   raw_sum, params.output_multiplier, params.output_shift) +
+                                 params.output_offset;
+      const int32_t clamped_output = std::min(
+        params.quantized_activation_max, std::max(params.quantized_activation_min, raw_output));
+      output_data[i] = static_cast<int8_t>(clamped_output);
+    }
+  }
+}
+
+template <typename T>
+inline typename std::enable_if_t<is_quant8<T>::value>
+BroadcastAddDispatch(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                     const T *input1_data, const Shape &input2_shape, const T *input2_data,
+                     const Shape &output_shape, T *output_data)
+{
+  if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast)
+  {
+    const std::function<T(const BinaryArithmeticOpParam &, const T &, const T &)> fn =
+      [](const BinaryArithmeticOpParam &params, const T &a, const T &b) {
+        return static_cast<T>(quant8_sum(params, a, b));
+      };
+    reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
+                                               input2_data, output_shape, output_data, fn);
+    return;
   }
+
+  BinaryBroadcastFiveFold(
+    params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
+    static_cast<void (*)(int, const BinaryArithmeticOpParam &, const T *, const T *, T *)>(
+      AddElementwise),
+    static_cast<void (*)(int, const BinaryArithmeticOpParam &, T, const T *, T *)>(
+      AddScalarBroadcast));
 }
 
 inline void BroadcastAddDispatch(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
@@ -592,8 +891,9 @@ inline void BroadcastSubDispatch(const BinaryArithmeticOpParam &params, const Sh
   }
 }
 
-inline int32_t quant8_mul(const BinaryArithmeticOpParam &params, const uint8_t input1_data,
-                          const uint8_t input2_data)
+template <typename T>
+inline typename std::enable_if_t<is_quant8<T>::value, int32_t>
+quant8_mul(const BinaryArithmeticOpParam &params, const T input1_data, const T input2_data)
 {
   const int32_t input1_val = params.input1_offset + input1_data;
   const int32_t input2_val = params.input2_offset + input2_data;
@@ -607,9 +907,9 @@ inline int32_t quant8_mul(const BinaryArithmeticOpParam &params, const uint8_t i
   return clamped_output;
 }
 
-inline void MulElementwiseQuant8(int size, const BinaryArithmeticOpParam &params,
-                                 const uint8_t *input1_data, const uint8_t *input2_data,
-                                 uint8_t *output_data)
+inline void MulElementwise(int size, const BinaryArithmeticOpParam &params,
+                           const uint8_t *input1_data, const uint8_t *input2_data,
+                           uint8_t *output_data)
 {
   int i = 0;
 
@@ -671,12 +971,102 @@ inline void MulElementwiseQuant8(int size, const BinaryArithmeticOpParam &params
   }
 }
 
-inline void MulQuant8(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
-                      const uint8_t *input1_data, const Shape &input2_shape,
-                      const uint8_t *input2_data, const Shape &output_shape, uint8_t *output_data)
+inline void MulElementwise(int size, const BinaryArithmeticOpParam &params,
+                           const int8_t *input1_data, const int8_t *input2_data,
+                           int8_t *output_data)
+{
+  int i = 0;
+#ifdef USE_NEON
+  const int16x8_t input1_offset_vector = vdupq_n_s16(params.input1_offset);
+  const int16x8_t input2_offset_vector = vdupq_n_s16(params.input2_offset);
+  const int16x8_t output_offset_vector = vdupq_n_s16(params.output_offset);
+  const auto output_activation_min_vector = vdupq_n_s8(params.quantized_activation_min);
+  const auto output_activation_max_vector = vdupq_n_s8(params.quantized_activation_max);
+  const int left_shift = std::max(0, params.output_shift);
+  const int right_shift = std::max(0, -params.output_shift);
+  const int32x4_t left_shift_vec = vdupq_n_s32(left_shift);
+  for (; i <= size - 16; i += 16)
+  {
+    // We load / store 16 at a time, multiplying as four sets of 4 int32s.
+    const int8x16_t input1_val_original = vld1q_s8(input1_data + i);
+    const int8x16_t input2_val_original = vld1q_s8(input2_data + i);
+
+    const int16x8_t input1_val_s16_high = vmovl_s8(vget_high_s8(input1_val_original));
+    const int16x8_t input1_val_s16_low = vmovl_s8(vget_low_s8(input1_val_original));
+
+    const int16x8_t input2_val_s16_high = vmovl_s8(vget_high_s8(input2_val_original));
+    const int16x8_t input2_val_s16_low = vmovl_s8(vget_low_s8(input2_val_original));
+    const int16x8_t input1_val_high = vaddq_s16(input1_val_s16_high, input1_offset_vector);
+    const int16x8_t input2_val_high = vaddq_s16(input2_val_s16_high, input2_offset_vector);
+    const int16x8_t input1_val_low = vaddq_s16(input1_val_s16_low, input1_offset_vector);
+    const int16x8_t input2_val_low = vaddq_s16(input2_val_s16_low, input2_offset_vector);
+    const int16x4_t input1_val_high_high = vget_high_s16(input1_val_high);
+    const int16x4_t input1_val_high_low = vget_low_s16(input1_val_high);
+    const int16x4_t input1_val_low_high = vget_high_s16(input1_val_low);
+    const int16x4_t input1_val_low_low = vget_low_s16(input1_val_low);
+    const int16x4_t input2_val_high_high = vget_high_s16(input2_val_high);
+    const int16x4_t input2_val_high_low = vget_low_s16(input2_val_high);
+    const int16x4_t input2_val_low_high = vget_high_s16(input2_val_low);
+    const int16x4_t input2_val_low_low = vget_low_s16(input2_val_low);
+
+    auto p1 = vmull_s16(input2_val_high_high, input1_val_high_high);
+    auto p2 = vmull_s16(input2_val_high_low, input1_val_high_low);
+    auto p3 = vmull_s16(input2_val_low_high, input1_val_low_high);
+    auto p4 = vmull_s16(input2_val_low_low, input1_val_low_low);
+
+    p1 = vshlq_s32(p1, left_shift_vec);
+    p2 = vshlq_s32(p2, left_shift_vec);
+    p3 = vshlq_s32(p3, left_shift_vec);
+    p4 = vshlq_s32(p4, left_shift_vec);
+
+    p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
+    p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
+    p3 = vqrdmulhq_n_s32(p3, params.output_multiplier);
+    p4 = vqrdmulhq_n_s32(p4, params.output_multiplier);
+    using gemmlowp::RoundingDivideByPOT;
+    p1 = RoundingDivideByPOT(p1, right_shift);
+    p2 = RoundingDivideByPOT(p2, right_shift);
+    p3 = RoundingDivideByPOT(p3, right_shift);
+    p4 = RoundingDivideByPOT(p4, right_shift);
+
+    const auto p1_narrowed = vqmovn_s32(p1);
+    const auto p2_narrowed = vqmovn_s32(p2);
+    const auto p3_narrowed = vqmovn_s32(p3);
+    const auto p4_narrowed = vqmovn_s32(p4);
+
+    const int16x8_t p_part1 =
+      vaddq_s16(vcombine_s16(p2_narrowed, p1_narrowed), output_offset_vector);
+    const int16x8_t p_part2 =
+      vaddq_s16(vcombine_s16(p4_narrowed, p3_narrowed), output_offset_vector);
+    const int8x16_t p = vcombine_s8(vqmovn_s16(p_part2), vqmovn_s16(p_part1));
+
+    const auto clamped =
+      vmaxq_s8(output_activation_min_vector, vminq_s8(output_activation_max_vector, p));
+    vst1q_s8(output_data + i, clamped);
+  }
+#endif // NEON
+
+  for (; i < size; ++i)
+  {
+    const int32_t input1_val = params.input1_offset + input1_data[i];
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t unclamped_result =
+      params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val,
+                                                           params.output_multiplier,
+                                                           params.output_shift);
+    const int32_t clamped_output = std::min(
+      params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result));
+    output_data[i] = static_cast<int8_t>(clamped_output);
+  }
+}
+
+template <typename T>
+inline typename std::enable_if_t<is_quant8<T>::value>
+Mul(const BinaryArithmeticOpParam &params, const Shape &input1_shape, const T *input1_data,
+    const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data)
 {
   const int flat_size = MatchingElementsSize(input1_shape, input2_shape, output_shape);
-  MulElementwiseQuant8(flat_size, params, input1_data, input2_data, output_data);
+  MulElementwise(flat_size, params, input1_data, input2_data, output_data);
 }
 
 inline void Mul(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
@@ -688,9 +1078,9 @@ inline void Mul(const BinaryArithmeticOpParam &params, const Shape &input1_shape
   (*implFuncs.first)(flat_size, params, input1_data, input2_data, output_data);
 }
 
-inline void MulSimpleBroadcastQuant8(int size, const BinaryArithmeticOpParam &params,
-                                     const uint8_t broadcast_value, const uint8_t *input2_data,
-                                     uint8_t *output_data)
+inline void MulSimpleBroadcast(int size, const BinaryArithmeticOpParam &params,
+                               const uint8_t broadcast_value, const uint8_t *input2_data,
+                               uint8_t *output_data)
 {
   int i = 0;
   int32_t clamped_output;
@@ -701,29 +1091,109 @@ inline void MulSimpleBroadcastQuant8(int size, const BinaryArithmeticOpParam &pa
   }
 }
 
-inline void BroadcastMulDispatchQuant8(const BinaryArithmeticOpParam &params,
-                                       const Shape &input1_shape, const uint8_t *input1_data,
-                                       const Shape &input2_shape, const uint8_t *input2_data,
-                                       const Shape &output_shape, uint8_t *output_data)
+// Broadcast mul that can often be used for inner loop of broadcast Mul.
+inline void MulSimpleBroadcast(int size, const BinaryArithmeticOpParam &params,
+                               const int8_t broadcast_value, const int8_t *input2_data,
+                               int8_t *output_data)
+{
+  const int16_t input1_val = params.input1_offset + broadcast_value;
+
+  int i = 0;
+#ifdef USE_NEON
+  const auto input2_offset_vector = vdupq_n_s16(params.input2_offset);
+  const auto output_offset_vector = vdupq_n_s16(params.output_offset);
+  const auto output_activation_min_vector = vdupq_n_s8(params.quantized_activation_min);
+  const auto output_activation_max_vector = vdupq_n_s8(params.quantized_activation_max);
+  const int left_shift = std::max(0, params.output_shift);
+  const int right_shift = std::max(0, -params.output_shift);
+  const int32x4_t left_shift_vec = vdupq_n_s32(left_shift);
+  for (; i <= size - 16; i += 16)
+  {
+    // We load / store 16 at a time, multiplying as four sets of 4 int32s.
+    const auto input2_val_original = vld1q_s8(input2_data + i);
+    const auto input2_val_s16_high = vmovl_s8(vget_high_s8(input2_val_original));
+    const auto input2_val_s16_low = vmovl_s8(vget_low_s8(input2_val_original));
+
+    const auto input2_val_high = vaddq_s16(input2_val_s16_high, input2_offset_vector);
+    const auto input2_val_low = vaddq_s16(input2_val_s16_low, input2_offset_vector);
+
+    const auto input2_val_low_low = vget_low_s16(input2_val_low);
+    const auto input2_val_low_high = vget_high_s16(input2_val_low);
+    const auto input2_val_high_low = vget_low_s16(input2_val_high);
+    const auto input2_val_high_high = vget_high_s16(input2_val_high);
+
+    auto p1 = vmull_n_s16(input2_val_high_high, input1_val);
+    auto p2 = vmull_n_s16(input2_val_high_low, input1_val);
+    auto p3 = vmull_n_s16(input2_val_low_high, input1_val);
+    auto p4 = vmull_n_s16(input2_val_low_low, input1_val);
+
+    p1 = vshlq_s32(p1, left_shift_vec);
+    p2 = vshlq_s32(p2, left_shift_vec);
+    p3 = vshlq_s32(p3, left_shift_vec);
+    p4 = vshlq_s32(p4, left_shift_vec);
+
+    p1 = vqrdmulhq_n_s32(p1, params.output_multiplier);
+    p2 = vqrdmulhq_n_s32(p2, params.output_multiplier);
+    p3 = vqrdmulhq_n_s32(p3, params.output_multiplier);
+    p4 = vqrdmulhq_n_s32(p4, params.output_multiplier);
+    using gemmlowp::RoundingDivideByPOT;
+    p1 = RoundingDivideByPOT(p1, right_shift);
+    p2 = RoundingDivideByPOT(p2, right_shift);
+    p3 = RoundingDivideByPOT(p3, right_shift);
+    p4 = RoundingDivideByPOT(p4, right_shift);
+
+    const auto p1_narrowed = vqmovn_s32(p1);
+    const auto p2_narrowed = vqmovn_s32(p2);
+    const auto p3_narrowed = vqmovn_s32(p3);
+    const auto p4_narrowed = vqmovn_s32(p4);
+
+    const int16x8_t p_part1 =
+      vaddq_s16(vcombine_s16(p2_narrowed, p1_narrowed), output_offset_vector);
+    const int16x8_t p_part2 =
+      vaddq_s16(vcombine_s16(p4_narrowed, p3_narrowed), output_offset_vector);
+    const int8x16_t p = vcombine_s8(vqmovn_s16(p_part2), vqmovn_s16(p_part1));
+
+    const auto clamped =
+      vmaxq_s8(output_activation_min_vector, vminq_s8(output_activation_max_vector, p));
+    vst1q_s8(output_data + i, clamped);
+  }
+#endif // NEON
+
+  for (; i < size; ++i)
+  {
+    const int32_t input2_val = params.input2_offset + input2_data[i];
+    const int32_t unclamped_result =
+      params.output_offset + MultiplyByQuantizedMultiplier(input1_val * input2_val,
+                                                           params.output_multiplier,
+                                                           params.output_shift);
+    const int32_t clamped_output = std::min(
+      params.quantized_activation_max, std::max(params.quantized_activation_min, unclamped_result));
+    output_data[i] = static_cast<int8_t>(clamped_output);
+  }
+}
+
+template <typename T>
+inline typename std::enable_if_t<is_quant8<T>::value>
+BroadcastMulDispatch(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
+                     const T *input1_data, const Shape &input2_shape, const T *input2_data,
+                     const Shape &output_shape, T *output_data)
 {
   if (params.broadcast_category == BroadcastableOpCategory::kGenericBroadcast)
   {
-    const std::function<uint8_t(const BinaryArithmeticOpParam &, const uint8_t &, const uint8_t &)>
-      fn =
-        [](const BinaryArithmeticOpParam &params, const uint8_t &a, const uint8_t &b) -> uint8_t {
-      return static_cast<uint8_t>(quant8_mul(params, a, b));
-    };
-    reference::BroadcastBinaryArithmeticOpSlowQuant8(
-      params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data, fn);
+    const std::function<T(const BinaryArithmeticOpParam &, const T &, const T &)> fn =
+      [](const BinaryArithmeticOpParam &params, const T &a, const T &b) {
+        return static_cast<T>(quant8_mul(params, a, b));
+      };
+    reference::BroadcastBinaryArithmeticOpSlow(params, input1_shape, input1_data, input2_shape,
+                                               input2_data, output_shape, output_data, fn);
     return;
   }
   BinaryBroadcastFiveFold(
-    params, params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast,
-    input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
-    static_cast<void (*)(int, const BinaryArithmeticOpParam &, const uint8_t *, const uint8_t *,
-                         uint8_t *)>(MulElementwiseQuant8),
-    static_cast<void (*)(int, const BinaryArithmeticOpParam &, uint8_t, const uint8_t *,
-                         uint8_t *)>(MulSimpleBroadcastQuant8));
+    params, input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
+    static_cast<void (*)(int, const BinaryArithmeticOpParam &, const T *, const T *, T *)>(
+      MulElementwise),
+    static_cast<void (*)(int, const BinaryArithmeticOpParam &, T, const T *, T *)>(
+      MulSimpleBroadcast));
 }
 
 inline void BroadcastMulDispatch(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
@@ -741,10 +1211,8 @@ inline void BroadcastMulDispatch(const BinaryArithmeticOpParam &params, const Sh
     return;
   }
   auto implFuncs = getBinaryOpWithActivationImplFloat<BinaryOpFuncMulFloat>(params);
-  BinaryBroadcastFiveFold(
-    params, params.broadcast_category == BroadcastableOpCategory::kSecondInputBroadcastsFast,
-    input1_shape, input1_data, input2_shape, input2_data, output_shape, output_data,
-    implFuncs.first, implFuncs.second);
+  BinaryBroadcastFiveFold(params, input1_shape, input1_data, input2_shape, input2_data,
+                          output_shape, output_data, implFuncs.first, implFuncs.second);
 }
 
 inline void Div(const BinaryArithmeticOpParam &params, const Shape &input1_shape,
diff --git a/compute/cker/include/cker/operation/optimized/DepthwiseConvFloat.h b/compute/cker/include/cker/operation/optimized/DepthwiseConvFloat.h
index d4397933a..17b2fc7a2 100644
--- a/compute/cker/include/cker/operation/optimized/DepthwiseConvFloat.h
+++ b/compute/cker/include/cker/operation/optimized/DepthwiseConvFloat.h
@@ -1243,8 +1243,8 @@ inline void DepthwiseConvImpl(const DepthwiseConvParams &params, const Shape &in
   }
 }
 
-} // nnfw
-} // cker
-} // optimized
+} // namespace optimized
+} // namespace cker
+} // namespace nnfw
 
 #endif
diff --git a/compute/cker/include/cker/operation/optimized/integer_ops/DepthwiseConvInt8.h b/compute/cker/include/cker/operation/optimized/integer_ops/DepthwiseConvInt8.h
new file mode 100644
index 000000000..bd8497920
--- /dev/null
+++ b/compute/cker/include/cker/operation/optimized/integer_ops/DepthwiseConvInt8.h
@@ -0,0 +1,2138 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_INT8_H__
+#define __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_INT8_H__
+
+#include "cker/CpuBackendThreadpool.h"
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+#include "cker/neon/neon_check.h"
+#include "cker/operation/Quantize.h"
+
+#include <fixedpoint/fixedpoint.h>
+#include <public/gemmlowp.h>
+
+namespace nnfw
+{
+namespace cker
+{
+namespace optimized_integer_ops
+{
+
+// Category of depthwise convolution output rounding.
+enum class DepthwiseConvOutputRounding
+{
+  kNone = 0,     // Invalid: specific method must be specified.
+  kAwayFromZero, // Original method: exact halves rounded away from zero.
+  kUpward,       // Halves towards +infinity: adds 0.5 before truncate.
+  // This is where a future kNearestEven would be placed.
+};
+
+// Category of depthwise convolution depth multiplication.
+enum class DepthwiseConvDepthMultiplication
+{
+  kNoMultiplication = 0, // Depth multiplier = 1.
+  kUnitInputDepth,       // Input depth = 1, output depth = depth multiplier.
+};
+
+namespace depthwise_conv
+{
+
+// Implementation of quantized DepthwiseConv
+
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+struct QuantizedDepthwiseConvKernel
+{
+};
+
+#ifdef USE_NEON
+template <> struct QuantizedDepthwiseConvKernel<true, 8, 2>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int8x8x2_t filter_s8;
+    filter_s8.val[0] = vld1_s8(filter_ptr);
+    filter_s8.val[1] = vld1_s8(filter_ptr + 8);
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++)
+    {
+      filter[i] = vmovl_s8(filter_s8.val[i]);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4x2_t acc[2];
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
+        acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
+      }
+      // Load the inputs, add input_offset.
+      const int8x8_t input_s8 = vld1_s8(input_ptr);
+      input_ptr += input_ptr_increment;
+      const int16x8_t input_s16 = vmovl_s8(input_s8);
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++)
+      {
+        acc[0].val[i] =
+          vmlal_s16(acc[0].val[i], vget_low_s16(filter[i]), vget_low_s16(input_dup2.val[i]));
+        acc[1].val[i] =
+          vmlal_s16(acc[1].val[i], vget_high_s16(filter[i]), vget_high_s16(input_dup2.val[i]));
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
+        vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 8, 1>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    const int8x8_t filter_s8 = vld1_s8(filter_ptr);
+    const int16x8_t filter = vmovl_s8(filter_s8);
+
+    int outp = 0;
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8[2];
+      for (int i = 0; i < 2; i++)
+      {
+        input_s8[i] = vld1_s8(input_ptr + 8 * i);
+      }
+      input_ptr += 16;
+      int16x8_t input[2];
+      for (int i = 0; i < 2; i++)
+      {
+        input[i] = vmovl_s8(input_s8[i]);
+      }
+      for (int i = 0; i < 2; i++)
+      {
+        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
+      }
+      // Multiply-accumulate.
+      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input[0]));
+      acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input[0]));
+      acc[2] = vmlal_s16(acc[2], vget_low_s16(filter), vget_low_s16(input[1]));
+      acc[3] = vmlal_s16(acc[3], vget_high_s16(filter), vget_high_s16(input[1]));
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle 1 output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[2];
+      acc[0] = vld1q_s32(acc_buffer_ptr);
+      acc[1] = vld1q_s32(acc_buffer_ptr + 4);
+
+      // Load the inputs, add input_offset.
+      const int8x8_t input_s8 = vld1_s8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vmovl_s8(input_s8);
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Multiply-accumulate.
+      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), vget_low_s16(input));
+      acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), vget_high_s16(input));
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc[0]);
+      vst1q_s32(acc_buffer_ptr + 4, acc[1]);
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 4, 2>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    const int8x8_t filter_s8 = vld1_s8(filter_ptr);
+    const int16x8_t filter = vmovl_s8(filter_s8);
+
+    int outp = 0;
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      const int8x8_t input_s8 = vld1_s8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vmovl_s8(input_s8);
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++)
+      {
+        acc[2 * i + 0] =
+          vmlal_s16(acc[2 * i + 0], vget_low_s16(filter), vget_low_s16(input_dup2.val[i]));
+        acc[2 * i + 1] =
+          vmlal_s16(acc[2 * i + 1], vget_high_s16(filter), vget_high_s16(input_dup2.val[i]));
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[2];
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
+      input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x4x2_t input_dup2 = vzip_s16(input, input);
+      // Multiply-accumulate
+      acc[0] = vmlal_s16(acc[0], vget_low_s16(filter), input_dup2.val[0]);
+      acc[1] = vmlal_s16(acc[1], vget_high_s16(filter), input_dup2.val[1]);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 2, 8>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++)
+    {
+      const int8x8_t filter_s8 = vld1_s8(filter_ptr + 8 * i);
+      filter[i] = vmovl_s8(filter_s8);
+    }
+    int outp = 0;
+    // Handle two output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[8];
+      for (int i = 0; i < 8; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
+      input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Multiply-accumulate.
+      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
+      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
+      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
+      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
+      acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), input, 2);
+      acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), input, 2);
+      acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), input, 3);
+      acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), input, 3);
+      // Store the accumulators back to acc_buffer.
+      for (int i = 0; i < 8; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 32;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_ptr += 2;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
+      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 0);
+      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 1);
+      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 1);
+
+      // Store the accumulators back to acc_buffer.
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 2, 2>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int8x8_t filter_s8 = vdup_n_s8(0);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
+    filter_s8 = vset_lane_s8(filter_ptr[2], filter_s8, 2);
+    filter_s8 = vset_lane_s8(filter_ptr[3], filter_s8, 3);
+    const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
+
+    int outp = 0;
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      const int8x8_t input_s8 = vld1_s8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vmovl_s8(input_s8);
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+      // Multiply-accumulate
+      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
+      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
+      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
+      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_ptr += 2;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x4_t input_dup2 = vzip_s16(input, input).val[0];
+      // Multiply-accumulate
+      acc = vmlal_s16(acc, filter, input_dup2);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 2, 1>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int8x8_t filter_s8 = vdup_n_s8(0);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 2);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 3);
+    const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
+
+    int outp = 0;
+    // Handle 8 output pixels at a time.
+    for (; outp <= num_output_pixels - 8; outp += 8)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8[2];
+      for (int i = 0; i < 2; i++)
+      {
+        input_s8[i] = vld1_s8(input_ptr + 8 * i);
+      }
+      input_ptr += 16;
+      int16x8_t input[2];
+      for (int i = 0; i < 2; i++)
+      {
+        input[i] = vmovl_s8(input_s8[i]);
+      }
+      for (int i = 0; i < 2; i++)
+      {
+        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
+      }
+
+      // Multiply-accumulate.
+      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input[0]));
+      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input[0]));
+      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input[1]));
+      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input[1]));
+      // Store the accumulators back to acc_buffer.
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc[2];
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      const int8x8_t input_s8 = vld1_s8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vmovl_s8(input_s8);
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input));
+      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input));
+      // Store the accumulators back to acc_buffer.
+      for (int i = 0; i < 2; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
+      input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc = vmlal_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer.
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+    // Handle 1 output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x2_t acc = vld1_s32(acc_buffer_ptr);
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_ptr += 2;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
+      // Store the accumulators back to acc_buffer.
+      vst1_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 1, 2>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int8x8_t filter_s8 = vdup_n_s8(0);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 2);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 3);
+    const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
+
+    int outp = 0;
+    // Handle 8 output pixels at a time.
+    for (; outp <= num_output_pixels - 8; outp += 8)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      const int8x8_t input_s8 = vld1_s8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vmovl_s8(input_s8);
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Duplicate the input values, 2-fold
+      const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+      // Multiply-accumulate
+      acc[0] = vmlal_s16(acc[0], filter, vget_low_s16(input_dup2.val[0]));
+      acc[1] = vmlal_s16(acc[1], filter, vget_high_s16(input_dup2.val[0]));
+      acc[2] = vmlal_s16(acc[2], filter, vget_low_s16(input_dup2.val[1]));
+      acc[3] = vmlal_s16(acc[3], filter, vget_high_s16(input_dup2.val[1]));
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer
+      int32x2_t acc = vld1_s32(acc_buffer_ptr);
+
+      // Load the inputs, add input_offset.
+      const uint32_t input = *input_ptr++ + input_offset;
+
+      // Multiply-accumulate
+      acc = vget_low_s32(vmlal_n_s16(vcombine_s32(acc, acc), filter, input));
+      // Store the accumulators back to acc_buffer
+      vst1_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 1, 4>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int8x8_t filter_s8 = vdup_n_s8(0);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
+    filter_s8 = vset_lane_s8(filter_ptr[2], filter_s8, 2);
+    filter_s8 = vset_lane_s8(filter_ptr[3], filter_s8, 3);
+    const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
+
+    int outp = 0;
+    // Handle 8 output pixels at a time.
+    for (; outp <= num_output_pixels - 8; outp += 8)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[8];
+      for (int i = 0; i < 8; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vld1_s8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vmovl_s8(input_s8);
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+
+      // Multiply-accumulate
+      acc[0] = vmlal_lane_s16(acc[0], filter, vget_low_s16(input), 0);
+      acc[1] = vmlal_lane_s16(acc[1], filter, vget_low_s16(input), 1);
+      acc[2] = vmlal_lane_s16(acc[2], filter, vget_low_s16(input), 2);
+      acc[3] = vmlal_lane_s16(acc[3], filter, vget_low_s16(input), 3);
+      acc[4] = vmlal_lane_s16(acc[4], filter, vget_high_s16(input), 0);
+      acc[5] = vmlal_lane_s16(acc[5], filter, vget_high_s16(input), 1);
+      acc[6] = vmlal_lane_s16(acc[6], filter, vget_high_s16(input), 2);
+      acc[7] = vmlal_lane_s16(acc[7], filter, vget_high_s16(input), 3);
+
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 8; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 32;
+    }
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
+      input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate
+      acc[0] = vmlal_lane_s16(acc[0], filter, input, 0);
+      acc[1] = vmlal_lane_s16(acc[1], filter, input, 1);
+      acc[2] = vmlal_lane_s16(acc[2], filter, input, 2);
+      acc[3] = vmlal_lane_s16(acc[3], filter, input, 3);
+
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+
+      // Load the inputs, add input_offset.
+      const uint32_t input = *input_ptr++ + input_offset;
+
+      // Multiply-accumulate
+      acc = vmlal_n_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 4, 1>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int8x8_t filter_s8 = vdup_n_s8(0);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
+    filter_s8 = vset_lane_s8(filter_ptr[2], filter_s8, 2);
+    filter_s8 = vset_lane_s8(filter_ptr[3], filter_s8, 3);
+    const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
+
+    int outp = 0;
+    // Handle 4 output pixels at a time.
+    for (; outp <= num_output_pixels - 4; outp += 4)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Load the inputs, add input_offset.
+      int16x8_t input[2];
+      for (int i = 0; i < 2; i++)
+      {
+        const int8x8_t input_s8 = vld1_s8(input_ptr + 8 * i);
+        const int16x8_t input_s16 = vmovl_s8(input_s8);
+        input[i] = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      }
+      input_ptr += 16;
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++)
+      {
+        acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], filter, vget_low_s16(input[i]));
+        acc[2 * i + 1] = vmlal_s16(acc[2 * i + 1], filter, vget_high_s16(input[i]));
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc;
+      acc = vld1q_s32(acc_buffer_ptr);
+
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
+      input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Multiply-accumulate
+      acc = vmlal_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 4, 4>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int /* input_ptr_increment */,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++)
+    {
+      const int8x8_t filter_s8 = vld1_s8(filter_ptr + 8 * i);
+      filter[i] = vmovl_s8(filter_s8);
+    }
+
+    int outp = 0;
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[8];
+      for (int i = 0; i < 8; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vld1_s8(input_ptr);
+      input_ptr += 8;
+      const int16x8_t input_s16 = vmovl_s8(input_s8);
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+
+      // Multiply-accumulate
+      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), vget_low_s16(input), 0);
+      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), vget_low_s16(input), 1);
+      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), vget_low_s16(input), 2);
+      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), vget_low_s16(input), 3);
+      acc[4] = vmlal_lane_s16(acc[4], vget_low_s16(filter[0]), vget_high_s16(input), 0);
+      acc[5] = vmlal_lane_s16(acc[5], vget_high_s16(filter[0]), vget_high_s16(input), 1);
+      acc[6] = vmlal_lane_s16(acc[6], vget_low_s16(filter[1]), vget_high_s16(input), 2);
+      acc[7] = vmlal_lane_s16(acc[7], vget_high_s16(filter[1]), vget_high_s16(input), 3);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 8; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 32;
+    }
+    // Handle one output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
+      input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
+      input_ptr += 4;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate
+      acc[0] = vmlal_lane_s16(acc[0], vget_low_s16(filter[0]), input, 0);
+      acc[1] = vmlal_lane_s16(acc[1], vget_high_s16(filter[0]), input, 1);
+      acc[2] = vmlal_lane_s16(acc[2], vget_low_s16(filter[1]), input, 2);
+      acc[3] = vmlal_lane_s16(acc[3], vget_high_s16(filter[1]), input, 3);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 0, 3>
+{
+  static void Run(int num_output_pixels, int input_depth, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // We will have to duplicate bytes in a NEON register, 3-fold.
+    // We will do that by register-level table-look-up using VTBL instructions.
+    // Here we prepare the registers containing the table-lookup indices.
+    static const int8_t dup3_indices_array[3][8] = {
+      {0, 0, 0, 1, 1, 1, 2, 2}, {2, 3, 3, 3, 4, 4, 4, 5}, {5, 5, 6, 6, 6, 7, 7, 7}};
+    int8x8_t dup3_indices[3];
+    for (int i = 0; i < 3; i++)
+    {
+      dup3_indices[i] = vld1_s8(dup3_indices_array[i]);
+    }
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      const int8_t *local_filter_ptr = filter_ptr;
+      const int8_t *local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 8 input channels at a time.
+      for (; ic <= input_depth - 8; ic += 8)
+      {
+        // Load the filters.
+        int16x8_t filter[3];
+        int8x8x3_t filter_s8;
+        filter_s8.val[0] = vld1_s8(local_filter_ptr);
+        filter_s8.val[1] = vld1_s8(local_filter_ptr + 8);
+        filter_s8.val[2] = vld1_s8(local_filter_ptr + 16);
+        local_filter_ptr += 24;
+        for (int i = 0; i < 3; i++)
+        {
+          filter[i] = vmovl_s8(filter_s8.val[i]);
+        }
+        // Load the inputs, duplicate 3-fold, add input_offset.
+        const int8x8_t input_s8 = vld1_s8(local_input_ptr);
+        local_input_ptr += 8;
+
+        int8x8_t input_s8_dup3[3];
+        for (int i = 0; i < 3; i++)
+        {
+          input_s8_dup3[i] = vtbl1_s8(input_s8, dup3_indices[i]);
+        }
+        int16x8_t input_dup3[3];
+        for (int i = 0; i < 3; i++)
+        {
+          const int16x8_t input_s16_dup3 = vmovl_s8(input_s8_dup3[i]);
+          input_dup3[i] = vaddq_s16(input_s16_dup3, vdupq_n_s16(input_offset));
+        }
+        // Load the accumulators from acc_buffer
+        int32x4x3_t acc[2];
+        for (int i = 0; i < 2; i++)
+        {
+          acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
+          acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
+          acc[i].val[2] = vld1q_s32(acc_buffer_ptr + 4 * i + 16);
+        }
+        // Multiply-accumulate
+        for (int j = 0; j < 3; j++)
+        {
+          acc[0].val[j] =
+            vmlal_s16(acc[0].val[j], vget_low_s16(input_dup3[j]), vget_low_s16(filter[j]));
+          acc[1].val[j] =
+            vmlal_s16(acc[1].val[j], vget_high_s16(input_dup3[j]), vget_high_s16(filter[j]));
+        }
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 2; i++)
+        {
+          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
+          vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
+          vst1q_s32(acc_buffer_ptr + 4 * i + 16, acc[i].val[2]);
+        }
+        acc_buffer_ptr += 24;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++)
+      {
+        const int16_t input_val = *local_input_ptr++ + input_offset;
+        for (int i = 0; i < 3; i++)
+        {
+          *acc_buffer_ptr++ += static_cast<int32_t>(local_filter_ptr[i]) * input_val;
+        }
+        local_filter_ptr += 3;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 0, 2>
+{
+  static void Run(int num_output_pixels, int input_depth, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      const int8_t *local_filter_ptr = filter_ptr;
+      const int8_t *local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 8 input channels at a time.
+      for (; ic <= input_depth - 8; ic += 8)
+      {
+        // Load the filters.
+        int16x8_t filter[2];
+        int8x8x2_t filter_s8;
+        filter_s8.val[0] = vld1_s8(local_filter_ptr);
+        filter_s8.val[1] = vld1_s8(local_filter_ptr + 8);
+        local_filter_ptr += 16;
+        for (int i = 0; i < 2; i++)
+        {
+          filter[i] = vmovl_s8(filter_s8.val[i]);
+        }
+        // Load the inputs, add input_offset, duplicate 2-fold.
+        const int8x8_t input_s8 = vld1_s8(local_input_ptr);
+        local_input_ptr += 8;
+        const int16x8_t input_s16 = vmovl_s8(input_s8);
+        const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+        const int16x8x2_t input_dup2 = vzipq_s16(input, input);
+        // Load the accumulators from acc_buffer.
+        int32x4x2_t acc[2];
+        for (int i = 0; i < 2; i++)
+        {
+          acc[i].val[0] = vld1q_s32(acc_buffer_ptr + 4 * i);
+          acc[i].val[1] = vld1q_s32(acc_buffer_ptr + 4 * i + 8);
+        }
+        // Multiply-accumulate.
+        for (int j = 0; j < 2; j++)
+        {
+          acc[0].val[j] =
+            vmlal_s16(acc[0].val[j], vget_low_s16(filter[j]), vget_low_s16(input_dup2.val[j]));
+          acc[1].val[j] =
+            vmlal_s16(acc[1].val[j], vget_high_s16(filter[j]), vget_high_s16(input_dup2.val[j]));
+        }
+        // Store the accumulators back to acc_buffer.
+        for (int i = 0; i < 2; i++)
+        {
+          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i].val[0]);
+          vst1q_s32(acc_buffer_ptr + 4 * i + 8, acc[i].val[1]);
+        }
+        acc_buffer_ptr += 16;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++)
+      {
+        // Load the inputs.
+        const int16_t input_val = *local_input_ptr++ + input_offset;
+        for (int i = 0; i < 2; i++)
+        {
+          *acc_buffer_ptr++ += static_cast<int32_t>(local_filter_ptr[i]) * input_val;
+        }
+        local_filter_ptr += 2;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 0, 1>
+{
+  static void Run(int num_output_pixels, int input_depth, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      const int8_t *local_filter_ptr = filter_ptr;
+      const int8_t *local_input_ptr = input_ptr;
+      int ic = 0;
+      // Handle 16 input channels at a time.
+      for (; ic <= input_depth - 16; ic += 16)
+      {
+        // Load the filters.
+        int8x8_t filter_s8_0 = vld1_s8(local_filter_ptr + 8 * 0);
+        int8x8_t filter_s8_1 = vld1_s8(local_filter_ptr + 8 * 1);
+        local_filter_ptr += 16;
+        int16x8_t filter_0 = vmovl_s8(filter_s8_0);
+        int16x8_t filter_1 = vmovl_s8(filter_s8_1);
+        // Load the inputs, add input_offset.
+        int8x8_t input_s8_0 = vld1_s8(local_input_ptr + 8 * 0);
+        int8x8_t input_s8_1 = vld1_s8(local_input_ptr + 8 * 1);
+        local_input_ptr += 16;
+        int16x8_t input_0 = vmovl_s8(input_s8_0);
+        int16x8_t input_1 = vmovl_s8(input_s8_1);
+        input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
+        input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
+        // Load the accumulators from acc_buffer
+        int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+        int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+        int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+        int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
+        acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), vget_low_s16(filter_0));
+        acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), vget_high_s16(filter_0));
+        acc_2 = vmlal_s16(acc_2, vget_low_s16(input_1), vget_low_s16(filter_1));
+        acc_3 = vmlal_s16(acc_3, vget_high_s16(input_1), vget_high_s16(filter_1));
+        // Store the accumulators back to acc_buffer
+        vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+        vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+        vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+        vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
+        acc_buffer_ptr += 16;
+      }
+      // Handle 8 input channels at a time.
+      for (; ic <= input_depth - 8; ic += 8)
+      {
+        // Load the filters.
+        const int8x8_t filter_s8 = vld1_s8(local_filter_ptr);
+        local_filter_ptr += 8;
+        const int16x8_t filter = vmovl_s8(filter_s8);
+        // Load the inputs, add input_offset.
+        const int8x8_t input_s8 = vld1_s8(local_input_ptr);
+        local_input_ptr += 8;
+        const int16x8_t input_s16 = vmovl_s8(input_s8);
+        const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+        // Load the accumulators from acc_buffer
+        int32x4_t acc[2];
+        for (int i = 0; i < 2; i++)
+        {
+          acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+        }
+        // Multiply-accumulate
+        acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
+        acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
+        // Store the accumulators back to acc_buffer
+        for (int i = 0; i < 2; i++)
+        {
+          vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+        }
+        acc_buffer_ptr += 8;
+      }
+      // Handle one input channel at a time.
+      for (; ic < input_depth; ic++)
+      {
+        const int16_t input_val = *local_input_ptr++ + input_offset;
+        const int16_t filter_val = *local_filter_ptr++;
+        *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
+      }
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 16, 1>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int8x8_t filter_s8[2];
+    for (int i = 0; i < 2; i++)
+    {
+      filter_s8[i] = vld1_s8(filter_ptr + 8 * i);
+    }
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++)
+    {
+      filter[i] = vmovl_s8(filter_s8[i]);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8[2];
+      for (int i = 0; i < 2; i++)
+      {
+        input_s8[i] = vld1_s8(input_ptr + 8 * i);
+      }
+      input_ptr += input_ptr_increment;
+      int16x8_t input[2];
+      for (int i = 0; i < 2; i++)
+      {
+        input[i] = vmovl_s8(input_s8[i]);
+      }
+      for (int i = 0; i < 2; i++)
+      {
+        input[i] = vaddq_s16(input[i], vdupq_n_s16(input_offset));
+      }
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++)
+      {
+        acc[2 * i + 0] = vmlal_s16(acc[2 * i + 0], vget_low_s16(input[i]), vget_low_s16(filter[i]));
+        acc[2 * i + 1] =
+          vmlal_s16(acc[2 * i + 1], vget_high_s16(input[i]), vget_high_s16(filter[i]));
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 8, 1>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    const int8x8_t filter_s8 = vld1_s8(filter_ptr);
+    const int16x8_t filter = vmovl_s8(filter_s8);
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      // Load the inputs, add input_offset.
+      const int8x8_t input_s8 = vld1_s8(input_ptr);
+      const int16x8_t input_s16 = vmovl_s8(input_s8);
+      const int16x8_t input = vaddq_s16(input_s16, vdupq_n_s16(input_offset));
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[2];
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      acc[0] = vmlal_s16(acc[0], vget_low_s16(input), vget_low_s16(filter));
+      acc[1] = vmlal_s16(acc[1], vget_high_s16(input), vget_high_s16(filter));
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+      input_ptr += input_ptr_increment;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 1, 16>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int8x8_t filter_s8[2];
+    for (int i = 0; i < 2; i++)
+    {
+      filter_s8[i] = vld1_s8(filter_ptr + 8 * i);
+    }
+    int16x8_t filter[2];
+    for (int i = 0; i < 2; i++)
+    {
+      filter[i] = vmovl_s8(filter_s8[i]);
+    }
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      int8_t input_s8 = *input_ptr;
+      input_ptr += input_ptr_increment;
+      int16_t input = static_cast<int16_t>(input_s8 + input_offset);
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[4];
+      for (int i = 0; i < 4; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      for (int i = 0; i < 2; i++)
+      {
+        acc[2 * i + 0] = vmlal_n_s16(acc[2 * i + 0], vget_low_s16(filter[i]), input);
+        acc[2 * i + 1] = vmlal_n_s16(acc[2 * i + 1], vget_high_s16(filter[i]), input);
+      }
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 4; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 16;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 1, 32>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int8x8_t filter_s8_0 = vld1_s8(filter_ptr + 8 * 0);
+    int8x8_t filter_s8_1 = vld1_s8(filter_ptr + 8 * 1);
+    int8x8_t filter_s8_2 = vld1_s8(filter_ptr + 8 * 2);
+    int8x8_t filter_s8_3 = vld1_s8(filter_ptr + 8 * 3);
+    int16x8_t filter_0 = vmovl_s8(filter_s8_0);
+    int16x8_t filter_1 = vmovl_s8(filter_s8_1);
+    int16x8_t filter_2 = vmovl_s8(filter_s8_2);
+    int16x8_t filter_3 = vmovl_s8(filter_s8_3);
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      int8_t input_s8 = *input_ptr;
+      input_ptr += input_ptr_increment;
+      int16_t input = static_cast<int16_t>(input_s8 + input_offset);
+      // Load the accumulators from acc_buffer
+      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+      int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
+      int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
+      int32x4_t acc_5 = vld1q_s32(acc_buffer_ptr + 4 * 5);
+      int32x4_t acc_6 = vld1q_s32(acc_buffer_ptr + 4 * 6);
+      int32x4_t acc_7 = vld1q_s32(acc_buffer_ptr + 4 * 7);
+      // Multiply-accumulate
+      acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
+      acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
+      acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
+      acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
+      acc_4 = vmlal_n_s16(acc_4, vget_low_s16(filter_2), input);
+      acc_5 = vmlal_n_s16(acc_5, vget_high_s16(filter_2), input);
+      acc_6 = vmlal_n_s16(acc_6, vget_low_s16(filter_3), input);
+      acc_7 = vmlal_n_s16(acc_7, vget_high_s16(filter_3), input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+      vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
+      vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
+      vst1q_s32(acc_buffer_ptr + 4 * 5, acc_5);
+      vst1q_s32(acc_buffer_ptr + 4 * 6, acc_6);
+      vst1q_s32(acc_buffer_ptr + 4 * 7, acc_7);
+      acc_buffer_ptr += 32;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 1, 20>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    // NEON wants to load 8 bytes at a time, but 20 is not divisible by 8.
+    // We load the first 16 bytes into filter_s8_{0,1} as usual.
+    // Then we load the 8 last bytes into filter_s8_x  (x for 'extra').
+    // This is redundant: the first 4 bytes of filter_s8_x are the same
+    // as the last 4 bytes of filter_s8_x.
+    int8x8_t filter_s8_0 = vld1_s8(filter_ptr + 8 * 0);
+    int8x8_t filter_s8_1 = vld1_s8(filter_ptr + 8 * 1);
+    int8x8_t filter_s8_x = vld1_s8(filter_ptr + 8 * 1 + 4);
+    int16x8_t filter_0 = vmovl_s8(filter_s8_0);
+    int16x8_t filter_1 = vmovl_s8(filter_s8_1);
+    int16x8_t filter_x = vmovl_s8(filter_s8_x);
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      int8_t input_s8 = *input_ptr;
+      input_ptr += input_ptr_increment;
+      int16_t input = static_cast<int16_t>(input_s8 + input_offset);
+      // Load the accumulators from acc_buffer
+      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+      int32x4_t acc_3 = vld1q_s32(acc_buffer_ptr + 4 * 3);
+      int32x4_t acc_4 = vld1q_s32(acc_buffer_ptr + 4 * 4);
+      // Multiply-accumulate
+      acc_0 = vmlal_n_s16(acc_0, vget_low_s16(filter_0), input);
+      acc_1 = vmlal_n_s16(acc_1, vget_high_s16(filter_0), input);
+      acc_2 = vmlal_n_s16(acc_2, vget_low_s16(filter_1), input);
+      acc_3 = vmlal_n_s16(acc_3, vget_high_s16(filter_1), input);
+      acc_4 = vmlal_n_s16(acc_4, vget_high_s16(filter_x), input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+      vst1q_s32(acc_buffer_ptr + 4 * 3, acc_3);
+      vst1q_s32(acc_buffer_ptr + 4 * 4, acc_4);
+      acc_buffer_ptr += 20;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 1, 8>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    const int8x8_t filter_s8 = vld1_s8(filter_ptr);
+    const int16x8_t filter = vmovl_s8(filter_s8);
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      int8_t input_s8 = *input_ptr;
+      input_ptr += input_ptr_increment;
+      int16_t input = static_cast<int16_t>(input_s8 + input_offset);
+      // Load the accumulators from acc_buffer
+      int32x4_t acc[2];
+      for (int i = 0; i < 2; i++)
+      {
+        acc[i] = vld1q_s32(acc_buffer_ptr + 4 * i);
+      }
+      // Multiply-accumulate
+      acc[0] = vmlal_n_s16(acc[0], vget_low_s16(filter), input);
+      acc[1] = vmlal_n_s16(acc[1], vget_high_s16(filter), input);
+      // Store the accumulators back to acc_buffer
+      for (int i = 0; i < 2; i++)
+      {
+        vst1q_s32(acc_buffer_ptr + 4 * i, acc[i]);
+      }
+      acc_buffer_ptr += 8;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 2, 1>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int8x8_t filter_s8 = vdup_n_s8(0);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 2);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 3);
+    const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
+
+    int outp = 0;
+
+    // Handle 2 output pixels at a time.
+    for (; outp <= num_output_pixels - 2; outp += 2)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x4_t acc = vld1q_s32(acc_buffer_ptr);
+      // Load the inputs, add input_offset.
+      int16x4_t input_s16 = vdup_n_s16(0);
+      input_s16 = vset_lane_s16((reinterpret_cast<const int16_t *>(input_ptr))[0], input_s16, 0);
+      input_ptr += input_ptr_increment;
+      input_s16 = vset_lane_s16((reinterpret_cast<const int16_t *>(input_ptr))[0], input_s16, 1);
+      input_ptr += input_ptr_increment;
+      input_s16 = vget_low_s16(vmovl_s8(vreinterpret_s8_s16(input_s16)));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc = vmlal_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer.
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+
+    // Handle 1 output pixel at a time.
+    for (; outp < num_output_pixels; outp++)
+    {
+      // Load the accumulators from acc_buffer.
+      int32x2_t acc = vld1_s32(acc_buffer_ptr);
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vdup_n_s8(0);
+      input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+      input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+      input_ptr += input_ptr_increment;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+
+      // Multiply-accumulate.
+      acc = vget_low_s32(vmlal_s16(vcombine_s32(acc, acc), filter, input));
+      // Store the accumulators back to acc_buffer.
+      vst1_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 2;
+    }
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<true, 4, 1>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    if (num_output_pixels <= 0)
+    {
+      return;
+    }
+
+    // Load the filters.
+    int8x8_t filter_s8 = vdup_n_s8(0);
+    filter_s8 = vset_lane_s8(filter_ptr[0], filter_s8, 0);
+    filter_s8 = vset_lane_s8(filter_ptr[1], filter_s8, 1);
+    filter_s8 = vset_lane_s8(filter_ptr[2], filter_s8, 2);
+    filter_s8 = vset_lane_s8(filter_ptr[3], filter_s8, 3);
+    const int16x4_t filter = vget_low_s16(vmovl_s8(filter_s8));
+
+    int outp = 0;
+
+    // Handle one output pixel at a time until second to the last pixel. Second
+    // to the last because we read eight input pixels while only processing
+    // four.
+    for (; outp < num_output_pixels - 1; outp++)
+    {
+      // Load the accumulators from acc_buffer
+      int32x4_t acc;
+      acc = vld1q_s32(acc_buffer_ptr);
+
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8 = vld1_s8(input_ptr);
+      input_ptr += input_ptr_increment;
+      const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+      const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+      // Multiply-accumulate
+      acc = vmlal_s16(acc, filter, input);
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr, acc);
+      acc_buffer_ptr += 4;
+    }
+
+    // Handle the last output pixel.
+    // Load the accumulators from acc_buffer
+    int32x4_t acc;
+    acc = vld1q_s32(acc_buffer_ptr);
+
+    // Load the inputs, add input_offset.
+    int8x8_t input_s8 = vdup_n_s8(0);
+    input_s8 = vset_lane_s8(input_ptr[0], input_s8, 0);
+    input_s8 = vset_lane_s8(input_ptr[1], input_s8, 1);
+    input_s8 = vset_lane_s8(input_ptr[2], input_s8, 2);
+    input_s8 = vset_lane_s8(input_ptr[3], input_s8, 3);
+    const int16x4_t input_s16 = vget_low_s16(vmovl_s8(input_s8));
+    const int16x4_t input = vadd_s16(input_s16, vdup_n_s16(input_offset));
+    // Multiply-accumulate
+    acc = vmlal_s16(acc, filter, input);
+    // Store the accumulators back to acc_buffer
+    vst1q_s32(acc_buffer_ptr, acc);
+  }
+};
+
+template <> struct QuantizedDepthwiseConvKernel<false, 12, 1>
+{
+  static void Run(int num_output_pixels, int /* input_depth */, int /* depth_multiplier */,
+                  const int8_t *input_ptr, int16_t input_offset, int input_ptr_increment,
+                  const int8_t *filter_ptr, int32_t *acc_buffer_ptr)
+  {
+    // Load the filters.
+    int8x8_t filter_s8_0 = vld1_s8(filter_ptr);
+    int8x8_t filter_s8_1 = vld1_s8(filter_ptr + 4);
+    int16x8_t filter_s16_0 = vmovl_s8(filter_s8_0);
+    int16x8_t filter_s16_1 = vmovl_s8(filter_s8_1);
+    int16x4_t filter_0 = vget_low_s16(filter_s16_0);
+    int16x4_t filter_1 = vget_high_s16(filter_s16_0);
+    int16x4_t filter_2 = vget_high_s16(filter_s16_1);
+
+    // Handle one output pixel at a time.
+    for (int outp = 0; outp < num_output_pixels; outp++)
+    {
+      // Load the inputs, add input_offset.
+      int8x8_t input_s8_0 = vld1_s8(input_ptr);
+      int8x8_t input_s8_1 = vld1_s8(input_ptr + 4);
+      input_ptr += input_ptr_increment;
+      int16x8_t input_0 = vmovl_s8(input_s8_0);
+      int16x8_t input_1 = vmovl_s8(input_s8_1);
+      input_0 = vaddq_s16(input_0, vdupq_n_s16(input_offset));
+      input_1 = vaddq_s16(input_1, vdupq_n_s16(input_offset));
+
+      // Load the accumulators from acc_buffer
+      int32x4_t acc_0 = vld1q_s32(acc_buffer_ptr + 4 * 0);
+      int32x4_t acc_1 = vld1q_s32(acc_buffer_ptr + 4 * 1);
+      int32x4_t acc_2 = vld1q_s32(acc_buffer_ptr + 4 * 2);
+
+      // Multiply-accumulate
+      acc_0 = vmlal_s16(acc_0, vget_low_s16(input_0), filter_0);
+      acc_1 = vmlal_s16(acc_1, vget_high_s16(input_0), filter_1);
+      acc_2 = vmlal_s16(acc_2, vget_high_s16(input_1), filter_2);
+
+      // Store the accumulators back to acc_buffer
+      vst1q_s32(acc_buffer_ptr + 4 * 0, acc_0);
+      vst1q_s32(acc_buffer_ptr + 4 * 1, acc_1);
+      vst1q_s32(acc_buffer_ptr + 4 * 2, acc_2);
+
+      acc_buffer_ptr += 12;
+    }
+  }
+};
+#endif
+
+// Accumulates the effect of one row of the filter, on a segment of one row
+// of the output, accessing the corresponding one row of the input.
+template <bool kAllowStrided, int kFixedInputDepth, int kFixedDepthMultiplier>
+void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor, int input_depth,
+                                    int input_width, const int8_t *input_data, int16_t input_offset,
+                                    int pad_width, int depth_multiplier, int filter_width,
+                                    const int8_t *filter_data, int out_x_buffer_start,
+                                    int out_x_buffer_end, int output_depth, int32_t *acc_buffer)
+{
+  // Consistency check parameters. This is important in particular to ensure
+  // that we keep the number of template instantiations minimal, so we don't
+  // increase binary size unnecessarily.
+  static_assert(kFixedDepthMultiplier || !kFixedInputDepth, "");
+  static_assert(kFixedInputDepth || kAllowStrided, "");
+  assert(stride == 1 || kAllowStrided);
+  if (kFixedInputDepth)
+  {
+    assert(input_depth == kFixedInputDepth);
+  }
+  if (kFixedDepthMultiplier)
+  {
+    assert(depth_multiplier == kFixedDepthMultiplier);
+  }
+  assert(output_depth == input_depth * depth_multiplier);
+  const int input_ptr_increment = stride * input_depth;
+  const int8_t *filter_base_ptr = filter_data;
+  for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+  {
+    // For the current (filter_x, filter_y) point in the filter,
+    // compute the boundaries of the corresponding output row segment.
+    int out_x_loop_start_unclamped = 0;
+    int out_x_loop_end_unclamped = 0;
+    if (kAllowStrided)
+    {
+      if (stride == 2)
+      {
+        out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + 1) / 2;
+        out_x_loop_end_unclamped = (pad_width + input_width - dilation_factor * filter_x + 1) / 2;
+      }
+      else if (stride == 4)
+      {
+        out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + 3) / 4;
+        out_x_loop_end_unclamped = (pad_width + input_width - dilation_factor * filter_x + 3) / 4;
+      }
+      else
+      {
+        out_x_loop_start_unclamped = (pad_width - dilation_factor * filter_x + stride - 1) / stride;
+        out_x_loop_end_unclamped =
+          (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride;
+      }
+    }
+    else
+    {
+      out_x_loop_start_unclamped = pad_width - dilation_factor * filter_x;
+      out_x_loop_end_unclamped = pad_width + input_width - dilation_factor * filter_x;
+    }
+    // The kernel will have to iterate on the segment of the
+    // output row that starts at out_x_loop_start and out_x_loop_end.
+    const int out_x_loop_start = std::max(out_x_buffer_start, out_x_loop_start_unclamped);
+    const int out_x_loop_end = std::min(out_x_buffer_end, out_x_loop_end_unclamped);
+
+    int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
+    const int8_t *input_ptr = input_data + in_x_origin * input_depth;
+    const int num_output_pixels = out_x_loop_end - out_x_loop_start;
+    QuantizedDepthwiseConvKernel<kAllowStrided, kFixedInputDepth, kFixedDepthMultiplier>::Run(
+      num_output_pixels, input_depth, depth_multiplier, input_ptr, input_offset,
+      input_ptr_increment, filter_base_ptr, acc_buffer_ptr);
+    filter_base_ptr += output_depth;
+  }
+}
+
+// generic fallback of DepthwiseConvAccumRow, portable, non-templatized.
+inline void QuantizedDepthwiseConvAccumRowGeneric(int stride, int dilation_factor, int input_depth,
+                                                  int input_width, const int8_t *input_data,
+                                                  int16_t input_offset, int pad_width,
+                                                  int depth_multiplier, int filter_width,
+                                                  const int8_t *filter_data, int out_x_buffer_start,
+                                                  int out_x_buffer_end, int output_depth,
+                                                  int32_t *acc_buffer)
+{
+  const int8_t *filter_base_ptr = filter_data;
+  for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+  {
+    const int out_x_loop_start =
+      std::max(out_x_buffer_start, (pad_width - dilation_factor * filter_x + stride - 1) / stride);
+    const int out_x_loop_end =
+      std::min(out_x_buffer_end,
+               (pad_width + input_width - dilation_factor * filter_x + stride - 1) / stride);
+
+    int32_t *acc_buffer_ptr = acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
+    const int in_x_origin = (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
+    const int8_t *input_ptr = input_data + in_x_origin * input_depth;
+    const int input_ptr_increment = (stride - 1) * input_depth;
+    for (int out_x = out_x_loop_start; out_x < out_x_loop_end; out_x++)
+    {
+      const int8_t *filter_ptr = filter_base_ptr;
+      for (int ic = 0; ic < input_depth; ++ic)
+      {
+        const int16_t input_val = *input_ptr++ + input_offset;
+        for (int m = 0; m < depth_multiplier; m++)
+        {
+          const int16_t filter_val = *filter_ptr++;
+          *acc_buffer_ptr++ += static_cast<int32_t>(filter_val) * input_val;
+        }
+      }
+      input_ptr += input_ptr_increment;
+    }
+    filter_base_ptr += output_depth;
+  }
+}
+
+// Initializes the accumulator buffer with bias values.
+inline void DepthwiseConvInitAccBuffer(int num_output_pixels, int output_depth,
+                                       const int32_t *bias_data, int32_t *acc_buffer)
+{
+  int i = 0;
+#ifdef USE_NEON
+  if (output_depth == 1)
+  {
+    const int32x4_t b = vdupq_n_s32(bias_data[0]);
+    for (; i <= num_output_pixels - 16; i += 16)
+    {
+      vst1q_s32(acc_buffer + i + 0, b);
+      vst1q_s32(acc_buffer + i + 4, b);
+      vst1q_s32(acc_buffer + i + 8, b);
+      vst1q_s32(acc_buffer + i + 12, b);
+    }
+    for (; i <= num_output_pixels - 4; i += 4)
+    {
+      vst1q_s32(acc_buffer + i, b);
+    }
+  }
+  else if (output_depth == 2)
+  {
+    int32x4_t b = vdupq_n_s32(bias_data[0]);
+    b = vsetq_lane_s32(bias_data[1], b, 1);
+    b = vsetq_lane_s32(bias_data[1], b, 3);
+    for (; i <= num_output_pixels - 8; i += 8)
+    {
+      vst1q_s32(acc_buffer + 2 * i + 0, b);
+      vst1q_s32(acc_buffer + 2 * i + 4, b);
+      vst1q_s32(acc_buffer + 2 * i + 8, b);
+      vst1q_s32(acc_buffer + 2 * i + 12, b);
+    }
+    for (; i <= num_output_pixels - 2; i += 2)
+    {
+      vst1q_s32(acc_buffer + 2 * i, b);
+    }
+  }
+  else if (output_depth == 4)
+  {
+    const int32x4_t b = vld1q_s32(bias_data);
+    for (; i <= num_output_pixels - 4; i += 4)
+    {
+      vst1q_s32(acc_buffer + 4 * i + 0, b);
+      vst1q_s32(acc_buffer + 4 * i + 4, b);
+      vst1q_s32(acc_buffer + 4 * i + 8, b);
+      vst1q_s32(acc_buffer + 4 * i + 12, b);
+    }
+    for (; i < num_output_pixels; i++)
+    {
+      vst1q_s32(acc_buffer + 4 * i, b);
+    }
+  }
+  else if (output_depth == 8)
+  {
+    const int32x4_t b0 = vld1q_s32(bias_data);
+    const int32x4_t b1 = vld1q_s32(bias_data + 4);
+    for (; i <= num_output_pixels - 2; i += 2)
+    {
+      vst1q_s32(acc_buffer + 8 * i + 0, b0);
+      vst1q_s32(acc_buffer + 8 * i + 4, b1);
+      vst1q_s32(acc_buffer + 8 * i + 8, b0);
+      vst1q_s32(acc_buffer + 8 * i + 12, b1);
+    }
+    for (; i < num_output_pixels; i++)
+    {
+      vst1q_s32(acc_buffer + 8 * i + 0, b0);
+      vst1q_s32(acc_buffer + 8 * i + 4, b1);
+    }
+  }
+  else if (output_depth == 16)
+  {
+    const int32x4_t b0 = vld1q_s32(bias_data);
+    const int32x4_t b1 = vld1q_s32(bias_data + 4);
+    const int32x4_t b2 = vld1q_s32(bias_data + 8);
+    const int32x4_t b3 = vld1q_s32(bias_data + 12);
+    for (; i < num_output_pixels; i++)
+    {
+      vst1q_s32(acc_buffer + 16 * i + 0, b0);
+      vst1q_s32(acc_buffer + 16 * i + 4, b1);
+      vst1q_s32(acc_buffer + 16 * i + 8, b2);
+      vst1q_s32(acc_buffer + 16 * i + 12, b3);
+    }
+  }
+#endif
+  for (; i < num_output_pixels; i++)
+  {
+    memcpy(acc_buffer + i * output_depth, bias_data, sizeof(acc_buffer[0]) * output_depth);
+  }
+}
+
+inline void DepthwiseConvGeneral(const DepthwiseConvParams &params,
+                                 const int32_t *output_multiplier, const int32_t *output_shift,
+                                 const Shape &input_shape, const int8_t *input_data,
+                                 const Shape &filter_shape, const int8_t *filter_data,
+                                 const Shape & /* bias_shape */, const int32_t *bias_data,
+                                 const Shape &output_shape, int8_t *output_data, int thread_start,
+                                 int thread_end, int thread_dim)
+{
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int depth_multiplier = params.depth_multiplier;
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+  const int32_t input_offset = params.input_offset;
+  const int32_t output_offset = params.output_offset;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int input_depth = input_shape.Dims(3);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_rows = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+
+  static const int kAccBufferMaxSize = 2048;
+  int32_t acc_buffer[kAccBufferMaxSize];
+  assert(kAccBufferMaxSize >= output_depth);
+  const int kOutputPixelsInAccBuffer = kAccBufferMaxSize / output_depth;
+  const int kAccBufferActualSize = kOutputPixelsInAccBuffer * output_depth;
+  UNUSED_RELEASE(kAccBufferActualSize);
+  assert(kOutputPixelsInAccBuffer * output_depth <= kAccBufferActualSize);
+  assert(kAccBufferActualSize <= kAccBufferMaxSize);
+  assert(kOutputPixelsInAccBuffer >= 1);
+  assert(thread_dim == 0 || thread_dim == 1);
+
+  // row_accum_func will point to the core accumulation function to be used
+  // for this DepthwiseConv op.
+  using row_accum_func_t = decltype(&QuantizedDepthwiseConvAccumRowGeneric);
+  row_accum_func_t row_accum_func = nullptr;
+
+#define TFMINI_USE_DEPTHWISECONV_KERNEL(ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER) \
+  if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) &&                                  \
+      (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) &&                             \
+      depth_multiplier == FIXED_DEPTH_MULTIPLIER)                                                 \
+  {                                                                                               \
+    row_accum_func =                                                                              \
+      QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH, FIXED_DEPTH_MULTIPLIER>;   \
+  }
+
+#ifdef USE_NEON
+  // We go over our list of kernels by decreasing order of preference
+  // for the cases where multiple kernels could apply.
+
+  // Start with the fastest kernels: AllowStrided=false, fixed input depth.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 1, 4)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 4, 4)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 8, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 8)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 2, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(false, 12, 1)
+
+  // Next come the strided kernels: AllowStrided=true, fixed input depth.
+  // They are a bit less efficient, but allow stride!=1.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 16, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 16)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 20)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 32)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 1, 8)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 8, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 2, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 4, 1)
+
+  // Finally, the kernels allowing a variable input depth,
+  // these are the least efficient but most general kernels.
+
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 1)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 2)
+  TFMINI_USE_DEPTHWISECONV_KERNEL(true, 0, 3)
+#endif // USE_NEON
+
+  // No matching fast kernel found, use slow fallback.
+  if (!row_accum_func)
+  {
+    row_accum_func = QuantizedDepthwiseConvAccumRowGeneric;
+  }
+
+#undef TFMINI_USE_DEPTHWISECONV_KERNEL
+
+  const int input_height_stride = input_shape.Dims(3) * input_shape.Dims(2);
+  const int input_batch_stride = input_height_stride * input_shape.Dims(1);
+  const int filter_height_stride = filter_shape.Dims(3) * filter_shape.Dims(2);
+
+  // Now that we have determined row_accum_func, we can start work.
+  int batch_start = 0;
+  int batch_end = batches;
+  int row_start = 0;
+  int row_end = output_rows;
+  int output_ptr_offset = 0;
+
+  switch (thread_dim)
+  {
+    case 0:
+      assert(thread_start >= 0);
+      assert(thread_end <= batches);
+      batch_start = thread_start;
+      batch_end = thread_end;
+      output_ptr_offset = batch_start * FlatSizeSkipDim(output_shape, 0);
+      break;
+    case 1:
+      assert(thread_start >= 0);
+      assert(thread_end <= output_rows);
+      row_start = thread_start;
+      row_end = thread_end;
+      output_ptr_offset = row_start * output_width * output_depth;
+      break;
+  }
+
+  int8_t *output_ptr = output_data + output_ptr_offset;
+  int batch_step = (output_rows + row_start - row_end) * output_width * output_depth;
+  for (int b = batch_start; b < batch_end; ++b)
+  {
+    for (int out_y = row_start; out_y < row_end; ++out_y)
+    {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
+      const int filter_y_start =
+        std::max(0, (-in_y_origin + dilation_height_factor - 1) / dilation_height_factor);
+      const int filter_y_end =
+        std::min(filter_height, (input_height - in_y_origin + dilation_height_factor - 1) /
+                                  dilation_height_factor);
+      for (int out_x_buffer_start = 0; out_x_buffer_start < output_width;
+           out_x_buffer_start += kOutputPixelsInAccBuffer)
+      {
+        const int out_x_buffer_end =
+          std::min(output_width, out_x_buffer_start + kOutputPixelsInAccBuffer);
+        // We call a 'pixel' a group of activation that share all but the
+        // 'depth'/'channel' coordinate. num_output_pixels is the number of
+        // output pixels that we will accumulate in this loop iteration.
+        const int num_output_pixels = out_x_buffer_end - out_x_buffer_start;
+        // Initialize our local accumulator with the bias values, so we don't
+        // have to add them later.
+        DepthwiseConvInitAccBuffer(num_output_pixels, output_depth, bias_data, acc_buffer);
+        // Accumulation loop. Most of the time should be spent in here.
+        for (int filter_y = filter_y_start; filter_y < filter_y_end; ++filter_y)
+        {
+          const int in_y = in_y_origin + dilation_height_factor * filter_y;
+          row_accum_func(stride_width, dilation_width_factor, input_depth, input_width,
+                         input_data + in_y * input_height_stride + b * input_batch_stride,
+                         input_offset, pad_width, depth_multiplier, filter_width,
+                         filter_data + filter_y * filter_height_stride, out_x_buffer_start,
+                         out_x_buffer_end, output_depth, acc_buffer);
+        }
+        // Finished accumulating int32_t values. Now need to convert them to
+        // the final 8bit form and store them.
+        const int num_output_values = output_depth * num_output_pixels;
+
+        Quantize(output_multiplier, output_shift, output_depth, num_output_values, output_offset,
+                 output_activation_min, output_activation_max, acc_buffer, output_ptr);
+
+        output_ptr += num_output_values;
+      }
+    }
+    output_ptr += batch_step;
+  }
+}
+
+} // namespace depthwise_conv
+
+template <DepthwiseConvOutputRounding kOutputRounding>
+inline void DepthwiseConvWithRounding(const DepthwiseConvParams &params,
+                                      const int32_t *output_multiplier, const int32_t *output_shift,
+                                      const Shape &input_shape, const int8_t *input_data,
+                                      const Shape &filter_shape, const int8_t *filter_data,
+                                      const Shape &bias_shape, const int32_t *bias_data,
+                                      const Shape &output_shape, int8_t *output_data,
+                                      int thread_start, int thread_end, int thread_dim)
+{
+  const int depth_multiplier = params.depth_multiplier;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  UNUSED_RELEASE(depth_multiplier);
+  UNUSED_RELEASE(dilation_width_factor);
+  UNUSED_RELEASE(dilation_height_factor);
+  assert(dilation_width_factor >= 1);
+  assert(dilation_height_factor >= 1);
+  assert(input_shape.DimensionsCount() == 4);
+  assert(filter_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+  const int output_depth = MatchingDim(filter_shape, 3, output_shape, 3);
+  const int input_depth = input_shape.Dims(3);
+  UNUSED_RELEASE(output_depth);
+  UNUSED_RELEASE(input_depth);
+  assert(output_depth == input_depth * depth_multiplier);
+  assert(bias_shape.FlatSize() == output_depth);
+
+//  TODO Use below codes
+#if 0
+// Enable for arm64 except for the Nvidia Linux 4 Tegra (L4T) running on
+// Jetson TX-2. This compiler does not support the offsetof() macro.
+#if defined(__aarch64__) && !defined(GOOGLE_L4T)
+#if defined(__ANDROID__) && defined(__clang__)
+  CpuFlags cpu_flags;
+  GetCpuFlags(&cpu_flags);
+  const bool has_dot_product_instructions = cpu_flags.neon_dotprod;
+
+  // Dispatch to dot-product 3x3 kernels when supported.
+  if (has_dot_product_instructions)
+  {
+    using optimized_ops::depthwise_conv::DotProduct3x3KernelType;
+    DotProduct3x3KernelType kernel_type = optimized_ops::depthwise_conv::CategorizeDotProductKernel<
+      optimized_ops::depthwise_conv::QuantizationType::kPerChannelInt8>(
+      input_shape, filter_shape, output_shape, params, output_shift);
+    if (kernel_type != DotProduct3x3KernelType::kNone)
+    {
+      DepthwiseConvParams params_copy = params;
+      params_copy.output_shift_per_channel = output_shift;
+      params_copy.output_multiplier_per_channel = output_multiplier;
+      optimized_ops::depthwise_conv::DepthwiseConvDotProduct3x3PerChannel<
+        DepthwiseConvImplementation::kUseNeon3x3DotProduct>(
+        params_copy, input_shape, input_data, filter_shape, filter_data, bias_shape, bias_data,
+        output_shape, output_data, thread_start, thread_end, thread_dim);
+      return;
+    }
+  }
+
+#endif
+  // Dispatch to non-dot-product 3x3 kernels when supported.
+
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+
+  // Call kernel optimized for depthwise convolutions using 3x3 filters if
+  // parameters are supported.
+  if (optimized_ops::depthwise_conv::Fast3x3FilterKernelSupported<
+        optimized_ops::depthwise_conv::QuantizationType::kPerChannelInt8>(
+        input_shape, filter_shape, stride_width, stride_height, dilation_width_factor,
+        dilation_height_factor, pad_width, pad_height, depth_multiplier, output_shape, 0,
+        output_shift))
+  {
+    optimized_ops::depthwise_conv::DepthwiseConv3x3FilterPerChannel<
+      DepthwiseConvOutputRounding::kUpward>(
+      params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data,
+      bias_shape, bias_data, output_shape, output_data, thread_start, thread_end, thread_dim);
+    return;
+  }
+#endif
+
+#endif /* end of if 0 */
+
+  depthwise_conv::DepthwiseConvGeneral(
+    params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data,
+    bias_shape, bias_data, output_shape, output_data, thread_start, thread_end, thread_dim);
+}
+
+inline void DepthwiseConvImpl(const DepthwiseConvParams &params, const int32_t *output_multiplier,
+                              const int32_t *output_shift, const Shape &input_shape,
+                              const int8_t *input_data, const Shape &filter_shape,
+                              const int8_t *filter_data, const Shape &bias_shape,
+                              const int32_t *bias_data, const Shape &output_shape,
+                              int8_t *output_data, int thread_start, int thread_end, int thread_dim)
+{
+  return DepthwiseConvWithRounding<DepthwiseConvOutputRounding::kAwayFromZero>(
+    params, output_multiplier, output_shift, input_shape, input_data, filter_shape, filter_data,
+    bias_shape, bias_data, output_shape, output_data, thread_start, thread_end, thread_dim);
+}
+
+template <typename T, typename TS> struct DepthwiseConvWorkerTask : cpu_backend_threadpool::Task
+{
+  DepthwiseConvWorkerTask(const DepthwiseConvParams &params, const int32_t *output_multiplier,
+                          const int32_t *output_shift, const Shape &input_shape,
+                          const T *input_data, const Shape &filter_shape, const T *filter_data,
+                          const Shape &bias_shape, const TS *bias_data, const Shape &output_shape,
+                          T *output_data, int thread_start, int thread_end, int thread_dim)
+    : params_(params), output_multiplier_(output_multiplier), output_shift_(output_shift),
+      input_shape_(input_shape), input_data_(input_data), filter_shape_(filter_shape),
+      filter_data_(filter_data), bias_shape_(bias_shape), bias_data_(bias_data),
+      output_shape_(output_shape), output_data_(output_data), thread_start_(thread_start),
+      thread_end_(thread_end), thread_dim_(thread_dim)
+  {
+  }
+
+  void Run() override
+  {
+    DepthwiseConvImpl(params_, output_multiplier_, output_shift_, input_shape_, input_data_,
+                      filter_shape_, filter_data_, bias_shape_, bias_data_, output_shape_,
+                      output_data_, thread_start_, thread_end_, thread_dim_);
+  }
+
+private:
+  const DepthwiseConvParams &params_;
+  const int32_t *output_multiplier_;
+  const int32_t *output_shift_;
+  const Shape &input_shape_;
+  const T *input_data_;
+  const Shape &filter_shape_;
+  const T *filter_data_;
+  const Shape &bias_shape_;
+  const TS *bias_data_;
+  const Shape &output_shape_;
+  T *output_data_;
+  int thread_start_;
+  int thread_end_;
+  int thread_dim_;
+};
+
+inline int HowManyConvThreads(const Shape &output_shape, const Shape &filter_shape, int thread_dim)
+{
+  constexpr int kMinMulPerThread = 8;
+  const int output_units = output_shape.Dims(thread_dim);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int num_mul_per_unit =
+    FlatSizeSkipDim(output_shape, thread_dim) * filter_height * filter_width;
+  const int min_units_per_thread = kMinMulPerThread / num_mul_per_unit + 1;
+  int thread_count = output_units / min_units_per_thread;
+  return thread_count;
+}
+
+inline void DepthwiseConvPerChannel(const DepthwiseConvParams &params,
+                                    const int32_t *output_multiplier, const int32_t *output_shift,
+                                    const Shape &input_shape, const int8_t *input_data,
+                                    const Shape &filter_shape, const int8_t *filter_data,
+                                    const Shape &bias_shape, const int32_t *bias_data,
+                                    const Shape &output_shape, int8_t *output_data,
+                                    ruy::Context *ruy_context)
+{
+  UNUSED_ALL(params, output_multiplier, output_shift, input_shape, input_data, filter_shape,
+             filter_data, bias_shape, bias_data, output_shape, output_data, ruy_context);
+
+  assert(input_shape.DimensionsCount() == 4);
+  assert(filter_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+
+  const int output_batches = output_shape.Dims(0);
+  const int output_rows = output_shape.Dims(1);
+  int thread_count_batch = HowManyConvThreads(output_shape, filter_shape, 0);
+  int thread_count_row = HowManyConvThreads(output_shape, filter_shape, 1);
+  int thread_dim, thread_count, thread_dim_size;
+  if (thread_count_batch > thread_count_row)
+  {
+    thread_dim = 0;
+    thread_dim_size = output_batches;
+    thread_count = thread_count_batch;
+  }
+  else
+  {
+    thread_dim = 1;
+    thread_dim_size = output_rows;
+    thread_count = thread_count_row;
+  }
+
+  // NOTE Borrow RuyContext to get max_num_threads setting
+  // TODO Define and use max_num_threads for CPU backend
+  const int max_threads = ruy_context->max_num_threads();
+  thread_count = std::max(1, std::min(thread_count, max_threads));
+
+  if (thread_count == 1)
+  {
+    DepthwiseConvImpl(params, output_multiplier, output_shift, input_shape, input_data,
+                      filter_shape, filter_data, bias_shape, bias_data, output_shape, output_data,
+                      /*thread_start=*/0,
+                      /*thread_end=*/output_rows, /*thread_dim=*/1);
+  }
+  else
+  {
+    std::vector<DepthwiseConvWorkerTask<int8_t, int32_t>> tasks;
+    // TODO(b/131746020) don't create new heap allocations every time.
+    // At least we make it a single heap allocation by using reserve().
+    tasks.reserve(thread_count);
+    int thread_start = 0;
+    for (int i = 0; i < thread_count; ++i)
+    {
+      int thread_end = thread_start + (thread_dim_size - thread_start) / (thread_count - i);
+      tasks.emplace_back(params, output_multiplier, output_shift, input_shape, input_data,
+                         filter_shape, filter_data, bias_shape, bias_data, output_shape,
+                         output_data, thread_start, thread_end, thread_dim);
+      thread_start = thread_end;
+    }
+    cpu_backend_threadpool::Execute(tasks.size(), tasks.data(), ruy_context);
+  }
+}
+
+} // namespace optimized_integer_ops
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_OPTIMIZED_DEPTHWISE_CONV_INT8_H__
diff --git a/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h b/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h
index 93cb21e0b..96e1d9127 100644
--- a/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h
+++ b/compute/cker/include/cker/operation/reference/BinaryArithmeticOps.h
@@ -62,7 +62,7 @@ inline void BinaryArithmeticOp(const BinaryArithmeticOpParam &params, const Shap
 }
 
 template <typename T>
-inline void BroadcastBinaryArithmeticOpSlowQuant8(
+inline typename std::enable_if_t<is_quant8<T>::value> BroadcastBinaryArithmeticOpSlow(
   const BinaryArithmeticOpParam &params, const Shape &input1_shape, const T *input1_data,
   const Shape &input2_shape, const T *input2_data, const Shape &output_shape, T *output_data,
   const std::function<T(const BinaryArithmeticOpParam &params, const T &, const T &)> &fn)
@@ -72,11 +72,6 @@ inline void BroadcastBinaryArithmeticOpSlowQuant8(
   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
   const Shape extended_output_shape = Shape::ExtendedShape(4, output_shape);
 
-  if ((params.quantized_activation_min < 0) && (params.quantized_activation_max > 255))
-  {
-    throw std::runtime_error{"Support only for Quant8."};
-  }
-
   // Comment from tensorflow lite:
   //
   // In Tensorflow, the dimensions are canonically named (batch_number, row,
@@ -98,11 +93,10 @@ inline void BroadcastBinaryArithmeticOpSlowQuant8(
       {
         for (int c = 0; c < extended_output_shape.Dims(3); ++c)
         {
-          output_data[Offset(extended_output_shape, b, y, x, c)] =
-            ActivationFunctionWithMinMax<uint8_t>(
-              fn(params, input1_data[SubscriptToIndex(desc1, b, y, x, c)],
-                 input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
-              params.quantized_activation_min, params.quantized_activation_max);
+          output_data[Offset(extended_output_shape, b, y, x, c)] = ActivationFunctionWithMinMax<T>(
+            fn(params, input1_data[SubscriptToIndex(desc1, b, y, x, c)],
+               input2_data[SubscriptToIndex(desc2, b, y, x, c)]),
+            params.quantized_activation_min, params.quantized_activation_max);
         }
       }
     }
diff --git a/compute/cker/include/cker/operation/reference/Conv.h b/compute/cker/include/cker/operation/reference/Conv.h
index 43a5bf256..4474754af 100644
--- a/compute/cker/include/cker/operation/reference/Conv.h
+++ b/compute/cker/include/cker/operation/reference/Conv.h
@@ -190,6 +190,116 @@ inline void Conv(const ConvParams &params, const Shape &input_shape, const uint8
   }
 }
 
+inline void Conv(const ConvParams &params, const int32_t *output_multiplier,
+                 const int32_t *output_shift, const Shape &input_shape, const int8_t *input_data,
+                 const Shape &filter_shape, const int8_t *filter_data, const Shape &bias_shape,
+                 const int32_t *bias_data, const Shape &output_shape, int8_t *output_data)
+{
+  UNUSED_RELEASE(bias_shape);
+  // Get parameters.
+  const int32_t input_offset = params.input_offset; // r = s(q - Z)
+  const int stride_width = params.stride_width;
+  const int stride_height = params.stride_height;
+  const int dilation_width_factor = params.dilation_width_factor;
+  const int dilation_height_factor = params.dilation_height_factor;
+  const int pad_width = params.padding_values.width;
+  const int pad_height = params.padding_values.height;
+  const int32_t output_offset = params.output_offset;
+
+  // Set min and max value of the output.
+  const int32_t output_activation_min = params.quantized_activation_min;
+  const int32_t output_activation_max = params.quantized_activation_max;
+
+  // Consistency check.
+  assert(output_activation_min < output_activation_max);
+  assert(input_shape.DimensionsCount() == 4);
+  assert(filter_shape.DimensionsCount() == 4);
+  assert(output_shape.DimensionsCount() == 4);
+  const int batches = MatchingDim(input_shape, 0, output_shape, 0);
+  const int input_depth = MatchingDim(input_shape, 3, filter_shape, 3);
+  const int output_depth = MatchingDim(filter_shape, 0, output_shape, 3);
+  if (bias_data)
+  {
+    assert(bias_shape.FlatSize() == output_depth);
+  }
+
+  // Check dimensions of the tensors.
+  const int input_height = input_shape.Dims(1);
+  const int input_width = input_shape.Dims(2);
+  const int filter_height = filter_shape.Dims(1);
+  const int filter_width = filter_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+  const int output_width = output_shape.Dims(2);
+  for (int batch = 0; batch < batches; ++batch)
+  {
+    for (int out_y = 0; out_y < output_height; ++out_y)
+    {
+      const int in_y_origin = (out_y * stride_height) - pad_height;
+      for (int out_x = 0; out_x < output_width; ++out_x)
+      {
+        const int in_x_origin = (out_x * stride_width) - pad_width;
+        for (int out_channel = 0; out_channel < output_depth; ++out_channel)
+        {
+          int32_t acc = 0;
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y)
+          {
+            const int in_y = in_y_origin + dilation_height_factor * filter_y;
+            for (int filter_x = 0; filter_x < filter_width; ++filter_x)
+            {
+              const int in_x = in_x_origin + dilation_width_factor * filter_x;
+
+              // Zero padding by omitting the areas outside the image.
+              const bool is_point_inside_image =
+                (in_x >= 0) && (in_x < input_width) && (in_y >= 0) && (in_y < input_height);
+
+              if (!is_point_inside_image)
+              {
+                continue;
+              }
+
+              for (int in_channel = 0; in_channel < input_depth; ++in_channel)
+              {
+                int32_t input_val = input_data[Offset(input_shape, batch, in_y, in_x, in_channel)];
+                int32_t filter_val =
+                  filter_data[Offset(filter_shape, out_channel, filter_y, filter_x, in_channel)];
+                // Accumulate with 32 bits accumulator.
+                // In the nudging process during model quantization, we force
+                // real value of 0.0 be represented by a quantized value. This
+                // guarantees that the input_offset is a int8_t, even though
+                // it is represented using int32_t. int32_t += int8_t *
+                // (int8_t - int8_t) so the highest value we can get from each
+                // accumulation is [-127, 127] * ([-128, 127] -
+                // [-128, 127]), which is [-32512, 32512]. log2(32512)
+                // = 14.98, which means we can accumulate at least 2^16
+                // multiplications without overflow. The accumulator is
+                // applied to a filter so the accumulation logic will hold as
+                // long as the filter size (filter_y * filter_x * in_channel)
+                // does not exceed 2^16, which is the case in all the models
+                // we have seen so far.
+                // TODO(jianlijianli): Add a check to make sure the
+                // accumulator depth is smaller than 2^16.
+                acc += filter_val * (input_val + input_offset);
+              }
+            }
+          }
+
+          if (bias_data)
+          {
+            acc += bias_data[out_channel];
+          }
+          acc = MultiplyByQuantizedMultiplier(acc, output_multiplier[out_channel],
+                                              output_shift[out_channel]);
+          acc += output_offset;
+          acc = std::max(acc, output_activation_min);
+          acc = std::min(acc, output_activation_max);
+          output_data[Offset(output_shape, batch, out_y, out_x, out_channel)] =
+            static_cast<int8_t>(acc);
+        }
+      }
+    }
+  }
+}
+
 } // namespace reference
 } // namespace cker
 } // namespace nnfw
diff --git a/docs/conf.py b/docs/conf.py
index 68b7d0628..cb5309565 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -21,7 +21,7 @@ copyright = '2020, Samsung Research & contributors'
 author = 'Samsung Research & contributors'
 
 # The full version, including alpha/beta/rc tags
-release = '1.12.0'
+release = '1.15.0'
 
 # -- General configuration ---------------------------------------------------
 
diff --git a/docs/howto/how-to-build-runtime-tizen-gbs-rpi4.md b/docs/howto/how-to-build-runtime-tizen-gbs-rpi4.md
index fd5f1349f..1f8c0c289 100644
--- a/docs/howto/how-to-build-runtime-tizen-gbs-rpi4.md
+++ b/docs/howto/how-to-build-runtime-tizen-gbs-rpi4.md
@@ -249,7 +249,7 @@ In your host, maybe with another terminal, download packages from
 http://download.tizen.org/releases/daily/tizen/unified/latest/repos/standard/packages/aarch64/
 
 ```
-$ wget http://download.tizen.org/releases/daily/tizen/unified/latest/repos/standard/packages/aarch64/libarmcl-v20.05-17.5.aarch64.rpm
+$ wget http://download.tizen.org/releases/daily/tizen/unified/latest/repos/standard/packages/aarch64/libarmcl-v21.02-17.5.aarch64.rpm
 
 $ wget http://download.tizen.org/releases/daily/tizen/unified/latest/repos/standard/packages/aarch64/libhdf5-101-1.10.1-3.85.aarch64.rpm
 
@@ -258,7 +258,7 @@ $ wget http://download.tizen.org/releases/daily/tizen/unified/latest/repos/stand
 
 (4) Copy to device
 ```
-$ sdb push libarmcl-v20.05-17.5.aarch64.rpm /opt/usr/home/owner/share/tmp/
+$ sdb push libarmcl-v21.02-17.5.aarch64.rpm /opt/usr/home/owner/share/tmp/
 $ sdb push libhdf5-101-1.10.1-3.85.aarch64.rpm /opt/usr/home/owner/share/tmp/
 $ sdb push libhdf5_cpp101-1.10.1-3.85.aarch64.rpm /opt/usr/home/owner/share/tmp/
 ```
@@ -275,7 +275,7 @@ Within Tizen/RPi4 shell
 ```
 sh-3.2# cd /opt/usr/home/owner/share/tmp/
 
-sh-3.2# rpm -i libarmcl-v20.05-17.5.aarch64.rpm
+sh-3.2# rpm -i libarmcl-v21.02-17.5.aarch64.rpm
 sh-3.2# rpm -i libhdf5-101-1.10.1-3.85.aarch64.rpm
 sh-3.2# rpm -i libhdf5_cpp101-1.10.1-3.85.aarch64.rpm
 ```
diff --git a/docs/howto/how-to-build-runtime.md b/docs/howto/how-to-build-runtime.md
index 9181a6138..02ab47537 100644
--- a/docs/howto/how-to-build-runtime.md
+++ b/docs/howto/how-to-build-runtime.md
@@ -100,7 +100,7 @@ $ tree -L 3 ./Product/out
 ├── bin
 │   ├── nnapi_test
 │   ├── nnpackage_run
-│   ├── tflite_loader_test_tool
+│   ├── tflite_comparator
 │   └── tflite_run
 ├── include
 │   ├── nnfw
diff --git a/docs/howto/how-to-cross-build-runtime-for-aarch64.md b/docs/howto/how-to-cross-build-runtime-for-aarch64.md
index b30ce348a..4c8fe1d27 100644
--- a/docs/howto/how-to-cross-build-runtime-for-aarch64.md
+++ b/docs/howto/how-to-cross-build-runtime-for-aarch64.md
@@ -16,7 +16,7 @@ Use `install_rootfs.sh` script to prepare Root File System. You should have `sud
 $ sudo ./tools/cross/install_rootfs.sh aarch64
 ```
 - supports `arm`(default) and `aarch64` architecutre for now
-- supports `xenial`(default), `trusty` and `bionic` release
+- supports `bionic`(default) and `focal` release
 
 To see the options,
 ```
@@ -27,7 +27,7 @@ RootFS will be prepared at `tools/cross/rootfs/aarch64` folder.
 
 ***\* CAUTION: The OS version of rootfs must match the OS version of execution target device. On the other hand, you need to match the Ubuntu version of the development PC with the Ubuntu version of rootfs to be used for cross-build. Otherwise, unexpected build errors may occur.***
 
-If you are using Ubuntu 16.04 LTS, select `xenial`, if you are using Ubuntu 18.04 LTS, select `bionic`. You can check your Ubuntu code name in the following way.
+If you are using Ubuntu 18.04 LTS, select `bionic`, if you are using Ubuntu 20.04 LTS, select `focal`. You can check your Ubuntu code name in the following way.
 
 ```
 $ cat /etc/lsb-release
@@ -44,7 +44,7 @@ If a build error occurs because the version of the development system and the ta
 Use `ROOTFS_DIR` to a full path to prepare at alternative path.
 
 ```
-$ ROOTFS_DIR=/home/user/rootfs/aarch64-xenial sudo -E ./tools/cross/install_rootfs.sh aarch64
+$ ROOTFS_DIR=/home/user/rootfs/aarch64-bionic sudo -E ./tools/cross/install_rootfs.sh aarch64
 ```
 
 ### Using proxy
diff --git a/docs/howto/how-to-cross-build-runtime-for-arm.md b/docs/howto/how-to-cross-build-runtime-for-arm.md
index d9c1bfe75..32c64f85e 100644
--- a/docs/howto/how-to-cross-build-runtime-for-arm.md
+++ b/docs/howto/how-to-cross-build-runtime-for-arm.md
@@ -14,7 +14,7 @@ Use `install_rootfs.sh` script to prepare Root File System. You should have `sud
 $ sudo ./tools/cross/install_rootfs.sh arm
 ```
 - supports `arm`(default) and `aarch` architecutre for now
-- supports `bionic`(default), `trusty`, `xenial` and `focal` release
+- supports `bionic`(default), and `focal` release
 
 To see the options,
 ```
@@ -25,7 +25,7 @@ RootFS will be prepared at `tools/cross/rootfs/arm` folder.
 
 ***\* CAUTION: The OS version of rootfs must match the OS version of execution target device. On the other hand, you need to match the Ubuntu version of the development PC with the Ubuntu version of rootfs to be used for cross-build. Otherwise, unexpected build errors may occur.***
 
-If you are using Ubuntu 16.04 LTS, select `xenial`, if you are using Ubuntu 18.04 LTS, select `bionic`. You can check your Ubuntu code name in the following way.
+If you are using Ubuntu 18.04 LTS, select `bionic`, if you are using Ubuntu 20.04 LTS, select `focal`. You can check your Ubuntu code name in the following way.
 
 ```
 $ cat /etc/lsb-release
@@ -42,7 +42,7 @@ If a build error occurs because the version of the development system and the ta
 Use `ROOTFS_DIR` to a full path to prepare at alternative path.
 
 ```
-$ ROOTFS_DIR=/home/user/rootfs/arm-xenial sudo -E ./tools/cross/install_rootfs.sh arm
+$ ROOTFS_DIR=/home/user/rootfs/arm-bionic sudo -E ./tools/cross/install_rootfs.sh arm
 ```
 
 ### Using proxy
@@ -81,32 +81,20 @@ $ update-alternatives --install /usr/bin/arm-linux-gnueabihf-gcc arm-linux-gnuea
     --slave /usr/bin/arm-linux-gnueabihf-gcov arm-linux-gnueabihf-gcov /usr/bin/arm-linux-gnueabihf-gcov-8
 ```
 
-### Ubuntu 16.04 LTS
+### Ubuntu 20.04 LTS
 
-On Ubuntu 16.04 or older, follow the next steps:
+Same with Ubuntu 18.04 LTS. (except g++ version)
 
-```
-$ cd ~/your/path
-$ wget https://releases.linaro.org/components/toolchain/binaries/7.2-2017.11/arm-linux-gnueabihf/gcc-linaro-7.2.1-2017.11-x86_64_arm-linux-gnueabihf.tar.xz
-$ tar xvf gcc-linaro-7.2.1-2017.11-x86_64_arm-linux-gnueabihf.tar.xz
-$ echo 'export PATH=~/your/path/gcc-linaro-7.2.1-2017.11-x86_64_arm-linux-gnueabihf/bin:$PATH' >> ~/.bashrc
-```
+## Build and install ARM Compute Library
 
-Make sure you get `libstdc++.so` updated on your target with your new toolchain's corresponding one.
+Mostly you only need once of ACL (ARM Compute Library) build.
 
-For example, if you installed gcc-linaro-7.2.1-2017.11 above, do
+To build ACL, you need to install scons
 
 ```
-$ wget https://releases.linaro.org/components/toolchain/binaries/7.2-2017.11/arm-linux-gnueabihf/runtime-gcc-linaro-7.2.1-2017.11-arm-linux-gnueabihf.tar.xz
-$ tar xvf runtime-gcc-linaro-7.2.1-2017.11-arm-linux-gnueabihf.tar.xz
+$ sudo apt-get install scons
 ```
 
-Then, copy `libstdc++.so.6.0.24` into `/usr/lib/arm-linux-gnueabihf`, and update symbolic links on your device.
-
-## Build and install ARM Compute Library
-
-Mostly you only need once of ACL build.
-
 ACL will be automatically installed in `externals/acl` when you build runtime without any changes.
 
 You can check ACL source information in `infra/cmake/packages/ARMComputeSourceConfig.cmake`
diff --git a/docs/release/1.13/index.rst b/docs/release/1.13/index.rst
new file mode 100644
index 000000000..04aa2b76b
--- /dev/null
+++ b/docs/release/1.13/index.rst
@@ -0,0 +1,13 @@
+.. ONE documentation master file, created by
+   sphinx-quickstart on Wed Jan 14 16:48:12 2021.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+1.0
+===
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+  ./release-note-1.13.0.md
diff --git a/docs/release/1.13/release-note-1.13.0.md b/docs/release/1.13/release-note-1.13.0.md
new file mode 100644
index 000000000..31e3a0d77
--- /dev/null
+++ b/docs/release/1.13/release-note-1.13.0.md
@@ -0,0 +1,12 @@
+# Release Note 1.13.0
+
+## ONE Compiler
+
+### Compiler Frontend
+
+- Add optimization pass: ConvertNCHWToNHWC, FoldSparseToDensePass, FuseBatchNormWithConvPass, ForwardReshapeToUnaryOpPass, RemoveUnnecessarySlicePass, RemoveUnnecessarySplitPass,  RemoveUnnecessaryReshapePass, RemoveRedundantReshape, SubstituteTransposeToReshapePass, SubstituteSqueezeToReshapePass, 
+- Support more operators: FAKE_QUANT
+- Enhancements: Support auto generated random input for record-minmax (for better quantization testing)
+- Changes: `--all` option to `--O1` in circle2circle(and one-optimize)
+- Fixes: `tf2tfliteV2` accept input shapes `--v2` option, lots of fixes for increase test coverage
+- Experimental: Compile ONNX models to circle
diff --git a/docs/release/1.14/index.rst b/docs/release/1.14/index.rst
new file mode 100644
index 000000000..c3d10bf33
--- /dev/null
+++ b/docs/release/1.14/index.rst
@@ -0,0 +1,13 @@
+.. ONE documentation master file, created by
+   sphinx-quickstart on Thu Mar 18 16:47:12 2021.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+1.0
+===
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+  ./release-note-1.14.0.md
diff --git a/docs/release/1.14/release-note-1.14.0.md b/docs/release/1.14/release-note-1.14.0.md
new file mode 100644
index 000000000..7c567b0fe
--- /dev/null
+++ b/docs/release/1.14/release-note-1.14.0.md
@@ -0,0 +1,10 @@
+# Release Note 1.14.0
+
+## ONE Compiler
+
+### Compiler Frontend
+
+- `one-codegen` interface now distinguishes own arguments from backend's.
+- Adds `RemoveUnnecessaryStridedSlice` optimization pass.
+- Introduces experimental support for generating profile data.
+  - Adds `--generate_profile_data` option to `one-optimize`, `one-quantize`.
diff --git a/docs/release/1.15/index.rst b/docs/release/1.15/index.rst
new file mode 100644
index 000000000..1680aaf1f
--- /dev/null
+++ b/docs/release/1.15/index.rst
@@ -0,0 +1,13 @@
+.. ONE documentation master file, created by
+   sphinx-quickstart on Thu Mar 18 16:47:12 2021.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+1.0
+===
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+  ./release-note-1.15.0.md
diff --git a/docs/release/1.15/release-note-1.15.0.md b/docs/release/1.15/release-note-1.15.0.md
new file mode 100644
index 000000000..106cefd42
--- /dev/null
+++ b/docs/release/1.15/release-note-1.15.0.md
@@ -0,0 +1,42 @@
+# Release Note 1.15.0
+
+## ONE Compiler
+
+### Compiler Frontend
+
+- Support more Ops for quantization
+- Fix `record-minmax` tool for bool type, NaN values
+- Fix `one-cmds` test scripts
+- Remove `stdex` module
+- `arser` supports short option
+
+
+## ONE Runtime
+
+### Runtime backend supports more operations and types
+
+- CPU backend
+  - Add: int8
+  - AvgPool2d: int8
+  - Conv2D: int8
+  - DepthwiseConv2D: int8
+  - Div: uint8
+  - Elu: float
+  - ExpandDims: int8
+  - LogicalAnd: boolean
+  - Maximum: uint8
+  - MaxPool2D: int8
+  - Minimum: uint8
+  - Mul: int8 
+  - Pad: int8
+  - PadV2: int8
+  - Quantize: uint8, int8
+  - Reshape: int8
+  - Resizebiliear: int8
+  - Softmax: int8
+  - Squeeze: int8
+  - Sub: int8
+
+### ARM Compute Library Update 
+
+- ONERT uses Compute Library v21.02
diff --git a/docs/release/index.rst b/docs/release/index.rst
index 1a5a780c0..31cd0d792 100644
--- a/docs/release/index.rst
+++ b/docs/release/index.rst
@@ -23,3 +23,4 @@ Release
   ./1.10/index
   ./1.11/index
   ./1.12/index
+  ./1.13/index
diff --git a/docs/runtime/backend-api.md b/docs/runtime/backend-api.md
index b32690a00..54ef87090 100644
--- a/docs/runtime/backend-api.md
+++ b/docs/runtime/backend-api.md
@@ -28,24 +28,25 @@ C API above is just an entrypoint and it delegates core stuff to C++ API.
 Here are major classes are described below. One must implement these classes(and some more classes) to create a backend.
 
 - `Backend` : Responsible to create a backend context which is a set of backend components
-- `IConfig` : Configurations and miscellaneous stuff
+- `BackendContext` : Holds data for the current session and also responsible to create tensor objects and kernels
+  - `BackendContext::genTensors` : Create tensor objects
+  - `BackendContext::genKernels` : Create kernels
+- `IConfig` : Configurations and miscellaneous stuff (not session based, global)
 - `ITensorRegistry` : A set of tensor(`ITensor`) objects that are used by the current backend
-- `ITensorBuilder` : Make tensor object and register it to `ITensorRegistry` and static tensors
-- `IKernelGenerator` : Generates operation kernels
 
 Please refer to each class document for details. You may refer to [Bundle Backends](#bundle-backends) for actual implementation samples.
 
 ## Provided Backend Implementations
 
-We provide some backends along with the runtime. There is the special backend `controlflow` which is part of runtime core, and some bundle backends which are baseline backends and samples of backend implementation.
+We provide some backends along with the runtime. There is the special backend `builtin` which is part of runtime core, and some bundle backends which are baseline backends and samples of backend implementation.
 
-## `controlflow` Backend
+## `builtin` Backend
 
-`controlflow` is a special backend that is always loaded(statically linked, part of runtime core). It is implemented just like other backends, but there are some things that it does exclusively.
+`builtin` is a special backend that is always loaded(statically linked, part of runtime core). It is implemented just like other backends, but there are some things that it does exclusively.
 
 - Has kernels for If, While and Permute operations (Kernels from other backends are never be used)
-- The runtime core directly creates `controlflow`'s tensor objects to accept user-given input and output buffers
-- The runtime core gives the executor context to `controlflow` backend which allows control flow ops can change execution flow properly
+- The runtime core directly creates `builtin`'s tensor objects to accept user-given input and output buffers
+- The runtime core gives the executor context to `builtin` backend which allows control flow ops can change execution flow properly
 
 ## Bundle Backends
 
diff --git a/docs/runtime/supported-operations-backend.md b/docs/runtime/supported-operations-backend.md
index 04ece9765..b5b5c6268 100644
--- a/docs/runtime/supported-operations-backend.md
+++ b/docs/runtime/supported-operations-backend.md
@@ -1,6 +1,6 @@
 # Supported Operations and backend
 
-As of 2020-12-07
+As of 2021-03-08
 
 ### Raw-data format (float32, int32, boolean, etc)
 
@@ -14,6 +14,7 @@ ArgMin | O | O | O
 AvgPool2D | O | O | O
 BatchMatmul | O |   |
 BatchToSpaceND | O | O | O
+BroadcastTo | O |   |
 Cast | O | O | O
 Concat | O | O | O
 Conv2D | O | O | O
@@ -22,13 +23,16 @@ Custom | O |   |
 DepthToSpace | O | O | O
 DepthwiseConv2D | O | O | O
 Div | O | O | O
+Einsum | O |   |
+Elu | O |   |
 EmbeddingLookup |   | O | O
 Equal | O | O | O
 Exp | O | O | O
-ExpandDims | O |   |
+ExpandDims | O | O | O
 Fill | O |   |
 Floor | O | O | O
 FullyConnected | O | O | O
+FusedBatchNorm | O |   |
 Gather | O | O | O
 Greater | O | O | O
 GreaterEqual | O | O | O
@@ -42,13 +46,13 @@ Less | O | O | O
 LessEqual | O | O | O
 LocalResponseNormalize |   | O | O
 Log | O |   |
-LogicalAnd |   | O | O
+LogicalAnd | O | O | O
 LogicalNot | O | O | O
 LogicalOr | O | O | O
 Logistic | O | O | O
 LogSoftmax | O |   |
-LSHProjection |   |   |
 LSTM |   | O | O
+MatrixBandPart | O |   |
 Maximum | O | O | O
 MaxPool2D | O | O | O
 Mean | O | O | O
@@ -65,16 +69,18 @@ PReLU |   | O | O
 Quantize | O |   |
 Range | O |   |
 Rank | O |   |
+ReduceAny(All) | O |   |
 ReduceAny(Any) | O |   |
 ReduceMax(Max) | O | O | O
 ReduceMin(Min) | O | O | O
 ReduceProd | O |   |
 ReduceSum(Sum) | O | O | O
 ReLU | O | O | O
-ReLU6 |   | O | O
+ReLU6 | O | O | O
 Reshape | O | O | O
 ResizeBilinear | O | O | O
-ReverseV2 | O |   | O
+ResizeNearestNeighbor |   | O | O
+ReverseV2 | O | O | O
 RNN |   | O | O
 Round | O |   |
 Rsqrt | O | O | O
@@ -87,14 +93,13 @@ Softmax | O | O | O
 SpaceToBatchND | O | O | O
 SpaceToDepth | O | O | O
 Split | O | O | O
-SplitV | O |   |
+SplitV | O | O |
 Sqrt | O | O | O
 Square | O |   |   |
 SquaredDifference | O | O | O
 Squeeze | O | O | O
 StridedSlice | O | O | O
 Sub | O | O | O
-Svdf |   |   |
 Tanh | O | O | O
 Tile | O |   |
 TopKV2 |   |   | O
@@ -121,9 +126,11 @@ Custom | O |   |
 DepthToSpace | O | O | O
 DepthwiseConv2D | O | O | O
 Dequantize | O | O | O
+Div | O |   |
 EmbeddingLookup |   | O | O
 Equal | O | O | O
-ExpandDims | O |   |
+Erf | O |   |
+ExpandDims | O | O | O
 FullyConnected | O | O | O
 Gather | O | O | O
 Greater | O | O | O
@@ -134,17 +141,17 @@ Less | O | O | O
 LessEqual | O | O | O
 Logistic | O | O | O
 LogSoftmax | O |   |
-Maximum |   | O | O
+Maximum | O | O | O
 MaxPool2D | O | O | O
 Mean | O | O | O
-Minimum |   | O | O
+Minimum | O | O | O
 Mul | O | O |
 NotEqual | O | O | O
-OneHot |   | O |
 Pack |   | O | O
 Pad | O | O | O
 PadV2 | O | O | O
 PReLU |   | O | O
+Quantize | O |   |
 Rank | O |   |
 ReduceMax(Max) |   | O |
 ReduceMin(Min) |   | O |
@@ -152,15 +159,17 @@ ReduceSum(Sum) | O | O |
 ReLU |   | O | O
 ReLU6 |   | O | O
 Reshape | O | O | O
-ResizeBilinear | O |   | O
+ResizeBilinear | O | O | O
+ResizeNearestNeighbor |   | O | O
 Shape | O |   |
 Slice | O | O | O
 Softmax | O | O | O
 SpaceToBatchND | O | O | O
 SpaceToDepth | O | O | O
 Split | O | O | O
-SplitV | O |   |
+SplitV | O | O |
 Squeeze | O | O | O
+StatelessRandomUniform | O |   |
 StridedSlice |   | O | O
 Sub | O | O | O
 Tanh | O | O | O
@@ -173,10 +182,27 @@ Unpack(Unstack) |   | O | O
 
 Operation | CPU | ACL-CL | ACL-NEON
 -- | -- | -- | --
+Add | O | O | O
 ArgMax | O | O | O
 ArgMin | O | O | O
-Concat | O |   |
+AvgPool2D | O |   |
+Concat | O | O | O
+Conv2D | O |   |
 DepthToSpace | O |   |
-Dequantize | O |   |
+DepthwiseConv2D | O |   |
+Dequantize | O | O | O
+ExpandDims | O | O | O
+MaxPool2D | O |   |
+Mul | O | O | O
+Pad | O | O | O
+PadV2 | O |   |
+PReLU |   | O | O
+Quantize | O |   |
 Rank | O |   |
+Reshape | O | O | O
+ResizeBilinear | O | O | O
+ResizeNearestNeighbor |   | O | O
 Shape | O |   |
+Softmax | O | O | O
+Squeeze | O | O | O
+Sub | O | O | O
diff --git a/infra/cmake/modules/IdentifyPlatform.cmake b/infra/cmake/modules/IdentifyPlatform.cmake
index 69fe48cad..cf56dd086 100644
--- a/infra/cmake/modules/IdentifyPlatform.cmake
+++ b/infra/cmake/modules/IdentifyPlatform.cmake
@@ -39,6 +39,8 @@ elseif("${HOST_ARCH}" STREQUAL "armv7l")
   set(HOST_ARCH_BASE "arm")
 elseif("${HOST_ARCH}" STREQUAL "aarch64")
   set(HOST_ARCH_BASE "aarch64")
+elseif("${HOST_ARCH}" STREQUAL "i686")
+  set(HOST_ARCH_BASE "i686")
 else()
   message(FATAL_ERROR "'${HOST_ARCH}' architecture is not supported")
 endif()
@@ -49,6 +51,8 @@ elseif("${TARGET_ARCH}" STREQUAL "armv7l")
   set(TARGET_ARCH_BASE "arm")
 elseif("${TARGET_ARCH}" STREQUAL "aarch64")
   set(TARGET_ARCH_BASE "aarch64")
+elseif("${TARGET_ARCH}" STREQUAL "i686")
+  set(TARGET_ARCH_BASE "i686")
 else()
   message(FATAL_ERROR "'${TARGET_ARCH}' architecture is not supported")
 endif()
diff --git a/infra/cmake/packages/ARMComputeSourceConfig.cmake b/infra/cmake/packages/ARMComputeSourceConfig.cmake
index 0ffa0cd35..16e12bbca 100644
--- a/infra/cmake/packages/ARMComputeSourceConfig.cmake
+++ b/infra/cmake/packages/ARMComputeSourceConfig.cmake
@@ -8,7 +8,7 @@ function(_ARMComputeSource_import)
   nnas_include(OptionTools)
 
   envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com")
-  set(ARMCOMPUTE_URL ${EXTERNAL_DOWNLOAD_SERVER}/ARM-software/ComputeLibrary/archive/v20.05.tar.gz)
+  set(ARMCOMPUTE_URL ${EXTERNAL_DOWNLOAD_SERVER}/ARM-software/ComputeLibrary/archive/v21.02.tar.gz)
   ExternalSource_Download(ARMCOMPUTE ${ARMCOMPUTE_URL})
 
   set(ARMComputeSource_DIR ${ARMCOMPUTE_SOURCE_DIR} PARENT_SCOPE)
diff --git a/infra/cmake/packages/AbseilSourceConfig.cmake b/infra/cmake/packages/AbseilSourceConfig.cmake
index 8be732660..8aeb86db3 100644
--- a/infra/cmake/packages/AbseilSourceConfig.cmake
+++ b/infra/cmake/packages/AbseilSourceConfig.cmake
@@ -7,19 +7,14 @@ function(_AbseilSource_import)
   nnas_include(ExternalSourceTools)
   nnas_include(OptionTools)
 
-  # NOTE TensorFlow 1.12 downloads abseil from the following URL
-  # - https://github.com/abseil/abseil-cpp/archive/48cd2c3f351ff188bc85684b84a91b6e6d17d896.tar.gz
-  #
-  # The last change of "48cd2c3f351" was commited on 2018.09.27
-  #
-  # Let's use the latest released version (2020-02 release patch 2)
+  # NOTE TensorFlow 2.3 downloads abseil from the following URL
   envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com")
-  envoption(ABSEIL_URL ${EXTERNAL_DOWNLOAD_SERVER}/abseil/abseil-cpp/archive/20200225.2.tar.gz)
+  envoption(ABSEIL_URL ${EXTERNAL_DOWNLOAD_SERVER}/abseil/abseil-cpp/archive/df3ea785d8c30a9503321a3d35ee7d35808f190d.tar.gz)
 
   ExternalSource_Download(ABSEIL
     DIRNAME ABSEIL
     URL ${ABSEIL_URL}
-    CHECKSUM MD5=73f2b6e72f1599a9139170c29482ddc4)
+    CHECKSUM MD5=4d9aa7e757adf48fef171c85f0d88552)
 
   set(AbseilSource_DIR ${ABSEIL_SOURCE_DIR} PARENT_SCOPE)
   set(AbseilSource_FOUND TRUE PARENT_SCOPE)
diff --git a/infra/cmake/packages/OouraFFTSourceConfig.cmake b/infra/cmake/packages/OouraFFTSourceConfig.cmake
new file mode 100644
index 000000000..be551fbe4
--- /dev/null
+++ b/infra/cmake/packages/OouraFFTSourceConfig.cmake
@@ -0,0 +1,19 @@
+function(_OouraFFTSource_import)
+  if(NOT DOWNLOAD_OOURAFFT)
+    set(OouraFFTSource_FOUND FALSE PARENT_SCOPE)
+    return()
+  endif(NOT DOWNLOAD_OOURAFFT)
+
+  nnas_include(ExternalSourceTools)
+  nnas_include(OptionTools)
+
+  # NOTE TensorFlow 2.3 downloads OOURAFFT from the following URL
+  envoption(OOURAFFT_URL https://github.com/petewarden/OouraFFT/archive/v1.0.tar.gz)
+
+  ExternalSource_Download(OOURAFFT ${OOURAFFT_URL})
+
+  set(OouraFFTSource_DIR ${OOURAFFT_SOURCE_DIR} PARENT_SCOPE)
+  set(OouraFFTSource_FOUND TRUE PARENT_SCOPE)
+endfunction(_OouraFFTSource_import)
+
+_OouraFFTSource_import()
diff --git a/infra/command/build-docker-image b/infra/command/build-docker-image
index f05266b58..9a044385c 100644
--- a/infra/command/build-docker-image
+++ b/infra/command/build-docker-image
@@ -14,7 +14,7 @@ DOCKER_FILE_RPATH_BASE="infra/docker"
 DOCKER_BUILD_ARGS=()
 
 # Default setting
-UBUNTU_CODENAME="xenial"
+UBUNTU_CODENAME="bionic"
 DOCKER_TAG="latest"
 
 while [[ $# -gt 0 ]]
diff --git a/infra/command/format b/infra/command/format
index c57e6dc8f..dc006639d 100644
--- a/infra/command/format
+++ b/infra/command/format
@@ -131,7 +131,7 @@ function check_cpp_files() {
     return
   fi
 
-  CLANG_FORMAT_CANDIDATES+=("clang-format-3.9")
+  CLANG_FORMAT_CANDIDATES+=("clang-format-8")
   for CLANG_FORMAT_CANDIDATE in ${CLANG_FORMAT_CANDIDATES[@]}; do
     if command_exists ${CLANG_FORMAT_CANDIDATE} ; then
       CLANG_FORMAT="${CLANG_FORMAT_CANDIDATE}"
@@ -140,29 +140,14 @@ function check_cpp_files() {
   done
 
   if [[ -z ${CLANG_FORMAT}  ]]; then
-    echo "[ERROR] clang-format-3.9 is unavailable"
-    echo
-    echo "        Please install clang-format-3.9 before running format check"
-    exit 1
-  fi
-
-  # Migration to clang-format-8
-  # TODO Remove this after migration to clang-format-8
-  CLANG_FORMAT_8="clang-format-8"
-  if ! command_exists $CLANG_FORMAT_8_CANDIDATE; then
     echo "[ERROR] clang-format-8 is unavailable"
     echo
     echo "        Please install clang-format-8 before running format check"
-    echo "        (or use latest docker image if you are using docker for format check)"
     exit 1
   fi
-  for DIR_CLANG_FORMAT_8 in $(git ls-files -co --exclude-standard '*/.clang-format'); do
-    DIRECTORIES_USE_CLANG_FORMAT_8+=($(dirname "${DIR_CLANG_FORMAT_8}"))
-  done
 
   # Check c++ files
   FILES_TO_CHECK_CPP=()
-  FILES_TO_CHECK_CPP_BY_CLANG_FORMAT_8=()
   for f in ${FILES_TO_CHECK[@]}; do
     # Manually ignore style checking
     if [[ ${f} == +(*/NeuralNetworks.h|*/NeuralNetworksExtensions.h) ]]; then
@@ -171,21 +156,7 @@ function check_cpp_files() {
 
     # File extension to check
     if [[ ${f} == +(*.h|*.hpp|*.cpp|*.cc|*.c|*.cl) ]]; then
-
-      # Check clang-format-8 target files first
-      # TODO Remove this after migration to clang-format-8
-      FOUND_CLANG_8=0
-      for USE_CLANG_FORMAT_8 in ${DIRECTORIES_USE_CLANG_FORMAT_8[@]}; do
-        if [[ $f = $USE_CLANG_FORMAT_8* ]]; then
-          FILES_TO_CHECK_CPP_BY_CLANG_FORMAT_8+=("$f")
-          FOUND_CLANG_8=1
-          break
-        fi
-      done
-
-      if [[ $FOUND_CLANG_8 -ne 1 ]]; then
-        FILES_TO_CHECK_CPP+=("${f}")
-      fi
+      FILES_TO_CHECK_CPP+=("${f}")
     fi
   done
 
@@ -202,16 +173,6 @@ function check_cpp_files() {
       INVALID_EXIT=${EXIT_CODE}
     fi
   fi
-
-  # Check by clang-format-8
-  # TODO Remove this after migration to clang-format-8
-  if [[ ${#FILES_TO_CHECK_CPP_BY_CLANG_FORMAT_8} -ne 0 ]]; then
-    ${CLANG_FORMAT_8} -i ${FILES_TO_CHECK_CPP_BY_CLANG_FORMAT_8[@]}
-    EXIT_CODE=$?
-    if [[ ${EXIT_CODE} -ne 0 ]]; then
-      INVALID_EXIT=${EXIT_CODE}
-    fi
-  fi
 }
 
 function check_python_files() {
diff --git a/infra/command/gen-coverage-report b/infra/command/gen-coverage-report
index c3a8202e7..bf65b1bfb 100644
--- a/infra/command/gen-coverage-report
+++ b/infra/command/gen-coverage-report
@@ -67,12 +67,9 @@ done
   "${CANDIDATES[@]}"
 
 # Exclude *.test.cpp files from coverage report
-"${LCOV_PATH}" -r "${EXTRACTED_COVERAGE_INFO_PATH}" -o "${EXCLUDED_COVERAGE_INFO_PATH}" \
-  '*.test.cpp'
-
 # Exclude flatbuffer generated files from coverage report
 "${LCOV_PATH}" -r "${EXTRACTED_COVERAGE_INFO_PATH}" -o "${EXCLUDED_COVERAGE_INFO_PATH}" \
-  '*_schema_generated.h'
+  '*.test.cpp' '*_schema_generated.h'
 
 # Final coverage data
 cp -v ${EXCLUDED_COVERAGE_INFO_PATH} ${COVERAGE_INFO_PATH}
diff --git a/infra/docker/xenial/Dockerfile b/infra/docker/xenial/Dockerfile
deleted file mode 100644
index ae3c46401..000000000
--- a/infra/docker/xenial/Dockerfile
+++ /dev/null
@@ -1,67 +0,0 @@
-FROM ubuntu:16.04
-
-ARG UBUNTU_MIRROR
-
-RUN if [ -n "$http_proxy" ] ; then echo "Acquire::http::proxy \"${http_proxy}\";" >> /etc/apt/apt.conf ; fi
-RUN if [ -n "$https_proxy" ] ; then echo "Acquire::https::proxy \"${https_proxy}\";" >> /etc/apt/apt.conf ; fi
-RUN if [ -n "$UBUNTU_MIRROR" ] ; then sed "s/archive.ubuntu.com/${UBUNTU_MIRROR}/g" -i /etc/apt/sources.list ; fi
-
-# Install 'add-apt-repository'
-RUN apt-get update && apt-get -qqy install software-properties-common
-
-# Build tool
-RUN apt-get update && apt-get -qqy install build-essential cmake scons git lcov
-
-# Install extra dependencies (Caffe, nnkit)
-RUN apt-get update && apt-get -qqy install libboost-all-dev libgflags-dev libgoogle-glog-dev libatlas-base-dev libhdf5-dev
-
-# Install protocol buffer
-RUN apt-get update && apt-get -qqy install libprotobuf-dev protobuf-compiler
-
-# Additonal tools
-RUN apt-get update && \
-    apt-get -qqy install doxygen graphviz wget unzip clang-format-3.9 clang-format-8 python3 python3-pip python3-venv hdf5-tools pylint curl
-RUN pip3 install --upgrade pip
-RUN pip3 install yapf==0.22.0 numpy
-
-# Install google test (source)
-RUN apt-get update && apt-get -qqy install libgtest-dev
-
-###
-### NOTE: Don't add new package install using apt-get or pip below this line
-###
-
-# Install native build tool gcc version 6.x
-RUN add-apt-repository ppa:ubuntu-toolchain-r/test && apt-get update && apt-get -qqy install gcc-6 g++-6
-RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-6 60 --slave /usr/bin/g++ g++ /usr/bin/g++-6 && update-alternatives --config gcc
-
-# Install cross build tool gcc version 6.x
-RUN wget https://releases.linaro.org/components/toolchain/binaries/6.3-2017.02/arm-linux-gnueabihf/gcc-linaro-6.3.1-2017.02-x86_64_arm-linux-gnueabihf.tar.xz -O gcc-hardfp.tar.xz -nv
-RUN wget https://releases.linaro.org/components/toolchain/binaries/6.2-2016.11/arm-linux-gnueabi/gcc-linaro-6.2.1-2016.11-x86_64_arm-linux-gnueabi.tar.xz -O gcc-softfp.tar.xz -nv
-RUN wget https://releases.linaro.org/components/toolchain/binaries/6.2-2016.11/aarch64-linux-gnu/gcc-linaro-6.2.1-2016.11-x86_64_aarch64-linux-gnu.tar.xz -O gcc-aarch64.tar.xz -nv
-RUN tar -xf gcc-hardfp.tar.xz -C /opt/ && rm -rf gcc-hardfp.tar.xz
-RUN tar -xf gcc-softfp.tar.xz -C /opt/ && rm -rf gcc-softfp.tar.xz
-RUN tar -xf gcc-aarch64.tar.xz -C /opt/ && rm -rf gcc-aarch64.tar.xz
-ENV PATH "/opt/gcc-linaro-6.2.1-2016.11-x86_64_arm-linux-gnueabi/bin:/opt/gcc-linaro-6.3.1-2017.02-x86_64_arm-linux-gnueabihf/bin:/opt/gcc-linaro-6.2.1-2016.11-x86_64_aarch64-linux-gnu/bin:$PATH"
-
-###
-### NOTE: Don't add build & install process using installed buildtool above this line
-###
-
-# Build and install google test static libraries
-WORKDIR /root/gtest
-RUN cmake /usr/src/gtest
-RUN make
-RUN mv *.a /usr/lib
-WORKDIR /root
-RUN rm -rf gtest
-
-# Install gbs & sdb
-RUN echo 'deb [trusted=yes] http://download.tizen.org/tools/latest-release/Ubuntu_16.04/ /' | cat >> /etc/apt/sources.list
-RUN apt-get update && apt-get -qqy install gbs
-RUN wget http://download.tizen.org/sdk/tizenstudio/official/binary/sdb_3.1.4_ubuntu-64.zip -O sdb.zip
-RUN unzip -d tmp sdb.zip && rm sdb.zip
-RUN cp tmp/data/tools/sdb /usr/bin/. && rm -rf tmp
-
-# Clean archives (to reduce image size)
-RUN apt-get clean -y
diff --git a/infra/nncc/command/utcount b/infra/nncc/command/utcount
index d06c5c9de..64aaace9b 100644
--- a/infra/nncc/command/utcount
+++ b/infra/nncc/command/utcount
@@ -9,15 +9,17 @@ if [[ ! -d "${BUILD_WORKSPACE_PATH}" ]]; then
   exit 255
 fi
 
-BUILD_ITEMS="angkor cwrap pepper-str pepper-strcast pp stdex \
+BUILD_ITEMS="angkor cwrap pepper-str pepper-strcast pp \
 oops pepper-assert \
 hermes hermes-std \
 loco locop locomotiv logo-core logo \
-foder souschef arser vconone \
+foder souschef arser vconone crew \
 safemain mio-circle mio-tflite \
 tflite2circle \
 luci \
 luci-interpreter \
+luci-eval-driver \
+luci-pass-value-test \
 luci-value-test \
 record-minmax \
 circle2circle circle-quantizer"
diff --git a/infra/nnfw/cmake/CfgOptionFlags.cmake b/infra/nnfw/cmake/CfgOptionFlags.cmake
index f6ad0cada..87c2c86f6 100644
--- a/infra/nnfw/cmake/CfgOptionFlags.cmake
+++ b/infra/nnfw/cmake/CfgOptionFlags.cmake
@@ -24,7 +24,7 @@ option(BUILD_NNAPI_TEST "Build nnapi_test" ON)
 option(BUILD_NNPACKAGE_RUN "Build nnpackge_run" ON)
 option(BUILD_TFLITE_LOADER "Build TensorFlow Lite loader" ON)
 option(BUILD_CIRCLE_LOADER "Build circle loader" ON)
-option(BUILD_TFLITE_LOADER_TEST_TOOL "Build tflite loader testing tool" ON)
+option(BUILD_TFLITE_COMPARATOR_TEST_TOOL "Build tflite loader testing tool" ON)
 option(BUILD_WITH_HDF5 "Build test tool with HDF5 library" ON)
 option(GENERATE_RUNTIME_NNAPI_TESTS "Generate NNAPI operation gtest" ON)
 option(ENVVAR_ONERT_CONFIG "Use environment variable for onert configuration" ON)
@@ -64,6 +64,7 @@ option(DOWNLOAD_NONIUS "Download nonius source" ON)
 option(DOWNLOAD_BOOST "Download boost source" OFF)
 option(DOWNLOAD_RUY "Download ruy source" ON)
 option(DOWNLOAD_CPUINFO "Download cpuinfo source" ON)
+option(DOWNLOAD_OOURAFFT "Download Ooura FFT source" ON)
 option(DOWNLOAD_GTEST "Download Google Test source and build Google Test" ON)
 option(BUILD_BOOST "Build boost source" OFF)
 option(BUILD_TENSORFLOW_LITE "Build TensorFlow Lite from the downloaded source" ON)
diff --git a/infra/nnfw/cmake/buildtool/config/config_i686-tizen.cmake b/infra/nnfw/cmake/buildtool/config/config_i686-tizen.cmake
new file mode 100644
index 000000000..3929e07fd
--- /dev/null
+++ b/infra/nnfw/cmake/buildtool/config/config_i686-tizen.cmake
@@ -0,0 +1,17 @@
+#
+# i686 tizen compile options
+#
+
+message(STATUS "Building for i686 Tizen")
+
+# Build flag for tizen
+set(CMAKE_C_FLAGS_DEBUG     "-O -g -DDEBUG")
+set(CMAKE_CXX_FLAGS_DEBUG   "-O -g -DDEBUG")
+
+# TODO : add and use option_tizen if something uncommon comes up
+# include linux common
+include("cmake/buildtool/config/config_linux.cmake")
+
+# addition for i686-tizen
+set(FLAGS_COMMON ${FLAGS_COMMON}
+    )
diff --git a/infra/nnfw/cmake/options/options_i686-tizen.cmake b/infra/nnfw/cmake/options/options_i686-tizen.cmake
new file mode 100644
index 000000000..7a425f068
--- /dev/null
+++ b/infra/nnfw/cmake/options/options_i686-tizen.cmake
@@ -0,0 +1,12 @@
+#
+# i686 tizen cmake options
+#
+option(BUILD_ARMCOMPUTE "Build ARM Compute from the downloaded source" OFF)
+option(BUILD_TENSORFLOW_LITE "Build TensorFlow Lite from the downloaded source" OFF)
+option(DOWNLOAD_ARMCOMPUTE "Download ARM Compute source" OFF)
+
+option(BUILD_LOGGING "Build logging runtime" OFF)
+option(GENERATE_RUNTIME_NNAPI_TESTS "Generate NNAPI operation gtest" OFF)
+option(ENVVAR_ONERT_CONFIG "Use environment variable for onert configuration" OFF)
+
+option(BUILD_XNNPACK "Build XNNPACK" OFF)
diff --git a/infra/nnfw/cmake/packages/ARMComputeConfig.cmake b/infra/nnfw/cmake/packages/ARMComputeConfig.cmake
index 1b5a32ef6..4761e848c 100644
--- a/infra/nnfw/cmake/packages/ARMComputeConfig.cmake
+++ b/infra/nnfw/cmake/packages/ARMComputeConfig.cmake
@@ -65,7 +65,7 @@ endfunction(_ARMCompute_Import)
 # Let's build and install ARMCompute libraries
 function(_ARMCompute_Build ARMComputeInstall_DIR)
   set(PKG_NAME "ARMCOMPUTE")
-  set(PKG_IDENTIFIER "20.05")
+  set(PKG_IDENTIFIER "21.02")
   set(INSTALL_STAMP_PATH "${ARMComputeInstall_DIR}/${PKG_NAME}.stamp")
   set(ARMComputeBuild_DIR "${CMAKE_BINARY_DIR}/externals/armcompute")
 
diff --git a/infra/nnfw/cmake/packages/TensorFlowLite-1.13.1/TensorFlowLiteConfig.cmake b/infra/nnfw/cmake/packages/TensorFlowLite-1.13.1/TensorFlowLiteConfig.cmake
index e15239805..e4fbc3ad3 100644
--- a/infra/nnfw/cmake/packages/TensorFlowLite-1.13.1/TensorFlowLiteConfig.cmake
+++ b/infra/nnfw/cmake/packages/TensorFlowLite-1.13.1/TensorFlowLiteConfig.cmake
@@ -13,8 +13,8 @@ if(BUILD_TENSORFLOW_LITE)
   endmacro(return_unless)
 
   # Required packages
-  nnas_find_package(AbseilSource QUIET)
-  return_unless(AbseilSource_FOUND)
+  nnas_find_package(Abseil QUIET)
+  return_unless(Abseil_FOUND)
   nnfw_find_package(TensorFlowEigen EXACT 1.13.1 QUIET)
   return_unless(TensorFlowEigen_1_13_1_FOUND)
   nnas_find_package(FarmhashSource QUIET)
diff --git a/infra/nnfw/cmake/packages/TensorFlowLite-2.3.0/CMakeLists.txt b/infra/nnfw/cmake/packages/TensorFlowLite-2.3.0/TensorFlowLite/CMakeLists.txt
index 616f8ff8e..afee6e1cc 100644
--- a/infra/nnfw/cmake/packages/TensorFlowLite-2.3.0/CMakeLists.txt
+++ b/infra/nnfw/cmake/packages/TensorFlowLite-2.3.0/TensorFlowLite/CMakeLists.txt
@@ -2,7 +2,7 @@
 #
 # Tensorflow Lite library 2.3.0
 #
-set(TENSORFLOW_LITE_BASE ${TFLiteVanillaTensorFlowSource_DIR}/tensorflow/lite)
+set(TENSORFLOW_LITE_BASE ${TensorFlowSource_DIR}/tensorflow/lite)
 
 file(GLOB TFLITE_CORE_SRCS "${TENSORFLOW_LITE_BASE}/*.c"
      "${TENSORFLOW_LITE_BASE}/*.cc"
@@ -31,34 +31,8 @@ list(APPEND TFLITE_SRCS ${TFLITE_EXPERIMENTAL_SRCS})
 list(APPEND TFLITE_SRCS ${TFLITE_SPARSITY_SRCS})
 
 # externals
-list(APPEND TFLITE_SRCS "${TFLiteVanillaFarmhashSource_DIR}/src/farmhash.cc")
-list(APPEND TFLITE_SRCS "${TFLiteVanillaFFT2DSource_DIR}/fftsg.c")
-list(APPEND TFLITE_SRCS "${TFLiteVanillaFFT2DSource_DIR}/fftsg2d.c")
-list(APPEND TFLITE_SRCS "${TFLiteVanillaFlatBuffersSource_DIR}/src/util.cpp")
-
-# externals - absl
-file(GLOB_RECURSE ABSL_SRCS "${TFLiteVanillaAbslSource_DIR}/absl/*.cc")
-file(GLOB_RECURSE ABSL_EXCLS "${TFLiteVanillaAbslSource_DIR}/absl/*test*.cc"
-     "${TFLiteVanillaAbslSource_DIR}/absl/*benchmark*.cc"
-     "${TFLiteVanillaAbslSource_DIR}/absl/synchronization/*.cc"
-     "${TFLiteVanillaAbslSource_DIR}/absl/debugging/*.cc"
-     "${TFLiteVanillaAbslSource_DIR}/absl/hash/*.cc"
-     "${TFLiteVanillaAbslSource_DIR}/absl/flags/*.cc"
-     "${TFLiteVanillaAbslSource_DIR}/absl/random/*.cc")
-list(REMOVE_ITEM ABSL_SRCS ${ABSL_EXCLS})
-list(APPEND TFLITE_SRCS ${ABSL_SRCS})
-
-# externals - ruy
-file(GLOB RUY_SRCS "${TFLiteVanillaRuySource_DIR}/ruy/*.cc")
-file(GLOB_RECURSE RUY_EXCLS "${TFLiteVanillaRuySource_DIR}/ruy/*test*.cc"
-      "${TFLiteVanillaRuySource_DIR}/ruy/*benchmark*.cc"
-      "${TFLiteVanillaRuySource_DIR}/ruy/*example*.cc")
-list(REMOVE_ITEM RUY_SRCS ${RUY_EXCLS})
-# Temporary fix for ruy compilation error.
-# TODO(b/158800055): Remove this hack once the ruy version is correctly bumped.
-list(REMOVE_ITEM RUY_SRCS "${TFLiteVanillaRuySource_DIR}/ruy/prepare_packed_matrices.cc")
-list(APPEND TFLITE_SRCS ${RUY_SRCS})
-
+list(APPEND TFLITE_SRCS "${OouraFFTSource_DIR}/fftsg.c")
+list(APPEND TFLITE_SRCS "${OouraFFTSource_DIR}/fftsg2d.c")
 
 # Build with mmap? true
 # caution: v2.3.0's Makefile has wrong code on this part. This is fixed on master branch.
@@ -98,22 +72,20 @@ file(GLOB_RECURSE TFLITE_EXCLS "${TENSORFLOW_LITE_BASE}/*test*.cc"
 list(REMOVE_ITEM TFLITE_SRCS ${TFLITE_EXCLS})
 
 # include headers
-list(APPEND TFLITE_INCLUDES "${TFLiteVanillaTensorFlowSource_DIR}")
-list(APPEND TFLITE_INCLUDES "${TFLiteVanillaEigenSource_DIR}")
-list(APPEND TFLITE_INCLUDES "${TFLiteVanillaAbslSource_DIR}")
-list(APPEND TFLITE_INCLUDES "${TFLiteVanillaGEMMLowpSource_DIR}")
-list(APPEND TFLITE_INCLUDES "${TFLiteVanillaNEON2SSESource_DIR}")
-list(APPEND TFLITE_INCLUDES "${TFLiteVanillaFarmhashSource_DIR}/src")
-list(APPEND TFLITE_INCLUDES "${TFLiteVanillaFlatBuffersSource_DIR}/include")
-list(APPEND TFLITE_INCLUDES "${TFLiteVanillaFP16Source_DIR}/include")
-list(APPEND TFLITE_INCLUDES "${TFLiteVanillaRuySource_DIR}")
+list(APPEND TFLITE_INCLUDES "${TensorFlowSource_DIR}")
+list(APPEND TFLITE_INCLUDES "${TensorFlowGEMMLowpSource_DIR}")
+list(APPEND TFLITE_INCLUDES "${Fp16Source_DIR}/include")
+
+if(NEON2SSESource_FOUND)
+  list(APPEND TFLITE_INCLUDES "${NEON2SSESource_DIR}")
+endif(NEON2SSESource_FOUND)
 
 add_library(tensorflow-lite-2.3.0 STATIC ${TFLITE_SRCS})
 target_include_directories(tensorflow-lite-2.3.0 SYSTEM PUBLIC ${TFLITE_INCLUDES})
 target_include_directories(tensorflow-lite-2.3.0 PRIVATE ${CpuInfoSource_DIR})
 target_compile_definitions(tensorflow-lite-2.3.0 PUBLIC "GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK -DTFLITE_WITH_RUY -DTFLITE_WITH_RUY_GEMV -DRUY_HAVE_CPUINFO")
 set_property(TARGET tensorflow-lite-2.3.0 PROPERTY POSITION_INDEPENDENT_CODE ON)
-target_link_libraries(tensorflow-lite-2.3.0 eigen ${LIB_PTHREAD} dl cpuinfo)
+target_link_libraries(tensorflow-lite-2.3.0 eigen flatbuffers::flatbuffers ruy abseil farmhash ${LIB_PTHREAD} dl)
 if(NOT ANDROID AND ${BUILD_WITH_NNAPI})
   target_link_libraries(tensorflow-lite-2.3.0 rt)
 endif()
diff --git a/infra/nnfw/cmake/packages/TensorFlowLite-2.3.0/TensorFlowLiteConfig.cmake b/infra/nnfw/cmake/packages/TensorFlowLite-2.3.0/TensorFlowLiteConfig.cmake
new file mode 100644
index 000000000..c81958cf4
--- /dev/null
+++ b/infra/nnfw/cmake/packages/TensorFlowLite-2.3.0/TensorFlowLiteConfig.cmake
@@ -0,0 +1,44 @@
+if(BUILD_TENSORFLOW_LITE_2_3_0)
+  macro(return_unless VAR)
+  if(NOT ${VAR})
+    message("TFLiteVanillaRun: ${VAR} NOT TRUE")
+    set(TensorFlowLite_2_3_0_FOUND FALSE PARENT_SCOPE)
+    return()
+  endif(NOT ${VAR})
+  endmacro(return_unless)
+
+  nnas_include(ExternalSourceTools)
+  nnas_include(OptionTools)
+
+  nnas_find_package(TensorFlowSource EXACT 2.3.0 QUIET)
+  return_unless(TensorFlowSource_FOUND)
+
+  # Below urls come from https://github.com/tensorflow/tensorflow/blob/v2.3.0/tensorflow/tensorflow/workspace.bzl
+  nnas_find_package(AbseilSource QUIET)
+  return_unless(AbseilSource_FOUND)
+  nnfw_find_package(Eigen QUIET)
+  return_unless(Eigen_FOUND)
+  nnas_find_package(Farmhash QUIET)
+  return_unless(Farmhash_FOUND)
+  nnfw_find_package(FlatBuffers QUIET)
+  return_unless(FlatBuffers_FOUND)
+  nnas_find_package(TensorFlowGEMMLowpSource EXACT 2.3.0 QUIET)
+  return_unless(TensorFlowGEMMLowpSource_FOUND)
+  nnas_find_package(OouraFFTSource QUIET)
+  return_unless(OouraFFTSource_FOUND)
+  nnfw_find_package(Ruy QUIET)
+  return_unless(Ruy_FOUND)
+
+  # TensorFlow Lite requires FP16 library's header only
+  nnas_find_package(Fp16Source QUIET)
+  return_unless(Fp16Source_FOUND)
+
+  # Optional packages
+  nnas_find_package(NEON2SSESource QUIET)
+
+  nnas_include(ExternalProjectTools)
+  add_extdirectory("${CMAKE_CURRENT_LIST_DIR}/TensorFlowLite" tflite-2.3.0)
+
+  set(TensorFlowLite_2_3_0_FOUND TRUE)
+  return()
+endif()
diff --git a/infra/nnfw/cmake/packages/TensorFlowLite-2.3.0/TensorFlowLiteConfigVersion.cmake b/infra/nnfw/cmake/packages/TensorFlowLite-2.3.0/TensorFlowLiteConfigVersion.cmake
new file mode 100644
index 000000000..08e637421
--- /dev/null
+++ b/infra/nnfw/cmake/packages/TensorFlowLite-2.3.0/TensorFlowLiteConfigVersion.cmake
@@ -0,0 +1,9 @@
+set(PACKAGE_VERSION "2.3.0")
+set(PACKAGE_VERSION_EXACT FALSE)
+set(PACKAGE_VERSION_COMPATIBLE FALSE)
+set(PACKAGE_VERSION_UNSUITABLE TRUE)
+
+if(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION)
+  set(PACKAGE_VERSION_EXACT TRUE)
+  set(PACKAGE_VERSION_UNSUITABLE FALSE)
+endif(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION)
diff --git a/infra/nnfw/cmake/packages/TensorFlowLite-2.3.0Config.cmake b/infra/nnfw/cmake/packages/TensorFlowLite-2.3.0Config.cmake
deleted file mode 100644
index 9671dc4af..000000000
--- a/infra/nnfw/cmake/packages/TensorFlowLite-2.3.0Config.cmake
+++ /dev/null
@@ -1,107 +0,0 @@
-if(BUILD_TENSORFLOW_LITE_2_3_0)
-  macro(return_unless VAR)
-  if(NOT ${VAR})
-    message("${VAR} NOT TRUE")
-    set(TensorFlowLite_2_3_0_FOUND PARENT_SCOPE)
-    return()
-  endif(NOT ${VAR})
-  endmacro(return_unless)
-
-  nnas_include(ExternalSourceTools)
-  nnas_include(OptionTools)
-
-  # Below urls come from https://github.com/tensorflow/tensorflow/blob/v2.3.0/tensorflow/lite/tools/make/Makefile
-
-  set(absl_url "https://github.com/abseil/abseil-cpp/archive/df3ea785d8c30a9503321a3d35ee7d35808f190d.tar.gz")
-  ExternalSource_Download("TFLiteVanilla_Absl" ${absl_url})
-  set(TFLiteVanillaAbslSource_DIR "${TFLiteVanilla_Absl_SOURCE_DIR}")
-  if (NOT TFLiteVanillaAbslSource_DIR STREQUAL "")
-    set(TFLiteVanillaAbslSource_FOUND TRUE)
-  endif()
-  return_unless(TFLiteVanillaAbslSource_FOUND)
-
-  set(eigen_url "https://gitlab.com/libeigen/eigen/-/archive/386d809bde475c65b7940f290efe80e6a05878c4/eigen-386d809bde475c65b7940f290efe80e6a05878c4.tar.gz")
-  ExternalSource_Download("TFLiteVanilla_Eigen" ${eigen_url})
-  set(TFLiteVanillaEigenSource_DIR "${TFLiteVanilla_Eigen_SOURCE_DIR}")
-  if (NOT TFLiteVanillaEigenSource_DIR STREQUAL "")
-    set(TFLiteVanillaEigenSource_FOUND TRUE)
-  endif()
-  return_unless(TFLiteVanillaEigenSource_FOUND)
-
-  set(farmhash_url "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz")
-  ExternalSource_Download("TFLiteVanilla_Farmhash" ${farmhash_url})
-  set(TFLiteVanillaFarmhashSource_DIR "${TFLiteVanilla_Farmhash_SOURCE_DIR}")
-  if (NOT TFLiteVanillaFarmhashSource_DIR STREQUAL "")
-    set(TFLiteVanillaFarmhashSource_FOUND TRUE)
-  endif()
-  return_unless(TFLiteVanillaFarmhashSource_FOUND)
-
-  set(fft2d_url "https://storage.googleapis.com/mirror.tensorflow.org/github.com/petewarden/OouraFFT/archive/v1.0.tar.gz")
-  ExternalSource_Download("TFLiteVanilla_FFT2D" ${fft2d_url})
-  set(TFLiteVanillaFFT2DSource_DIR "${TFLiteVanilla_FFT2D_SOURCE_DIR}")
-  if (NOT TFLiteVanillaFFT2DSource_DIR STREQUAL "")
-    set(TFLiteVanillaFFT2DSource_FOUND TRUE)
-  endif()
-  return_unless(TFLiteVanillaFFT2DSource_FOUND)
-
-  set(flatbuffers_url "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/flatbuffers/archive/v1.12.0.tar.gz")
-  ExternalSource_Download("TFLiteVanilla_FlatBuffers" ${flatbuffers_url})
-  set(TFLiteVanillaFlatBuffersSource_DIR "${TFLiteVanilla_FlatBuffers_SOURCE_DIR}")
-  if (NOT TFLiteVanillaFlatBuffersSource_DIR STREQUAL "")
-    set(TFLiteVanillaFlatBuffersSource_FOUND TRUE)
-  endif()
-  return_unless(TFLiteVanillaFlatBuffersSource_FOUND)
-
-  set(fp16_url "https://github.com/Maratyszcza/FP16/archive/4dfe081cf6bcd15db339cf2680b9281b8451eeb3.zip")
-  ExternalSource_Download("TFLiteVanilla_FP16" ${fp16_url})
-  set(TFLiteVanillaFP16Source_DIR "${TFLiteVanilla_FP16_SOURCE_DIR}")
-  if (NOT TFLiteVanillaFP16Source_DIR STREQUAL "")
-    set(TFLiteVanillaFP16Source_FOUND TRUE)
-  endif()
-  return_unless(TFLiteVanillaFP16Source_FOUND)
-
-  set(gemmlowp_url "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/gemmlowp/archive/fda83bdc38b118cc6b56753bd540caa49e570745.zip")
-  ExternalSource_Download("TFLiteVanilla_GEMMLowp" ${gemmlowp_url})
-  set(TFLiteVanillaGEMMLowpSource_DIR "${TFLiteVanilla_GEMMLowp_SOURCE_DIR}")
-  if (NOT TFLiteVanillaGEMMLowpSource_DIR STREQUAL "")
-    set(TFLiteVanillaGEMMLowpSource_FOUND TRUE)
-  endif()
-  return_unless(TFLiteVanillaGEMMLowpSource_FOUND)
-
-  set(neon2sse_url "https://github.com/intel/ARM_NEON_2_x86_SSE/archive/1200fe90bb174a6224a525ee60148671a786a71f.tar.gz")
-  ExternalSource_Download("TFLiteVanilla_NEON2SSE" ${neon2sse_url})
-  set(TFLiteVanillaNEON2SSESource_DIR "${TFLiteVanilla_NEON2SSE_SOURCE_DIR}")
-  if (NOT TFLiteVanillaNEON2SSESource_DIR STREQUAL "")
-    set(TFLiteVanillaNEON2SSESource_FOUND TRUE)
-  endif()
-  return_unless(TFLiteVanillaNEON2SSESource_FOUND)
-
-  set(tensorflow_url "https://github.com/tensorflow/tensorflow/archive/v2.3.0.tar.gz")
-  ExternalSource_Download("TFLiteVanilla_TensorFlow" ${tensorflow_url})
-  set(TFLiteVanillaTensorFlowSource_DIR "${TFLiteVanilla_TensorFlow_SOURCE_DIR}")
-  if (NOT TFLiteVanillaTensorFlowSource_DIR STREQUAL "")
-    set(TFLiteVanillaTensorFlowSource_FOUND TRUE)
-  endif()
-  return_unless(TFLiteVanillaTensorFlowSource_FOUND)
-
-  set(ruy_url "https://github.com/google/ruy/archive/34ea9f4993955fa1ff4eb58e504421806b7f2e8f.zip")
-  ExternalSource_Download("TFLiteVanilla_Ruy" ${ruy_url})
-  set(TFLiteVanillaRuySource_DIR "${TFLiteVanilla_Ruy_SOURCE_DIR}")
-  if (NOT TFLiteVanillaRuySource_DIR STREQUAL "")
-    set(TFLiteVanillaRuySource_FOUND TRUE)
-  endif()
-  return_unless(TFLiteVanillaRuySource_FOUND)
-
-  nnfw_find_package(CpuInfo QUIET)
-  if (NOT CpuInfo_FOUND)
-    message(STATUS "TFLiteVanillaRun: CPUINFO not found")
-    set(TensorFlowLite_2_3_0_FOUND FALSE PARENT_SCOPE)
-    return()
-  endif(NOT CpuInfo_FOUND)
-
-  nnas_include(ExternalProjectTools)
-  add_extdirectory("${CMAKE_CURRENT_LIST_DIR}/TensorFlowLite-2.3.0" tflite-2.3.0)
-
-  set(TensorFlowLite_2_3_0_FOUND TRUE)
-  return()
-endif()
diff --git a/infra/nnfw/cmake/packages/XnnpackConfig.cmake b/infra/nnfw/cmake/packages/XnnpackConfig.cmake
index 191a28f0e..101d757ec 100644
--- a/infra/nnfw/cmake/packages/XnnpackConfig.cmake
+++ b/infra/nnfw/cmake/packages/XnnpackConfig.cmake
@@ -31,6 +31,9 @@ function(_Xnnpack_Build)
   set(Xnnpack_FOUND TRUE PARENT_SCOPE)
 endfunction(_Xnnpack_Build)
 
+string(REGEX REPLACE "-flto" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
+string(REGEX REPLACE "-flto" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+
 if(BUILD_XNNPACK)
   _Xnnpack_Build()
 else(BUILD_XNNPACK)
diff --git a/infra/packaging/build b/infra/packaging/build
index e941a724b..667857f53 100644
--- a/infra/packaging/build
+++ b/infra/packaging/build
@@ -8,7 +8,7 @@ if [[ -z "${NNAS_PROJECT_PATH}" ]]; then
 fi
 
 # The default preset
-PRESET="20200630"
+PRESET="20210406"
 
 EXTRA_OPTIONS=()
 while [ "$#" -ne 0 ]; do
diff --git a/infra/packaging/preset/20200630 b/infra/packaging/preset/20200630
index 506b9f8db..a1721d941 100644
--- a/infra/packaging/preset/20200630
+++ b/infra/packaging/preset/20200630
@@ -9,7 +9,7 @@ function preset_configure()
 {
   REQUIRED_UNITS=()
   # Common Libraries
-  REQUIRED_UNITS+=("angkor" "cwrap" "pepper-str" "pepper-strcast" "pp" "stdex")
+  REQUIRED_UNITS+=("angkor" "cwrap" "pepper-str" "pepper-strcast" "pp")
   REQUIRED_UNITS+=("oops" "pepper-assert" "foder")
   REQUIRED_UNITS+=("souschef")
   REQUIRED_UNITS+=("safemain")
diff --git a/infra/packaging/preset/20200731_windows b/infra/packaging/preset/20200731_windows
index 763487a47..078c7db47 100644
--- a/infra/packaging/preset/20200731_windows
+++ b/infra/packaging/preset/20200731_windows
@@ -4,7 +4,7 @@ function preset_configure()
 {
   REQUIRED_UNITS=()
   # Common Libraries
-  REQUIRED_UNITS+=("angkor" "cwrap" "pepper-str" "pepper-strcast" "pp" "stdex")
+  REQUIRED_UNITS+=("angkor" "cwrap" "pepper-str" "pepper-strcast" "pp")
   REQUIRED_UNITS+=("oops" "pepper-assert" "foder")
   REQUIRED_UNITS+=("souschef")
   REQUIRED_UNITS+=("safemain")
diff --git a/infra/packaging/preset/20210406 b/infra/packaging/preset/20210406
new file mode 100644
index 000000000..3da09705c
--- /dev/null
+++ b/infra/packaging/preset/20210406
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+# NOTE purpose of this file is static analysis only
+#      new official preset will be added when new programs are ready
+
+PRESET="20210406"
+
+function preset_configure()
+{
+  REQUIRED_UNITS=()
+  # Common Libraries
+  REQUIRED_UNITS+=("angkor" "cwrap" "pepper-str" "pepper-strcast" "pp")
+  REQUIRED_UNITS+=("oops" "pepper-assert" "foder" "crew")
+  REQUIRED_UNITS+=("souschef")
+  REQUIRED_UNITS+=("safemain")
+  REQUIRED_UNITS+=("arser")
+  REQUIRED_UNITS+=("vconone")
+  # Hermes Logging Framework
+  REQUIRED_UNITS+=("hermes" "hermes-std")
+  # loco IR and related utilities
+  REQUIRED_UNITS+=("loco" "locop" "locomotiv" "logo-core" "logo")
+  # Flatbuffer I/O
+  REQUIRED_UNITS+=("mio-tflite" "mio-circle")
+  # Circle compiler library (.circle -> .circle)
+  REQUIRED_UNITS+=("luci")
+  # Tools
+  REQUIRED_UNITS+=("tflite2circle" "circle2circle" "tflchef" "circlechef")
+  REQUIRED_UNITS+=("tf2tfliteV2" "luci-interpreter" "circle-verify")
+  REQUIRED_UNITS+=("record-minmax" "circle-quantizer" "rawdata2hdf5")
+  REQUIRED_UNITS+=("circle-partitioner")
+  REQUIRED_UNITS+=("one-cmds")
+  REQUIRED_UNITS+=("bcq-tools")
+
+  NPROC=${NPROC:-$(cat /proc/cpuinfo | grep -c processor)}
+
+  # TODO Use "nncc configure" and "nncc build"
+  cmake \
+    -DCMAKE_INSTALL_PREFIX="${NNCC_INSTALL_PREFIX}" \
+    -DCMAKE_BUILD_TYPE=release \
+    -DBUILD_WHITELIST=$(join_by ";" "${REQUIRED_UNITS[@]}") \
+    -DEXTERNALS_BUILD_THREADS=$((NPROC/2)) \
+    ${EXTRA_OPTIONS[@]} \
+    "${NNAS_PROJECT_PATH}/infra/nncc"
+}
+
+function preset_install()
+{
+  install -t "${NNPKG_INSTALL_PREFIX}/bin" -D \
+    "${NNAS_PROJECT_PATH}/tools/nnpackage_tool/model2nnpkg/model2nnpkg.sh"
+
+  # Install tf2nnpkg
+  install -T -m 755 -D "${SCRIPT_PATH}/res/tf2nnpkg.${PRESET}" "${NNAS_INSTALL_PREFIX}/bin/tf2nnpkg"
+}
diff --git a/infra/packaging/preset/20210406_windows b/infra/packaging/preset/20210406_windows
new file mode 100644
index 000000000..5a250ca6c
--- /dev/null
+++ b/infra/packaging/preset/20210406_windows
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+function preset_configure()
+{
+  REQUIRED_UNITS=()
+  # Common Libraries
+  REQUIRED_UNITS+=("angkor" "cwrap" "pepper-str" "pepper-strcast" "pp")
+  REQUIRED_UNITS+=("oops" "pepper-assert" "foder" "crew")
+  REQUIRED_UNITS+=("souschef")
+  REQUIRED_UNITS+=("safemain")
+  REQUIRED_UNITS+=("arser")
+  REQUIRED_UNITS+=("vconone")
+  # Hermes Logging Framework
+  REQUIRED_UNITS+=("hermes" "hermes-std")
+  # loco IR and related utilities
+  REQUIRED_UNITS+=("loco" "locop" "locomotiv" "logo-core" "logo")
+  # Flatbuffer I/O
+  REQUIRED_UNITS+=("mio-tflite" "mio-circle")
+  # Circle compiler library (.circle -> .circle)
+  REQUIRED_UNITS+=("luci")
+  # Tools
+  REQUIRED_UNITS+=("tflite2circle" "circle2circle" "tflchef" "circlechef")
+  REQUIRED_UNITS+=("tf2tfliteV2" "luci-interpreter" "circle-verify")
+  REQUIRED_UNITS+=("record-minmax" "circle-quantizer" "rawdata2hdf5")
+  REQUIRED_UNITS+=("circle-partitioner")
+  REQUIRED_UNITS+=("one-cmds")
+  REQUIRED_UNITS+=("bcq-tools")
+
+  NPROC=$(cat /proc/cpuinfo | grep -c processor)
+
+  # TODO Use "nncc configure" and "nncc build"
+  cmake \
+    -G "MSYS Makefiles" \
+    -DUSE_PROTOBUF_LEGACY_IMPORT=ON \
+    -DCMAKE_EXE_LINKER_FLAGS="-Wl,--allow-multiple-definition" \
+    -DCMAKE_SHARED_LINKER_FLAGS="-Wl,--allow-multiple-definition" \
+    -DENABLE_TEST=OFF \
+    -DDOWNLOAD_GTEST=OFF \
+    -DBUILD_GTEST=OFF \
+    -DCMAKE_C_COMPILER=gcc \
+    -DCMAKE_CXX_COMPILER=g++ \
+    -DCMAKE_INSTALL_PREFIX="${NNCC_INSTALL_PREFIX}" \
+    -DCMAKE_BUILD_TYPE=release \
+    -DBUILD_WHITELIST=$(join_by ";" "${REQUIRED_UNITS[@]}") \
+    -DEXTERNALS_BUILD_THREADS=$((NPROC/2)) \
+    ${EXTRA_OPTIONS[@]} \
+    "${NNAS_PROJECT_PATH}/infra/nncc"
+}
+
+function preset_install()
+{
+  # Install libraries to bin/ for Windows release
+  mv ${NNCC_INSTALL_PREFIX}/lib/*.dll ${NNCC_INSTALL_PREFIX}/bin
+  rm -rf ${NNCC_INSTALL_PREFIX}/lib
+
+  install -t "${NNPKG_INSTALL_PREFIX}/bin" -D \
+    "${NNAS_PROJECT_PATH}/tools/nnpackage_tool/model2nnpkg/model2nnpkg.sh"
+
+  # Install tf2nnpkg
+  install -T -m 755 -D "${SCRIPT_PATH}/res/tf2nnpkg.20210406" "${NNAS_INSTALL_PREFIX}/bin/tf2nnpkg"
+
+  # Though you have to install tensorflow to run 'tf2tfliteV2',
+  # tensorflow can't be installed in mingw. First, You can install tensorflow 
+  # from Window native CMD(run as administrator) with python virtual environment.
+  # And, you must copy it to "${NNAS_INSTALL_PREFIX}/bin/venv"
+}
diff --git a/infra/packaging/res/tf2nnpkg.20200630 b/infra/packaging/res/tf2nnpkg.20200630
index db7053a7b..b7091541a 100644
--- a/infra/packaging/res/tf2nnpkg.20200630
+++ b/infra/packaging/res/tf2nnpkg.20200630
@@ -125,6 +125,6 @@ ${TF2TFLITE_CONVERT_SCRIPT}
 "${ROOT}/bin/tflite2circle" "${TMPDIR}/${MODEL_NAME}.tflite" "${TMPDIR}/${MODEL_NAME}.tmp.circle"
 
 # optimize
-"${ROOT}/bin/circle2circle" --all "${TMPDIR}/${MODEL_NAME}.tmp.circle" "${TMPDIR}/${MODEL_NAME}.circle"
+"${ROOT}/bin/circle2circle" --O1 "${TMPDIR}/${MODEL_NAME}.tmp.circle" "${TMPDIR}/${MODEL_NAME}.circle"
 
 "${ROOT}/bin/model2nnpkg.sh" -o "${OUTPUT_DIR}" "${TMPDIR}/${MODEL_NAME}.circle"
diff --git a/infra/packaging/res/tf2nnpkg.20210406 b/infra/packaging/res/tf2nnpkg.20210406
new file mode 100644
index 000000000..b7091541a
--- /dev/null
+++ b/infra/packaging/res/tf2nnpkg.20210406
@@ -0,0 +1,130 @@
+#!/bin/bash
+
+set -e
+
+ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+
+command_exists() {
+  if [ "$#" -le 0 ]; then
+    return 1
+  fi
+  command -v "$@" > /dev/null 2>&1
+}
+
+usage()
+{
+  echo "Convert TensorFlow model to nnpackage."
+  echo "Usage: tf2nnpkg"
+  echo "    --info <path/to/info>"
+  echo "    --graphdef <path/to/pb>"
+  echo "    -o <path/to/nnpkg/directory>"
+  echo "    --v2 (optional) Use TF 2.x interface"
+  exit 255
+}
+
+TF_INTERFACE="--v1"
+
+# Parse command-line arguments
+#
+while [ "$#" -ne 0 ]; do
+  CUR="$1"
+
+  case $CUR in
+    '--help')
+      usage
+      ;;
+    '--info')
+      export INFO_FILE="$2"
+      shift 2
+      ;;
+    '--graphdef')
+      export GRAPHDEF_FILE="$2"
+      shift 2
+      ;;
+    '-o')
+      export OUTPUT_DIR="$2"
+      shift 2
+      ;;
+    '--v2')
+      TF_INTERFACE="--v2"
+      shift
+      ;;
+    *)
+      echo "${CUR}"
+      shift
+      ;;
+  esac
+done
+
+if [ -z ${GRAPHDEF_FILE} ] || [ ! -e ${GRAPHDEF_FILE} ]; then
+  echo "pb is not found. Please check --graphdef is correct."
+  exit 2
+fi
+
+if [ -z ${INFO_FILE} ] || [ ! -e ${INFO_FILE} ]; then
+  echo "info is not found. Please check --info is correct."
+  exit 2
+fi
+
+if [ -z ${OUTPUT_DIR} ]; then
+  echo "output directory is not specifed. Please check -o is correct.."
+  exit 2
+fi
+
+FILE_BASE=$(basename ${GRAPHDEF_FILE})
+MODEL_NAME="${FILE_BASE%.*}"
+TMPDIR=$(mktemp -d)
+trap "{ rm -rf $TMPDIR; }" EXIT
+
+# activate python virtual environment
+VIRTUALENV_LINUX="${ROOT}/bin/venv/bin/activate"
+VIRTUALENV_WINDOWS="${ROOT}/bin/venv/Scripts/activate"
+
+if [ -e ${VIRTUALENV_LINUX} ]; then
+  source ${VIRTUALENV_LINUX}
+elif [ -e ${VIRTUALENV_WINDOWS} ]; then
+  source ${VIRTUALENV_WINDOWS}
+fi
+
+# parse inputs, outputs from info file
+INPUT=$(awk -F, '/^input/ { print $2 }' ${INFO_FILE} | cut -d: -f1 | tr -d ' ' | paste -d, -s)
+OUTPUT=$(awk -F, '/^output/ { print $2 }' ${INFO_FILE} | cut -d: -f1 | tr -d ' ' | paste -d, -s)
+
+INPUT_SHAPES=$(grep ^input ${INFO_FILE} | cut -d "[" -f2 | cut -d "]" -f1 | tr -d ' ' | xargs | tr ' ' ':')
+
+# Generate BCQ information metadata
+# If model has no BCQ information or invalid information, pb file is not changed.
+"${ROOT}/bin/generate_bcq_metadata" \
+--input_path "${GRAPHDEF_FILE}" \
+--output_path "${TMPDIR}/${MODEL_NAME}_withmeta.pb" \
+--output_arrays "${OUTPUT}"
+
+# Generate BCQ information nodes as output_arrays
+# If model has no BCQ information, output_arrays would be empty.
+"${ROOT}/bin/generate_bcq_output_arrays" \
+--input_path "${TMPDIR}/${MODEL_NAME}_withmeta.pb" \
+--metadata_path "${TMPDIR}/${MODEL_NAME}_metadata_arrays.txt" \
+--output_arrays_path "${TMPDIR}/${MODEL_NAME}_output_arrays.txt"
+
+# generate tflite file
+TF2TFLITE_CONVERT_SCRIPT="python ${ROOT}/bin/tf2tfliteV2.py ${TF_INTERFACE} "
+TF2TFLITE_CONVERT_SCRIPT+="--input_path ${TMPDIR}/${MODEL_NAME}_withmeta.pb "
+TF2TFLITE_CONVERT_SCRIPT+="--input_arrays ${INPUT} "
+TF2TFLITE_CONVERT_SCRIPT+="--output_path ${TMPDIR}/${MODEL_NAME}.tflite "
+TF2TFLITE_CONVERT_SCRIPT+="--output_arrays "
+TF2TFLITE_CONVERT_SCRIPT+="$(cat ${TMPDIR}/${MODEL_NAME}_metadata_arrays.txt)"
+TF2TFLITE_CONVERT_SCRIPT+="${OUTPUT}"
+TF2TFLITE_CONVERT_SCRIPT+="$(cat ${TMPDIR}/${MODEL_NAME}_output_arrays.txt) "
+if [ ! -z ${INPUT_SHAPES} ]; then
+  TF2TFLITE_CONVERT_SCRIPT+="--input_shapes ${INPUT_SHAPES} "
+fi
+
+${TF2TFLITE_CONVERT_SCRIPT}
+
+# convert .tflite to .circle
+"${ROOT}/bin/tflite2circle" "${TMPDIR}/${MODEL_NAME}.tflite" "${TMPDIR}/${MODEL_NAME}.tmp.circle"
+
+# optimize
+"${ROOT}/bin/circle2circle" --O1 "${TMPDIR}/${MODEL_NAME}.tmp.circle" "${TMPDIR}/${MODEL_NAME}.circle"
+
+"${ROOT}/bin/model2nnpkg.sh" -o "${OUTPUT_DIR}" "${TMPDIR}/${MODEL_NAME}.circle"
diff --git a/infra/scripts/build_android_runtime_release.sh b/infra/scripts/build_android_runtime_release.sh
index c9a3b1ba2..a131fbe40 100755
--- a/infra/scripts/build_android_runtime_release.sh
+++ b/infra/scripts/build_android_runtime_release.sh
@@ -5,10 +5,12 @@ ROOT_PATH="$CURRENT_PATH/../../"
 
 # prepare pre-built armcompute library
 # android build requires pre-built armcompute library
-if [ ! -n "$EXT_ACL_FOLDER" ]; then
-  echo "Please set EXT_ACL_FOLDER to use pre-built armcompute library"
-  exit 1
-fi
+# if [ ! -n "$EXT_ACL_FOLDER" ]; then
+#   echo "Please set EXT_ACL_FOLDER to use pre-built armcompute library"
+#   exit 1
+# fi
+
+unset EXT_ACL_FOLDER
 
 # prepare ndk
 if [ ! -n "$NDK_DIR" ]; then
diff --git a/infra/scripts/common.sh b/infra/scripts/common.sh
index 818957a21..4a1385d03 100755
--- a/infra/scripts/common.sh
+++ b/infra/scripts/common.sh
@@ -50,10 +50,10 @@ function TFLiteModelVerification()
 
   export BACKENDS=$1
   if [[ "$2" == "" ]]; then
-    $INSTALL_PATH/test/onert-test verify-tflite --api=nnapi \
+    $INSTALL_PATH/test/onert-test verify-tflite --api=loader \
       --reportdir=$ROOT_PATH/$3
   else
-    $INSTALL_PATH/test/onert-test verify-tflite --api=nnapi \
+    $INSTALL_PATH/test/onert-test verify-tflite --api=loader \
       --list=$2 \
       --reportdir=$ROOT_PATH/$3
   fi
@@ -134,18 +134,18 @@ function NNPackageTest()
 # $2: (required) test list file relative path from nnfw root directory
 #                pass empty string if there is no skiplist
 # $3: (required) relative path to report from nnfw root directory
-function TFLiteLoaderTest()
+function NNAPIFrontendTest()
 {
-  [[ $# -ne 3 ]] && echo "TFLiteLoaderTest: Invalid function argument setting" && exit 1
+  [[ $# -ne 3 ]] && echo "NNAPIFrontendTest: Invalid function argument setting" && exit 1
 
   pushd ${ROOT_PATH} > /dev/null
 
   export BACKENDS=$1
   if [[ "$2" == "" ]]; then
-    $INSTALL_PATH/test/onert-test verify-tflite --api=loader \
+    $INSTALL_PATH/test/onert-test verify-tflite --api=nnapi \
       --reportdir=$ROOT_PATH/$3
   else
-    $INSTALL_PATH/test/onert-test verify-tflite --api=loader \
+    $INSTALL_PATH/test/onert-test verify-tflite --api=nnapi \
       --list=$2 \
       --reportdir=$ROOT_PATH/$3
   fi
diff --git a/infra/scripts/compiler_modules.sh b/infra/scripts/compiler_modules.sh
index a0323e0a0..133af3f69 100644
--- a/infra/scripts/compiler_modules.sh
+++ b/infra/scripts/compiler_modules.sh
@@ -3,21 +3,25 @@
 # Don't run this script
 [[ "${BASH_SOURCE[0]}" == "${0}" ]] && echo "Please don't execute ${BASH_SOURCE[0]}, source it" && return
 
-DEBUG_BUILD_ITEMS="angkor;cwrap;pepper-str;pepper-strcast;pp;stdex"
+DEBUG_BUILD_ITEMS="angkor;cwrap;pepper-str;pepper-strcast;pp"
 DEBUG_BUILD_ITEMS+=";oops;pepper-assert"
 DEBUG_BUILD_ITEMS+=";hermes;hermes-std"
 DEBUG_BUILD_ITEMS+=";loco;locop;locomotiv;logo-core;logo"
-DEBUG_BUILD_ITEMS+=";foder;souschef;arser;vconone"
+DEBUG_BUILD_ITEMS+=";foder;crew;souschef;arser;vconone"
 DEBUG_BUILD_ITEMS+=";safemain;mio-circle;mio-tflite"
 DEBUG_BUILD_ITEMS+=";tflite2circle"
 DEBUG_BUILD_ITEMS+=";luci"
 DEBUG_BUILD_ITEMS+=";luci-interpreter"
-DEBUG_BUILD_ITEMS+=";luci-value-test"
+DEBUG_BUILD_ITEMS+=";luci-eval-driver;luci-pass-value-test;luci-value-test"
 DEBUG_BUILD_ITEMS+=";circle2circle;record-minmax;circle-quantizer"
+DEBUG_BUILD_ITEMS+=";circle-partitioner;circle-part-driver"
 DEBUG_BUILD_ITEMS+=";circle-verify"
+DEBUG_BUILD_ITEMS+=";circle-tensordump"
 DEBUG_BUILD_ITEMS+=";tflchef;circlechef"
 DEBUG_BUILD_ITEMS+=";common-artifacts"
 DEBUG_BUILD_ITEMS+=";circle2circle-dredd-recipe-test"
 DEBUG_BUILD_ITEMS+=";record-minmax-conversion-test"
 DEBUG_BUILD_ITEMS+=";tf2tfliteV2;tf2tfliteV2-conversion-test"
 DEBUG_BUILD_ITEMS+=";tflite2circle-conversion-test"
+DEBUG_BUILD_ITEMS+=";pota-quantization-value-test"
+DEBUG_BUILD_ITEMS+=";circle-part-value-test"
diff --git a/infra/scripts/docker_build_cross_aarch64_runtime.sh b/infra/scripts/docker_build_cross_aarch64_runtime.sh
index 607526bc8..f73894fdf 100755
--- a/infra/scripts/docker_build_cross_aarch64_runtime.sh
+++ b/infra/scripts/docker_build_cross_aarch64_runtime.sh
@@ -22,8 +22,8 @@ else
 fi
 
 # docker image name
-# - for xenial, use DOCKER_IMAGE_NAME="nnfw/one-devtools:xenial"
 # - for bionic, use DOCKER_IMAGE_NAME="nnfw/one-devtools:bionic"
+# - for focal, use DOCKER_IMAGE_NAME="nnfw/one-devtools:focal"
 if [[ -z $DOCKER_IMAGE_NAME ]]; then
   echo "It will use default docker image name"
 fi
diff --git a/infra/scripts/docker_build_cross_arm_runtime.sh b/infra/scripts/docker_build_cross_arm_runtime.sh
index 07b5ca4b5..17d75ded3 100755
--- a/infra/scripts/docker_build_cross_arm_runtime.sh
+++ b/infra/scripts/docker_build_cross_arm_runtime.sh
@@ -22,8 +22,8 @@ else
 fi
 
 # docker image name
-# - for xenial, use DOCKER_IMAGE_NAME="nnfw/one-devtools:xenial"
 # - for bionic, use DOCKER_IMAGE_NAME="nnfw/one-devtools:bionic"
+# - for focal, use DOCKER_IMAGE_NAME="nnfw/one-devtools:focal"
 if [[ -z $DOCKER_IMAGE_NAME ]]; then
   echo "It will use default docker image name"
 fi
diff --git a/infra/scripts/docker_build_cross_arm_runtime_release.sh b/infra/scripts/docker_build_cross_arm_runtime_release.sh
index 8d0443802..377bc3e23 100755
--- a/infra/scripts/docker_build_cross_arm_runtime_release.sh
+++ b/infra/scripts/docker_build_cross_arm_runtime_release.sh
@@ -22,8 +22,8 @@ else
 fi
 
 # docker image name
-# - for xenial, use DOCKER_IMAGE_NAME="nnfw/one-devtools:xenial"
 # - for bionic, use DOCKER_IMAGE_NAME="nnfw/one-devtools:bionic"
+# - for focal, use DOCKER_IMAGE_NAME="nnfw/one-devtools:focal"
 if [[ -z $DOCKER_IMAGE_NAME ]]; then
   echo "It will use default docker image name"
 fi
diff --git a/infra/scripts/docker_build_cross_coverage.sh b/infra/scripts/docker_build_cross_coverage.sh
index e03ea7571..454bf276d 100755
--- a/infra/scripts/docker_build_cross_coverage.sh
+++ b/infra/scripts/docker_build_cross_coverage.sh
@@ -22,8 +22,8 @@ else
 fi
 
 # docker image name
-# - for xenial, use DOCKER_IMAGE_NAME="nnfw/one-devtools:xenial"
 # - for bionic, use DOCKER_IMAGE_NAME="nnfw/one-devtools:bionic"
+# - for focal, use DOCKER_IMAGE_NAME="nnfw/one-devtools:focal"
 if [[ -z $DOCKER_IMAGE_NAME ]]; then
   echo "It will use default docker image name"
 fi
diff --git a/infra/scripts/docker_build_nncc.sh b/infra/scripts/docker_build_nncc.sh
index e65feb527..96a00fa05 100755
--- a/infra/scripts/docker_build_nncc.sh
+++ b/infra/scripts/docker_build_nncc.sh
@@ -35,8 +35,8 @@ if [ -d $ONNXRUNTIME_PREFIX ]; then
 fi
 
 # docker image name
-# - for xenial, use DOCKER_IMAGE_NAME="nnfw/one-devtools:xenial"
 # - for bionic, use DOCKER_IMAGE_NAME="nnfw/one-devtools:bionic"
+# - for focal, use DOCKER_IMAGE_NAME="nnfw/one-devtools:focal"
 if [[ -z $DOCKER_IMAGE_NAME ]]; then
   echo "It will use default docker image name"
 fi
@@ -57,7 +57,7 @@ mkdir -p ${NNCC_INSTALL_PREFIX}
 ./nncc docker-run ./nnas create-package --prefix "${PWD}/${NNCC_INSTALL_PREFIX}" -- "${CONFIG_OPTIONS}"
 
 mkdir -p ${ARCHIVE_PATH}
-tar -zcf ${ARCHIVE_PATH}/nncc-package.tar.gz -C ${NNCC_INSTALL_PREFIX} --exclude test ./
+tar -zcf ${ARCHIVE_PATH}/nncc-package.tar.gz -C ${NNCC_INSTALL_PREFIX} --exclude test --exclude tflchef* ./
 tar -zcf ${ARCHIVE_PATH}/nncc-test-package.tar.gz -C ${NNCC_INSTALL_PREFIX} ./test
 
 popd > /dev/null
diff --git a/infra/scripts/docker_build_test_x64.sh b/infra/scripts/docker_build_test_x64.sh
index 0d2395bc0..9f3966af7 100755
--- a/infra/scripts/docker_build_test_x64.sh
+++ b/infra/scripts/docker_build_test_x64.sh
@@ -14,8 +14,8 @@ else
 fi
 
 # docker image name
-# - for xenial, use DOCKER_IMAGE_NAME="nnfw/one-devtools:xenial"
 # - for bionic, use DOCKER_IMAGE_NAME="nnfw/one-devtools:bionic"
+# - for focal, use DOCKER_IMAGE_NAME="nnfw/one-devtools:focal"
 if [[ -z $DOCKER_IMAGE_NAME ]]; then
   echo "It will use default docker image name"
 fi
diff --git a/infra/scripts/docker_build_tizen_cross.sh b/infra/scripts/docker_build_tizen_cross.sh
index 9a8378f05..42e79a703 100755
--- a/infra/scripts/docker_build_tizen_cross.sh
+++ b/infra/scripts/docker_build_tizen_cross.sh
@@ -22,8 +22,8 @@ else
 fi
 
 # docker image name
-# - for xenial, use DOCKER_IMAGE_NAME="nnfw/one-devtools:xenial"
 # - for bionic, use DOCKER_IMAGE_NAME="nnfw/one-devtools:bionic"
+# - for focal, use DOCKER_IMAGE_NAME="nnfw/one-devtools:focal"
 if [[ -z $DOCKER_IMAGE_NAME ]]; then
   echo "It will use default docker image name"
 fi
diff --git a/infra/scripts/docker_collect_nnpkg_resources.sh b/infra/scripts/docker_collect_nnpkg_resources.sh
index ef6212a50..5608c7800 100755
--- a/infra/scripts/docker_collect_nnpkg_resources.sh
+++ b/infra/scripts/docker_collect_nnpkg_resources.sh
@@ -40,8 +40,8 @@ if [ -d $ONNXRUNTIME_PREFIX ]; then
 fi
 
 # docker image name
-# - for xenial, use DOCKER_IMAGE_NAME="nnfw/one-devtools:xenial"
 # - for bionic, use DOCKER_IMAGE_NAME="nnfw/one-devtools:bionic"
+# - for focal, use DOCKER_IMAGE_NAME="nnfw/one-devtools:focal"
 if [[ -z $DOCKER_IMAGE_NAME ]]; then
   echo "It will use default docker image name"
 fi
@@ -61,8 +61,8 @@ pushd $ROOT_PATH > /dev/null
 
 REQUIRED_UNITS=()
 # Common Libraries
-REQUIRED_UNITS+=("angkor" "cwrap" "pepper-str" "pepper-strcast" "pp" "stdex")
-REQUIRED_UNITS+=("oops" "safemain" "foder" "arser" "vconone")
+REQUIRED_UNITS+=("angkor" "cwrap" "pepper-str" "pepper-strcast" "pp")
+REQUIRED_UNITS+=("oops" "safemain" "foder" "crew" "arser" "vconone")
 # Hermes Logging Framework
 REQUIRED_UNITS+=("hermes" "hermes-std")
 # loco IR and related utilities
diff --git a/infra/scripts/docker_coverage_report.sh b/infra/scripts/docker_coverage_report.sh
index f0de1de5f..2c3ee303e 100755
--- a/infra/scripts/docker_coverage_report.sh
+++ b/infra/scripts/docker_coverage_report.sh
@@ -8,8 +8,8 @@ CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 ROOT_PATH="$CURRENT_PATH/../../"
 
 # docker image name
-# - for xenial, use DOCKER_IMAGE_NAME="nnfw/one-devtools:xenial"
 # - for bionic, use DOCKER_IMAGE_NAME="nnfw/one-devtools:bionic"
+# - for focal, use DOCKER_IMAGE_NAME="nnfw/one-devtools:focal"
 if [[ -z $DOCKER_IMAGE_NAME ]]; then
   echo "It will use default docker image name"
 fi
diff --git a/infra/scripts/test_coverage.sh b/infra/scripts/test_coverage.sh
index 369e53239..6cb4bb7dd 100755
--- a/infra/scripts/test_coverage.sh
+++ b/infra/scripts/test_coverage.sh
@@ -24,15 +24,15 @@ if [[ ! -e $ROOT_PATH/tests/scripts/build_path_depth.txt ]]; then
 fi
 export GCOV_PREFIX_STRIP=`cat $ROOT_PATH/tests/scripts/build_path_depth.txt`
 
-./infra/scripts/test_ubuntu_runtime.sh --backend acl_cl --tflite-loader
+TENSOR_LOGGING=trace_log.txt ./infra/scripts/test_ubuntu_runtime.sh --backend acl_cl --nnapi-frontend
 ./infra/scripts/test_ubuntu_runtime.sh --backend acl_neon
 ./infra/scripts/test_ubuntu_runtime.sh --backend cpu
 
 # Enable all logs (mixed backend)
-TENSOR_LOGGING=trace_log.txt ONERT_LOG_ENABLE=1 GRAPH_DOT_DUMP=1 ./infra/scripts/test_ubuntu_runtime_mixed.sh
+ONERT_LOG_ENABLE=1 GRAPH_DOT_DUMP=1 ./infra/scripts/test_ubuntu_runtime_mixed.sh
 # Enable trace event (acl_cl default backend)
 export TRACE_FILEPATH=trace.json
-TFLiteModelVerification "acl_cl" "Product/out/test/list/frameworktest_list.armv7l.acl_cl.txt" "report/acl_cl/trace"
+TFLiteModelVerification "acl_cl" "Product/out/test/list/tflite_comparator.armv7l.acl_cl.list" "report/acl_cl/trace"
 unset TRACE_FILEPATH
 
 # Interpreter
diff --git a/infra/scripts/test_ubuntu_runtime.sh b/infra/scripts/test_ubuntu_runtime.sh
index db70580f8..17bdf6e99 100755
--- a/infra/scripts/test_ubuntu_runtime.sh
+++ b/infra/scripts/test_ubuntu_runtime.sh
@@ -7,9 +7,10 @@ source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
 BACKEND="cpu"
 TEST_OS="linux"
 TEST_PLATFORM="$TEST_ARCH-$TEST_OS"
-TFLITE_LOADER="0"
+TFLITE_LOADER="1"
 LINEAR_ONLY="0"
 RUN_INTERP="0"
+NNAPI_FRONTEND="0"
 
 function Usage()
 {
@@ -17,7 +18,7 @@ function Usage()
   echo ""
   echo "Options:"
   echo "      --backend <BACKEND>     Runtime backend to test (default: ${BACKEND})"
-  echo "      --tflite-loader         Enable TFLite Loader test"
+  echo "      --nnapi-frontend        NNAPI Frontend test"
   echo "      --linear-only           Use Linear executor only"
 }
 
@@ -39,6 +40,12 @@ do
       ;;
     --tflite-loader)
       TFLITE_LOADER="1"
+      NNAPI_FRONTEND="1" # For CI test
+      echo "[INFO] \"--tflite-loader\" argument is deprecated"
+      shift
+      ;;
+    --nnapi-frontend)
+      NNAPI_FRONTEND="1"
       shift
       ;;
     --linear-only)
@@ -68,7 +75,7 @@ else
 fi
 
 UNITTEST_SKIPLIST="Product/out/unittest/nnapi_gtest.skip.${TEST_PLATFORM}.${BACKEND}"
-FRAMEWORK_TESTLIST="Product/out/test/list/frameworktest_list.${TEST_ARCH}.${BACKEND}.txt"
+TFLITE_TESTLIST="Product/out/test/list/tflite_comparator.${TEST_ARCH}.${BACKEND}.list"
 REPORT_BASE="report/${BACKEND}"
 EXECUTORS=("Linear" "Dataflow" "Parallel")
 
@@ -92,7 +99,7 @@ do
   fi
 
   NNAPIGTest "${BACKEND}" "${UNITTEST_SKIPLIST}" "${REPORT_PATH}"
-  TFLiteModelVerification "${BACKEND}" "${FRAMEWORK_TESTLIST}" "${REPORT_PATH}"
+  TFLiteModelVerification "${BACKEND}" "${TFLITE_TESTLIST}" "${REPORT_PATH}"
 
   if [ $EXECUTOR = "Interpreter" ]; then
     unset DISABLE_COMPILE
@@ -101,9 +108,8 @@ do
   fi
 done
 
-# Current support acl_cl backend testlist only
 # TODO Support more backends
-TFLITE_LOADER_TESTLIST="Product/out/test/list/tflite_loader_list.${TEST_ARCH}.txt"
-if [[ $TFLITE_LOADER = "1" ]]; then
-  TFLiteLoaderTest "${BACKEND}" "${TFLITE_LOADER_TESTLIST}" "${REPORT_BASE}/loader/${EXECUTOR}"
+NNAPI_FRONTEND_TESTLIST="Product/out/test/list/nnapi_test.${TEST_ARCH}.list"
+if [[ $NNAPI_FRONTEND = "1" ]]; then
+  NNAPIFrontendTest "${BACKEND}" "${NNAPI_FRONTEND_TESTLIST}" "${REPORT_BASE}/nnapi/${EXECUTOR}"
 fi
diff --git a/infra/scripts/test_ubuntu_runtime_mixed.sh b/infra/scripts/test_ubuntu_runtime_mixed.sh
index 6eab90cd3..697fed897 100755
--- a/infra/scripts/test_ubuntu_runtime_mixed.sh
+++ b/infra/scripts/test_ubuntu_runtime_mixed.sh
@@ -32,14 +32,14 @@ popd > /dev/null
 BACKENDS=(acl_cl acl_neon cpu)
 
 # Get the intersect of framework test list files
-TESTLIST_PREFIX="Product/out/test/list/frameworktest_list.${TEST_ARCH}"
+TESTLIST_PREFIX="Product/out/test/list/tflite_comparator.${TEST_ARCH}"
 SKIPLIST_PREFIX="Product/out/unittest/nnapi_gtest.skip.${TEST_ARCH}-${TEST_OS}"
-sort $TESTLIST_PREFIX.${BACKENDS[0]}.txt > $TESTLIST_PREFIX.intersect.txt
+sort $TESTLIST_PREFIX.${BACKENDS[0]}.list > $TESTLIST_PREFIX.intersect.list
 sort $SKIPLIST_PREFIX.${BACKENDS[0]} > $SKIPLIST_PREFIX.union
 for BACKEND in "${BACKENDS[@]:1}"; do
-    comm -12 <(sort $TESTLIST_PREFIX.intersect.txt) <(sort $TESTLIST_PREFIX.$BACKEND.txt) > $TESTLIST_PREFIX.intersect.next.txt
+    comm -12 <(sort $TESTLIST_PREFIX.intersect.list) <(sort $TESTLIST_PREFIX.$BACKEND.list) > $TESTLIST_PREFIX.intersect.next.list
     comm <(sort $SKIPLIST_PREFIX.union) <(sort $SKIPLIST_PREFIX.$BACKEND) | tr -d "[:blank:]" > $SKIPLIST_PREFIX.union.next
-    mv $TESTLIST_PREFIX.intersect.next.txt $TESTLIST_PREFIX.intersect.txt
+    mv $TESTLIST_PREFIX.intersect.next.list $TESTLIST_PREFIX.intersect.list
     mv $SKIPLIST_PREFIX.union.next $SKIPLIST_PREFIX.union
 done
 popd > /dev/null
@@ -60,4 +60,4 @@ export OP_BACKEND_AvgPool2D="acl_neon"
 export ACL_LAYOUT="NCHW"
 export RUY_THREADS=4
 NNAPIGTest "acl_cl;acl_neon;cpu" "Product/out/unittest/nnapi_gtest.skip.${TEST_ARCH}-${TEST_OS}.union" "report/mixed"
-TFLiteModelVerification "acl_cl;acl_neon;cpu" "${TESTLIST_PREFIX}.intersect.txt" "report/mixed"
+TFLiteModelVerification "acl_cl;acl_neon;cpu" "${TESTLIST_PREFIX}.intersect.list" "report/mixed"
diff --git a/infra/scripts/tizen_xu4_test.sh b/infra/scripts/tizen_xu4_test.sh
index f412e7f7a..37576ac2e 100755
--- a/infra/scripts/tizen_xu4_test.sh
+++ b/infra/scripts/tizen_xu4_test.sh
@@ -34,17 +34,6 @@ function install_model()
     $SDB_CMD push cache.tar.gz $TEST_ROOT/.
     rm -rf cache.tar.gz
     $SDB_CMD shell tar -zxf $TEST_ROOT/cache.tar.gz -C $TEST_ROOT/Product/out/test/models
-
-    # download api test model file for nnfw_api_gtest
-    MODEL_CACHE_DIR=$(mktemp -d)
-    tests/scripts/models/run_test.sh --download=on --run=off \
-        --configdir=tests/scripts/models/nnfw_api_gtest \
-        --cachedir=$MODEL_CACHE_DIR
-    tar -zcf $MODEL_CACHE_DIR/api_model_test.tar.gz -C $MODEL_CACHE_DIR .
-    $SDB_CMD push $MODEL_CACHE_DIR/api_model_test.tar.gz $TEST_ROOT/Product/out/unittest_standalone/nnfw_api_gtest_models/
-    $SDB_CMD shell tar -zxf $TEST_ROOT/Product/out/unittest_standalone/nnfw_api_gtest_models/api_model_test.tar.gz \
-    -C $TEST_ROOT/Product/out/unittest_standalone/nnfw_api_gtest_models/
-    rm -rf $MODEL_CACHE_DIR
     popd
 }
 
diff --git a/nnpackage/examples/README.md b/nnpackage/examples/README.md
new file mode 100644
index 000000000..fb0bae35e
--- /dev/null
+++ b/nnpackage/examples/README.md
@@ -0,0 +1,32 @@
+# NNPackage example
+
+## Package version 1.1.0
+
+### one_op_in_tflite
+
+- Model file: TensorFlow Lite model
+- Only one `ADD` operation
+
+## Package version 1.0.0
+
+### add
+
+- Model file: TensorFlow Lite model
+- Only one `ADD` operation
+
+### add_invalid_manifest
+
+- Model file: TensorFlow Lite model
+- Only one `ADD` operation
+- Invalid menifest: invalid json format
+
+### if_dynamic
+
+- Model file: TensorFlow Lite model
+- `IF` operation example with input and output example
+
+### while_dynamic
+
+- Model file: TensorFlow Lite model
+- `WHILE` operation example with input and output example
+
diff --git a/nnpackage/examples/one_op_in_tflite/add.tflite b/nnpackage/examples/v1.0.0/add/add.tflite
index e748b6843..e748b6843 100644
--- a/nnpackage/examples/one_op_in_tflite/add.tflite
+++ b/nnpackage/examples/v1.0.0/add/add.tflite
diff --git a/nnpackage/examples/v1.0.0/add/metadata/MANIFEST b/nnpackage/examples/v1.0.0/add/metadata/MANIFEST
new file mode 100644
index 000000000..1d96cce1b
--- /dev/null
+++ b/nnpackage/examples/v1.0.0/add/metadata/MANIFEST
@@ -0,0 +1,7 @@
+{
+  "major-version" : "1",
+  "minor-version" : "0",
+  "patch-version" : "0",
+  "models"      : [ "add.tflite" ],
+  "model-types" : [ "tflite" ]
+}
diff --git a/nnpackage/examples/v1.0.0/add_invalid_manifest/add.tflite b/nnpackage/examples/v1.0.0/add_invalid_manifest/add.tflite
new file mode 100644
index 000000000..e748b6843
--- /dev/null
+++ b/nnpackage/examples/v1.0.0/add_invalid_manifest/add.tflite
diff --git a/nnpackage/examples/v1.0.0/add_invalid_manifest/metadata/MANIFEST b/nnpackage/examples/v1.0.0/add_invalid_manifest/metadata/MANIFEST
new file mode 100644
index 000000000..8b18e4edd
--- /dev/null
+++ b/nnpackage/examples/v1.0.0/add_invalid_manifest/metadata/MANIFEST
@@ -0,0 +1,7 @@
+{
+  "major-version" : "1"
+  "minor-version" : "0"
+  "patch-version" : "0"
+  "models"      : [ "add.tflite" ]
+  "model-types" : [ "tflite" ]
+}
diff --git a/nnpackage/examples/v1.0.0/if_dynamic/if_dynamic.tflite b/nnpackage/examples/v1.0.0/if_dynamic/if_dynamic.tflite
new file mode 100644
index 000000000..680a8b17e
--- /dev/null
+++ b/nnpackage/examples/v1.0.0/if_dynamic/if_dynamic.tflite
diff --git a/nnpackage/examples/v1.0.0/if_dynamic/metadata/MANIFEST b/nnpackage/examples/v1.0.0/if_dynamic/metadata/MANIFEST
new file mode 100644
index 000000000..0fea9800f
--- /dev/null
+++ b/nnpackage/examples/v1.0.0/if_dynamic/metadata/MANIFEST
@@ -0,0 +1,7 @@
+{
+  "major-version" : "1",
+  "minor-version" : "0",
+  "patch-version" : "0",
+  "models"      : [ "if_dynamic.tflite" ],
+  "model-types" : [ "tflite" ]
+}
diff --git a/nnpackage/examples/v1.0.0/if_dynamic/metadata/tc/expected.h5 b/nnpackage/examples/v1.0.0/if_dynamic/metadata/tc/expected.h5
new file mode 100644
index 000000000..d1a47b9e3
--- /dev/null
+++ b/nnpackage/examples/v1.0.0/if_dynamic/metadata/tc/expected.h5
diff --git a/nnpackage/examples/v1.0.0/if_dynamic/metadata/tc/input.h5 b/nnpackage/examples/v1.0.0/if_dynamic/metadata/tc/input.h5
new file mode 100644
index 000000000..1309ed51a
--- /dev/null
+++ b/nnpackage/examples/v1.0.0/if_dynamic/metadata/tc/input.h5
diff --git a/nnpackage/examples/v1.0.0/while_dynamic/metadata/MANIFEST b/nnpackage/examples/v1.0.0/while_dynamic/metadata/MANIFEST
new file mode 100644
index 000000000..cfe19ad2c
--- /dev/null
+++ b/nnpackage/examples/v1.0.0/while_dynamic/metadata/MANIFEST
@@ -0,0 +1,7 @@
+{
+  "major-version" : "1",
+  "minor-version" : "0",
+  "patch-version" : "0",
+  "models"      : [ "while_dynamic.tflite" ],
+  "model-types" : [ "tflite" ]
+}
diff --git a/nnpackage/examples/v1.0.0/while_dynamic/metadata/tc/expected.h5 b/nnpackage/examples/v1.0.0/while_dynamic/metadata/tc/expected.h5
new file mode 100644
index 000000000..5d5eec6f8
--- /dev/null
+++ b/nnpackage/examples/v1.0.0/while_dynamic/metadata/tc/expected.h5
diff --git a/nnpackage/examples/v1.0.0/while_dynamic/metadata/tc/input.h5 b/nnpackage/examples/v1.0.0/while_dynamic/metadata/tc/input.h5
new file mode 100644
index 000000000..75f09095c
--- /dev/null
+++ b/nnpackage/examples/v1.0.0/while_dynamic/metadata/tc/input.h5
diff --git a/nnpackage/examples/v1.0.0/while_dynamic/while_dynamic.tflite b/nnpackage/examples/v1.0.0/while_dynamic/while_dynamic.tflite
new file mode 100644
index 000000000..6f201d504
--- /dev/null
+++ b/nnpackage/examples/v1.0.0/while_dynamic/while_dynamic.tflite
diff --git a/nnpackage/examples/v1.1.0/one_op_in_tflite/add.tflite b/nnpackage/examples/v1.1.0/one_op_in_tflite/add.tflite
new file mode 100644
index 000000000..e748b6843
--- /dev/null
+++ b/nnpackage/examples/v1.1.0/one_op_in_tflite/add.tflite
diff --git a/nnpackage/examples/one_op_in_tflite/metadata/MANIFEST b/nnpackage/examples/v1.1.0/one_op_in_tflite/metadata/MANIFEST
index 3ed12f99d..3ed12f99d 100644
--- a/nnpackage/examples/one_op_in_tflite/metadata/MANIFEST
+++ b/nnpackage/examples/v1.1.0/one_op_in_tflite/metadata/MANIFEST
diff --git a/nnpackage/examples/one_op_in_tflite/metadata/config.cfg b/nnpackage/examples/v1.1.0/one_op_in_tflite/metadata/config.cfg
index 776fa7024..776fa7024 100644
--- a/nnpackage/examples/one_op_in_tflite/metadata/config.cfg
+++ b/nnpackage/examples/v1.1.0/one_op_in_tflite/metadata/config.cfg
diff --git a/packaging/nnfw.spec b/packaging/nnfw.spec
index 028d88b61..2eba073fd 100644
--- a/packaging/nnfw.spec
+++ b/packaging/nnfw.spec
@@ -1,6 +1,6 @@
 Name:    nnfw
 Summary: nnfw
-Version: 1.12.0
+Version: 1.15.0
 Release: 1
 Group:   Development
 License: Apache-2.0 and MIT and BSD-2-Clause
@@ -91,6 +91,9 @@ NNFW test rpm. It does not depends on nnfw rpm since it contains nnfw runtime.
 %ifarch aarch64
 %define target_arch aarch64
 %endif
+%ifarch %ix86
+%define target_arch i686
+%endif
 
 %define install_dir %{_prefix}
 %define install_path %{buildroot}%{install_dir}
@@ -128,7 +131,7 @@ tar -xf %{SOURCE1010} -C ./externals
 tar -xf %{SOURCE1011} -C ./externals
 
 %build
-%ifarch arm armv7l aarch64 x86_64
+%ifarch arm armv7l aarch64 x86_64 %ix86
 # runtime build
 %{build_env} ./nnfw configure %{build_options} %{extra_option}
 %{build_env} ./nnfw build -j4
@@ -149,7 +152,7 @@ tar -zcf test-suite.tar.gz infra/scripts
 %endif # arm armv7l aarch64
 
 %install
-%ifarch arm armv7l aarch64 x86_64
+%ifarch arm armv7l aarch64 x86_64 %ix86
 
 mkdir -p %{buildroot}%{_libdir}
 mkdir -p %{buildroot}%{_bindir}
@@ -193,14 +196,14 @@ install -m 0644 ./tests/scripts/build_path.txt %{buildroot}%{test_install_dir}/t
 %files
 %manifest %{name}.manifest
 %defattr(-,root,root,-)
-%ifarch arm armv7l aarch64 x86_64
+%ifarch arm armv7l aarch64 x86_64 %ix86
 %{_libdir}/*.so
 %endif
 
 %files devel
 %manifest %{name}.manifest
 %defattr(-,root,root,-)
-%ifarch arm armv7l aarch64 x86_64
+%ifarch arm armv7l aarch64 x86_64 %ix86
 %dir %{_includedir}/nnfw
 %{_includedir}/nnfw/*
 %{_libdir}/pkgconfig/nnfw.pc
@@ -209,13 +212,13 @@ install -m 0644 ./tests/scripts/build_path.txt %{buildroot}%{test_install_dir}/t
 %files plugin-devel
 %manifest %{name}.manifest
 %defattr(-,root,root,-)
-%ifarch arm armv7l aarch64 x86_64
+%ifarch arm armv7l aarch64 x86_64 %ix86
 %dir %{_includedir}/onert
 %{_includedir}/onert/*
 %{_libdir}/pkgconfig/nnfw-plugin.pc
 %endif
 
-%ifarch arm armv7l aarch64 x86_64
+%ifarch arm armv7l aarch64 x86_64 %ix86
 %files minimal-app
 %manifest %{name}.manifest
 %defattr(-,root,root,-)
diff --git a/res/TensorFlowLiteRecipes/BroadcastTo_000/test.recipe b/res/TensorFlowLiteRecipes/BroadcastTo_000/test.recipe
new file mode 100644
index 000000000..015e40bc4
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/BroadcastTo_000/test.recipe
@@ -0,0 +1,24 @@
+operand {
+  name: "bc_input"
+  type: FLOAT32
+  shape { dim: 2 dim: 3 }
+}
+operand {
+  name: "bc_shape"
+  type: INT32
+  shape { dim: 3 }
+  filler { tag: "explicit" arg: "1" arg: "2" arg: "3" }
+}
+operand {
+  name: "bc_ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 2 dim: 3 }
+}
+operation {
+  type: "BroadcastTo"
+  input: "bc_input"
+  input: "bc_shape"
+  output: "bc_ofm"
+}
+input: "bc_input"
+output: "bc_ofm"
diff --git a/res/TensorFlowLiteRecipes/ExpandDims_004/test.recipe b/res/TensorFlowLiteRecipes/ExpandDims_004/test.recipe
new file mode 100644
index 000000000..20e6555f7
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/ExpandDims_004/test.recipe
@@ -0,0 +1,30 @@
+operand {
+  name: "ifm1"
+  type: FLOAT32
+  shape { dim: 3 dim: 3 }
+}
+
+operand {
+  name: "ifm2"
+  type: INT32
+  shape { }
+  filler {
+    tag: "constant"
+    arg: "-1"
+  }
+}
+
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 3 dim: 3 dim: 1 }
+}
+
+operation {
+  type: "ExpandDims"
+  input: "ifm1"
+  input: "ifm2"
+  output: "ofm"
+}
+input: "ifm1"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/FakeQuant_000/test.recipe b/res/TensorFlowLiteRecipes/FakeQuant_000/test.recipe
new file mode 100644
index 000000000..c96466f83
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/FakeQuant_000/test.recipe
@@ -0,0 +1,25 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+
+operation {
+  type: "FakeQuant"
+  fakequant_options {
+    min: 0.0
+    max: 1.0
+    num_bits: 8
+    narrow_range: false
+  }
+  input: "ifm"
+  output: "ofm"
+}
+
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/FakeQuant_000/test.reverse b/res/TensorFlowLiteRecipes/FakeQuant_000/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/FakeQuant_000/test.reverse
diff --git a/res/TensorFlowLiteRecipes/Net_BroadcastTo_AddV2_000/test.recipe b/res/TensorFlowLiteRecipes/Net_BroadcastTo_AddV2_000/test.recipe
new file mode 100644
index 000000000..5069aac09
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_BroadcastTo_AddV2_000/test.recipe
@@ -0,0 +1,63 @@
+operand {
+  name: "bc_input"
+  type: FLOAT32
+  shape { dim: 2 dim: 3 }
+}
+operand {
+  name: "bc_shape"
+  type: INT32
+  shape { dim: 3 }
+  filler { tag: "explicit" arg: "1" arg: "2" arg: "3" }
+}
+operand {
+  name: "bc_ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 2 dim: 3 }
+}
+operation {
+  type: "BroadcastTo"
+  input: "bc_input"
+  input: "bc_shape"
+  output: "bc_ofm"
+}
+operand {
+  name: "reshape_data"
+  type: FLOAT32
+  shape { dim: 2 dim: 3 }
+}
+operand {
+  name: "reshape_shape"
+  type: INT32
+  shape { dim: 3 }
+  filler { tag: "explicit" arg: "1" arg: "2" arg: "3" }
+}
+operand {
+  name: "reshape_ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 2 dim: 3 }
+}
+operation {
+  type: "Reshape"
+  reshape_options {
+    new_shape: 1
+    new_shape: 2
+    new_shape: 3
+  }
+  input: "reshape_data"
+  input: "reshape_shape"
+  output: "reshape_ofm"
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 2 dim: 3 }
+}
+operation {
+  type: "AddV2"
+  input: "bc_ofm"
+  input: "reshape_ofm"
+  output: "ofm"
+}
+input: "bc_input"
+input: "reshape_data"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/Net_BroadcastTo_AddV2_000/test.rule b/res/TensorFlowLiteRecipes/Net_BroadcastTo_AddV2_000/test.rule
new file mode 100644
index 000000000..fdaa7904a
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_BroadcastTo_AddV2_000/test.rule
@@ -0,0 +1,7 @@
+# To check if BroadcastTo and AddV2 are fused to Add op
+
+RULE    "VERIFY_FILE_FORMAT"      $(verify_file_format) '=' 1
+
+RULE    "ADD_EXIST"               $(op_count ADD) '=' 1
+RULE    "NO_BroadcastTo"          $(op_count 'CUSTOM(BroadcastTo)') '=' 0
+RULE    "NO_AddV2"                $(op_count 'CUSTOM(AddV2)') '=' 0
diff --git a/res/TensorFlowLiteRecipes/Net_BroadcastTo_AddV2_001/test.recipe b/res/TensorFlowLiteRecipes/Net_BroadcastTo_AddV2_001/test.recipe
new file mode 100644
index 000000000..ca0ad8e03
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_BroadcastTo_AddV2_001/test.recipe
@@ -0,0 +1,63 @@
+operand {
+  name: "bc_input"
+  type: INT64
+  shape { dim: 2 dim: 3 }
+}
+operand {
+  name: "bc_shape"
+  type: INT32
+  shape { dim: 3 }
+  filler { tag: "explicit" arg: "1" arg: "2" arg: "3" }
+}
+operand {
+  name: "bc_ofm"
+  type: INT64
+  shape { dim: 1 dim: 2 dim: 3 }
+}
+operation {
+  type: "BroadcastTo"
+  input: "bc_input"
+  input: "bc_shape"
+  output: "bc_ofm"
+}
+operand {
+  name: "reshape_data"
+  type: INT64
+  shape { dim: 2 dim: 3 }
+}
+operand {
+  name: "reshape_shape"
+  type: INT32
+  shape { dim: 3 }
+  filler { tag: "explicit" arg: "1" arg: "2" arg: "3" }
+}
+operand {
+  name: "reshape_ofm"
+  type: INT64
+  shape { dim: 1 dim: 2 dim: 3 }
+}
+operation {
+  type: "Reshape"
+  reshape_options {
+    new_shape: 1
+    new_shape: 2
+    new_shape: 3
+  }
+  input: "reshape_data"
+  input: "reshape_shape"
+  output: "reshape_ofm"
+}
+operand {
+  name: "ofm"
+  type: INT64
+  shape { dim: 1 dim: 2 dim: 3 }
+}
+operation {
+  type: "AddV2"
+  input: "bc_ofm"
+  input: "reshape_ofm"
+  output: "ofm"
+}
+input: "bc_input"
+input: "reshape_data"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/Net_BroadcastTo_AddV2_001/test.rule b/res/TensorFlowLiteRecipes/Net_BroadcastTo_AddV2_001/test.rule
new file mode 100644
index 000000000..d34458999
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_BroadcastTo_AddV2_001/test.rule
@@ -0,0 +1,7 @@
+# To check if BroadcastTo and AddV2 are not fused to Add op
+
+RULE    "VERIFY_FILE_FORMAT"      $(verify_file_format) '=' 1
+
+RULE    "BroadcastTo_EXIST"       $(op_count 'CUSTOM(BroadcastTo)') '=' 1
+RULE    "AddV2_EXIST"             $(op_count 'CUSTOM(AddV2)') '=' 1
+RULE    "NO_ADD"                  $(op_count ADD) '=' 0
diff --git a/res/TensorFlowLiteRecipes/Net_Conv_Add_Mul_000/test.recipe b/res/TensorFlowLiteRecipes/Net_Conv_Add_Mul_000/test.recipe
new file mode 100644
index 000000000..5ee07b456
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_Conv_Add_Mul_000/test.recipe
@@ -0,0 +1,92 @@
+operand {
+  name: "ifm_conv"
+  type: FLOAT32
+  shape { dim: 1 dim: 64 dim: 64 dim: 32 }
+}
+operand {
+  name: "filter"
+  type: FLOAT32
+  shape { dim: 64 dim: 1 dim: 1 dim: 32 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "bias"
+  type: FLOAT32
+  shape { dim: 64 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "ofm_conv"
+  type: FLOAT32
+  shape { dim: 1 dim: 32 dim: 32 dim: 64 }
+}
+operand {
+  name: "mul_const"
+  type: FLOAT32
+  shape { dim: 1 dim: 1 dim: 1 dim: 64 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "add_const"
+  type: FLOAT32
+  shape { dim: 1 dim: 1 dim: 1 dim: 64 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand { 
+  name: "ofm_mul"
+  type: FLOAT32
+  shape { dim: 1 dim: 32 dim: 32 dim: 64 }
+}
+operand {
+  name: "ofm_add"
+  type: FLOAT32
+  shape { dim: 1 dim: 32 dim: 32 dim: 64 }
+}
+operation {
+  type: "Conv2D"
+  conv2d_options {
+    padding: VALID
+    stride_w: 2
+    stride_h: 2
+  }
+  input: "ifm_conv"
+  input: "filter"
+  input: "bias"
+  output: "ofm_conv"
+}
+operation {
+  type: "Mul"
+  input: "ofm_conv"
+  input: "mul_const"
+  output: "ofm_mul"
+  mul_options {
+    activation: NONE
+  }
+}
+operation {
+  type: "Add"
+  input: "ofm_mul"
+  input: "add_const"
+  output: "ofm_add"
+  add_options {
+    activation: NONE
+  }
+}
+input: "ifm_conv"
+output: "ofm_add"
diff --git a/res/TensorFlowLiteRecipes/Net_Conv_Add_Mul_000/test.rule b/res/TensorFlowLiteRecipes/Net_Conv_Add_Mul_000/test.rule
new file mode 100644
index 000000000..00a25dfd6
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_Conv_Add_Mul_000/test.rule
@@ -0,0 +1,7 @@
+# To check if Add and Mul are fused to Convolution op
+
+RULE    "VERIFY_FILE_FORMAT"      $(verify_file_format) '=' 1
+
+RULE    "CONV_EXIST"              $(op_count CONV_2D) '=' 1
+RULE    "NO_MUL"                  $(op_count MUL) '=' 0
+RULE    "NO_ADD"                  $(op_count ADD) '=' 0
diff --git a/res/TensorFlowLiteRecipes/Net_Conv_Add_Mul_001/test.recipe b/res/TensorFlowLiteRecipes/Net_Conv_Add_Mul_001/test.recipe
new file mode 100644
index 000000000..04bdd5ae0
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_Conv_Add_Mul_001/test.recipe
@@ -0,0 +1,92 @@
+operand {
+  name: "ifm_conv"
+  type: FLOAT32
+  shape { dim: 1 dim: 64 dim: 64 dim: 32 }
+}
+operand {
+  name: "filter"
+  type: FLOAT32
+  shape { dim: 64 dim: 1 dim: 1 dim: 32 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "bias"
+  type: FLOAT32
+  shape { dim: 64 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "ofm_conv"
+  type: FLOAT32
+  shape { dim: 1 dim: 32 dim: 32 dim: 64 }
+}
+operand {
+  name: "mul_const"
+  type: FLOAT32
+  shape { dim: 1 dim: 1 dim: 1 dim: 64 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "add_const"
+  type: FLOAT32
+  shape { dim: 1 dim: 1 dim: 1 dim: 64 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand { 
+  name: "ofm_mul"
+  type: FLOAT32
+  shape { dim: 1 dim: 32 dim: 32 dim: 64 }
+}
+operand {
+  name: "ofm_add"
+  type: FLOAT32
+  shape { dim: 1 dim: 32 dim: 32 dim: 64 }
+}
+operation {
+  type: "Conv2D"
+  conv2d_options {
+    padding: VALID
+    stride_w: 2
+    stride_h: 2
+  }
+  input: "ifm_conv"
+  input: "filter"
+  input: "bias"
+  output: "ofm_conv"
+}
+operation {
+  type: "Mul"
+  input: "ofm_conv"
+  input: "mul_const"
+  output: "ofm_mul"
+  mul_options {
+    activation: NONE
+  }
+}
+operation {
+  type: "Add"
+  input: "ofm_mul"
+  input: "add_const"
+  output: "ofm_add"
+  add_options {
+    activation: RELU
+  }
+}
+input: "ifm_conv"
+output: "ofm_add"
diff --git a/res/TensorFlowLiteRecipes/Net_Conv_Add_Mul_001/test.rule b/res/TensorFlowLiteRecipes/Net_Conv_Add_Mul_001/test.rule
new file mode 100644
index 000000000..7f3511a35
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_Conv_Add_Mul_001/test.rule
@@ -0,0 +1,7 @@
+# To check if Add(with RELU) and Mul are fused to Convolution op
+
+RULE    "VERIFY_FILE_FORMAT"      $(verify_file_format) '=' 1
+
+RULE    "CONV_EXIST"              $(op_count CONV_2D) '=' 1
+RULE    "NO_MUL"                  $(op_count MUL) '=' 0
+RULE    "NO_ADD"                  $(op_count ADD) '=' 0
diff --git a/res/TensorFlowLiteRecipes/Net_Conv_Add_Mul_002/test.recipe b/res/TensorFlowLiteRecipes/Net_Conv_Add_Mul_002/test.recipe
new file mode 100644
index 000000000..e3fe1e315
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_Conv_Add_Mul_002/test.recipe
@@ -0,0 +1,92 @@
+operand {
+  name: "ifm_conv"
+  type: FLOAT32
+  shape { dim: 1 dim: 64 dim: 64 dim: 32 }
+}
+operand {
+  name: "filter"
+  type: FLOAT32
+  shape { dim: 64 dim: 1 dim: 1 dim: 32 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "bias"
+  type: FLOAT32
+  shape { dim: 64 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "ofm_conv"
+  type: FLOAT32
+  shape { dim: 1 dim: 32 dim: 32 dim: 64 }
+}
+operand {
+  name: "mul_const"
+  type: FLOAT32
+  shape { dim: 1 dim: 1 dim: 1 dim: 64 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "add_const"
+  type: FLOAT32
+  shape { dim: 1 dim: 1 dim: 1 dim: 64 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand { 
+  name: "ofm_mul"
+  type: FLOAT32
+  shape { dim: 1 dim: 32 dim: 32 dim: 64 }
+}
+operand {
+  name: "ofm_add"
+  type: FLOAT32
+  shape { dim: 1 dim: 32 dim: 32 dim: 64 }
+}
+operation {
+  type: "Conv2D"
+  conv2d_options {
+    padding: VALID
+    stride_w: 2
+    stride_h: 2
+  }
+  input: "ifm_conv"
+  input: "filter"
+  input: "bias"
+  output: "ofm_conv"
+}
+operation {
+  type: "Mul"
+  input: "mul_const"
+  input: "ofm_conv"
+  output: "ofm_mul"
+  mul_options {
+    activation: NONE
+  }
+}
+operation {
+  type: "Add"
+  input: "add_const"
+  input: "ofm_mul"
+  output: "ofm_add"
+  add_options {
+    activation: NONE
+  }
+}
+input: "ifm_conv"
+output: "ofm_add"
diff --git a/res/TensorFlowLiteRecipes/Net_Conv_Add_Mul_002/test.rule b/res/TensorFlowLiteRecipes/Net_Conv_Add_Mul_002/test.rule
new file mode 100644
index 000000000..329d1752c
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_Conv_Add_Mul_002/test.rule
@@ -0,0 +1,7 @@
+# To check if Add and Mul with reverse input sequence are fused to Convolution op
+
+RULE    "VERIFY_FILE_FORMAT"      $(verify_file_format) '=' 1
+
+RULE    "CONV_EXIST"              $(op_count CONV_2D) '=' 1
+RULE    "NO_MUL"                  $(op_count MUL) '=' 0
+RULE    "NO_ADD"                  $(op_count ADD) '=' 0
diff --git a/res/TensorFlowLiteRecipes/Net_Conv_Add_Mul_003/test.recipe b/res/TensorFlowLiteRecipes/Net_Conv_Add_Mul_003/test.recipe
new file mode 100644
index 000000000..d7673169e
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_Conv_Add_Mul_003/test.recipe
@@ -0,0 +1,92 @@
+operand {
+  name: "ifm_conv"
+  type: FLOAT32
+  shape { dim: 1 dim: 64 dim: 64 dim: 32 }
+}
+operand {
+  name: "filter"
+  type: FLOAT32
+  shape { dim: 64 dim: 1 dim: 1 dim: 32 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "bias"
+  type: FLOAT32
+  shape { dim: 64 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "ofm_conv"
+  type: FLOAT32
+  shape { dim: 1 dim: 32 dim: 32 dim: 64 }
+}
+operand {
+  name: "mul_const"
+  type: FLOAT32
+  shape { dim: 1 dim: 1 dim: 1 dim: 64 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "add_const"
+  type: FLOAT32
+  shape { dim: 1 dim: 1 dim: 1 dim: 64 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand { 
+  name: "ofm_mul"
+  type: FLOAT32
+  shape { dim: 1 dim: 32 dim: 32 dim: 64 }
+}
+operand {
+  name: "ofm_add"
+  type: FLOAT32
+  shape { dim: 1 dim: 32 dim: 32 dim: 64 }
+}
+operation {
+  type: "Conv2D"
+  conv2d_options {
+    padding: VALID
+    stride_w: 2
+    stride_h: 2
+  }
+  input: "ifm_conv"
+  input: "filter"
+  input: "bias"
+  output: "ofm_conv"
+}
+operation {
+  type: "Mul"
+  input: "ofm_conv"
+  input: "mul_const"
+  output: "ofm_mul"
+  mul_options {
+    activation: RELU
+  }
+}
+operation {
+  type: "Add"
+  input: "ofm_mul"
+  input: "add_const"
+  output: "ofm_add"
+  add_options {
+    activation: NONE
+  }
+}
+input: "ifm_conv"
+output: "ofm_add"
diff --git a/res/TensorFlowLiteRecipes/Net_Conv_Add_Mul_003/test.rule b/res/TensorFlowLiteRecipes/Net_Conv_Add_Mul_003/test.rule
new file mode 100644
index 000000000..9e158e3d6
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_Conv_Add_Mul_003/test.rule
@@ -0,0 +1,7 @@
+# To check if Add and Mul are not fused to Convolution op
+
+RULE    "VERIFY_FILE_FORMAT"      $(verify_file_format) '=' 1
+
+RULE    "CONV_EXIST"              $(op_count CONV_2D) '=' 1
+RULE    "MUL_EXIST"               $(op_count MUL) '=' 1
+RULE    "ADD_EXIST"               $(op_count ADD) '=' 1
diff --git a/res/TensorFlowLiteRecipes/Net_Conv_Min_Max_000/test.recipe b/res/TensorFlowLiteRecipes/Net_Conv_Min_Max_000/test.recipe
new file mode 100644
index 000000000..6d166f0bf
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_Conv_Min_Max_000/test.recipe
@@ -0,0 +1,121 @@
+operand {
+  name: "Placeholder"
+  type: FLOAT32
+  shape { dim: 1 dim: 16 dim: 16 dim: 3 }
+}
+operand {
+  name: "Const_4"
+  type: FLOAT32
+  shape { }
+  filler { tag: "explicit" arg: "6" }
+}
+operand {
+  name: "Const_5"
+  type: FLOAT32
+  shape { }
+  filler { tag: "explicit" arg: "0" }
+}
+operand {
+  name: "Conv2D_1"
+  type: FLOAT32
+  shape { dim: 3 dim: 3 dim: 3 dim: 3 }
+  filler { tag: "gaussian" arg: "0.0" arg: "0.1" }
+}
+operand {
+  name: "Conv2D_2"
+  type: FLOAT32
+  shape { dim: 3 }
+  filler { tag: "gaussian" arg: "0.0" arg: "0.1" }
+}
+operand {
+  name: "Conv2D_21"
+  type: FLOAT32
+  shape { dim: 3 dim: 3 dim: 3 dim: 3 }
+  filler { tag: "gaussian" arg: "0.0" arg: "0.1" }
+}
+operand {
+  name: "Conv2D_11"
+  type: FLOAT32
+  shape { dim: 1 dim: 16 dim: 16 dim: 3 }
+}
+operand {
+  name: "Minimum"
+  type: FLOAT32
+  shape { dim: 1 dim: 16 dim: 16 dim: 3 }
+}
+operand {
+  name: "Maximum"
+  type: FLOAT32
+  shape { dim: 1 dim: 16 dim: 16 dim: 3 }
+}
+operand {
+  name: "Conv2D_22"
+  type: FLOAT32
+  shape { dim: 1 dim: 16 dim: 16 dim: 3 }
+}
+operand {
+  name: "Minimum_1"
+  type: FLOAT32
+  shape { dim: 1 dim: 16 dim: 16 dim: 3 }
+}
+operand {
+  name: "Maximum_1"
+  type: FLOAT32
+  shape { dim: 1 dim: 16 dim: 16 dim: 3 }
+}
+operation {
+  type: "Conv2D"
+  input: "Placeholder"
+  input: "Conv2D_1"
+  input: "Conv2D_2"
+  output: "Conv2D_11"
+  conv2d_options {
+    padding: SAME
+    stride_w: 1
+    stride_h: 1
+    activation: NONE
+    dilation_w_factor: 1
+    dilation_h_factor: 1
+  }
+}
+operation {
+  type: "Minimum"
+  input: "Conv2D_11"
+  input: "Const_4"
+  output: "Minimum"
+}
+operation {
+  type: "Maximum"
+  input: "Minimum"
+  input: "Const_5"
+  output: "Maximum"
+}
+operation {
+  type: "Conv2D"
+  input: "Maximum"
+  input: "Conv2D_21"
+  input: "Conv2D_2"
+  output: "Conv2D_22"
+  conv2d_options {
+    padding: SAME
+    stride_w: 1
+    stride_h: 1
+    activation: NONE
+    dilation_w_factor: 1
+    dilation_h_factor: 1
+  }
+}
+operation {
+  type: "Minimum"
+  input: "Conv2D_22"
+  input: "Const_4"
+  output: "Minimum_1"
+}
+operation {
+  type: "Maximum"
+  input: "Minimum_1"
+  input: "Const_5"
+  output: "Maximum_1"
+}
+input: "Placeholder"
+output: "Maximum_1"
diff --git a/res/TensorFlowLiteRecipes/Net_Conv_Min_Max_000/test.rule b/res/TensorFlowLiteRecipes/Net_Conv_Min_Max_000/test.rule
new file mode 100644
index 000000000..a67530afd
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_Conv_Min_Max_000/test.rule
@@ -0,0 +1,8 @@
+# To check if Minumum and Maximum are converte to Relu6 op
+
+RULE    "VERIFY_FILE_FORMAT"      $(verify_file_format) '=' 1
+
+RULE    "CONV_EXIST"              $(op_count CONV_2D) '=' 2
+RULE    "RELU6_EXIST"             $(op_count RELU6) '=' 2
+RULE    "MIN_NOT_EXIST"           $(op_count MINUMUM) '=' 0
+RULE    "MAX_NOT_EXIST"           $(op_count MAXIMUM) '=' 0
diff --git a/res/TensorFlowLiteRecipes/Net_Conv_Relu6_000/test.recipe b/res/TensorFlowLiteRecipes/Net_Conv_Relu6_000/test.recipe
new file mode 100644
index 000000000..f6be63f84
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_Conv_Relu6_000/test.recipe
@@ -0,0 +1,85 @@
+operand {
+  name: "Placeholder"
+  type: FLOAT32
+  shape { dim: 1 dim: 16 dim: 16 dim: 3 }
+}
+operand {
+  name: "Conv2D_1"
+  type: FLOAT32
+  shape { dim: 3 dim: 3 dim: 3 dim: 3 }
+  filler { tag: "gaussian" arg: "0.0" arg: "0.1" }
+}
+operand {
+  name: "Conv2D_2"
+  type: FLOAT32
+  shape { dim: 3 }
+  filler { tag: "gaussian" arg: "0.0" arg: "0.1" }
+}
+operand {
+  name: "Conv2D_21"
+  type: FLOAT32
+  shape { dim: 3 dim: 3 dim: 3 dim: 3 }
+  filler { tag: "gaussian" arg: "0.0" arg: "0.1" }
+}
+operand {
+  name: "Conv2D_11"
+  type: FLOAT32
+  shape { dim: 1 dim: 16 dim: 16 dim: 3 }
+}
+operand {
+  name: "ReLU6"
+  type: FLOAT32
+  shape { dim: 1 dim: 16 dim: 16 dim: 3 }
+}
+operand {
+  name: "Conv2D_22"
+  type: FLOAT32
+  shape { dim: 1 dim: 16 dim: 16 dim: 3 }
+}
+operand {
+  name: "ReLU6_1"
+  type: FLOAT32
+  shape { dim: 1 dim: 16 dim: 16 dim: 3 }
+}
+operation {
+  type: "Conv2D"
+  input: "Placeholder"
+  input: "Conv2D_1"
+  input: "Conv2D_2"
+  output: "Conv2D_11"
+  conv2d_options {
+    padding: SAME
+    stride_w: 1
+    stride_h: 1
+    activation: NONE
+    dilation_w_factor: 1
+    dilation_h_factor: 1
+  }
+}
+operation {
+  type: "ReLU6"
+  input: "Conv2D_11"
+  output: "ReLU6"
+}
+operation {
+  type: "Conv2D"
+  input: "ReLU6"
+  input: "Conv2D_21"
+  input: "Conv2D_2"
+  output: "Conv2D_22"
+  conv2d_options {
+    padding: SAME
+    stride_w: 1
+    stride_h: 1
+    activation: NONE
+    dilation_w_factor: 1
+    dilation_h_factor: 1
+  }
+}
+operation {
+  type: "ReLU6"
+  input: "Conv2D_22"
+  output: "ReLU6_1"
+}
+input: "Placeholder"
+output: "ReLU6_1"
diff --git a/res/TensorFlowLiteRecipes/Net_Conv_Relu6_000/test.rule b/res/TensorFlowLiteRecipes/Net_Conv_Relu6_000/test.rule
new file mode 100644
index 000000000..34d5d663d
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_Conv_Relu6_000/test.rule
@@ -0,0 +1,6 @@
+# To check if ReLU6 is fused to Convolution op
+
+RULE    "VERIFY_FILE_FORMAT"      $(verify_file_format) '=' 1
+
+RULE    "CONV_EXIST"              $(op_count CONV_2D) '=' 2
+RULE    "RELU6_NOT_EXIST"         $(op_count RELU6) '=' 0
diff --git a/res/TensorFlowLiteRecipes/Net_DwConv_BN_000/test.recipe b/res/TensorFlowLiteRecipes/Net_DwConv_BN_000/test.recipe
new file mode 100644
index 000000000..f9769273f
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_DwConv_BN_000/test.recipe
@@ -0,0 +1,91 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 64 dim: 64 dim: 8 }
+}
+operand {
+  name: "filter"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 8 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "bias"
+  type: FLOAT32
+  shape { dim: 8 }
+  filler {
+    tag: "constant"
+    arg: "1.1"
+  }
+}
+operand {
+  name: "scale"
+  type: FLOAT32
+  shape { dim: 8 }
+  filler {
+    tag: "constant"
+    arg: "1.1"
+  }
+}
+operand {
+  name: "shift"
+  type: FLOAT32
+  shape { dim: 8 }
+  filler {
+    tag: "constant"
+    arg: "1.1"
+  }
+}
+operand {
+  name: "dwout"
+  type: FLOAT32
+  shape { dim: 1 dim: 64 dim: 64 dim: 8 }
+}
+operand {
+  name: "mulout"
+  type: FLOAT32
+  shape { dim: 1 dim: 64 dim: 64 dim: 8 }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 64 dim: 64 dim: 8 }
+}
+operation {
+  type: "DepthwiseConv2D"
+  depthwiseconv2d_options {
+    padding: SAME
+    stride_w: 1
+    stride_h: 1
+    depth_multiplier: 1
+    activation : NONE
+  }
+  input: "ifm"
+  input: "filter"
+  input: "bias"
+  output: "dwout"
+}
+operation {
+  type: "Mul"
+  input: "dwout"
+  input: "scale"
+  output: "mulout"
+  mul_options {
+    activation: NONE
+  }
+}
+operation {
+  type: "Add"
+  input: "mulout"
+  input: "shift"
+  output: "ofm"
+  add_options {
+    activation: RELU6
+  }
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/Net_DwConv_BN_000/test.rule b/res/TensorFlowLiteRecipes/Net_DwConv_BN_000/test.rule
new file mode 100644
index 000000000..eb0cba835
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_DwConv_BN_000/test.rule
@@ -0,0 +1,7 @@
+# To check if BatchNorm op(mul + add) is fused to Depthwise Convolution op
+
+RULE    "VERIFY_FILE_FORMAT"      $(verify_file_format) '=' 1
+
+RULE    "DWCONV_EXIST"            $(op_count DEPTHWISE_CONV_2D) '=' 1
+RULE    "NO_MUL"                  $(op_count MUL) '=' 0
+RULE    "NO_ADD"                  $(op_count ADD) '=' 0
diff --git a/res/TensorFlowLiteRecipes/Net_DwConv_BN_001/test.recipe b/res/TensorFlowLiteRecipes/Net_DwConv_BN_001/test.recipe
new file mode 100644
index 000000000..4bbfd841c
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_DwConv_BN_001/test.recipe
@@ -0,0 +1,91 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 64 dim: 64 dim: 8 }
+}
+operand {
+  name: "filter"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 8 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+}
+operand {
+  name: "bias"
+  type: FLOAT32
+  shape { dim: 8 }
+  filler {
+    tag: "constant"
+    arg: "1.1"
+  }
+}
+operand {
+  name: "scale"
+  type: FLOAT32
+  shape { dim: 1 dim: 1 dim: 1 dim: 8 }
+  filler {
+    tag: "constant"
+    arg: "1.1"
+  }
+}
+operand {
+  name: "shift"
+  type: FLOAT32
+  shape { dim: 1 dim: 1 dim: 1 dim: 8 }
+  filler {
+    tag: "constant"
+    arg: "1.1"
+  }
+}
+operand {
+  name: "dwout"
+  type: FLOAT32
+  shape { dim: 1 dim: 64 dim: 64 dim: 8 }
+}
+operand {
+  name: "mulout"
+  type: FLOAT32
+  shape { dim: 1 dim: 64 dim: 64 dim: 8 }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 64 dim: 64 dim: 8 }
+}
+operation {
+  type: "DepthwiseConv2D"
+  depthwiseconv2d_options {
+    padding: SAME
+    stride_w: 1
+    stride_h: 1
+    depth_multiplier: 1
+    activation : NONE
+  }
+  input: "ifm"
+  input: "filter"
+  input: "bias"
+  output: "dwout"
+}
+operation {
+  type: "Mul"
+  input: "dwout"
+  input: "scale"
+  output: "mulout"
+  mul_options {
+    activation: NONE
+  }
+}
+operation {
+  type: "Add"
+  input: "mulout"
+  input: "shift"
+  output: "ofm"
+  add_options {
+    activation: RELU6
+  }
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/Net_DwConv_BN_001/test.rule b/res/TensorFlowLiteRecipes/Net_DwConv_BN_001/test.rule
new file mode 100644
index 000000000..eb0cba835
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_DwConv_BN_001/test.rule
@@ -0,0 +1,7 @@
+# To check if BatchNorm op(mul + add) is fused to Depthwise Convolution op
+
+RULE    "VERIFY_FILE_FORMAT"      $(verify_file_format) '=' 1
+
+RULE    "DWCONV_EXIST"            $(op_count DEPTHWISE_CONV_2D) '=' 1
+RULE    "NO_MUL"                  $(op_count MUL) '=' 0
+RULE    "NO_ADD"                  $(op_count ADD) '=' 0
diff --git a/res/TensorFlowLiteRecipes/Net_InstanceNorm_002/test.recipe b/res/TensorFlowLiteRecipes/Net_InstanceNorm_002/test.recipe
index 92087829c..a79517484 100644
--- a/res/TensorFlowLiteRecipes/Net_InstanceNorm_002/test.recipe
+++ b/res/TensorFlowLiteRecipes/Net_InstanceNorm_002/test.recipe
@@ -18,7 +18,7 @@ operand {
   name: "sequential/instance_normalization/stack"
   type: INT32
   shape {
-    dim: 5
+    dim: 4
   }
   filler {
     tag: "explicit"
@@ -26,7 +26,6 @@ operand {
     arg: "32"
     arg: "32"
     arg: "8"
-    arg: "1"
   }
 }
 operand {
@@ -51,7 +50,6 @@ operand {
     dim: 1
     dim: 1
     dim: 8
-    dim: 1
   }
   filler {
     tag: "explicit"
@@ -73,7 +71,6 @@ operand {
     dim: 1
     dim: 1
     dim: 8
-    dim: 1
   }
   filler {
     tag: "explicit"
@@ -101,13 +98,12 @@ operand {
   name: "sequential/instance_normalization/moments/variance/reduction_indices"
   type: INT32
   shape {
-    dim: 3
+    dim: 2
   }
   filler {
     tag: "explicit"
     arg: "1"
     arg: "2"
-    arg: "4"
   }
 }
 operand {
@@ -118,7 +114,6 @@ operand {
     dim: 32
     dim: 32
     dim: 8
-    dim: 1
   }
 }
 operand {
@@ -129,7 +124,6 @@ operand {
     dim: 1
     dim: 1
     dim: 8
-    dim: 1
   }
 }
 operand {
@@ -140,7 +134,6 @@ operand {
     dim: 32
     dim: 32
     dim: 8
-    dim: 1
   }
 }
 operand {
@@ -151,7 +144,6 @@ operand {
     dim: 1
     dim: 1
     dim: 8
-    dim: 1
   }
 }
 operand {
@@ -162,7 +154,6 @@ operand {
     dim: 1
     dim: 1
     dim: 8
-    dim: 1
   }
 }
 operand {
@@ -173,7 +164,6 @@ operand {
     dim: 1
     dim: 1
     dim: 8
-    dim: 1
   }
 }
 operand {
@@ -184,7 +174,6 @@ operand {
     dim: 1
     dim: 1
     dim: 8
-    dim: 1
   }
 }
 operand {
@@ -195,7 +184,6 @@ operand {
     dim: 32
     dim: 32
     dim: 8
-    dim: 1
   }
 }
 operand {
@@ -206,7 +194,6 @@ operand {
     dim: 1
     dim: 1
     dim: 8
-    dim: 1
   }
 }
 operand {
@@ -217,7 +204,6 @@ operand {
     dim: 1
     dim: 1
     dim: 8
-    dim: 1
   }
 }
 operand {
@@ -228,7 +214,6 @@ operand {
     dim: 32
     dim: 32
     dim: 8
-    dim: 1
   }
 }
 operand {
@@ -242,14 +227,8 @@ operand {
   }
 }
 operation {
-  type: "Reshape"
-  input: "input_layer"
-  input: "sequential/instance_normalization/stack"
-  output: "sequential/instance_normalization/Reshape"
-}
-operation {
   type: "Mean"
-  input: "sequential/instance_normalization/Reshape"
+  input: "input_layer"
   input: "sequential/instance_normalization/moments/variance/reduction_indices"
   output: "sequential/instance_normalization/moments/mean"
   mean_options {
@@ -258,7 +237,7 @@ operation {
 }
 operation {
   type: "SquaredDifference"
-  input: "sequential/instance_normalization/Reshape"
+  input: "input_layer"
   input: "sequential/instance_normalization/moments/mean"
   output: "sequential/instance_normalization/moments/SquaredDifference"
 }
@@ -296,7 +275,7 @@ operation {
 }
 operation {
   type: "Mul"
-  input: "sequential/instance_normalization/Reshape"
+  input: "input_layer"
   input: "sequential/instance_normalization/batchnorm/mul"
   output: "sequential/instance_normalization/batchnorm/mul_1"
   mul_options {
@@ -330,11 +309,5 @@ operation {
     activation: NONE
   }
 }
-operation {
-  type: "Reshape"
-  input: "sequential/instance_normalization/batchnorm/add_1"
-  input: "sequential/instance_normalization/Shape"
-  output: "Identity"
-}
 input: "input_layer"
-output: "Identity"
+output: "sequential/instance_normalization/batchnorm/add_1"
diff --git a/res/TensorFlowLiteRecipes/Net_InstanceNorm_002/test.rule b/res/TensorFlowLiteRecipes/Net_InstanceNorm_002/test.rule
index 650827f4e..d6e47712f 100644
--- a/res/TensorFlowLiteRecipes/Net_InstanceNorm_002/test.rule
+++ b/res/TensorFlowLiteRecipes/Net_InstanceNorm_002/test.rule
@@ -3,6 +3,6 @@
 RULE    "VERIFY_FILE_FORMAT"      $(verify_file_format) '=' 1
 
 RULE    "INSTANCE_NORM_EXIST"     $(op_count INSTANCE_NORM) '=' 1
-RULE    "RESHAPE_EXIST"           $(op_count RESHAPE) '=' 3
+RULE    "RESHAPE_EXIST"           $(op_count RESHAPE) '<=' 3
 RULE    "NO_ADD"                  $(op_count ADD) '=' 0
 RULE    "NO_MUL"                  $(op_count MUL) '=' 0
diff --git a/res/TensorFlowLiteRecipes/Net_Maximum_Minimum_000/test.recipe b/res/TensorFlowLiteRecipes/Net_Maximum_Minimum_000/test.recipe
new file mode 100644
index 000000000..e1d3c0a09
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_Maximum_Minimum_000/test.recipe
@@ -0,0 +1,86 @@
+operand {
+  name: "Const"
+  type: FLOAT32
+  shape {
+  }
+  filler {
+    tag: "explicit"
+    arg: "6"
+  }
+  quant {
+    quantized_dimension: 0
+  }
+  is_variable: false
+}
+operand {
+  name: "Const_1"
+  type: FLOAT32
+  shape {
+  }
+  filler {
+    tag: "explicit"
+    arg: "0"
+  }
+  quant {
+    quantized_dimension: 0
+  }
+  is_variable: false
+}
+operand {
+  name: "Hole"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 3
+    dim: 3
+    dim: 4
+  }
+  quant {
+    min: 0
+    max: 255
+    quantized_dimension: 0
+  }
+  is_variable: false
+}
+operand {
+  name: "Maximum"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 3
+    dim: 3
+    dim: 4
+  }
+  quant {
+    quantized_dimension: 0
+  }
+  is_variable: false
+}
+operand {
+  name: "Minimum"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 3
+    dim: 3
+    dim: 4
+  }
+  quant {
+    quantized_dimension: 0
+  }
+  is_variable: false
+}
+operation {
+  type: "Minimum"
+  input: "Hole"
+  input: "Const"
+  output: "Minimum"
+}
+operation {
+  type: "Maximum"
+  input: "Minimum"
+  input: "Const_1"
+  output: "Maximum"
+}
+input: "Hole"
+output: "Maximum"
diff --git a/res/TensorFlowLiteRecipes/Net_Maximum_Minimum_000/test.rule b/res/TensorFlowLiteRecipes/Net_Maximum_Minimum_000/test.rule
new file mode 100644
index 000000000..9d6340727
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_Maximum_Minimum_000/test.rule
@@ -0,0 +1,7 @@
+# To check if Maximum and Minimum is fused to Relu6.
+
+RULE    "VERIFY_FILE_FORMAT"      $(verify_file_format) '=' 1
+
+RULE    "RELU6_EXIST"             $(op_count RELU6) '=' 1
+RULE    "NO_MAXIMUM"              $(op_count MAXIMUM) '=' 0
+RULE    "NO_MINIMUM"              $(op_count MINIMUM) '=' 0
diff --git a/res/TensorFlowLiteRecipes/Net_Preactivation_BN_000/test.recipe b/res/TensorFlowLiteRecipes/Net_Preactivation_BN_000/test.recipe
index c12ce9d64..3658a2bff 100644
--- a/res/TensorFlowLiteRecipes/Net_Preactivation_BN_000/test.recipe
+++ b/res/TensorFlowLiteRecipes/Net_Preactivation_BN_000/test.recipe
@@ -7,11 +7,6 @@ operand {
     dim: 4
     dim: 16
   }
-  filler {
-    tag: "gaussian"
-    arg: "0.0"
-    arg: "0.1"
-  }
 }
 operand {
   name: "Weights1"
diff --git a/res/TensorFlowLiteRecipes/Net_Reshape_Neg_000/test.recipe b/res/TensorFlowLiteRecipes/Net_Reshape_Neg_000/test.recipe
new file mode 100644
index 000000000..51cf3b4ca
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_Reshape_Neg_000/test.recipe
@@ -0,0 +1,35 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 2 dim: 3 dim: 6 }
+}
+operand {
+  name: "shape1"
+  type: INT32
+  shape { dim: 2 }
+  filler { tag: "explicit" arg: "6" arg: "6" }
+}
+operand {
+  name: "reshape_out"
+  type: FLOAT32
+  shape { dim: 6 dim: 6 }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 6 dim: 6 }
+}
+operation {
+  type: "Reshape"
+  input: "ifm"
+  input: "shape1"
+  output: "reshape_out"
+}
+operation {
+  type: "Neg"
+  input: "reshape_out"
+  output: "ofm"
+}
+
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/Net_Reshape_Reshape_000/test.recipe b/res/TensorFlowLiteRecipes/Net_Reshape_Reshape_000/test.recipe
new file mode 100644
index 000000000..2acb2e71b
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_Reshape_Reshape_000/test.recipe
@@ -0,0 +1,42 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 2 dim: 3 dim: 6 }
+}
+operand {
+  name: "shape1"
+  type: INT32
+  shape { dim: 2 }
+  filler { tag: "explicit" arg: "6" arg: "6" }
+}
+operand {
+  name: "shape2"
+  type: INT32
+  shape { dim: 3 }
+  filler { tag: "explicit" arg: "6" arg: "2"  arg: "3" }
+}
+operand {
+  name: "reshape_out"
+  type: FLOAT32
+  shape { dim: 6 dim: 6 }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 6 dim: 2 dim: 3 }
+}
+operation {
+  type: "Reshape"
+  input: "ifm"
+  input: "shape1"
+  output: "reshape_out"
+}
+operation {
+  type: "Reshape"
+  input: "reshape_out"
+  input: "shape2"
+  output: "ofm"
+}
+
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/Net_Reshape_Reshape_000/test.rule b/res/TensorFlowLiteRecipes/Net_Reshape_Reshape_000/test.rule
new file mode 100644
index 000000000..9a70601c8
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_Reshape_Reshape_000/test.rule
@@ -0,0 +1,5 @@
+# To check if Redundant Reshape removed.
+
+RULE    "VERIFY_FILE_FORMAT"      $(verify_file_format) '=' 1
+
+RULE    "RESHAPE_EXIST"             $(op_count RESHAPE) '=' 1
diff --git a/res/TensorFlowLiteRecipes/Net_Squeeze_Squeeze_000/test.recipe b/res/TensorFlowLiteRecipes/Net_Squeeze_Squeeze_000/test.recipe
new file mode 100644
index 000000000..b84058b0e
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_Squeeze_Squeeze_000/test.recipe
@@ -0,0 +1,29 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 16 dim: 1 dim: 1 }
+}
+operand {
+  name: "t1"
+  type: FLOAT32
+  shape { dim: 1 dim: 16 dim: 1 }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 16 }
+}
+operation {
+  type: "Squeeze"
+  squeeze_options { squeeze_dim: 3 }
+  input: "ifm"
+  output: "t1"
+}
+operation {
+  type: "Squeeze"
+  squeeze_options { squeeze_dim: 2 }
+  input: "t1"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/Net_Squeeze_Squeeze_000/test.rule b/res/TensorFlowLiteRecipes/Net_Squeeze_Squeeze_000/test.rule
new file mode 100644
index 000000000..66a105a73
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_Squeeze_Squeeze_000/test.rule
@@ -0,0 +1,6 @@
+# To check if Squeeze is substituted to Reshape op
+
+RULE    "VERIFY_FILE_FORMAT"      $(verify_file_format) '=' 1
+
+RULE    "SQUEEZE_COUNT"           $(op_count SQUEEZE) '=' 0
+RULE    "RESHAPE_COUNT"           $(op_count RESHAPE) '=' 2
diff --git a/res/TensorFlowLiteRecipes/Net_StridedSlice_StridedSlice_000/test.recipe b/res/TensorFlowLiteRecipes/Net_StridedSlice_StridedSlice_000/test.recipe
new file mode 100644
index 000000000..04c0e9084
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_StridedSlice_StridedSlice_000/test.recipe
@@ -0,0 +1,77 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 2 dim: 4 }
+}
+operand {
+  name: "begin"
+  type: INT32
+  shape { dim: 3 }
+  filler { tag: "explicit" arg: "0" arg: "0" arg: "0" }
+}
+operand {
+  name: "end"
+  type: INT32
+  shape { dim: 3 }
+  filler { tag: "explicit" arg: "1" arg: "2" arg: "4" }
+}
+operand {
+  name: "strides"
+  type: INT32
+  shape { dim: 3 }
+  filler { tag: "explicit" arg: "1" arg: "1" arg: "1" }
+}
+operand {
+  name: "output_1"
+  type: FLOAT32
+  shape { dim: 1 dim: 2 dim: 4 }
+}
+operation {
+  type: "StridedSlice"
+  strided_slice_options {
+    begin_mask: 0
+    end_mask: 0
+    ellipsis_mask: 0
+    new_axis_mask: 0
+    shrink_axis_mask: 0
+  }
+  input: "ifm"
+  input: "begin"
+  input: "end"
+  input: "strides"
+  output: "output_1"
+}
+operand {
+  name: "begin_2"
+  type: INT32
+  shape { dim: 3 }
+  filler { tag: "explicit" arg: "0" arg: "0" arg: "0" }
+}
+operand {
+  name: "end_2"
+  type: INT32
+  shape { dim: 3 }
+  filler { tag: "explicit" arg: "0" arg: "1" arg: "0" }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim:1 dim: 4}
+}
+operation {
+  type: "StridedSlice"
+  strided_slice_options {
+    begin_mask: 5
+    end_mask: 5
+    ellipsis_mask: 0
+    new_axis_mask: 0
+    shrink_axis_mask: 2
+  }
+  input: "output_1"
+  input: "begin_2"
+  input: "end_2"
+  input: "strides"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/Net_StridedSlice_StridedSlice_000/test.rule b/res/TensorFlowLiteRecipes/Net_StridedSlice_StridedSlice_000/test.rule
new file mode 100644
index 000000000..f1a660d19
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_StridedSlice_StridedSlice_000/test.rule
@@ -0,0 +1,5 @@
+# To check if Unnecessary StridedSlice removed.
+
+RULE    "VERIFY_FILE_FORMAT"      $(verify_file_format) '=' 1
+
+RULE    "STRIDEDSLICE_EXIST"      $(op_count STRIDEDSLICE) '=' 1
diff --git a/res/TensorFlowLiteRecipes/Net_TConv_BN_002/test.recipe b/res/TensorFlowLiteRecipes/Net_TConv_BN_002/test.recipe
new file mode 100644
index 000000000..e40fe4f59
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_TConv_BN_002/test.recipe
@@ -0,0 +1,156 @@
+# Tconv with asymmetric filter + BN + Relu6
+operand {
+  name: "Hole"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 1
+    dim: 1
+    dim: 2
+  }
+  quant {
+    quantized_dimension: 0
+  }
+  is_variable: false
+}
+operand {
+  name: "conv2d_transpose/input_sizes"
+  type: INT32
+  shape {
+    dim: 4
+  }
+  filler {
+    tag: "explicit"
+    arg: "1"
+    arg: "5"
+    arg: "1"
+    arg: "2"
+  }
+  quant {
+    quantized_dimension: 0
+  }
+  is_variable: false
+}
+operand {
+  name: "FusedBatchNormV3"
+  type: FLOAT32
+  shape {
+    dim: 2
+  }
+  filler {
+    tag: "explicit"
+    arg: "-2.04724"
+    arg: "-7.80109"
+  }
+  quant {
+    quantized_dimension: 0
+  }
+  is_variable: false
+}
+operand {
+  name: "FusedBatchNormV3;conv2d_transpose;conv2d_transpose/input_sizes"
+  type: FLOAT32
+  shape {
+    dim: 2
+    dim: 5
+    dim: 1
+    dim: 2
+  }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "0.1"
+  }
+  quant {
+    quantized_dimension: 0
+  }
+  is_variable: false
+}
+operand {
+  name: "FusedBatchNormV3;conv2d_transpose;conv2d_transpose/input_sizes2"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 5
+    dim: 1
+    dim: 2
+  }
+  quant {
+    quantized_dimension: 0
+  }
+  is_variable: false
+}
+operand {
+  name: "FusedBatchNormV3_mul_0"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 5
+    dim: 1
+    dim: 2
+  }
+  quant {
+    quantized_dimension: 0
+  }
+}
+operand {
+  name: "FusedBatchNormV3_mul_0_param"
+  type: FLOAT32
+  shape {
+    dim: 2
+  }
+  filler {
+    tag: "explicit"
+    arg: "2.00834"
+    arg: "1.00344"
+  }
+  quant {
+    quantized_dimension: 0
+  }
+}
+operand {
+  name: "Relu6"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 5
+    dim: 1
+    dim: 2
+  }
+  quant {
+    quantized_dimension: 0
+  }
+  is_variable: false
+}
+operation {
+  type: "TransposeConv"
+  input: "conv2d_transpose/input_sizes"
+  input: "FusedBatchNormV3;conv2d_transpose;conv2d_transpose/input_sizes"
+  input: "Hole"
+  output: "FusedBatchNormV3;conv2d_transpose;conv2d_transpose/input_sizes2"
+  transpose_conv_options {
+    padding: VALID
+    stride_w: 1
+    stride_h: 1
+  }
+}
+operation {
+  type: "Mul"
+  input: "FusedBatchNormV3;conv2d_transpose;conv2d_transpose/input_sizes2"
+  input: "FusedBatchNormV3_mul_0_param"
+  output: "FusedBatchNormV3_mul_0"
+  mul_options {
+    activation: NONE
+  }
+}
+operation {
+  type: "Add"
+  input: "FusedBatchNormV3_mul_0"
+  input: "FusedBatchNormV3"
+  output: "Relu6"
+  add_options {
+    activation: RELU6
+  }
+}
+input: "Hole"
+output: "Relu6"
diff --git a/res/TensorFlowLiteRecipes/Net_TConv_BN_002/test.rule b/res/TensorFlowLiteRecipes/Net_TConv_BN_002/test.rule
new file mode 100644
index 000000000..dfc392758
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_TConv_BN_002/test.rule
@@ -0,0 +1,8 @@
+# To check if BatchNorm op(mul + add) is fused to Transposed Convolution op
+
+RULE    "VERIFY_FILE_FORMAT"      $(verify_file_format) '=' 1
+
+RULE    "TCONV_EXIST"             $(op_count TRANSPOSE_CONV) '=' 1
+RULE    "RELU6_EXIST"             $(op_count RELU6) '=' 1
+RULE    "NO_MUL"                  $(op_count MUL) '=' 0
+RULE    "NO_ADD"                  $(op_count ADD) '=' 0
diff --git a/res/TensorFlowLiteRecipes/Part_Add_Sqrt_000/test.recipe b/res/TensorFlowLiteRecipes/Part_Add_Sqrt_000/test.recipe
new file mode 100644
index 000000000..1125246d1
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Part_Add_Sqrt_000/test.recipe
@@ -0,0 +1,48 @@
+operand {
+  name: "ifm1"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "ifm2"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "add"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "ofm1"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "ofm2"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operation {
+  type: "Add"
+  add_options {
+    activation: NONE
+  }
+  input: "ifm1"
+  input: "ifm2"
+  output: "add"
+}
+operation {
+  type: "Sqrt"
+  input: "add"
+  output: "ofm1"
+}
+operation {
+  type: "Sqrt"
+  input: "add"
+  output: "ofm2"
+}
+input: "ifm1"
+input: "ifm2"
+output: "ofm1"
+output: "ofm2"
diff --git a/res/TensorFlowLiteRecipes/Part_Add_Sqrt_Rsqrt_000/test.recipe b/res/TensorFlowLiteRecipes/Part_Add_Sqrt_Rsqrt_000/test.recipe
new file mode 100644
index 000000000..c9cee9960
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Part_Add_Sqrt_Rsqrt_000/test.recipe
@@ -0,0 +1,68 @@
+operand {
+  name: "ifm1"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "ifm2"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "add"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "sqrt1"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "sqrt2"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "ofm1"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "ofm2"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operation {
+  type: "Add"
+  add_options {
+    activation: NONE
+  }
+  input: "ifm1"
+  input: "ifm2"
+  output: "add"
+}
+operation {
+  type: "Sqrt"
+  input: "add"
+  output: "sqrt1"
+}
+operation {
+  type: "Sqrt"
+  input: "add"
+  output: "sqrt2"
+}
+operation {
+  type: "Rsqrt"
+  input: "sqrt1"
+  output: "ofm1"
+}
+operation {
+  type: "Rsqrt"
+  input: "sqrt2"
+  output: "ofm2"
+}
+input: "ifm1"
+input: "ifm2"
+output: "ofm1"
+output: "ofm2"
diff --git a/res/TensorFlowLiteRecipes/Part_Add_Sub_000/test.recipe b/res/TensorFlowLiteRecipes/Part_Add_Sub_000/test.recipe
new file mode 100644
index 000000000..8cd878ac3
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Part_Add_Sub_000/test.recipe
@@ -0,0 +1,67 @@
+operand {
+  name: "ifm1"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "ifm2"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "ifm3"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "ifm4"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "add1"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "add2"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operation {
+  type: "Add"
+  add_options {
+    activation: NONE
+  }
+  input: "ifm1"
+  input: "ifm2"
+  output: "add1"
+}
+operation {
+  type: "Add"
+  add_options {
+    activation: NONE
+  }
+  input: "add1"
+  input: "ifm3"
+  output: "add2"
+}
+operation {
+  type: "Sub"
+  sub_options {
+    activation: NONE
+  }
+  input: "add2"
+  input: "ifm4"
+  output: "ofm"
+}
+input: "ifm1"
+input: "ifm2"
+input: "ifm3"
+input: "ifm4"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/Part_Sqrt_Rsqrt_000/test.recipe b/res/TensorFlowLiteRecipes/Part_Sqrt_Rsqrt_000/test.recipe
new file mode 100644
index 000000000..e0a6fe2aa
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Part_Sqrt_Rsqrt_000/test.recipe
@@ -0,0 +1,27 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "sqrt"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operation {
+  type: "Sqrt"
+  input: "ifm"
+  output: "sqrt"
+}
+operation {
+  type: "Rsqrt"
+  input: "sqrt"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/Part_Sqrt_Rsqrt_001/test.recipe b/res/TensorFlowLiteRecipes/Part_Sqrt_Rsqrt_001/test.recipe
new file mode 100644
index 000000000..89f74772e
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Part_Sqrt_Rsqrt_001/test.recipe
@@ -0,0 +1,47 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "sqrt"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "sqrt2"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "rsqrt"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operation {
+  type: "Sqrt"
+  input: "ifm"
+  output: "sqrt"
+}
+operation {
+  type: "Sqrt"
+  input: "sqrt"
+  output: "sqrt2"
+}
+operation {
+  type: "Rsqrt"
+  input: "sqrt2"
+  output: "rsqrt"
+}
+operation {
+  type: "Rsqrt"
+  input: "rsqrt"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/Part_Sqrt_Rsqrt_002/test.recipe b/res/TensorFlowLiteRecipes/Part_Sqrt_Rsqrt_002/test.recipe
new file mode 100644
index 000000000..2e7e13240
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Part_Sqrt_Rsqrt_002/test.recipe
@@ -0,0 +1,47 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "sqrt"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "rsqrt"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "sqrt2"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operation {
+  type: "Sqrt"
+  input: "ifm"
+  output: "sqrt"
+}
+operation {
+  type: "Rsqrt"
+  input: "sqrt"
+  output: "rsqrt"
+}
+operation {
+  type: "Sqrt"
+  input: "rsqrt"
+  output: "sqrt2"
+}
+operation {
+  type: "Rsqrt"
+  input: "sqrt2"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/Part_Sqrt_Rsqrt_003/test.recipe b/res/TensorFlowLiteRecipes/Part_Sqrt_Rsqrt_003/test.recipe
new file mode 100644
index 000000000..1cd57ae12
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Part_Sqrt_Rsqrt_003/test.recipe
@@ -0,0 +1,47 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "sqrt"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "rsqrt"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "rsqrt2"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operation {
+  type: "Sqrt"
+  input: "ifm"
+  output: "sqrt"
+}
+operation {
+  type: "Rsqrt"
+  input: "sqrt"
+  output: "rsqrt"
+}
+operation {
+  type: "Rsqrt"
+  input: "rsqrt"
+  output: "rsqrt2"
+}
+operation {
+  type: "Sqrt"
+  input: "rsqrt2"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/Part_Sqrt_Rsqrt_004/test.recipe b/res/TensorFlowLiteRecipes/Part_Sqrt_Rsqrt_004/test.recipe
new file mode 100644
index 000000000..3b4458480
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Part_Sqrt_Rsqrt_004/test.recipe
@@ -0,0 +1,38 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "sqrt"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "ofm1"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "ofm2"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operation {
+  type: "Sqrt"
+  input: "ifm"
+  output: "sqrt"
+}
+operation {
+  type: "Rsqrt"
+  input: "sqrt"
+  output: "ofm1"
+}
+operation {
+  type: "Rsqrt"
+  input: "sqrt"
+  output: "ofm2"
+}
+input: "ifm"
+output: "ofm1"
+output: "ofm2"
diff --git a/res/TensorFlowLiteRecipes/Part_Sqrt_Rsqrt_Add_000/test.recipe b/res/TensorFlowLiteRecipes/Part_Sqrt_Rsqrt_Add_000/test.recipe
new file mode 100644
index 000000000..6618fff22
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Part_Sqrt_Rsqrt_Add_000/test.recipe
@@ -0,0 +1,56 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "rsqrt"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "sqrt"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "rsqrt2"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "add"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operation {
+  type: "Rsqrt"
+  input: "ifm"
+  output: "rsqrt"
+}
+operation {
+  type: "Sqrt"
+  input: "rsqrt"
+  output: "sqrt"
+}
+operation {
+  type: "Rsqrt"
+  input: "rsqrt"
+  output: "rsqrt2"
+}
+operation {
+  type: "Add"
+  add_options {
+    activation: NONE
+  }
+  input: "sqrt"
+  input: "rsqrt2"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/Part_Sqrt_Rsqrt_Add_001/test.recipe b/res/TensorFlowLiteRecipes/Part_Sqrt_Rsqrt_Add_001/test.recipe
new file mode 100644
index 000000000..dd3f69bea
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Part_Sqrt_Rsqrt_Add_001/test.recipe
@@ -0,0 +1,61 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "rsqrt"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "sqrt"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "rsqrt2"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "rsqrt3"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operation {
+  type: "Rsqrt"
+  input: "ifm"
+  output: "rsqrt"
+}
+operation {
+  type: "Sqrt"
+  input: "rsqrt"
+  output: "sqrt"
+}
+operation {
+  type: "Rsqrt"
+  input: "rsqrt"
+  output: "rsqrt2"
+}
+operation {
+  type: "Rsqrt"
+  input: "rsqrt2"
+  output: "rsqrt3"
+}
+operation {
+  type: "Add"
+  add_options {
+    activation: NONE
+  }
+  input: "sqrt"
+  input: "rsqrt3"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/Part_Sqrt_Rsqrt_Add_002/test.recipe b/res/TensorFlowLiteRecipes/Part_Sqrt_Rsqrt_Add_002/test.recipe
new file mode 100644
index 000000000..23b7458c9
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Part_Sqrt_Rsqrt_Add_002/test.recipe
@@ -0,0 +1,71 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "rsqrt"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "sqrt"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "rsqrt2"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "rsqrt3"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "rsqrt4"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operation {
+  type: "Rsqrt"
+  input: "ifm"
+  output: "rsqrt"
+}
+operation {
+  type: "Sqrt"
+  input: "rsqrt"
+  output: "sqrt"
+}
+operation {
+  type: "Rsqrt"
+  input: "rsqrt"
+  output: "rsqrt2"
+}
+operation {
+  type: "Rsqrt"
+  input: "sqrt"
+  output: "rsqrt3"
+}
+operation {
+  type: "Rsqrt"
+  input: "rsqrt2"
+  output: "rsqrt4"
+}
+operation {
+  type: "Add"
+  add_options {
+    activation: NONE
+  }
+  input: "rsqrt3"
+  input: "rsqrt4"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/Part_Sqrt_Rsqrt_Add_003/test.recipe b/res/TensorFlowLiteRecipes/Part_Sqrt_Rsqrt_Add_003/test.recipe
new file mode 100644
index 000000000..c2dae2e86
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Part_Sqrt_Rsqrt_Add_003/test.recipe
@@ -0,0 +1,47 @@
+operand {
+  name: "ifm1"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "ifm2"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "rsqrt"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "sqrt"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operation {
+  type: "Rsqrt"
+  input: "ifm1"
+  output: "rsqrt"
+}
+operation {
+  type: "Sqrt"
+  input: "ifm2"
+  output: "sqrt"
+}
+operation {
+  type: "Add"
+  add_options {
+    activation: NONE
+  }
+  input: "rsqrt"
+  input: "sqrt"
+  output: "ofm"
+}
+input: "ifm1"
+input: "ifm2"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/Part_Sqrt_Rsqrt_Add_004/test.recipe b/res/TensorFlowLiteRecipes/Part_Sqrt_Rsqrt_Add_004/test.recipe
new file mode 100644
index 000000000..c1693f72e
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Part_Sqrt_Rsqrt_Add_004/test.recipe
@@ -0,0 +1,41 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "rsqrt"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "sqrt"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+}
+operation {
+  type: "Rsqrt"
+  input: "ifm"
+  output: "rsqrt"
+}
+operation {
+  type: "Sqrt"
+  input: "rsqrt"
+  output: "sqrt"
+}
+operation {
+  type: "Add"
+  add_options {
+    activation: NONE
+  }
+  input: "rsqrt"
+  input: "sqrt"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/Slice_001/test.recipe b/res/TensorFlowLiteRecipes/Slice_001/test.recipe
new file mode 100644
index 000000000..20f1baab3
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Slice_001/test.recipe
@@ -0,0 +1,37 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 3 dim: 2 dim: 3 }
+}
+operand {
+  name: "begin"
+  type: INT32
+  shape { dim: 3 }
+  filler {
+    tag: "explicit"
+    arg: "-1" arg: "0" arg: "0"
+  }
+}
+operand {
+  name: "size"
+  type: INT32
+  shape { dim: 3 }
+  filler {
+    tag: "explicit"
+    arg: "1" arg: "1" arg: "3"
+  }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 1 dim: 3 }
+}
+operation {
+  type: "Slice"
+  input: "ifm"
+  input: "begin"
+  input: "size"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/Slice_001/test.reverse b/res/TensorFlowLiteRecipes/Slice_001/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Slice_001/test.reverse
diff --git a/res/TensorFlowLiteRecipes/Squeeze_001/test.recipe b/res/TensorFlowLiteRecipes/Squeeze_001/test.recipe
new file mode 100644
index 000000000..9ac441574
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Squeeze_001/test.recipe
@@ -0,0 +1,18 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 4 dim: 5 dim: 1 }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 4 dim: 5 }
+}
+operation {
+  type: "Squeeze"
+  squeeze_options { }
+  input: "ifm"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/Squeeze_001/test.reverse b/res/TensorFlowLiteRecipes/Squeeze_001/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Squeeze_001/test.reverse
diff --git a/res/TensorFlowPythonExamples/examples/Bidirectional_LSTM/__init__.py b/res/TensorFlowPythonExamples/examples/Bidirectional_LSTM/__init__.py
new file mode 100644
index 000000000..d28034bf9
--- /dev/null
+++ b/res/TensorFlowPythonExamples/examples/Bidirectional_LSTM/__init__.py
@@ -0,0 +1,6 @@
+import tensorflow as tf
+
+in_ = tf.compat.v1.placeholder(dtype=tf.float32, shape=[28, 28, 3], name="Hole")
+
+op_uni_ = tf.compat.v1.keras.layers.LSTM(1, time_major=False, return_sequences=True)
+op_bidi_ = tf.compat.v1.keras.layers.Bidirectional(op_uni_)(in_)
diff --git a/res/TensorFlowPythonExamples/examples/fake_quant_with_min_max_vars/__init__.py b/res/TensorFlowPythonExamples/examples/fake_quant_with_min_max_vars/__init__.py
new file mode 100644
index 000000000..c4c928466
--- /dev/null
+++ b/res/TensorFlowPythonExamples/examples/fake_quant_with_min_max_vars/__init__.py
@@ -0,0 +1,27 @@
+import tensorflow as tf
+import numpy as np
+
+tf.compat.v1.disable_eager_execution()
+
+in_ = tf.compat.v1.placeholder(tf.float32, shape=(1, 32, 32, 3), name="Hole")
+
+filters = np.random.uniform(low=-1., high=1, size=[5, 5, 3, 32]).astype(np.float32)
+strides = (1, 2, 2, 1)
+cv_ = tf.compat.v1.nn.conv2d(in_, filters, strides, "VALID", data_format="NHWC")
+
+op_ = tf.compat.v1.fake_quant_with_min_max_vars(cv_, 0.0, 1.0, 8, False)
+'''
+NOTE:
+'fake_quant_with_min_max_vars' is converted to QUANTIZE-DEQUANTIZE in tflite.
+To produce tflite with FAKE_QUANT Op, you need to change tf2tfliteV2.py with
+
+converter.experimental_new_converter = False
+
+and then run
+
+python3 ../../compiler/tf2tfliteV2/tf2tfliteV2.py --v2 --graph_def \
+-i ./fake_quant_with_min_max_vars.pbtxt \
+-o ./fake_quant_with_min_max_vars.tflite \
+-I Hole \
+-O FakeQuantWithMinMaxVars
+'''
diff --git a/res/TensorFlowPythonModels/examples/minimum-maximum/__init__.py b/res/TensorFlowPythonModels/examples/minimum-maximum/__init__.py
new file mode 100644
index 000000000..fe074b49c
--- /dev/null
+++ b/res/TensorFlowPythonModels/examples/minimum-maximum/__init__.py
@@ -0,0 +1,15 @@
+import tensorflow as tf
+
+in_ = tf.compat.v1.placeholder(dtype=tf.float32, shape=(1, 16, 160, 160), name="Hole")
+
+upper_ = tf.compat.v1.constant(6.)
+lower_ = tf.compat.v1.constant(0.)
+
+min_ = tf.compat.v1.minimum(in_, upper_)
+max_ = tf.compat.v1.maximum(min_, lower_)
+'''
+python ../../compiler/tf2tfliteV2/tf2tfliteV2.py --v1 \
+-i minimum-maximum.pbtxt \
+-o minimum-maximum.tflite \
+-I Hole -O Maximum
+'''
diff --git a/res/TensorFlowPythonModels/tfpem.py b/res/TensorFlowPythonModels/tfpem.py
index 01627eb99..542085bb6 100644..100755
--- a/res/TensorFlowPythonModels/tfpem.py
+++ b/res/TensorFlowPythonModels/tfpem.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+
 # TensorFlow Python Example Manager
 
 import tensorflow as tf
diff --git a/runtime/contrib/.clang-format b/runtime/contrib/.clang-format
deleted file mode 120000
index f761fe4ae..000000000
--- a/runtime/contrib/.clang-format
+++ /dev/null
@@ -1 +0,0 @@
-../../.clang-format.8
-\ No newline at end of file
diff --git a/runtime/contrib/android/api/Prebuilt.mk b/runtime/contrib/android/api/Prebuilt.mk
index c00c7d3da..63cf2bc7e 100644
--- a/runtime/contrib/android/api/Prebuilt.mk
+++ b/runtime/contrib/android/api/Prebuilt.mk
@@ -5,22 +5,6 @@ ifndef ONERT_PREBUILT_LIB_DIR
 $(error ONERT_PREBUILT_LIB_DIR is not set)
 endif
 
-# libcircle_loader
-include $(CLEAR_VARS)
-LOCAL_MODULE := circle_loader
-PREBUILT_LIB += circle_loader
-LOCAL_SRC_FILES := \
-		$(ONERT_PREBUILT_LIB_DIR)/libcircle_loader.so
-include $(PREBUILT_SHARED_LIBRARY)
-
-# libtflite_loader
-include $(CLEAR_VARS)
-LOCAL_MODULE := tflite_loader
-PREBUILT_LIB += tflite_loader
-LOCAL_SRC_FILES := \
-		$(ONERT_PREBUILT_LIB_DIR)/libtflite_loader.so
-include $(PREBUILT_SHARED_LIBRARY)
-
 # libnnfw
 include $(CLEAR_VARS)
 LOCAL_MODULE := nnfw-dev
diff --git a/runtime/contrib/android/api/build.gradle b/runtime/contrib/android/api/build.gradle
index 6bb7a5631..8f0d61c47 100644
--- a/runtime/contrib/android/api/build.gradle
+++ b/runtime/contrib/android/api/build.gradle
@@ -8,7 +8,7 @@ android {
         minSdkVersion 26
         targetSdkVersion 29
         versionCode 1
-        versionName "1.12.0"
+        versionName "1.15.0"
 
         externalNativeBuild {
             ndkBuild {
diff --git a/runtime/contrib/style_transfer_app/CMakeLists.txt b/runtime/contrib/style_transfer_app/CMakeLists.txt
index b137231ea..9ffbeaec7 100644
--- a/runtime/contrib/style_transfer_app/CMakeLists.txt
+++ b/runtime/contrib/style_transfer_app/CMakeLists.txt
@@ -32,7 +32,7 @@ endif(JPEG_FOUND)
 target_link_libraries(style_transfer_app onert_core onert tflite_loader)
 target_link_libraries(style_transfer_app tensorflow-lite ${LIB_PTHREAD} dl nnfw_lib_tflite)
 target_link_libraries(style_transfer_app nnfw-dev)
-target_link_libraries(tflite_loader_test_tool ${Boost_PROGRAM_OPTIONS_LIBRARY} ${Boost_SYSTEM_LIBRARY} ${Boost_FILESYSTEM_LIBRARY})
+target_link_libraries(tflite_comparator ${Boost_PROGRAM_OPTIONS_LIBRARY} ${Boost_SYSTEM_LIBRARY} ${Boost_FILESYSTEM_LIBRARY})
 if(JPEG_FOUND)
   target_link_libraries(style_transfer_app ${JPEG_LIBRARIES})
 endif(JPEG_FOUND)
diff --git a/runtime/libs/.clang-format b/runtime/libs/.clang-format
deleted file mode 120000
index f761fe4ae..000000000
--- a/runtime/libs/.clang-format
+++ /dev/null
@@ -1 +0,0 @@
-../../.clang-format.8
-\ No newline at end of file
diff --git a/runtime/libs/benchmark/src/Phases.cpp b/runtime/libs/benchmark/src/Phases.cpp
index 897b943d3..76993f266 100644
--- a/runtime/libs/benchmark/src/Phases.cpp
+++ b/runtime/libs/benchmark/src/Phases.cpp
@@ -42,7 +42,7 @@ void SleepForMicros(uint64_t micros)
   sleep_time.tv_nsec = micros * 1e3;
   nanosleep(&sleep_time, nullptr);
 }
-}
+} // namespace
 
 namespace benchmark
 {
diff --git a/runtime/libs/misc/include/misc/RandomGenerator.h b/runtime/libs/misc/include/misc/RandomGenerator.h
index 8d26b8c74..8da4f7f20 100644
--- a/runtime/libs/misc/include/misc/RandomGenerator.h
+++ b/runtime/libs/misc/include/misc/RandomGenerator.h
@@ -76,6 +76,7 @@ private:
   std::normal_distribution<float> _dist;
 };
 
+template <> int8_t RandomGenerator::generate<int8_t>(void);
 template <> uint8_t RandomGenerator::generate<uint8_t>(void);
 template <> bool RandomGenerator::generate<bool>(void);
 template <> int32_t RandomGenerator::generate<int32_t>(void);
diff --git a/runtime/libs/misc/src/RandomGenerator.cpp b/runtime/libs/misc/src/RandomGenerator.cpp
index e7fbc10ca..af072326b 100644
--- a/runtime/libs/misc/src/RandomGenerator.cpp
+++ b/runtime/libs/misc/src/RandomGenerator.cpp
@@ -21,6 +21,34 @@ namespace nnfw
 namespace misc
 {
 
+template <> int8_t RandomGenerator::generate<int8_t>(void)
+{
+  // The value of type_range is 255.
+  float type_range = static_cast<float>(std::numeric_limits<int8_t>::max()) -
+                     static_cast<float>(std::numeric_limits<int8_t>::min());
+  // Most _dist values range from -5.0 to 5.0.
+  float min_range = -5.0f;
+  float max_range = 5.0f;
+  // NOTE shifted_relative_val has Gaussian distribution that origin mean was 0 and standard
+  // deviation was 2. And then its values are distributed and shift to that mean is 127.5 and range
+  // is about [0, 255].
+  float shifted_relative_val = (_dist(_rand) - min_range) * type_range / (max_range - min_range);
+
+  // shifted_relative_val is adjusted to be mapped to end points of the range, if it is out of range
+  // values.
+  if (shifted_relative_val < -128.0f)
+  {
+    return -128;
+  }
+  else if (shifted_relative_val > type_range)
+  {
+    return 127;
+  }
+
+  // Convert shifted_relative_val from float to int8
+  return static_cast<int8_t>(shifted_relative_val);
+}
+
 template <> uint8_t RandomGenerator::generate<uint8_t>(void)
 {
   // The value of type_range is 255.
diff --git a/runtime/libs/profiling/CMakeLists.txt b/runtime/libs/profiling/CMakeLists.txt
index e0398ce93..b115cc1c6 100644
--- a/runtime/libs/profiling/CMakeLists.txt
+++ b/runtime/libs/profiling/CMakeLists.txt
@@ -4,4 +4,3 @@ add_library(nnfw_lib_profiling STATIC ${SOURCES})
 set_property(TARGET nnfw_lib_profiling PROPERTY POSITION_INDEPENDENT_CODE ON)
 target_include_directories(nnfw_lib_profiling PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
 target_link_libraries(nnfw_lib_profiling PRIVATE nnfw_common)
-target_link_libraries(nnfw_lib_profiling PRIVATE nnfw_coverage)
diff --git a/runtime/libs/rua/anchor/CMakeLists.txt b/runtime/libs/rua/anchor/CMakeLists.txt
index 6e65641f4..fb41c47ea 100644
--- a/runtime/libs/rua/anchor/CMakeLists.txt
+++ b/runtime/libs/rua/anchor/CMakeLists.txt
@@ -6,4 +6,3 @@ target_include_directories(nnfw_lib_rua_anchor PUBLIC include)
 target_link_libraries(nnfw_lib_rua_anchor PUBLIC nnfw_lib_rua_core)
 target_link_libraries(nnfw_lib_rua_anchor PRIVATE nnfw_lib_rua_dyn)
 target_link_libraries(nnfw_lib_rua_anchor PRIVATE nnfw_common)
-target_link_libraries(nnfw_lib_rua_anchor PRIVATE nnfw_coverage)
diff --git a/runtime/libs/rua/dyn/CMakeLists.txt b/runtime/libs/rua/dyn/CMakeLists.txt
index 3f9ac8928..01d8a7c02 100644
--- a/runtime/libs/rua/dyn/CMakeLists.txt
+++ b/runtime/libs/rua/dyn/CMakeLists.txt
@@ -5,4 +5,3 @@ set_target_properties(nnfw_lib_rua_dyn PROPERTIES POSITION_INDEPENDENT_CODE ON)
 target_include_directories(nnfw_lib_rua_dyn PUBLIC include)
 target_link_libraries(nnfw_lib_rua_dyn PUBLIC nnfw_lib_rua_core)
 target_link_libraries(nnfw_lib_rua_dyn PRIVATE nnfw_common)
-target_link_libraries(nnfw_lib_rua_dyn PRIVATE nnfw_coverage)
diff --git a/runtime/libs/tflite/CMakeLists.txt b/runtime/libs/tflite/CMakeLists.txt
index 93a3c9789..f02c93aa6 100644
--- a/runtime/libs/tflite/CMakeLists.txt
+++ b/runtime/libs/tflite/CMakeLists.txt
@@ -17,7 +17,6 @@ target_link_libraries(nnfw_lib_tflite PUBLIC tensorflow-lite-ex)
 target_link_libraries(nnfw_lib_tflite PUBLIC nnfw_lib_misc)
 target_link_libraries(nnfw_lib_tflite PRIVATE ${LIB_PTHREAD} dl)
 target_link_libraries(nnfw_lib_tflite PRIVATE nnfw_common)
-target_link_libraries(nnfw_lib_tflite PRIVATE nnfw_coverage)
 
 if(NOT ENABLE_TEST)
   return()
diff --git a/runtime/libs/tflite/include/tflite/CopyInputInitializer.h b/runtime/libs/tflite/include/tflite/CopyInputInitializer.h
new file mode 100644
index 000000000..866af0598
--- /dev/null
+++ b/runtime/libs/tflite/include/tflite/CopyInputInitializer.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_TFLITE_COPY_INPUT_INITIALIZER_H__
+#define __NNFW_TFLITE_COPY_INPUT_INITIALIZER_H__
+
+#include <tensorflow/lite/interpreter.h>
+
+namespace nnfw
+{
+namespace tflite
+{
+
+class CopyInputInitializer
+{
+public:
+  CopyInputInitializer(::tflite::Interpreter &from) : _from{from}
+  {
+    // DO NOTHING
+  }
+
+  void run(::tflite::Interpreter &interp);
+
+private:
+  template <typename T> void setValue(::tflite::Interpreter &interp, int tensor_idx);
+
+private:
+  ::tflite::Interpreter &_from;
+};
+
+} // namespace tflite
+} // namespace nnfw
+
+#endif // __NNFW_TFLITE_COPY_INPUT_INITIALIZER_H__
diff --git a/runtime/libs/tflite/include/tflite/OutputResetter.h b/runtime/libs/tflite/include/tflite/OutputResetter.h
new file mode 100644
index 000000000..424068d88
--- /dev/null
+++ b/runtime/libs/tflite/include/tflite/OutputResetter.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_TFLITE_OUTPUT_RESETTER_H__
+#define __NNFW_TFLITE_OUTPUT_RESETTER_H__
+
+#include <tensorflow/lite/interpreter.h>
+
+namespace nnfw
+{
+namespace tflite
+{
+
+class OutputResetter
+{
+public:
+  OutputResetter()
+  {
+    // DO NOTHING
+  }
+
+  void run(::tflite::Interpreter &interp);
+
+private:
+  template <typename T> void resetValue(::tflite::Interpreter &interp, int tensor_idx);
+};
+
+} // namespace tflite
+} // namespace nnfw
+
+#endif // __NNFW_TFLITE_OUTPUT_RESETTER_H__
diff --git a/runtime/libs/tflite/include/tflite/RandomInputInitializer.h b/runtime/libs/tflite/include/tflite/RandomInputInitializer.h
new file mode 100644
index 000000000..3c241a85e
--- /dev/null
+++ b/runtime/libs/tflite/include/tflite/RandomInputInitializer.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_TFLITE_RANDOM_INPUT_INITIALIZER_H__
+#define __NNFW_TFLITE_RANDOM_INPUT_INITIALIZER_H__
+
+#include <misc/RandomGenerator.h>
+
+#include <tensorflow/lite/interpreter.h>
+
+namespace nnfw
+{
+namespace tflite
+{
+
+class RandomInputInitializer
+{
+public:
+  RandomInputInitializer(misc::RandomGenerator &randgen) : _randgen{randgen}
+  {
+    // DO NOTHING
+  }
+
+  void run(::tflite::Interpreter &interp);
+
+private:
+  template <typename T> void setValue(::tflite::Interpreter &interp, int tensor_idx);
+
+private:
+  nnfw::misc::RandomGenerator &_randgen;
+};
+
+} // namespace tflite
+} // namespace nnfw
+
+#endif // __NNFW_TFLITE_RANDOM_INPUT_INITIALIZER_H__
diff --git a/runtime/libs/tflite/include/tflite/TensorShapeUtils.h b/runtime/libs/tflite/include/tflite/TensorShapeUtils.h
deleted file mode 100644
index ba8687413..000000000
--- a/runtime/libs/tflite/include/tflite/TensorShapeUtils.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file     TensorShapeUtils.h
- * @brief    This file contains utilities function of tensor shape
- * @ingroup  COM_AI_RUNTIME
- */
-
-#ifndef __NNFW_TFLITE_TENSOR_SHAPE_UTILS_H__
-#define __NNFW_TFLITE_TENSOR_SHAPE_UTILS_H__
-
-#include "misc/tensor/Shape.h"
-
-#include <vector>
-
-namespace nnfw
-{
-namespace tflite
-{
-
-/**
- * @brief Converts tensor::Shape into a vector
- * @param[in] shape The tensor shape to be converted
- * @return vector value of given shape object
- */
-static inline std::vector<int32_t> as_dims(const nnfw::misc::tensor::Shape &shape)
-{
-  std::vector<int32_t> dims;
-
-  for (uint32_t axis = 0; axis < shape.rank(); ++axis)
-  {
-    dims.emplace_back(shape.dim(axis));
-  }
-
-  return dims;
-}
-
-/**
- * @brief Broadcasts between two given shapes
- * @param[in] lhs_shape The left hand side shape
- * @param[in] rhs_shape The right hand side shape
- * @return The broadcasted shape
- */
-nnfw::misc::tensor::Shape broadcast(const nnfw::misc::tensor::Shape &lhs_shape,
-                                    const nnfw::misc::tensor::Shape &rhs_shape);
-
-} // namespace tflite
-} // namespace nnfw
-
-#endif // __NNFW_TFLITE_TENSOR_SHAPE_UTILS_H__
diff --git a/runtime/libs/tflite/src/CopyInputInitializer.cpp b/runtime/libs/tflite/src/CopyInputInitializer.cpp
new file mode 100644
index 000000000..1950dad21
--- /dev/null
+++ b/runtime/libs/tflite/src/CopyInputInitializer.cpp
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tflite/CopyInputInitializer.h"
+#include "tflite/TensorView.h"
+
+#include <misc/tensor/IndexIterator.h>
+
+namespace nnfw
+{
+namespace tflite
+{
+
+void CopyInputInitializer::run(::tflite::Interpreter &interp)
+{
+  for (const auto &tensor_idx : interp.inputs())
+  {
+    TfLiteTensor *tensor = interp.tensor(tensor_idx);
+    switch (tensor->type)
+    {
+      case kTfLiteInt32:
+        setValue<int32_t>(interp, tensor_idx);
+        break;
+      case kTfLiteUInt8:
+        setValue<uint8_t>(interp, tensor_idx);
+        break;
+      case kTfLiteInt8:
+        setValue<int8_t>(interp, tensor_idx);
+        break;
+      case kTfLiteBool:
+        setValue<bool>(interp, tensor_idx);
+        break;
+      case kTfLiteFloat32:
+        setValue<float>(interp, tensor_idx);
+        break;
+      default:
+        throw std::runtime_error{"Not supported input type"};
+    }
+  }
+}
+
+template <typename T>
+void CopyInputInitializer::setValue(::tflite::Interpreter &interp, int tensor_idx)
+{
+  auto tensor_from_view = nnfw::tflite::TensorView<T>::make(_from, tensor_idx);
+  auto tensor_to_view = nnfw::tflite::TensorView<T>::make(interp, tensor_idx);
+
+  nnfw::misc::tensor::iterate(tensor_from_view.shape())
+    << [&](const nnfw::misc::tensor::Index &ind) {
+         tensor_to_view.at(ind) = tensor_from_view.at(ind);
+       };
+}
+
+} // namespace tflite
+} // namespace nnfw
diff --git a/runtime/libs/tflite/src/OutputResetter.cpp b/runtime/libs/tflite/src/OutputResetter.cpp
new file mode 100644
index 000000000..486bb4035
--- /dev/null
+++ b/runtime/libs/tflite/src/OutputResetter.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tflite/OutputResetter.h"
+#include "tflite/TensorView.h"
+
+#include <misc/tensor/IndexIterator.h>
+
+namespace nnfw
+{
+namespace tflite
+{
+
+void OutputResetter::run(::tflite::Interpreter &interp)
+{
+  for (const auto &tensor_idx : interp.outputs())
+  {
+    TfLiteTensor *tensor = interp.tensor(tensor_idx);
+    switch (tensor->type)
+    {
+      case kTfLiteInt32:
+        resetValue<int32_t>(interp, tensor_idx);
+        break;
+      case kTfLiteUInt8:
+        resetValue<uint8_t>(interp, tensor_idx);
+        break;
+      case kTfLiteInt8:
+        resetValue<int8_t>(interp, tensor_idx);
+        break;
+      case kTfLiteBool:
+        resetValue<bool>(interp, tensor_idx);
+        break;
+      case kTfLiteFloat32:
+        resetValue<float>(interp, tensor_idx);
+        break;
+      default:
+        throw std::runtime_error{"Not supported output type"};
+    }
+  }
+}
+
+template <typename T> void OutputResetter::resetValue(::tflite::Interpreter &interp, int tensor_idx)
+{
+  auto tensor_view = nnfw::tflite::TensorView<T>::make(interp, tensor_idx);
+
+  nnfw::misc::tensor::iterate(tensor_view.shape())
+    << [&](const nnfw::misc::tensor::Index &ind) { tensor_view.at(ind) = 0; };
+}
+
+} // namespace tflite
+} // namespace nnfw
diff --git a/runtime/libs/tflite/src/RandomInputInitializer.cpp b/runtime/libs/tflite/src/RandomInputInitializer.cpp
new file mode 100644
index 000000000..57dd7f66c
--- /dev/null
+++ b/runtime/libs/tflite/src/RandomInputInitializer.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tflite/RandomInputInitializer.h"
+#include "tflite/TensorView.h"
+
+#include <misc/tensor/IndexIterator.h>
+
+namespace nnfw
+{
+namespace tflite
+{
+
+void RandomInputInitializer::run(::tflite::Interpreter &interp)
+{
+  for (const auto &tensor_idx : interp.inputs())
+  {
+    TfLiteTensor *tensor = interp.tensor(tensor_idx);
+    switch (tensor->type)
+    {
+      case kTfLiteFloat32:
+        setValue<float>(interp, tensor_idx);
+        break;
+      case kTfLiteInt32:
+        setValue<int32_t>(interp, tensor_idx);
+        break;
+      case kTfLiteUInt8:
+        setValue<uint8_t>(interp, tensor_idx);
+        break;
+      case kTfLiteBool:
+        setValue<bool>(interp, tensor_idx);
+        break;
+      case kTfLiteInt8:
+        setValue<int8_t>(interp, tensor_idx);
+        break;
+      default:
+        throw std::runtime_error{"Not supported input type"};
+    }
+  }
+}
+
+template <typename T>
+void RandomInputInitializer::setValue(::tflite::Interpreter &interp, int tensor_idx)
+{
+  auto tensor_view = nnfw::tflite::TensorView<T>::make(interp, tensor_idx);
+
+  nnfw::misc::tensor::iterate(tensor_view.shape())
+    << [&](const nnfw::misc::tensor::Index &ind) { tensor_view.at(ind) = _randgen.generate<T>(); };
+}
+
+} // namespace tflite
+} // namespace nnfw
diff --git a/runtime/libs/tflite/src/RandomTestRunner.cpp b/runtime/libs/tflite/src/RandomTestRunner.cpp
index 3fa9a973f..ae834e79e 100644
--- a/runtime/libs/tflite/src/RandomTestRunner.cpp
+++ b/runtime/libs/tflite/src/RandomTestRunner.cpp
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+#include "tflite/CopyInputInitializer.h"
+#include "tflite/OutputResetter.h"
+#include "tflite/RandomInputInitializer.h"
 #include "tflite/RandomTestRunner.h"
 #include "tflite/Diff.h"
 #include "tflite/TensorLogger.h"
@@ -42,247 +45,24 @@ void RandomTestRunner::compile(const nnfw::tflite::Builder &builder)
   _nnapi = builder.build();
 
   _tfl_interp->UseNNAPI(false);
+  _nnapi->UseNNAPI(true);
 
   // Allocate Tensors
   _tfl_interp->AllocateTensors();
   _nnapi->AllocateTensors();
+}
 
+int RandomTestRunner::run(size_t running_count)
+{
   assert(_tfl_interp->inputs() == _nnapi->inputs());
+  assert(_tfl_interp->outputs() == _nnapi->outputs());
 
-  using ::tflite::Interpreter;
-  using Initializer = std::function<void(int id, Interpreter *, Interpreter *)>;
-
-  std::map<TfLiteType, Initializer> initializers;
-  std::map<TfLiteType, Initializer> reseters;
-
-  // Generate singed 32-bit integer (s32) input
-  initializers[kTfLiteInt32] = [&](int id, Interpreter *tfl_interp, Interpreter *nnapi) {
-    assert(_tfl_interp->tensor(id)->type == kTfLiteInt32);
-    assert(_nnapi->tensor(id)->type == kTfLiteInt32);
-
-    auto tfl_interp_view = nnfw::tflite::TensorView<int32_t>::make(*tfl_interp, id);
-    auto nnapi_view = nnfw::tflite::TensorView<int32_t>::make(*nnapi, id);
-
-    assert(tfl_interp_view.shape() == nnapi_view.shape());
-
-    int32_t value = 0;
-
-    nnfw::misc::tensor::iterate(tfl_interp_view.shape())
-      << [&](const nnfw::misc::tensor::Index &ind) {
-           // TODO Generate random values
-           tfl_interp_view.at(ind) = value;
-           nnapi_view.at(ind) = value;
-           ++value;
-         };
-  };
-
-  // Generate singed 32-bit integer (s32) input
-  reseters[kTfLiteInt32] = [&](int id, Interpreter *tfl_interp, Interpreter *nnapi) {
-    assert(_tfl_interp->tensor(id)->type == kTfLiteInt32);
-    assert(_nnapi->tensor(id)->type == kTfLiteInt32);
-
-    auto tfl_interp_view = nnfw::tflite::TensorView<int32_t>::make(*tfl_interp, id);
-    auto nnapi_view = nnfw::tflite::TensorView<int32_t>::make(*nnapi, id);
-
-    assert(tfl_interp_view.shape() == nnapi_view.shape());
-
-    int32_t value = 0;
-
-    nnfw::misc::tensor::iterate(tfl_interp_view.shape())
-      << [&](const nnfw::misc::tensor::Index &ind) {
-           // TODO Generate random values
-           tfl_interp_view.at(ind) = value;
-           nnapi_view.at(ind) = value;
-         };
-  };
-
-  initializers[kTfLiteUInt8] = [&](int id, Interpreter *tfl_interp, Interpreter *nnapi) {
-    assert(_tfl_interp->tensor(id)->type == kTfLiteUInt8);
-    assert(_nnapi->tensor(id)->type == kTfLiteUInt8);
-
-    auto tfl_interp_view = nnfw::tflite::TensorView<uint8_t>::make(*tfl_interp, id);
-    auto nnapi_view = nnfw::tflite::TensorView<uint8_t>::make(*nnapi, id);
-
-    assert(tfl_interp_view.shape() == nnapi_view.shape());
-
-    auto fp = static_cast<uint8_t (nnfw::misc::RandomGenerator::*)(
-      const ::nnfw::misc::tensor::Shape &, const ::nnfw::misc::tensor::Index &)>(
-      &nnfw::misc::RandomGenerator::generate<uint8_t>);
-    const nnfw::misc::tensor::Object<uint8_t> data(tfl_interp_view.shape(),
-                                                   std::bind(fp, _randgen, _1, _2));
-    assert(tfl_interp_view.shape() == data.shape());
-
-    nnfw::misc::tensor::iterate(tfl_interp_view.shape())
-      << [&](const nnfw::misc::tensor::Index &ind) {
-           const auto value = data.at(ind);
-
-           tfl_interp_view.at(ind) = value;
-           nnapi_view.at(ind) = value;
-         };
-  };
-
-  reseters[kTfLiteUInt8] = [&](int id, Interpreter *tfl_interp, Interpreter *nnapi) {
-    assert(_tfl_interp->tensor(id)->type == kTfLiteUInt8);
-    assert(_nnapi->tensor(id)->type == kTfLiteUInt8);
-
-    auto tfl_interp_view = nnfw::tflite::TensorView<uint8_t>::make(*tfl_interp, id);
-    auto nnapi_view = nnfw::tflite::TensorView<uint8_t>::make(*nnapi, id);
-
-    assert(tfl_interp_view.shape() == nnapi_view.shape());
-
-    auto fp = static_cast<uint8_t (nnfw::misc::RandomGenerator::*)(
-      const ::nnfw::misc::tensor::Shape &, const ::nnfw::misc::tensor::Index &)>(
-      &nnfw::misc::RandomGenerator::generate<uint8_t>);
-    const nnfw::misc::tensor::Object<uint8_t> data(tfl_interp_view.shape(),
-                                                   std::bind(fp, _randgen, _1, _2));
-    assert(tfl_interp_view.shape() == data.shape());
-
-    uint8_t value = 0;
-
-    nnfw::misc::tensor::iterate(tfl_interp_view.shape())
-      << [&](const nnfw::misc::tensor::Index &ind) {
-           tfl_interp_view.at(ind) = value;
-           nnapi_view.at(ind) = value;
-         };
-  };
-
-  initializers[kTfLiteFloat32] = [&](int id, Interpreter *tfl_interp, Interpreter *nnapi) {
-    assert(_tfl_interp->tensor(id)->type == kTfLiteFloat32);
-    assert(_nnapi->tensor(id)->type == kTfLiteFloat32);
-
-    auto tfl_interp_view = nnfw::tflite::TensorView<float>::make(*tfl_interp, id);
-    auto nnapi_view = nnfw::tflite::TensorView<float>::make(*nnapi, id);
-
-    assert(tfl_interp_view.shape() == nnapi_view.shape());
-
-    auto fp = static_cast<float (nnfw::misc::RandomGenerator::*)(
-      const ::nnfw::misc::tensor::Shape &, const ::nnfw::misc::tensor::Index &)>(
-      &nnfw::misc::RandomGenerator::generate<float>);
-    const nnfw::misc::tensor::Object<float> data(tfl_interp_view.shape(),
-                                                 std::bind(fp, _randgen, _1, _2));
-
-    assert(tfl_interp_view.shape() == data.shape());
-
-    nnfw::misc::tensor::iterate(tfl_interp_view.shape())
-      << [&](const nnfw::misc::tensor::Index &ind) {
-           const auto value = data.at(ind);
-
-           tfl_interp_view.at(ind) = value;
-           nnapi_view.at(ind) = value;
-         };
-  };
-
-  reseters[kTfLiteFloat32] = [&](int id, Interpreter *tfl_interp, Interpreter *nnapi) {
-    assert(_tfl_interp->tensor(id)->type == kTfLiteFloat32);
-    assert(_nnapi->tensor(id)->type == kTfLiteFloat32);
-
-    auto tfl_interp_view = nnfw::tflite::TensorView<float>::make(*tfl_interp, id);
-    auto nnapi_view = nnfw::tflite::TensorView<float>::make(*nnapi, id);
-
-    assert(tfl_interp_view.shape() == nnapi_view.shape());
-
-    auto fp = static_cast<float (nnfw::misc::RandomGenerator::*)(
-      const ::nnfw::misc::tensor::Shape &, const ::nnfw::misc::tensor::Index &)>(
-      &nnfw::misc::RandomGenerator::generate<float>);
-    const nnfw::misc::tensor::Object<float> data(tfl_interp_view.shape(),
-                                                 std::bind(fp, _randgen, _1, _2));
+  nnfw::tflite::OutputResetter resetter;
+  resetter.run(*(_tfl_interp.get()));
 
-    assert(tfl_interp_view.shape() == data.shape());
+  RandomInputInitializer initializer{_randgen};
+  initializer.run(*(_tfl_interp.get()));
 
-    float value = 0;
-
-    nnfw::misc::tensor::iterate(tfl_interp_view.shape())
-      << [&](const nnfw::misc::tensor::Index &ind) {
-           tfl_interp_view.at(ind) = value;
-           nnapi_view.at(ind) = value;
-         };
-  };
-
-  initializers[kTfLiteBool] = [&](int id, Interpreter *tfl_interp, Interpreter *nnapi) {
-    assert(_tfl_interp->tensor(id)->type == kTfLiteBool);
-    assert(_nnapi->tensor(id)->type == kTfLiteBool);
-
-    auto tfl_interp_view = nnfw::tflite::TensorView<bool>::make(*tfl_interp, id);
-    auto nnapi_view = nnfw::tflite::TensorView<bool>::make(*nnapi, id);
-
-    assert(tfl_interp_view.shape() == nnapi_view.shape());
-
-    auto fp = static_cast<bool (nnfw::misc::RandomGenerator::*)(
-      const ::nnfw::misc::tensor::Shape &, const ::nnfw::misc::tensor::Index &)>(
-      &nnfw::misc::RandomGenerator::generate<bool>);
-    const nnfw::misc::tensor::Object<bool> data(tfl_interp_view.shape(),
-                                                std::bind(fp, _randgen, _1, _2));
-
-    assert(tfl_interp_view.shape() == data.shape());
-
-    nnfw::misc::tensor::iterate(tfl_interp_view.shape())
-      << [&](const nnfw::misc::tensor::Index &ind) {
-           const auto value = data.at(ind);
-
-           tfl_interp_view.at(ind) = value;
-           nnapi_view.at(ind) = value;
-         };
-  };
-
-  reseters[kTfLiteBool] = [&](int id, Interpreter *tfl_interp, Interpreter *nnapi) {
-    assert(_tfl_interp->tensor(id)->type == kTfLiteBool);
-    assert(_nnapi->tensor(id)->type == kTfLiteBool);
-
-    auto tfl_interp_view = nnfw::tflite::TensorView<bool>::make(*tfl_interp, id);
-    auto nnapi_view = nnfw::tflite::TensorView<bool>::make(*nnapi, id);
-
-    assert(tfl_interp_view.shape() == nnapi_view.shape());
-
-    auto fp = static_cast<bool (nnfw::misc::RandomGenerator::*)(
-      const ::nnfw::misc::tensor::Shape &, const ::nnfw::misc::tensor::Index &)>(
-      &nnfw::misc::RandomGenerator::generate<bool>);
-    const nnfw::misc::tensor::Object<bool> data(tfl_interp_view.shape(),
-                                                std::bind(fp, _randgen, _1, _2));
-
-    assert(tfl_interp_view.shape() == data.shape());
-
-    bool value = false;
-
-    nnfw::misc::tensor::iterate(tfl_interp_view.shape())
-      << [&](const nnfw::misc::tensor::Index &ind) {
-           tfl_interp_view.at(ind) = value;
-           nnapi_view.at(ind) = value;
-         };
-  };
-
-  // Fill IFM with random numbers
-  for (const auto id : _tfl_interp->inputs())
-  {
-    assert(_tfl_interp->tensor(id)->type == _nnapi->tensor(id)->type);
-
-    auto it = initializers.find(_tfl_interp->tensor(id)->type);
-
-    if (it == initializers.end())
-    {
-      throw std::runtime_error{"Not supported input type"};
-    }
-
-    it->second(id, _tfl_interp.get(), _nnapi.get());
-  }
-
-  // Fill OFM with 0
-  for (const auto id : _tfl_interp->outputs())
-  {
-    assert(_tfl_interp->tensor(id)->type == _nnapi->tensor(id)->type);
-
-    auto it = reseters.find(_tfl_interp->tensor(id)->type);
-
-    if (it == reseters.end())
-    {
-      throw std::runtime_error{"Not supported input type"};
-    }
-
-    it->second(id, _tfl_interp.get(), _nnapi.get());
-  }
-}
-
-int RandomTestRunner::run(size_t running_count)
-{
   std::cout << "[NNAPI TEST] Run T/F Lite Interpreter without NNAPI" << std::endl;
   _tfl_interp->Invoke();
 
@@ -290,13 +70,17 @@ int RandomTestRunner::run(size_t running_count)
 
   for (size_t i = 1; i <= running_count; ++i)
   {
+    resetter.run(*(_nnapi.get()));
+
+    CopyInputInitializer copy_initializer{*(_tfl_interp.get())};
+    copy_initializer.run(*(_nnapi.get()));
+
     std::cout << "[NNAPI TEST #" << i << "] Run T/F Lite Interpreter with NNAPI" << std::endl;
 
     char *env = getenv("UPSTREAM_DELEGATE");
 
     if (env && !std::string(env).compare("1"))
     {
-      _nnapi->UseNNAPI(true);
       _nnapi->Invoke();
     }
     else
diff --git a/runtime/libs/tflite/src/TensorShapeUtils.cpp b/runtime/libs/tflite/src/TensorShapeUtils.cpp
deleted file mode 100644
index 689b6151b..000000000
--- a/runtime/libs/tflite/src/TensorShapeUtils.cpp
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the License);
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "tflite/TensorShapeUtils.h"
-
-namespace nnfw
-{
-namespace tflite
-{
-
-nnfw::misc::tensor::Shape broadcast(const nnfw::misc::tensor::Shape &lhs_shape,
-                                    const nnfw::misc::tensor::Shape &rhs_shape)
-{
-  const uint32_t lhs_rank = lhs_shape.rank();
-  const uint32_t rhs_rank = rhs_shape.rank();
-  const uint32_t out_rank = std::max(lhs_rank, rhs_rank);
-  const uint32_t lhs_rank_diff = out_rank - lhs_rank;
-  const uint32_t rhs_rank_diff = out_rank - rhs_rank;
-
-  nnfw::misc::tensor::Shape out_shape(out_rank);
-
-  for (uint32_t axis = 0; axis < out_rank; ++axis)
-  {
-    out_shape.dim(axis) = std::max(axis < lhs_rank_diff ? 1 : lhs_shape.dim(axis - lhs_rank_diff),
-                                   axis < rhs_rank_diff ? 1 : rhs_shape.dim(axis - rhs_rank_diff));
-  }
-
-  return out_shape;
-}
-
-} // namespace tflite
-} // namespace nnfw
diff --git a/runtime/nnapi-header/include/NeuralNetworksEx.h b/runtime/nnapi-header/include/NeuralNetworksEx.h
index d15262e17..f0387995d 100644
--- a/runtime/nnapi-header/include/NeuralNetworksEx.h
+++ b/runtime/nnapi-header/include/NeuralNetworksEx.h
@@ -31,7 +31,8 @@ __BEGIN_DECLS
 /**
  * @brief Extended operation types
  */
-typedef enum {
+typedef enum
+{
   /** extends operation. */
 
   /**
diff --git a/runtime/onert/api/.clang-format b/runtime/onert/api/.clang-format
deleted file mode 120000
index 83185fee3..000000000
--- a/runtime/onert/api/.clang-format
+++ /dev/null
@@ -1 +0,0 @@
-../../../.clang-format.8
-\ No newline at end of file
diff --git a/runtime/onert/api/include/nnfw_version.h b/runtime/onert/api/include/nnfw_version.h
index 28703c0eb..1210e274f 100644
--- a/runtime/onert/api/include/nnfw_version.h
+++ b/runtime/onert/api/include/nnfw_version.h
@@ -21,6 +21,6 @@
  * NNFW_VERSION is a uint32 value representing nnfw runtime version
  * in 0xMMmmmmPP, where MM = major, mmmm = minor, PP = patch
  */
-#define NNFW_VERSION 0x01000c00
+#define NNFW_VERSION 0x01000f00
 
 #endif // __NNFW_VERSION_H__
diff --git a/runtime/onert/api/src/CustomKernel.cc b/runtime/onert/api/src/CustomKernel.cc
index 56525feff..f094047fe 100644
--- a/runtime/onert/api/src/CustomKernel.cc
+++ b/runtime/onert/api/src/CustomKernel.cc
@@ -18,9 +18,7 @@
 
 namespace onert
 {
-namespace frontend
-{
-namespace custom
+namespace api
 {
 
 using namespace backend::custom;
@@ -64,12 +62,12 @@ public:
   }
 };
 
-Kernel::Kernel(const nnfw_custom_eval evalFunction)
+CustomKernel::CustomKernel(const nnfw_custom_eval evalFunction)
   : _in_params(), _userdata(nullptr), _userdata_size(0), _evalFunction(evalFunction)
 {
 }
 
-void Kernel::configure(CustomKernelConfigParams &&inParams)
+void CustomKernel::configure(CustomKernelConfigParams &&inParams)
 {
   _userdata = inParams.userdata;
   _userdata_size = inParams.userdata_size;
@@ -77,7 +75,7 @@ void Kernel::configure(CustomKernelConfigParams &&inParams)
   _in_params = std::move(inParams);
 }
 
-void Kernel::run()
+void CustomKernel::run()
 {
   nnfw_custom_kernel_params params;
 
@@ -109,6 +107,5 @@ void Kernel::run()
   delete[] params.outputs;
 }
 
-} // namespace custom
-} // namespace frontend
+} // namespace api
 } // namespace onert
diff --git a/runtime/onert/api/src/CustomKernel.h b/runtime/onert/api/src/CustomKernel.h
index a42f7a639..4c41dd9ba 100644
--- a/runtime/onert/api/src/CustomKernel.h
+++ b/runtime/onert/api/src/CustomKernel.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_BACKEND_CUSTOM_KERNEL_H__
-#define __ONERT_BACKEND_CUSTOM_KERNEL_H__
+#ifndef __ONERT_API_CUSTOM_KERNEL_H__
+#define __ONERT_API_CUSTOM_KERNEL_H__
 
 #include "nnfw_experimental.h"
 
@@ -26,15 +26,13 @@
 
 namespace onert
 {
-namespace frontend
-{
-namespace custom
+namespace api
 {
 
-class Kernel : public ::onert::exec::IFunction
+class CustomKernel : public ::onert::exec::IFunction
 {
 public:
-  explicit Kernel(nnfw_custom_eval evalFunction);
+  explicit CustomKernel(nnfw_custom_eval evalFunction);
 
   backend::custom::CustomKernelConfigParams _in_params;
 
@@ -53,8 +51,7 @@ public:
   void run() override;
 };
 
-} // namespace custom
-} // namespace frontend
+} // namespace api
 } // namespace onert
 
-#endif // __ONERT_BACKEND_CUSTOM_KERNEL_H__
+#endif // __ONERT_API_CUSTOM_KERNEL_H__
diff --git a/runtime/onert/api/src/CustomKernelRegistry.cc b/runtime/onert/api/src/CustomKernelRegistry.cc
index 7812609d1..d97f1bb06 100644
--- a/runtime/onert/api/src/CustomKernelRegistry.cc
+++ b/runtime/onert/api/src/CustomKernelRegistry.cc
@@ -20,22 +20,39 @@
 
 namespace onert
 {
-namespace frontend
+namespace api
 {
-namespace custom
+
+class KernelBuilder : public backend::custom::IKernelBuilder
 {
+public:
+  KernelBuilder(CustomKernelRegistry *registry) : _registry(registry) {}
+
+  std::unique_ptr<exec::IFunction>
+  buildKernel(const std::string &id,
+              backend::custom::CustomKernelConfigParams &&params) const override
+  {
+    auto kernel = _registry->buildKernelForOp(id);
+    kernel->configure(std::move(params));
+
+    return kernel;
+  }
+
+private:
+  CustomKernelRegistry *_registry;
+};
 
-void KernelRegistry::registerKernel(const std::string &id, nnfw_custom_eval evalFunction)
+void CustomKernelRegistry::registerKernel(const std::string &id, nnfw_custom_eval evalFunction)
 {
   _storage.emplace(id, evalFunction);
 }
 
-std::shared_ptr<backend::custom::IKernelBuilder> KernelRegistry::getBuilder()
+std::shared_ptr<backend::custom::IKernelBuilder> CustomKernelRegistry::getBuilder()
 {
   return std::make_unique<KernelBuilder>(this);
 }
 
-std::unique_ptr<Kernel> KernelRegistry::buildKernelForOp(const std::string &id)
+std::unique_ptr<CustomKernel> CustomKernelRegistry::buildKernelForOp(const std::string &id)
 {
   auto it = _storage.find(id);
   if (it == _storage.end())
@@ -43,22 +60,8 @@ std::unique_ptr<Kernel> KernelRegistry::buildKernelForOp(const std::string &id)
     throw std::runtime_error("Unable to find associated kernel for op");
   }
 
-  return std::make_unique<Kernel>(it->second);
+  return std::make_unique<CustomKernel>(it->second);
 }
 
-// Kernel builder
-std::unique_ptr<exec::IFunction>
-KernelBuilder::buildKernel(const std::string &id,
-                           backend::custom::CustomKernelConfigParams &&params) const
-{
-  auto kernel = _registry->buildKernelForOp(id);
-  kernel->configure(std::move(params));
-
-  return kernel;
-}
-
-KernelBuilder::KernelBuilder(KernelRegistry *registry) : _registry(registry) {}
-
-} // namespace custom
-} // namespace frontend
+} // namespace api
 } // namespace onert
diff --git a/runtime/onert/api/src/CustomKernelRegistry.h b/runtime/onert/api/src/CustomKernelRegistry.h
index fe60d5bcc..d39f11ad6 100644
--- a/runtime/onert/api/src/CustomKernelRegistry.h
+++ b/runtime/onert/api/src/CustomKernelRegistry.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_BACKEND_CUSTOM_KERNEL_REGISTRY_H__
-#define __ONERT_BACKEND_CUSTOM_KERNEL_REGISTRY_H__
+#ifndef __ONERT_API_CUSTOM_KERNEL_REGISTRY_H__
+#define __ONERT_API_CUSTOM_KERNEL_REGISTRY_H__
 
 #include "CustomKernel.h"
 
@@ -27,38 +27,22 @@
 
 namespace onert
 {
-namespace frontend
-{
-namespace custom
+namespace api
 {
 
-class KernelRegistry
+class CustomKernelRegistry
 {
 public:
   void registerKernel(const std::string &id, nnfw_custom_eval evalFunction);
 
   std::shared_ptr<backend::custom::IKernelBuilder> getBuilder();
-  std::unique_ptr<Kernel> buildKernelForOp(const std::string &id);
+  std::unique_ptr<CustomKernel> buildKernelForOp(const std::string &id);
 
 private:
   std::unordered_map<std::string, nnfw_custom_eval> _storage;
 };
 
-class KernelBuilder : public backend::custom::IKernelBuilder
-{
-public:
-  KernelBuilder(KernelRegistry *registry);
-
-  std::unique_ptr<exec::IFunction>
-  buildKernel(const std::string &id,
-              backend::custom::CustomKernelConfigParams &&params) const override;
-
-private:
-  KernelRegistry *_registry;
-};
-
-} // namespace custom
-} // namespace frontend
+} // namespace api
 } // namespace onert
 
-#endif // __ONERT_BACKEND_CUSTOM_KERNEL_REGISTRY_H__
+#endif // __ONERT_API_CUSTOM_KERNEL_REGISTRY_H__
diff --git a/runtime/onert/api/src/nnfw_api_internal.cc b/runtime/onert/api/src/nnfw_api_internal.cc
index c3fdb131b..316bafb52 100644
--- a/runtime/onert/api/src/nnfw_api_internal.cc
+++ b/runtime/onert/api/src/nnfw_api_internal.cc
@@ -159,8 +159,7 @@ void setConfigKeyValues(const CfgKeyValues &keyValues)
 
 nnfw_session::nnfw_session()
   : _subgraphs{nullptr}, _execution{nullptr},
-    _kernel_registry{std::make_shared<onert::frontend::custom::KernelRegistry>()}, _tracing_ctx{
-                                                                                     nullptr}
+    _kernel_registry{std::make_shared<onert::api::CustomKernelRegistry>()}, _tracing_ctx{nullptr}
 {
   // DO NOTHING
 }
@@ -274,8 +273,8 @@ NNFW_STATUS nnfw_session::load_model_from_nnpackage(const char *package_dir)
 
   try
   {
-    std::string manifest_file_name(package_dir);
-    manifest_file_name += "/metadata/MANIFEST";
+    std::string package_path(package_dir);
+    std::string manifest_file_name = package_path + "/metadata/MANIFEST";
     std::ifstream mfs(manifest_file_name);
 
     // extract the filename of the first(index 0) model
@@ -288,7 +287,7 @@ NNFW_STATUS nnfw_session::load_model_from_nnpackage(const char *package_dir)
 
     if (!configs.empty() && !configs[0].empty())
     {
-      auto filepath = package_dir + std::string("/metadata/") + configs[0].asCString();
+      auto filepath = package_path + std::string("/metadata/") + configs[0].asString();
 
       CfgKeyValues keyValues;
       if (loadConfigure(filepath, keyValues))
@@ -297,15 +296,15 @@ NNFW_STATUS nnfw_session::load_model_from_nnpackage(const char *package_dir)
       }
     }
 
-    auto model_file_path = package_dir + std::string("/") + models[0].asString(); // first model
+    auto model_file_path = package_path + std::string("/") + models[0].asString(); // first model
     auto model_type = model_types[0].asString(); // first model's type
     if (model_type == "tflite")
     {
-      _subgraphs = onert::tflite_loader::loadModel(model_file_path.c_str());
+      _subgraphs = onert::tflite_loader::loadModel(model_file_path);
     }
     else if (model_type == "circle")
     {
-      _subgraphs = onert::circle_loader::loadModel(model_file_path.c_str());
+      _subgraphs = onert::circle_loader::loadModel(model_file_path);
     }
     else
     {
@@ -346,13 +345,6 @@ NNFW_STATUS nnfw_session::prepare()
     return NNFW_STATUS_INVALID_STATE;
   }
 
-  if (!_subgraphs || !primary_subgraph() || primary_subgraph()->isBuildingPhase())
-  {
-    std::cerr << "Error during model prepare : "
-              << "prepare should be run after load_model" << std::endl;
-    return NNFW_STATUS_ERROR;
-  }
-
   try
   {
     _subgraphs.reset();
@@ -632,8 +624,9 @@ NNFW_STATUS nnfw_session::apply_tensorinfo(uint32_t index, nnfw_tensorinfo ti)
   {
     // In this case, if we apply input shape in primary_subgraph, it will propagate after
     // compilation and excution
-    auto ind = primary_subgraph()->getInputs().at(index);
-    auto &input = primary_subgraph()->operands().at(ind);
+    auto primary_subgraph = _subgraphs->primary();
+    auto ind = primary_subgraph->getInputs().at(index);
+    auto &input = primary_subgraph->operands().at(ind);
 
     // overwrite input shape with the shape from ti
     input.info().shape(new_shape);
@@ -840,10 +833,6 @@ NNFW_STATUS nnfw_session::set_config(const char *key, const char *value)
   {
     options.graph_dump_level = toInt(value);
   }
-  else if (skey == config::OP_SEQ_MAX_NODE)
-  {
-    options.op_seq_max_node = toInt(value);
-  }
   else if (skey == config::EXECUTOR)
   {
     options.executor = value;
@@ -871,7 +860,7 @@ NNFW_STATUS nnfw_session::set_config(const char *key, const char *value)
   return NNFW_STATUS_NO_ERROR;
 }
 
-onert::ir::Graph *nnfw_session::primary_subgraph()
+const onert::ir::Graph *nnfw_session::primary_subgraph()
 {
   if (_subgraphs)
   {
@@ -883,7 +872,7 @@ onert::ir::Graph *nnfw_session::primary_subgraph()
     assert(_execution);
     // TODO Remove const_cast
     // We assumed the graph will not change after compilation, but shape could change
-    return const_cast<onert::ir::Graph *>(&_execution->primary_subgraph());
+    return &_execution->primary_subgraph();
   }
 }
 
@@ -957,7 +946,6 @@ bool nnfw_session::isStateModelLoaded()
     assert(_subgraphs);
     assert(_compiler);
     assert(!_execution);
-    assert(!primary_subgraph()->isBuildingPhase());
     return true;
   }
   else
@@ -973,7 +961,6 @@ bool nnfw_session::isStatePrepared()
     assert(!_subgraphs);
     assert(_compiler);
     assert(_execution);
-    assert(!primary_subgraph()->isBuildingPhase());
     return true;
   }
   else
@@ -989,7 +976,6 @@ bool nnfw_session::isStateRunning()
     assert(!_subgraphs);
     assert(_compiler);
     assert(_execution);
-    assert(!primary_subgraph()->isBuildingPhase());
     return true;
   }
   return false;
@@ -1002,7 +988,6 @@ bool nnfw_session::isStateFinishedRun()
     assert(!_subgraphs);
     assert(_compiler);
     assert(_execution);
-    assert(!primary_subgraph()->isBuildingPhase());
     return true;
   }
   else
diff --git a/runtime/onert/api/src/nnfw_api_internal.h b/runtime/onert/api/src/nnfw_api_internal.h
index a50ac72d3..b13962907 100644
--- a/runtime/onert/api/src/nnfw_api_internal.h
+++ b/runtime/onert/api/src/nnfw_api_internal.h
@@ -28,13 +28,10 @@
 
 namespace onert
 {
-namespace frontend
+namespace api
 {
-namespace custom
-{
-class KernelRegistry;
-}
-} // namespace frontend
+class CustomKernelRegistry;
+} // namespace api
 namespace exec
 {
 class Execution;
@@ -144,7 +141,7 @@ public:
   NNFW_STATUS output_tensorindex(const char *tensorname, uint32_t *index);
 
 private:
-  onert::ir::Graph *primary_subgraph();
+  const onert::ir::Graph *primary_subgraph();
   bool isStateInitialized();
   bool isStateModelLoaded();
   bool isStatePrepared();
@@ -157,7 +154,7 @@ private:
   std::shared_ptr<onert::ir::Subgraphs> _subgraphs;
   std::unique_ptr<onert::compiler::Compiler> _compiler;
   std::unique_ptr<onert::exec::Execution> _execution;
-  std::shared_ptr<onert::frontend::custom::KernelRegistry> _kernel_registry;
+  std::shared_ptr<onert::api::CustomKernelRegistry> _kernel_registry;
 
   std::unique_ptr<onert::util::TracingCtx> _tracing_ctx;
 };
diff --git a/runtime/onert/backend/acl_cl/Backend.h b/runtime/onert/backend/acl_cl/Backend.h
index 4f48314c1..945ad83bb 100644
--- a/runtime/onert/backend/acl_cl/Backend.h
+++ b/runtime/onert/backend/acl_cl/Backend.h
@@ -42,20 +42,18 @@ public:
 
   std::shared_ptr<IConfig> config() const override { return _config; }
 
-  std::unique_ptr<backend::BackendContext>
-  newContext(const ir::Graph &graph, const std::shared_ptr<custom::IKernelBuilder> &,
-             bool is_linear_executor) const override
+  std::unique_ptr<backend::BackendContext> newContext(ContextData &&data) const override
   {
-    const auto &operands = graph.operands();
-    const auto &operations = graph.operations();
-    auto context = std::make_unique<acl_cl::BackendContext>(this, &graph);
-    auto tm = createTensorManager(is_linear_executor);
+    const auto &graph = *data.graph;
+    const auto &operands = data.graph->operands();
+    auto context = std::make_unique<acl_cl::BackendContext>(this, std::move(data));
+    auto tm = createTensorManager(data.is_linear_executor);
     auto tr = std::make_shared<acl_common::AclTensorRegistry<TensorManager>>(tm);
     auto tb = std::make_shared<TensorBuilder>(operands, tm);
     context->tensor_registry = tr;
     context->tensor_builder = tb;
     context->constant_initializer = std::make_shared<ConstantInitializer>(operands, tr);
-    context->kernel_gen = std::make_shared<KernelGenerator>(operands, operations, tb, tr);
+    context->kernel_gen = std::make_shared<KernelGenerator>(graph, tb, tr);
     context->optimizer = std::make_shared<Optimizer>(context.get());
     return context;
   }
diff --git a/runtime/onert/backend/acl_cl/BackendContext.cc b/runtime/onert/backend/acl_cl/BackendContext.cc
index a6f228a4f..5595043ca 100644
--- a/runtime/onert/backend/acl_cl/BackendContext.cc
+++ b/runtime/onert/backend/acl_cl/BackendContext.cc
@@ -33,46 +33,34 @@ namespace acl_cl
 
 void BackendContext::initConsts()
 {
-  for (auto &op : operation_list())
-  {
-    constant_initializer->setLayout(op.layout);
-    graph()->operations().at(op.index).accept(*constant_initializer);
-  }
+  _data.graph->operations().iterate([&](const ir::OperationIndex &, const ir::Operation &op) {
+    constant_initializer->setLayout(graph()->layout());
+    op.accept(*constant_initializer);
+  });
 
-  for (auto ind : operand_list())
-  {
+  _data.graph->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &operand) {
+    if (_data.external_operands.contains(ind) || !operand.isConstant())
+      return;
     const auto &obj = graph()->operands().at(ind);
     if (obj.isConstant() && !constant_initializer->exist(ind))
     {
       constant_initializer->registerDefaultInitializer(ind, obj);
     }
-  }
+  });
 
   constant_initializer->run();
 }
 
-void BackendContext::planTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
-                                 const ir::OpSequences &op_seqs, const ir::LowerInfoMap &lower_info)
+void BackendContext::planTensors()
 {
   ir::OperandIndexMap<uint32_t> uses_map;
   ir::OperandIndexMap<uint32_t> def_map;
   ir::OperandIndexSequence constants;
 
   // Prepare scanning
-  for (auto ind : operand_list())
-  {
-    const auto &obj = graph()->operands().at(ind);
-    const auto &li = lower_info.operand.at(ind);
-    if (li->def_factors().getOnlyElement().backend() != backend())
-      continue;
-
-    // Ignore unused tensor
-    if (li->def_factors().size() == 0 && li->use_factors().size() == 0)
-    {
-      VERBOSE(planTensors) << "Operand #" << ind.value() << " will not be used. no more process."
-                           << std::endl;
+  _data.graph->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &obj) {
+    if (_data.external_operands.contains(ind))
       return;
-    }
 
     uses_map[ind] = obj.getUses().size();
     def_map[ind] = obj.getDef().valid() ? 1 : 0;
@@ -80,16 +68,15 @@ void BackendContext::planTensors(const std::vector<onert::ir::OpSequenceIndex> &
     if (obj.isConstant())
       constants.append(ind);
 
-    auto factor = li->def_factors().getOnlyElement();
     if (!tensor_builder->isRegistered(ind))
     {
-      // These tensors do not exist in any op_seq (No use and def)
+      // These tensors do not exist in any operation (No use and def)
       const auto info = obj.info();
-      const auto backend_layout = factor.layout();
+      const auto layout = _data.operand_layouts.at(ind);
       // TODO Change tensor info to have permuted shape
-      tensor_builder->registerTensorInfo(ind, info, backend_layout);
+      tensor_builder->registerTensorInfo(ind, info, layout);
     }
-  }
+  });
 
   // Start scanning to do notify{First|Last}Use for each tensor
 
@@ -107,64 +94,65 @@ void BackendContext::planTensors(const std::vector<onert::ir::OpSequenceIndex> &
   // 1. Scan DEF of outputs. If the DEF, allocate it
   // 2. Scan DEF of inputs. If variable tensor, allocate it
   // 3. Scan USE of inputs. Decrease the USE and deallocate if the USE is 0
-  for (const auto op_seq_ind : order)
+  for (const auto op_ind : _data.op_order)
   {
-    const auto &op_seq = op_seqs.at(op_seq_ind);
-    for (const auto &op_idx : op_seq.operations())
-    {
-      auto &op = graph()->operations().at(op_idx);
-      auto op_inputs = op.getInputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED;
-      auto op_outputs = op.getOutputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED;
+    const auto &op = graph()->operations().at(op_ind);
+    auto op_inputs = op.getInputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED;
+    auto op_outputs = op.getOutputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED;
 
-      // Define outputs
-      for (const auto &ind : op_outputs)
+    // Define outputs
+    for (const auto &ind : op_outputs)
+    {
+      if (!tensor_builder->isRegistered(ind))
+        continue;
+      assert(def_map.find(ind) != def_map.end());
+      if (def_map[ind])
       {
-        if (!tensor_builder->isRegistered(ind))
-          continue;
-        assert(def_map.find(ind) != def_map.end());
-        if (def_map[ind])
-        {
-          def_map[ind] = 0;
-          tensor_builder->notifyFirstUse(ind);
-        }
+        def_map[ind] = 0;
+        tensor_builder->notifyFirstUse(ind);
       }
+    }
 
-      // Scan variable tensors
-      // This tensor has features like constant. But OperandInfo and LowerInfo treat them as
-      // non-constant because of less memory usage by memory planning in here
-      for (const auto &ind : op_inputs)
+    // Scan variable tensors
+    // This tensor has features like constant. But OperandInfo and LowerInfo treat them as
+    // non-constant because of less memory usage by memory planning in here
+    for (const auto &ind : op_inputs)
+    {
+      if (!tensor_builder->isRegistered(ind))
+        continue;
+      const auto &operand = graph()->operands().at(ind);
+      if (operand.info().isVariable())
       {
-        if (!tensor_builder->isRegistered(ind))
-          continue;
-        const auto &operand = graph()->operands().at(ind);
-        if (operand.info().isVariable())
-        {
-          // The variable tensor with buffer is not supported yet
-          assert(operand.data() == nullptr);
-          assert(operand.getUses().size() == 1 && !operand.getDef().valid());
-          assert(lower_info.operand.at(ind)->def_factors().size() == 1 &&
-                 lower_info.operand.at(ind)->use_factors().size() == 1);
-          assert(uses_map[ind] == 1 && def_map[ind] == 0);
-          tensor_builder->notifyFirstUse(ind);
-        }
+        // The variable tensor with buffer is not supported yet
+        assert(operand.data() == nullptr);
+        assert(operand.getUses().size() == 1 && !operand.getDef().valid());
+        assert(uses_map[ind] == 1 && def_map[ind] == 0);
+        tensor_builder->notifyFirstUse(ind);
       }
+    }
 
-      for (const auto &ind : op_inputs)
+    for (const auto &ind : op_inputs)
+    {
+      if (!tensor_builder->isRegistered(ind))
+        continue;
+      assert(uses_map.find(ind) != uses_map.end());
+      assert(uses_map[ind] > 0);
+      uses_map[ind]--;
+      if (uses_map[ind] == 0)
       {
-        if (!tensor_builder->isRegistered(ind))
-          continue;
-        assert(uses_map.find(ind) != uses_map.end());
-        assert(uses_map[ind] > 0);
-        uses_map[ind]--;
-        if (uses_map[ind] == 0)
-        {
-          // plan for deallocation of static tensornode
-          tensor_builder->notifyLastUse(ind);
-        }
+        // plan for deallocation of static tensornode
+        tensor_builder->notifyLastUse(ind);
       }
     }
   }
 
+  _data.graph->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &) {
+    if (uses_map[ind] == 0)
+    {
+      tensor_builder->notifyLastUse(ind);
+    }
+  });
+
   // Dispose and validate
   for (const auto &ind : constants)
   {
@@ -176,77 +164,42 @@ void BackendContext::planTensors(const std::vector<onert::ir::OpSequenceIndex> &
   }
 
   assert(
-      std::all_of(uses_map.begin(), uses_map.end(),
-                  [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; }));
+    std::all_of(uses_map.begin(), uses_map.end(),
+                [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; }));
 
   assert(
-      std::all_of(def_map.begin(), def_map.end(),
-                  [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; }));
+    std::all_of(def_map.begin(), def_map.end(),
+                [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; }));
 }
 
-ITensorRegistry *BackendContext::genTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
-                                            const ir::OpSequences &op_seqs,
-                                            const ir::LowerInfoMap &lower_info)
+ITensorRegistry *BackendContext::genTensors()
 {
   optimizer->optimize();
 
-  for (const auto op_seq_ind : order)
-  {
-    const auto &op_seq = op_seqs.at(op_seq_ind);
-    auto model_io = (graph()->getInputs() + graph()->getOutputs()) | ir::Remove::UNDEFINED |
-                    ir::Remove::DUPLICATED;
-    for (const auto op_ind : op_seq)
-    {
-      bool op_assigned = [&]() {
-        for (auto &op_info : operation_list())
-          if (op_info.index == op_ind)
-            return true;
-        return false;
-      }();
-      if (!op_assigned)
-        continue;
+  graph()->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &obj) {
+    if (external_operands().contains(ind))
+      return;
 
-      const auto &op = graph()->operations().at(op_ind);
-      for (const auto &index : (op.getInputs() + op.getOutputs()) | ir::Remove::UNDEFINED)
-      {
-        if (!tensor_builder->isRegistered(index) && !model_io.contains(index) &&
-            find(operand_list().begin(), operand_list().end(), index) != operand_list().end())
-        {
-          const auto &operand_lower_info =
-              lower_info.operand.at(index)->def_factors().getOnlyElement();
-
-          // E.g., permute (CPU) -> tensor A -> MaxPool2D(acl_cl)
-          // op.getOutputs() of permute (CPU) returns tensor A
-          // but tensor A belongs to the backend of acl_cl.
-          // So, we have to make this tensor NOT registered for CPU.
-          if (operand_lower_info.backend() != backend())
-            continue;
-
-          const auto &obj = graph()->operands().at(index);
-          const auto frontend_layout = op_seq.getLayout();
-          const auto backend_layout = operand_lower_info.layout();
-          ir::OperandInfo backend_info{permuteShape(obj.shape(), frontend_layout, backend_layout),
-                                       obj.typeInfo(), obj.info().memAllocType(), obj.isConstant()};
-          tensor_builder->registerTensorInfo(index, backend_info, backend_layout);
-        }
-      }
-    }
-  }
+    const auto frontend_layout = graph()->layout();
+    const auto backend_layout = operand_layouts().at(ind);
+    ir::OperandInfo backend_info{permuteShape(obj.shape(), frontend_layout, backend_layout),
+                                 obj.typeInfo(), obj.info().memAllocType(), obj.isConstant()};
+    tensor_builder->registerTensorInfo(ind, backend_info, backend_layout);
+  });
 
   // TODO Get compiler options from compiler, and use it rather than getting it from Env
   if (util::getConfigString(util::config::EXECUTOR) == "Linear")
   {
-    planTensors(order, op_seqs, lower_info);
+    planTensors();
   }
   else
   {
     // For the executors that does not have fixed linear execution order:
     // To make tensors never be deallocated, this is a workaround to use static memory planner
-    for (auto ind : operand_list())
-    {
+    graph()->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &) {
       if (tensor_builder->isRegistered(ind))
         tensor_builder->notifyFirstUse(ind);
-    }
+    });
   }
 
   tensor_builder->prepare();
@@ -254,36 +207,23 @@ ITensorRegistry *BackendContext::genTensors(const std::vector<onert::ir::OpSeque
   return tensor_registry.get();
 }
 
-FunctionMap BackendContext::genKernels(const std::vector<onert::ir::OpSequenceIndex> &order,
-                                       const ir::OpSequences &op_seqs)
+FunctionMap BackendContext::genKernels()
 {
   FunctionMap ret;
 
-  for (auto op_seq_ind : order)
+  for (auto op_ind : _data.op_order)
   {
-    const auto &op_seq = op_seqs.at(op_seq_ind);
-    bool assigned = [&]() {
-      for (auto op_info : operation_list())
-        if (op_seq.exist(op_info.index))
-          return true;
-      return false;
-    }();
-    if (!assigned)
-      continue;
-    auto fn_seq = kernel_gen->generate(op_seqs.at(op_seq_ind));
-    ret.emplace_back(op_seq_ind, std::move(fn_seq));
+    auto fn_seq = kernel_gen->generate(op_ind);
+    ret.emplace_back(op_ind, std::move(fn_seq));
   }
 
   tensor_builder->allocate();
   initConsts();
 
   // NOTE For memory optimization, we want to free some operand data
-  for (auto ind : operand_list())
-  {
-    // TODO Remove const_cast
-    auto &obj = const_cast<ir::Graph *>(graph())->operands().at(ind);
-    obj.releaseData();
-  }
+  const_cast<ir::Graph &>(*_data.graph)
+    .operands()
+    .iterate([&](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); });
 
   for (auto &it : ret)
   {
diff --git a/runtime/onert/backend/acl_cl/BackendContext.h b/runtime/onert/backend/acl_cl/BackendContext.h
index 662d767d0..2638046ca 100644
--- a/runtime/onert/backend/acl_cl/BackendContext.h
+++ b/runtime/onert/backend/acl_cl/BackendContext.h
@@ -34,27 +34,23 @@ class Optimizer;
 class BackendContext : public onert::backend::BackendContext
 {
 public:
-  BackendContext(const Backend *backend, const ir::Graph *graph,
+  BackendContext(const Backend *backend, ContextData &&data,
                  std::shared_ptr<ITensorRegistry> tensor_registry = nullptr,
                  std::shared_ptr<TensorBuilder> tensor_builder = nullptr,
                  std::shared_ptr<ConstantInitializer> constant_initializer = nullptr,
                  std::shared_ptr<KernelGenerator> kernel_gen = nullptr)
-      : onert::backend::BackendContext(backend, graph, tensor_registry),
-        tensor_builder{tensor_builder}, constant_initializer{constant_initializer},
-        kernel_gen{kernel_gen}
+    : onert::backend::BackendContext(backend, std::move(data), tensor_registry),
+      tensor_builder{tensor_builder}, constant_initializer{constant_initializer}, kernel_gen{
+                                                                                    kernel_gen}
   {
   }
 
-  ITensorRegistry *genTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
-                              const ir::OpSequences &op_seqs,
-                              const ir::LowerInfoMap &lower_info) override;
-  FunctionMap genKernels(const std::vector<onert::ir::OpSequenceIndex> &order,
-                         const ir::OpSequences &op_seqs) override;
+  ITensorRegistry *genTensors() override;
+  FunctionMap genKernels() override;
 
 private:
   void initConsts();
-  void planTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
-                   const ir::OpSequences &op_seqs, const ir::LowerInfoMap &lower_info);
+  void planTensors();
 
 public:
   std::shared_ptr<TensorBuilder> tensor_builder;
diff --git a/runtime/onert/backend/acl_cl/CLTimer.h b/runtime/onert/backend/acl_cl/CLTimer.h
index 722dc68ef..a9158e1af 100644
--- a/runtime/onert/backend/acl_cl/CLTimer.h
+++ b/runtime/onert/backend/acl_cl/CLTimer.h
@@ -53,8 +53,8 @@ public:
                                           const cl_event *event_wait_list, cl_event *usr_event) {
       cl_event event;
       cl_int enqueue_res =
-          this->_origin_enqueue_function(command_queue, kernel, work_dim, gwo, gws, lws,
-                                         num_events_in_wait_list, event_wait_list, &event);
+        this->_origin_enqueue_function(command_queue, kernel, work_dim, gwo, gws, lws,
+                                       num_events_in_wait_list, event_wait_list, &event);
       this->_measured_events.emplace_back(event);
 
       // According to spec, if NULL was provided in usr_event - event shouldn't be returned
@@ -73,7 +73,7 @@ public:
     if ((props & CL_QUEUE_PROFILING_ENABLE) == 0)
     {
       cl_scheduler.set_queue(
-          cl::CommandQueue(cl_scheduler.context(), props | CL_QUEUE_PROFILING_ENABLE));
+        cl::CommandQueue(cl_scheduler.context(), props | CL_QUEUE_PROFILING_ENABLE));
     }
   };
 
diff --git a/runtime/onert/backend/acl_cl/Config.cc b/runtime/onert/backend/acl_cl/Config.cc
index 8017bdb0b..c10fdc1fe 100644
--- a/runtime/onert/backend/acl_cl/Config.cc
+++ b/runtime/onert/backend/acl_cl/Config.cc
@@ -42,7 +42,7 @@ bool Config::initialize()
   // NOTE CLKernelLibraryEx must use the same context as CLScheduler
   // It did not check whether another device is available.
   arm_compute::CLKernelLibraryEx::get().init(
-      "./cl_kernels/", arm_compute::CLScheduler::get().context(), cl::Device::getDefault());
+    "./cl_kernels/", arm_compute::CLScheduler::get().context(), cl::Device::getDefault());
 
   return true;
 }
diff --git a/runtime/onert/backend/acl_cl/ConstantInitializer.cc b/runtime/onert/backend/acl_cl/ConstantInitializer.cc
index 413a7ccc3..54b2a7a08 100644
--- a/runtime/onert/backend/acl_cl/ConstantInitializer.cc
+++ b/runtime/onert/backend/acl_cl/ConstantInitializer.cc
@@ -30,7 +30,7 @@ namespace acl_cl
 
 ConstantInitializer::ConstantInitializer(const ir::Operands &operands,
                                          const std::shared_ptr<ITensorRegistry> &tensor_reg)
-    : acl_common::AclConstantInitializer{operands, tensor_reg}
+  : acl_common::AclConstantInitializer{operands, tensor_reg}
 {
   // DO NOTHING
 }
@@ -84,7 +84,7 @@ void ConstantInitializer::visit(const ir::operation::SpaceToBatchND &node)
       const auto &shape = model_obj.shape();
       const auto base = reinterpret_cast<const int32_t *>(model_obj.data()->base());
       assert(model_obj.shape().rank() == 2);
-      assert(obj.dimension(0) == 2);
+      assert(obj.getShape().dim(0) == 2);
       obj.access([&](ITensor &tensor) {
         for (auto i = 0; i < shape.dim(0); ++i)
         {
@@ -92,7 +92,7 @@ void ConstantInitializer::visit(const ir::operation::SpaceToBatchND &node)
           {
             const int32_t value = base[i * 2 + j];
             int32_t *into = reinterpret_cast<int32_t *>(
-                tensor.buffer() + tensor.calcOffset({shape.dim(0) - i - 1, j}));
+              tensor.buffer() + tensor.calcOffset({shape.dim(0) - i - 1, j}));
             *into = value;
           }
         }
@@ -131,7 +131,7 @@ void ConstantInitializer::visit(const ir::operation::Reverse &node)
       }
 
       auto axis =
-          acl_common::ToARMComputeAxis(ifm_rank, axis_tmp, frontend_layout, backend_layout).value();
+        acl_common::ToARMComputeAxis(ifm_rank, axis_tmp, frontend_layout, backend_layout).value();
 
       obj.access([&](ITensor &tensor) {
         int32_t *into = reinterpret_cast<int32_t *>(tensor.buffer());
diff --git a/runtime/onert/backend/acl_cl/KernelGenerator.cc b/runtime/onert/backend/acl_cl/KernelGenerator.cc
index 3a5ea5a0f..e709286df 100644
--- a/runtime/onert/backend/acl_cl/KernelGenerator.cc
+++ b/runtime/onert/backend/acl_cl/KernelGenerator.cc
@@ -42,33 +42,27 @@ namespace acl_cl
 
 using ::onert::backend::acl_common::asAclFunction;
 using ActivationBuilder = ::onert::backend::acl_common::AclActivationBuilder<
-    ::arm_compute::ICLTensor, ::arm_compute::CLActivationLayer, acl_common::AclFunction>;
+  ::arm_compute::ICLTensor, ::arm_compute::CLActivationLayer, acl_common::AclFunction>;
 
 KernelGenerator::KernelGenerator(
-    const ir::Operands &operands_ctx, const ir::Operations &operations_ctx,
-    const std::shared_ptr<TensorBuilder> &tensor_builder,
-    const std::shared_ptr<acl_common::AclTensorRegistry<TensorManager>> &tensor_reg)
-    : _ctx(operands_ctx), _operations_ctx(operations_ctx), _tensor_builder(tensor_builder),
-      _tensor_reg(tensor_reg), _current_layout(ir::Layout::UNKNOWN)
+  const ir::Graph &graph, const std::shared_ptr<TensorBuilder> &tensor_builder,
+  const std::shared_ptr<acl_common::AclTensorRegistry<TensorManager>> &tensor_reg)
+  : basic::KernelGeneratorBase{graph}, _ctx(graph.operands()),
+    _operations_ctx(graph.operations()), _current_layout{graph.layout()},
+    _tensor_builder(tensor_builder), _tensor_reg(tensor_reg)
 {
   // DO NOTHING
 }
 
-void KernelGenerator::visit(const ir::OpSequence &op_seq)
+std::unique_ptr<exec::FunctionSequence> KernelGenerator::generate(ir::OperationIndex ind)
 {
-  // TODO Move this to IKernelGenerator
-  //      (all derivatives have the same implementation for this)
-  assert(!_return_fn_seq);
-  _return_fn_seq = std::make_unique<exec::FunctionSequence>();
-  _return_fn_seq->enableDynamicShapeInferer(false);
-
-  _current_layout = op_seq.getLayout();
-  for (const auto &operation_idx : op_seq.operations())
-  {
-    const auto &node = _operations_ctx.at(operation_idx);
-    node.accept(*this);
-    _return_fn_seq->append(releaseFunction());
-  }
+  auto ret = std::make_unique<exec::FunctionSequence>();
+  ret->enableDynamicShapeInferer(false);
+
+  const auto &op = _graph.operations().at(ind);
+  op.accept(*this);
+  ret->append(releaseFunction());
+  return ret;
 }
 
 void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
@@ -76,7 +70,7 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::INPUT)};
   const auto block_size_index{
-      node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
+    node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
 
   const auto NNApiInputs = 2;
   if (node.getInputs().size() != NNApiInputs)
@@ -104,7 +98,7 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
   assert(_ctx.at(block_size_index).data());
 
   auto fn = acl_common::generateLayer<arm_compute::CLBatchToSpaceLayer>(
-      ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle());
+    ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle());
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -129,29 +123,29 @@ void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
     case ir::operation::BinaryArithmetic::ArithmeticType::ADD:
     {
       fn = acl_common::generateLayer<arm_compute::CLArithmeticAddition>(
-          lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
-          arm_compute::ConvertPolicy::SATURATE, act_info);
+        lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
+        arm_compute::ConvertPolicy::SATURATE, act_info);
       break;
     }
     case ir::operation::BinaryArithmetic::ArithmeticType::SUB:
     {
       fn = acl_common::generateLayer<arm_compute::CLArithmeticSubtraction>(
-          lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
-          arm_compute::ConvertPolicy::SATURATE, act_info);
+        lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
+        arm_compute::ConvertPolicy::SATURATE, act_info);
       break;
     }
     case ir::operation::BinaryArithmetic::ArithmeticType::MUL:
     {
       fn = acl_common::generateLayer<arm_compute::CLPixelWiseMultiplication>(
-          lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale
-          arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_NEAREST_EVEN,
-          act_info);
+        lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale
+        arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_NEAREST_EVEN,
+        act_info);
       break;
     }
     case ir::operation::BinaryArithmetic::ArithmeticType::DIV:
     {
       fn = acl_common::generateLayer<arm_compute::CLArithmeticDivision>(
-          lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), act_info);
+        lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), act_info);
       break;
     }
     default:
@@ -179,8 +173,8 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
   const auto ker_width = ker_shape.dim(2);
 
   const auto stride = node.param().stride;
-  const auto padding = ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride,
-                                            ker_width, ker_height);
+  const auto padding =
+    ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, ker_width, ker_height);
   const auto activation = node.param().activation;
 
   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
@@ -192,9 +186,9 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
   const auto act_info = acl_common::asActivationLayerInfo(activation);
 
   auto fn = acl_common::generateLayer<arm_compute::CLConvolutionLayer>(
-      _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), ifm_tensor->handle(),
-      ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(), conv_info,
-      ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
+    _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), ifm_tensor->handle(),
+    ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(), conv_info,
+    ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -218,8 +212,8 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
   const auto stride = node.param().stride;
   const auto dilation = node.param().dilation;
   const auto padding =
-      ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, ker_width,
-                           ker_height, dilation.width_factor, dilation.height_factor);
+    ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, ker_width, ker_height,
+                         dilation.width_factor, dilation.height_factor);
   const auto multiplier = node.param().multiplier;
   const auto activation = node.param().activation;
 
@@ -233,8 +227,8 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
   const auto dilation_info = acl_common::asDilation(dilation.width_factor, dilation.height_factor);
 
   auto fn = acl_common::generateLayer<arm_compute::CLDepthwiseConvolutionLayer>(
-      ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(),
-      conv_info, multiplier, act_info, dilation_info);
+    ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(),
+    conv_info, multiplier, act_info, dilation_info);
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -261,15 +255,17 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
   }
 
   auto output_tensor = _tensor_reg->getAclTensor(ofm_index);
-  std::vector<::arm_compute::ICLTensor *> input_tensors;
+  std::vector<const ::arm_compute::ICLTensor *> input_tensors;
   for (auto &ifm_ind : input_indexes)
     input_tensors.emplace_back(_tensor_reg->getAclTensor(ifm_ind)->handle());
 
   std::unique_ptr<::arm_compute::IFunction> fn;
   if (input_indexes.size() < 2)
   {
-    fn = acl_common::generateLayer<arm_compute::CLCopy>(input_tensors.at(0),
-                                                        output_tensor->handle());
+    ::arm_compute::ICLTensor *input_tesor =
+      _tensor_reg->getAclTensor(input_indexes.at(0))->handle();
+
+    fn = acl_common::generateLayer<arm_compute::CLCopy>(input_tesor, output_tensor->handle());
   }
   else
   {
@@ -277,9 +273,9 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
     const auto frontend_layout = _current_layout;
     const auto backend_layout = output_tensor->layout();
     const auto fixed_axis =
-        acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
+      acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
     fn = acl_common::generateLayer<::arm_compute::CLConcatenateLayer>(
-        input_tensors, output_tensor->handle(), fixed_axis);
+      input_tensors, output_tensor->handle(), fixed_axis);
   }
 
   _return_fn = asAclFunction(std::move(fn));
@@ -292,13 +288,13 @@ void KernelGenerator::visit(const ir::operation::FullyConnected &node)
   const auto activation = node.param().activation;
   if (node.param().weights_format == ir::FullyConnectedWeightsFormat::Shuffled16x1Float32)
     throw std::runtime_error(
-        "KernelGenerator(acl_cl): FullyConnected 16x1Float32 weights is not supported.");
+      "KernelGenerator(acl_cl): FullyConnected 16x1Float32 weights is not supported.");
 
   auto fn = acl_common::kernelGenFullyConnected<acl_common::AclFunction, ::arm_compute::ICLTensor,
                                                 ::arm_compute::CLFullyConnectedReshapingLayer>(
-      node, _ctx, _tensor_builder, _tensor_reg, _current_layout);
+    node, _ctx, _tensor_builder, _tensor_reg, _current_layout);
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle()));
+    std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::Reduce &node)
@@ -322,7 +318,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
   if (reduce_type == ir::operation::Reduce::ReduceType::MEAN)
   {
     const auto acl_axes =
-        acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout);
+      acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout);
     fn = acl_common::generateLayer<arm_compute::CLReduceMean>(input_tensor->handle(), acl_axes,
                                                               keep_dims, output_tensor->handle());
   }
@@ -331,8 +327,8 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
     const auto acl_axes = acl_common::asSet(axes, input_rank, frontend_layout, backend_layout);
 
     fn = acl_common::generateLayer<arm_compute::CLReduceOperation>(
-        _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
-        output_tensor->handle(), acl_axes, keep_dims, acl_common::convertReduceType(reduce_type));
+      _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
+      output_tensor->handle(), acl_axes, keep_dims, acl_common::convertReduceType(reduce_type));
   }
 
   _return_fn = asAclFunction(std::move(fn));
@@ -392,8 +388,8 @@ void KernelGenerator::visit(const ir::operation::Softmax &node)
   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLSoftmaxLayer>(
-      _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
-      output_tensor->handle(), beta);
+    _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
+    output_tensor->handle(), beta);
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -439,7 +435,7 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
     {
       auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
                                                                  backend_layout)
-                      .value();
+                    .value();
 
       int32_t begin_value = *(reinterpret_cast<const int32_t *>(beginData_base) + n);
       starts[axis] = begin_value;
@@ -459,7 +455,7 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
   }
 
   auto fn = acl_common::generateLayer<arm_compute::CLSlice>(
-      inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set);
+    inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set);
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -514,7 +510,7 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
     {
       auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
                                                                  backend_layout)
-                      .value();
+                    .value();
 
       int32_t start_value = *(reinterpret_cast<const int32_t *>(startData_base) + n);
       starts[axis] = start_value;
@@ -533,7 +529,7 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
   const auto end_mask = acl_common::ReorderBits<int32_t>(node.param().end_mask, input_rank,
                                                          frontend_layout, backend_layout);
   const auto shrink_axis_mask = acl_common::ReorderBits<int32_t>(
-      node.param().shrink_axis_mask, input_rank, frontend_layout, backend_layout);
+    node.param().shrink_axis_mask, input_rank, frontend_layout, backend_layout);
 
   ::arm_compute::Coordinates starts_set;
   ::arm_compute::Coordinates ends_set;
@@ -554,8 +550,8 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
   }
 
   auto fn = acl_common::generateLayer<arm_compute::CLStridedSlice>(
-      inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set, strides_set,
-      begin_mask, end_mask, shrink_axis_mask);
+    inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set, strides_set,
+    begin_mask, end_mask, shrink_axis_mask);
 
   // Revert disabling applied dim_correction
   if (inputData_tensor->dimension(0) == 1)
@@ -606,7 +602,7 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
   else
   {
     auto backend_pv =
-        acl_common::getARMComputePermutationVector(rank, pv, frontend_layout, backend_layout);
+      acl_common::getARMComputePermutationVector(rank, pv, frontend_layout, backend_layout);
 
     fn = acl_common::generateLayer<arm_compute::CLPermute>(ifm_tensor->handle(),
                                                            ofm_tensor->handle(), backend_pv);
@@ -623,11 +619,11 @@ void KernelGenerator::visit(const ir::operation::ElementwiseActivation &node)
   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 
-  const ::arm_compute::ActivationLayerInfo act_info = acl_common::asActivationLayerInfo(
-      node.param().op_type, node.param().alpha, node.param().beta);
+  const ::arm_compute::ActivationLayerInfo act_info =
+    acl_common::asActivationLayerInfo(node.param().op_type, node.param().alpha, node.param().beta);
 
   auto fn = acl_common::generateLayer<arm_compute::CLActivationLayer>(
-      ifm_tensor->handle(), ofm_tensor->handle(), act_info);
+    ifm_tensor->handle(), ofm_tensor->handle(), act_info);
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -648,26 +644,26 @@ void KernelGenerator::visit(const ir::operation::ElementwiseBinary &node)
     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_AND:
     {
       fn = acl_common::generateLayer<arm_compute::CLBinaryLogicalOp>(
-          lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle(),
-          arm_compute::BinaryLogicalOperation::AND);
+        lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle(),
+        arm_compute::BinaryLogicalOperation::AND);
       break;
     }
     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_OR:
     {
       fn = acl_common::generateLayer<arm_compute::CLBitwiseOr>(
-          lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
+        lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
       break;
     }
     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::MAX:
     {
       fn = acl_common::generateLayer<arm_compute::CLElementwiseMax>(
-          lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
+        lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
       break;
     }
     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::MIN:
     {
       fn = acl_common::generateLayer<arm_compute::CLElementwiseMin>(
-          lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
+        lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
       break;
     }
     default:
@@ -696,10 +692,10 @@ void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node)
     case ir::operation::ElementwiseUnary::Type::ABS:
     {
       const ::arm_compute::ActivationLayerInfo act_info{
-          ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
+        ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
 
       fn = acl_common::generateLayer<arm_compute::CLActivationLayer>(
-          input_tensor->handle(), output_tensor->handle(), act_info);
+        input_tensor->handle(), output_tensor->handle(), act_info);
       break;
     }
     case ir::operation::ElementwiseUnary::Type::CAST:
@@ -718,7 +714,7 @@ void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node)
       {
         // TODO Support converting float to int32 as round down
         fn = acl_common::generateLayer<arm_compute::CLCast>(
-            input_tensor->handle(), output_tensor->handle(), arm_compute::ConvertPolicy::SATURATE);
+          input_tensor->handle(), output_tensor->handle(), arm_compute::ConvertPolicy::SATURATE);
       }
       break;
     }
@@ -761,10 +757,10 @@ void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node)
     case ir::operation::ElementwiseUnary::Type::SQRT:
     {
       const ::arm_compute::ActivationLayerInfo act_info{
-          ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
+        ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
 
       fn = acl_common::generateLayer<arm_compute::CLActivationLayer>(
-          input_tensor->handle(), output_tensor->handle(), act_info);
+        input_tensor->handle(), output_tensor->handle(), act_info);
       break;
     }
     default:
@@ -808,11 +804,11 @@ void KernelGenerator::visit(const ir::operation::InstanceNorm &node)
   auto activation = node.param().activation;
 
   auto fn = acl_common::generateLayer<arm_compute::CLInstanceNormalizationLayerEx>(
-      ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(), beta_tensor->handle(),
-      epsilon);
+    ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(), beta_tensor->handle(),
+    epsilon);
 
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
+    asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::LSTM &node)
@@ -834,8 +830,8 @@ void KernelGenerator::visit(const ir::operation::Comparison &node)
   auto input1_tensor = _tensor_reg->getAclTensor(input1_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLComparison>(
-      input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
-      (arm_compute::ComparisonOperation)comparison_type);
+    input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
+    (arm_compute::ComparisonOperation)comparison_type);
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -871,15 +867,15 @@ void KernelGenerator::visit(const ir::operation::OneHot &node)
   if (offvalue.isConstant())
   {
     fn = acl_common::generateLayer<arm_compute::CLOneHot>(
-        indices_tensor->handle(), onvalue_tensor->handle(), output_tensor->handle(),
-        acl_common::asPixelValue(offvalue), static_cast<uint32_t>(depth), axis);
+      indices_tensor->handle(), onvalue_tensor->handle(), output_tensor->handle(),
+      acl_common::asPixelValue(offvalue), static_cast<uint32_t>(depth), axis);
   }
   else
   {
     auto offvalue_tensor = _tensor_reg->getAclTensor(offvalue_idx);
     fn = acl_common::generateLayer<arm_compute::CLOneHot>(
-        indices_tensor->handle(), onvalue_tensor->handle(), offvalue_tensor->handle(),
-        output_tensor->handle(), static_cast<uint32_t>(depth), axis);
+      indices_tensor->handle(), onvalue_tensor->handle(), offvalue_tensor->handle(),
+      output_tensor->handle(), static_cast<uint32_t>(depth), axis);
   }
 
   if (output_tensor->dimension(0) == 1)
@@ -942,14 +938,14 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
 void KernelGenerator::visit(const ir::operation::Pool2D &node)
 {
   auto raw_fn = acl_common::kernelGenPool2D<::arm_compute::CLPoolingLayer>(
-      node, _ctx, _tensor_reg, _current_layout, acl_common::convertPoolType(node.param().op_type));
+    node, _ctx, _tensor_reg, _current_layout, acl_common::convertPoolType(node.param().op_type));
 
   const auto ofm_index{node.getOutputs().at(0)};
   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
   const auto activation = node.param().activation;
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclFunction(std::move(raw_fn)),
-      ActivationBuilder::generate(activation, ofm_tensor->handle()));
+    asAclFunction(std::move(raw_fn)),
+    ActivationBuilder::generate(activation, ofm_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::Permute &node)
@@ -997,9 +993,10 @@ void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLScale>(
-      ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::InterpolationPolicy::BILINEAR,
-      ::arm_compute::BorderMode::REPLICATE, ::arm_compute::PixelValue(0.f),
-      ::arm_compute::SamplingPolicy::TOP_LEFT);
+    ifm_tensor->handle(), ofm_tensor->handle(),
+    ::arm_compute::ScaleKernelInfo{
+      ::arm_compute::InterpolationPolicy::BILINEAR, ::arm_compute::BorderMode::REPLICATE,
+      ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT});
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1013,9 +1010,10 @@ void KernelGenerator::visit(const ir::operation::ResizeNearestNeighbor &node)
   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLScale>(
-      ifm_tensor->handle(), ofm_tensor->handle(),
+    ifm_tensor->handle(), ofm_tensor->handle(),
+    ::arm_compute::ScaleKernelInfo{
       ::arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, ::arm_compute::BorderMode::REPLICATE,
-      ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT);
+      ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT});
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1024,12 +1022,12 @@ void KernelGenerator::visit(const ir::operation::RNN &node)
 {
   const auto output_index{node.getOutputs().at(ir::operation::RNN::Output::OUTPUT)};
   const auto hidden_state_out_index{
-      node.getOutputs().at(ir::operation::RNN::Output::HIDDEN_STATE_OUT)};
+    node.getOutputs().at(ir::operation::RNN::Output::HIDDEN_STATE_OUT)};
 
   const auto input_index{node.getInputs().at(ir::operation::RNN::Input::INPUT)};
   const auto weights_index{node.getInputs().at(ir::operation::RNN::Input::WEIGHTS)};
   const auto recurrent_weights_index{
-      node.getInputs().at(ir::operation::RNN::Input::RECURRENT_WEIGHTS)};
+    node.getInputs().at(ir::operation::RNN::Input::RECURRENT_WEIGHTS)};
   const auto bias_index{node.getInputs().at(ir::operation::RNN::Input::BIAS)};
   const auto hidden_state_in_index{node.getInputs().at(ir::operation::RNN::Input::HIDDEN_STATE_IN)};
 
@@ -1046,13 +1044,13 @@ void KernelGenerator::visit(const ir::operation::RNN &node)
   auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
 
   auto copy_layer = acl_common::generateLayer<arm_compute::CLCopy>(
-      hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle());
+    hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle());
   _return_fn = asAclFunction(std::move(copy_layer));
 
   auto fn = acl_common::generateLayer<arm_compute::CLRNNLayer>(
-      _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
-      weights_tensor->handle(), recurrent_weights_tensor->handle(), bias_tensor->handle(),
-      hidden_state_out_tensor->handle(), output_tensor->handle(), act_info);
+    _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
+    weights_tensor->handle(), recurrent_weights_tensor->handle(), bias_tensor->handle(),
+    hidden_state_out_tensor->handle(), output_tensor->handle(), act_info);
   _return_fn = asAclFunction(std::move(fn));
 }
 
@@ -1061,7 +1059,7 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::INPUT)};
   const auto block_size_index{
-      node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
+    node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
   const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
 
   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
@@ -1073,8 +1071,8 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
   assert(_ctx.at(paddings_index).data());
 
   auto fn = acl_common::generateLayer<arm_compute::CLSpaceToBatchLayer>(
-      ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(),
-      ofm_tensor->handle());
+    ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(),
+    ofm_tensor->handle());
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1090,7 +1088,7 @@ void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLSpaceToDepthLayer>(
-      ifm_tensor->handle(), ofm_tensor->handle(), block_size);
+    ifm_tensor->handle(), ofm_tensor->handle(), block_size);
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1106,7 +1104,7 @@ void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
   auto values_tensor = _tensor_reg->getAclTensor(values_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLEmbeddingLookup>(
-      values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle());
+    values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle());
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1125,10 +1123,10 @@ void KernelGenerator::visit(const ir::operation::L2Normalization &node)
   // TODO Support optional constant dimension that normalization would be performed on
   const auto normalization_axis = _ctx.at(ifm_index).shape().rank() - 1;
   int32_t radius =
-      2 * ifm_shape.dim(normalization_axis) + 1; // normSize = depth(last dimension) * 2 + 1
-  float alpha = 1.0f;                            // In the implementation to make alpha_ become 1
-  float beta = 0.5f;                             // pow(reduction, -0.5) = 1 / sqrt(reduction)
-  float bias = 0.0f;                             // Don't offset the reduction.
+    2 * ifm_shape.dim(normalization_axis) + 1; // normSize = depth(last dimension) * 2 + 1
+  float alpha = 1.0f;                          // In the implementation to make alpha_ become 1
+  float beta = 0.5f;                           // pow(reduction, -0.5) = 1 / sqrt(reduction)
+  float bias = 0.0f;                           // Don't offset the reduction.
 
   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
@@ -1137,7 +1135,7 @@ void KernelGenerator::visit(const ir::operation::L2Normalization &node)
                                                                radius, alpha, beta, bias, false);
 
   auto fn = acl_common::generateLayer<arm_compute::CLNormalizationLayer>(
-      ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
+    ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1159,8 +1157,8 @@ void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
   auto values_tensor = _tensor_reg->getAclTensor(values_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLHashtableLookup>(
-      lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(),
-      output_tensor->handle(), hits_tensor->handle());
+    lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(),
+    output_tensor->handle(), hits_tensor->handle());
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1176,7 +1174,7 @@ void KernelGenerator::visit(const ir::operation::PReLU &node)
   auto alpha_tensor = _tensor_reg->getAclTensor(alpha_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLPReluLayer>(
-      ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle());
+    ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle());
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1202,7 +1200,7 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node)
   if (node.param().padding.type == ir::PaddingType::VALID)
   {
     invalid_horizontal =
-        ofm_shape.W - (1 + (ifm_shape.W - 1) * stride.horizontal) - (ker_shape.W - 1);
+      ofm_shape.W - (1 + (ifm_shape.W - 1) * stride.horizontal) - (ker_shape.W - 1);
     invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
   }
 
@@ -1213,9 +1211,9 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node)
   const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
 
   auto fn = acl_common::generateLayer<arm_compute::CLTransposeConvLayer>(
-      _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), ifm_tensor->handle(),
-      ker_tensor->handle(), nullptr, ofm_tensor->handle(), tconv_info, invalid_horizontal,
-      invalid_vertical);
+    _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), ifm_tensor->handle(),
+    ker_tensor->handle(), nullptr, ofm_tensor->handle(), tconv_info, invalid_horizontal,
+    invalid_vertical);
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1231,7 +1229,7 @@ void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
   auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLElementwiseSquaredDiff>(
-      lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
+    lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1240,7 +1238,7 @@ void KernelGenerator::visit(const ir::operation::TopKV2 &node)
 {
   const auto outputValues_index{node.getOutputs().at(ir::operation::TopKV2::Output::OUTPUT_VALUES)};
   const auto outputIndices_index{
-      node.getOutputs().at(ir::operation::TopKV2::Output::OUTPUT_INDICES)};
+    node.getOutputs().at(ir::operation::TopKV2::Output::OUTPUT_INDICES)};
 
   const auto inputData_index{node.getInputs().at(ir::operation::TopKV2::Input::INPUT)};
 
@@ -1255,7 +1253,7 @@ void KernelGenerator::visit(const ir::operation::TopKV2 &node)
   auto input_tensor = _tensor_reg->getAclTensor(inputData_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLTopKV2>(
-      input_tensor->handle(), k, values_tensor->handle(), indices_tensor->handle());
+    input_tensor->handle(), k, values_tensor->handle(), indices_tensor->handle());
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1309,7 +1307,7 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
   }
 
   auto fn = acl_common::generateLayer<arm_compute::CLGatherEx>(
-      ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis);
+    ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis);
 
   // Revert disabling applied dim_correction
   if (ifm_tensor->dimension(0) == 1)
@@ -1348,11 +1346,11 @@ void KernelGenerator::visit(const ir::operation::ArgMinMax &node)
   }
 
   auto acl_axis =
-      acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value();
+    acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value();
   auto reduce_type = node.param().is_arg_max ? ::arm_compute::ReductionOperation::ARG_IDX_MAX
                                              : ::arm_compute::ReductionOperation::ARG_IDX_MIN;
   auto fn = acl_common::generateLayer<arm_compute::CLArgMinMaxLayerEx>(
-      ifm_tensor->handle(), acl_axis, ofm_tensor->handle(), reduce_type);
+    ifm_tensor->handle(), acl_axis, ofm_tensor->handle(), reduce_type);
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1361,7 +1359,7 @@ void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &nod
 {
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{
-      node.getInputs().at(ir::operation::LocalResponseNormalization::Input::INPUT)};
+    node.getInputs().at(ir::operation::LocalResponseNormalization::Input::INPUT)};
 
   auto radius = node.param().radius;
   auto alpha = node.param().alpha;
@@ -1372,10 +1370,10 @@ void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &nod
   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 
   const auto norm_info = ::arm_compute::NormalizationLayerInfo(
-      ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
+    ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
 
   auto fn = acl_common::generateLayer<arm_compute::CLNormalizationLayer>(
-      ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
+    ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1392,7 +1390,7 @@ void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLDepthToSpaceLayer>(
-      input_tensor->handle(), output_tensor->handle(), block_size);
+    input_tensor->handle(), output_tensor->handle(), block_size);
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1426,7 +1424,7 @@ void KernelGenerator::visit(const ir::operation::Split &node)
   axis = acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value();
 
   auto fn =
-      acl_common::generateLayer<arm_compute::CLSplit>(ifm_tensor->handle(), output_tensors, axis);
+    acl_common::generateLayer<arm_compute::CLSplit>(ifm_tensor->handle(), output_tensors, axis);
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1467,8 +1465,8 @@ void KernelGenerator::visit(const ir::operation::SplitV &node)
     }
 
     split_dim_revised =
-        acl_common::ToARMComputeAxis(ifm_rank, split_dim_revised, frontend_layout, backend_layout)
-            .value();
+      acl_common::ToARMComputeAxis(ifm_rank, split_dim_revised, frontend_layout, backend_layout)
+        .value();
     fn->configure(ifm_tensor->handle(), size_split_tensor->handle(), split_dim_revised,
                   output_tensors, node.param().num_splits);
 
@@ -1515,7 +1513,7 @@ void KernelGenerator::visit(const ir::operation::Unpack &node)
   }
 
   auto fn =
-      acl_common::generateLayer<arm_compute::CLUnstack>(input_tensor->handle(), outputs, axis);
+    acl_common::generateLayer<arm_compute::CLUnstack>(input_tensor->handle(), outputs, axis);
 
   // Revert disabling applied dim_correction
   if (input_tensor->dimension(0) == 1)
@@ -1538,7 +1536,7 @@ void KernelGenerator::visit(const ir::operation::Pad &node)
 
   auto input_type = _ctx.at(input_index).typeInfo();
   auto data_type = acl_common::asDataType(input_type.type());
-  auto quant_info = ::arm_compute::QuantizationInfo(input_type.scale(), input_type.offset());
+  auto quant_info = ::arm_compute::QuantizationInfo(input_type.scale(), input_type.zero_point());
   const auto pixel_value = ::arm_compute::PixelValue(0, data_type, quant_info);
 
   auto input = _tensor_reg->getAclTensor(input_index)->handle();
@@ -1554,7 +1552,7 @@ void KernelGenerator::visit(const ir::operation::Pad &node)
     const int32_t *from = reinterpret_cast<const int32_t *>(pad_base) + (n * 2);
 
     const auto axis =
-        acl_common::ToARMComputeAxis(rank, n, frontend_layout, backend_layout).value();
+      acl_common::ToARMComputeAxis(rank, n, frontend_layout, backend_layout).value();
     padding_list[axis] = ::arm_compute::PaddingInfo{from[0], from[1]};
   }
 
@@ -1567,7 +1565,7 @@ void KernelGenerator::visit(const ir::operation::Pad &node)
   }
 
   auto fn =
-      acl_common::generateLayer<arm_compute::CLPadLayer>(input, output, padding_list, pixel_value);
+    acl_common::generateLayer<arm_compute::CLPadLayerEx>(input, output, padding_list, pixel_value);
 
   // NOTE Do not revert disabling applied dim_correction for 4D.
   // It would produce a mistach of result by incorrect offset_first_element in
@@ -1592,7 +1590,7 @@ void KernelGenerator::visit(const ir::operation::ConvertFp32ToFp16 &node)
   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLDepthConvertLayer>(
-      ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE, 0);
+    ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE, 0);
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1606,7 +1604,7 @@ void KernelGenerator::visit(const ir::operation::ConvertFp16ToFp32 &node)
   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 
   auto fn = acl_common::generateLayer<arm_compute::CLDepthConvertLayer>(
-      ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE, 0);
+    ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE, 0);
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1630,7 +1628,7 @@ void KernelGenerator::visit(const ir::operation::Reverse &node)
   }
 
   auto fn = acl_common::generateLayer<arm_compute::CLReverse>(
-      ifm_tensor->handle(), ofm_tensor->handle(), axis_tensor->handle());
+    ifm_tensor->handle(), ofm_tensor->handle(), axis_tensor->handle());
 
   _return_fn = asAclFunction(std::move(fn));
 }
diff --git a/runtime/onert/backend/acl_cl/KernelGenerator.h b/runtime/onert/backend/acl_cl/KernelGenerator.h
index 22a7c18a3..dc7285349 100644
--- a/runtime/onert/backend/acl_cl/KernelGenerator.h
+++ b/runtime/onert/backend/acl_cl/KernelGenerator.h
@@ -17,9 +17,8 @@
 #ifndef __ONERT_BACKEND_ACL_CL_KERNEL_GENERATOR_H__
 #define __ONERT_BACKEND_ACL_CL_KERNEL_GENERATOR_H__
 
-#include <backend/cpu_common/KernelGeneratorBase.h>
+#include <backend/basic/KernelGeneratorBase.h>
 
-#include "ir/Operands.h"
 #include "TensorBuilder.h"
 #include "AclTensorRegistry.h"
 #include "TensorManager.h"
@@ -31,15 +30,15 @@ namespace backend
 namespace acl_cl
 {
 
-class KernelGenerator : public cpu_common::KernelGeneratorBase
+class KernelGenerator : public basic::KernelGeneratorBase
 {
 public:
-  KernelGenerator(const ir::Operands &operands_ctx, const ir::Operations &operations_ctx,
-                  const std::shared_ptr<TensorBuilder> &tensor_builder,
+  KernelGenerator(const ir::Graph &graph, const std::shared_ptr<TensorBuilder> &tensor_builder,
                   const std::shared_ptr<acl_common::AclTensorRegistry<TensorManager>> &_tensor_reg);
 
-  void visit(const ir::OpSequence &) override;
+  std::unique_ptr<exec::FunctionSequence> generate(ir::OperationIndex ind) override;
 
+private:
   void visit(const ir::operation::ArgMinMax &) override;
   void visit(const ir::operation::BatchToSpaceND &) override;
   void visit(const ir::operation::BinaryArithmetic &) override;
@@ -91,9 +90,9 @@ public:
 private:
   const ir::Operands &_ctx;
   const ir::Operations &_operations_ctx;
+  const ir::Layout _current_layout;
   std::shared_ptr<TensorBuilder> _tensor_builder;
   std::shared_ptr<acl_common::AclTensorRegistry<TensorManager>> _tensor_reg;
-  ir::Layout _current_layout;
 };
 
 } // namespace acl_cl
diff --git a/runtime/onert/backend/acl_cl/Optimizer.cc b/runtime/onert/backend/acl_cl/Optimizer.cc
index 7290c5688..12e805ee5 100644
--- a/runtime/onert/backend/acl_cl/Optimizer.cc
+++ b/runtime/onert/backend/acl_cl/Optimizer.cc
@@ -31,8 +31,8 @@ namespace acl_cl
 {
 
 Optimizer::Optimizer(BackendContext *context)
-    : _context{context},
-      _tensor_builder{std::dynamic_pointer_cast<TensorBuilder>(context->tensor_builder)}
+  : _context{context}, _tensor_builder{
+                         std::dynamic_pointer_cast<TensorBuilder>(context->tensor_builder)}
 {
   assert(context);
 }
@@ -43,12 +43,11 @@ void Optimizer::optimize()
   {
     acl_common::AclSubTensorAnalyzer sa{*_context->graph()};
     sa.setUsePadding();
-    for (auto op_info : _context->operation_list())
-    {
-      auto &op = _context->graph()->operations().at(op_info.index);
-      sa.setLayout(op_info.layout);
-      op.accept(sa);
-    }
+    _context->graph()->operations().iterate(
+      [&](const ir::OperationIndex &, const ir::Operation &op) {
+        sa.setLayout(_context->graph()->layout());
+        op.accept(sa);
+      });
 
     _tensor_builder->parent_map(sa.releaseParentMap());
   }
diff --git a/runtime/onert/backend/acl_cl/TensorBuilder.h b/runtime/onert/backend/acl_cl/TensorBuilder.h
index 91502d39a..5492929fe 100644
--- a/runtime/onert/backend/acl_cl/TensorBuilder.h
+++ b/runtime/onert/backend/acl_cl/TensorBuilder.h
@@ -30,7 +30,7 @@ namespace acl_cl
 {
 
 using TensorBuilder =
-    acl_common::AclTensorBuilder<operand::ICLTensor, operand::CLTensor, operand::CLSubTensor>;
+  acl_common::AclTensorBuilder<operand::ICLTensor, operand::CLTensor, operand::CLSubTensor>;
 
 } // namespace acl_cl
 } // namespace backend
diff --git a/runtime/onert/backend/acl_cl/TensorManager.h b/runtime/onert/backend/acl_cl/TensorManager.h
index ab295dbec..2860f51f3 100644
--- a/runtime/onert/backend/acl_cl/TensorManager.h
+++ b/runtime/onert/backend/acl_cl/TensorManager.h
@@ -41,20 +41,20 @@ namespace acl_cl
 {
 
 using MemoryManager =
-    acl_common::AclMemoryManager<operand::ICLTensor, operand::CLTensor, operand::CLSubTensor>;
+  acl_common::AclMemoryManager<operand::ICLTensor, operand::CLTensor, operand::CLSubTensor>;
 
-using LinearMemoryManager = acl_common::AclLinearMemoryManager<
-    operand::ICLTensor, operand::CLTensor, operand::CLSubTensor,
-    ::arm_compute::MemoryManagerOnDemand, ::arm_compute::PoolManager,
-    ::arm_compute::BlobLifetimeManager, ::arm_compute::CLBufferAllocator,
-    ::arm_compute::MemoryGroup>;
+using LinearMemoryManager =
+  acl_common::AclLinearMemoryManager<operand::ICLTensor, operand::CLTensor, operand::CLSubTensor,
+                                     ::arm_compute::MemoryManagerOnDemand,
+                                     ::arm_compute::PoolManager, ::arm_compute::BlobLifetimeManager,
+                                     ::arm_compute::CLBufferAllocator, ::arm_compute::MemoryGroup>;
 
 using InternalBufferManager = acl_common::AclInternalBufferManager<
-    ::arm_compute::MemoryManagerOnDemand, ::arm_compute::PoolManager,
-    ::arm_compute::BlobLifetimeManager, ::arm_compute::CLBufferAllocator>;
+  ::arm_compute::MemoryManagerOnDemand, ::arm_compute::PoolManager,
+  ::arm_compute::BlobLifetimeManager, ::arm_compute::CLBufferAllocator>;
 
 using TensorManager =
-    acl_common::AclTensorManager<operand::ICLTensor, operand::CLTensor, operand::CLSubTensor>;
+  acl_common::AclTensorManager<operand::ICLTensor, operand::CLTensor, operand::CLSubTensor>;
 
 inline TensorManager *createTensorManager(bool is_linear_executor)
 {
diff --git a/runtime/onert/backend/acl_cl/operand/CLSubTensor.cc b/runtime/onert/backend/acl_cl/operand/CLSubTensor.cc
index 234229787..2c4357349 100644
--- a/runtime/onert/backend/acl_cl/operand/CLSubTensor.cc
+++ b/runtime/onert/backend/acl_cl/operand/CLSubTensor.cc
@@ -27,9 +27,8 @@ namespace operand
 
 CLSubTensor::CLSubTensor(ICLTensor *parent, const arm_compute::TensorShape &tensor_shape,
                          const arm_compute::Coordinates &coords, size_t rank, bool extend_parent)
-    : _cl_sub_tensor(std::make_shared<arm_compute::CLSubTensor>(parent->handle(), tensor_shape,
-                                                                coords, extend_parent)),
-      _rank{rank}
+  : ICLTensor{rank}, _cl_sub_tensor(std::make_shared<arm_compute::CLSubTensor>(
+                       parent->handle(), tensor_shape, coords, extend_parent))
 {
   // DO NOTHING
 }
diff --git a/runtime/onert/backend/acl_cl/operand/CLSubTensor.h b/runtime/onert/backend/acl_cl/operand/CLSubTensor.h
index 91f74f3d5..0a26e4822 100644
--- a/runtime/onert/backend/acl_cl/operand/CLSubTensor.h
+++ b/runtime/onert/backend/acl_cl/operand/CLSubTensor.h
@@ -39,9 +39,6 @@ public:
               const arm_compute::Coordinates &coords, size_t rank, bool extend_parent = false);
 
 public:
-  size_t num_dimensions() const final { return _rank; }
-
-public:
   const arm_compute::CLSubTensor *handle() const override;
   arm_compute::CLSubTensor *handle() override;
 
@@ -52,7 +49,6 @@ public:
 
 private:
   std::shared_ptr<arm_compute::CLSubTensor> _cl_sub_tensor;
-  size_t _rank;
 };
 
 } // namespace operand
diff --git a/runtime/onert/backend/acl_cl/operand/CLTensor.cc b/runtime/onert/backend/acl_cl/operand/CLTensor.cc
index f37edff51..38ce4647f 100644
--- a/runtime/onert/backend/acl_cl/operand/CLTensor.cc
+++ b/runtime/onert/backend/acl_cl/operand/CLTensor.cc
@@ -32,7 +32,7 @@ namespace operand
 {
 
 CLTensor::CLTensor(const arm_compute::TensorInfo &info, size_t rank, size_t num_uses)
-    : _cl_tensor(std::make_shared<arm_compute::CLTensor>()), _rank{rank}, _num_uses{num_uses}
+  : ICLTensor{rank}, _cl_tensor(std::make_shared<arm_compute::CLTensor>()), _num_uses{num_uses}
 {
   allocator()->init(info);
 }
diff --git a/runtime/onert/backend/acl_cl/operand/CLTensor.h b/runtime/onert/backend/acl_cl/operand/CLTensor.h
index c92208803..487d04662 100644
--- a/runtime/onert/backend/acl_cl/operand/CLTensor.h
+++ b/runtime/onert/backend/acl_cl/operand/CLTensor.h
@@ -41,9 +41,6 @@ public:
   CLTensor(const arm_compute::TensorInfo &info, size_t rank, size_t num_uses);
 
 public:
-  size_t num_dimensions() const final { return _rank; }
-
-public:
   const arm_compute::CLTensor *handle() const override;
   arm_compute::CLTensor *handle() override;
   size_t num_uses() const { return _num_uses; }
@@ -61,7 +58,6 @@ public:
 
 private:
   std::shared_ptr<arm_compute::CLTensor> _cl_tensor;
-  size_t _rank;
   size_t _num_uses;
 };
 
diff --git a/runtime/onert/backend/acl_cl/operand/ICLTensor.h b/runtime/onert/backend/acl_cl/operand/ICLTensor.h
index e6e20a8bf..51152a318 100644
--- a/runtime/onert/backend/acl_cl/operand/ICLTensor.h
+++ b/runtime/onert/backend/acl_cl/operand/ICLTensor.h
@@ -33,6 +33,7 @@ namespace operand
 class ICLTensor : public acl_common::IACLTensor
 {
 public:
+  ICLTensor(size_t rank) : IACLTensor{rank} {}
   const arm_compute::ICLTensor *handle() const override = 0;
   arm_compute::ICLTensor *handle() override = 0;
 
diff --git a/runtime/onert/backend/acl_common/AclActivationBuilder.h b/runtime/onert/backend/acl_common/AclActivationBuilder.h
index bfdea6ea0..5d92a7856 100644
--- a/runtime/onert/backend/acl_common/AclActivationBuilder.h
+++ b/runtime/onert/backend/acl_common/AclActivationBuilder.h
@@ -49,7 +49,7 @@ std::unique_ptr<exec::IFunction>
 AclActivationBuilder<T_Tensor, T_ActivationLayer, T_ExecFunction>::generateReLU(T_Tensor *ifm_alloc)
 {
   const ::arm_compute::ActivationLayerInfo act_info{
-      ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU};
+    ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU};
 
   auto fn = std::make_unique<T_ActivationLayer>();
 
@@ -61,10 +61,10 @@ AclActivationBuilder<T_Tensor, T_ActivationLayer, T_ExecFunction>::generateReLU(
 template <typename T_Tensor, typename T_ActivationLayer, typename T_ExecFunction>
 std::unique_ptr<exec::IFunction>
 AclActivationBuilder<T_Tensor, T_ActivationLayer, T_ExecFunction>::generateReLU1(
-    T_Tensor *ifm_alloc)
+  T_Tensor *ifm_alloc)
 {
   const ::arm_compute::ActivationLayerInfo act_info{
-      ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f};
+    ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f};
 
   auto fn = std::make_unique<T_ActivationLayer>();
 
@@ -76,10 +76,10 @@ AclActivationBuilder<T_Tensor, T_ActivationLayer, T_ExecFunction>::generateReLU1
 template <typename T_Tensor, typename T_ActivationLayer, typename T_ExecFunction>
 std::unique_ptr<exec::IFunction>
 AclActivationBuilder<T_Tensor, T_ActivationLayer, T_ExecFunction>::generateReLU6(
-    T_Tensor *ifm_alloc)
+  T_Tensor *ifm_alloc)
 {
   const ::arm_compute::ActivationLayerInfo act_info{
-      ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.0f, 0.0f};
+    ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.0f, 0.0f};
 
   auto fn = std::make_unique<T_ActivationLayer>();
 
diff --git a/runtime/onert/backend/acl_common/AclConstantInitializer.cc b/runtime/onert/backend/acl_common/AclConstantInitializer.cc
index 921d107d9..9748ab111 100644
--- a/runtime/onert/backend/acl_common/AclConstantInitializer.cc
+++ b/runtime/onert/backend/acl_common/AclConstantInitializer.cc
@@ -25,7 +25,7 @@ namespace acl_common
 
 AclConstantInitializer::AclConstantInitializer(const ir::Operands &operands,
                                                const std::shared_ptr<ITensorRegistry> &tensor_reg)
-    : cpu_common::ConstantInitializerBase{operands}, _tensor_reg{tensor_reg}
+  : _operands{operands}, _tensor_reg{tensor_reg}, _current_layout{ir::Layout::UNKNOWN}
 {
   // DO NOTHING
 }
@@ -126,6 +126,94 @@ void AclConstantInitializer::visit(const ir::operation::TransposeConv &node)
   permuteInputInitialize(node, ir::operation::TransposeConv::KERNEL);
 }
 
+// NOTE Workaround for 16b float type. Here, this is enough since only the size of bytes matters.
+using float16 = uint16_t;
+
+void AclConstantInitializer::registerCopyInitializer(const ir::OperandIndex &index,
+                                                     const ir::Operand &obj)
+{
+  // For only CONSTANTS
+  // TODO Add to check if tensor has been allocated
+  if (!obj.isConstant())
+    return;
+
+  const auto type = obj.typeInfo().type();
+  using ir::DataType;
+
+  switch (type)
+  {
+    case DataType::FLOAT32:
+      _init_map[index] = copyInit<float>;
+      break;
+    case DataType::INT32:
+      _init_map[index] = copyInit<int32_t>;
+      break;
+    case DataType::UINT32:
+      _init_map[index] = copyInit<uint32_t>;
+      break;
+    case DataType::BOOL8:
+    case DataType::QUANT_UINT8_ASYMM:
+      _init_map[index] = copyInit<uint8_t>;
+      break;
+    case DataType::QUANT_INT8_SYMM:
+    case DataType::QUANT_INT8_ASYMM:
+      _init_map[index] = copyInit<int8_t>;
+      break;
+    case DataType::FLOAT16:
+      _init_map[index] = copyInit<float16>;
+      break;
+    case DataType::INT64:
+      _init_map[index] = copyInit<int64_t>;
+      break;
+    default:
+      throw std::runtime_error("Not supported, yet");
+      break;
+  }
+}
+
+void AclConstantInitializer::registerPermuteInitializer(const ir::OperandIndex &index,
+                                                        const ir::Operand &obj)
+{
+  // For only CONSTANTS
+  // TODO Add to check if tensor has been allocated
+  if (!obj.isConstant())
+    return;
+
+  const auto type = obj.typeInfo().type();
+  using ir::DataType;
+  using namespace std::placeholders;
+
+  switch (type)
+  {
+    case DataType::FLOAT32:
+      _init_map[index] = std::bind(permuteInit<float>, _1, _2, _current_layout);
+      break;
+    case DataType::INT32:
+      _init_map[index] = std::bind(permuteInit<int32_t>, _1, _2, _current_layout);
+      break;
+    case DataType::UINT32:
+      _init_map[index] = std::bind(permuteInit<uint32_t>, _1, _2, _current_layout);
+      break;
+    case DataType::BOOL8:
+    case DataType::QUANT_UINT8_ASYMM:
+      _init_map[index] = std::bind(permuteInit<uint8_t>, _1, _2, _current_layout);
+      break;
+    case DataType::QUANT_INT8_SYMM:
+    case DataType::QUANT_INT8_ASYMM:
+      _init_map[index] = std::bind(permuteInit<int8_t>, _1, _2, _current_layout);
+      break;
+    case DataType::FLOAT16:
+      _init_map[index] = std::bind(permuteInit<float16>, _1, _2, _current_layout);
+      break;
+    case DataType::INT64:
+      _init_map[index] = std::bind(permuteInit<int64_t>, _1, _2, _current_layout);
+      break;
+    default:
+      throw std::runtime_error("Not supported, yet");
+      break;
+  }
+}
+
 } // namespace acl_common
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/backend/acl_common/AclConstantInitializer.h b/runtime/onert/backend/acl_common/AclConstantInitializer.h
index 894e2e7d1..b7f66b50e 100644
--- a/runtime/onert/backend/acl_common/AclConstantInitializer.h
+++ b/runtime/onert/backend/acl_common/AclConstantInitializer.h
@@ -17,10 +17,19 @@
 #ifndef __ONERT_COMPILER_ACL_COMMON_ACLCONSTANT_INITIALIZER_H__
 #define __ONERT_COMPILER_ACL_COMMON_ACLCONSTANT_INITIALIZER_H__
 
-#include <backend/cpu_common/ConstantInitializerBase.h>
-#include <ir/Operands.h>
 #include "AclTensorRegistry.h"
 
+#include <unordered_map>
+#include <functional>
+
+#include <ir/Coordinates.h>
+#include <ir/Layout.h>
+#include <ir/Operand.h>
+#include <ir/Operands.h>
+#include <ir/OperationVisitor.h>
+#include <backend/ITensorRegistry.h>
+#include <util/logging.h>
+
 namespace onert
 {
 namespace backend
@@ -28,13 +37,162 @@ namespace backend
 namespace acl_common
 {
 
-class AclConstantInitializer : public cpu_common::ConstantInitializerBase
+template <typename T>
+static void Init(const onert::ir::Operand &model_obj, onert::backend::ITensor &obj, const bool copy,
+                 const onert::ir::Layout frontend_layout = onert::ir::Layout::UNKNOWN)
+{
+  const auto shape = model_obj.shape();
+  assert(model_obj.data());
+  auto base = reinterpret_cast<const T *>(model_obj.data()->base());
+
+  obj.access([&](::onert::backend::ITensor &tensor) {
+    switch (shape.rank())
+    {
+      case 0:
+      {
+        assert(model_obj.data()->size() == sizeof(T));
+        const auto value = *reinterpret_cast<const T *>(base);
+        T *into = reinterpret_cast<T *>(tensor.buffer());
+        *into = value;
+        break;
+      }
+      case 1:
+      {
+        auto vec_size = shape.dim(0);
+        for (int32_t n = 0; n < vec_size; ++n)
+        {
+          const T *from = reinterpret_cast<const T *>(base) + n;
+          const auto value = *from;
+
+          T *into = reinterpret_cast<T *>(tensor.buffer()) + n;
+
+          *into = value;
+        }
+        break;
+      }
+      case 2:
+      {
+        const int32_t copy_len = shape.dim(1);
+
+        for (auto i = 0; i < shape.dim(0); ++i)
+        {
+          ::onert::ir::Coordinates coords{i, 0};
+          memcpy(tensor.buffer() + tensor.calcOffset(coords), base + i * copy_len,
+                 copy_len * sizeof(T));
+        }
+        break;
+      }
+      case 3:
+      {
+        const int32_t width = shape.dim(1);
+        const int32_t copy_len = shape.dim(2);
+
+        for (auto i = 0; i < shape.dim(0); ++i)
+        {
+          for (auto j = 0; j < shape.dim(1); ++j)
+          {
+            ::onert::ir::Coordinates coords{i, j, 0};
+            memcpy(tensor.buffer() + tensor.calcOffset(coords),
+                   base + i * width * copy_len + j * copy_len, copy_len * sizeof(T));
+          }
+        }
+        break;
+      }
+      case 4:
+      {
+        const int32_t height = shape.dim(1);
+        const int32_t width = shape.dim(2);
+        const int32_t copy_len = shape.dim(3);
+        for (auto i = 0; i < shape.dim(0); ++i)
+        {
+          for (auto j = 0; j < shape.dim(1); ++j)
+          {
+            for (auto k = 0; k < shape.dim(2); ++k)
+            {
+              if (copy)
+              {
+                ::onert::ir::Coordinates coords{i, j, k, 0};
+                memcpy(tensor.buffer() + tensor.calcOffset(coords),
+                       base + i * height * width * copy_len + j * width * copy_len + k * copy_len,
+                       copy_len * sizeof(T));
+              }
+              else
+              {
+                for (auto l = 0; l < shape.dim(3); ++l)
+                {
+                  const auto coords =
+                    ::onert::ir::convertCoordinates({i, j, k, l}, frontend_layout, tensor.layout());
+                  T *into = reinterpret_cast<T *>(tensor.buffer() + tensor.calcOffset(coords));
+                  T value = *(base + i * height * width * copy_len + j * width * copy_len +
+                              k * copy_len + l);
+                  *into = value;
+                }
+              }
+            }
+          }
+        }
+        break;
+      }
+      default:
+        throw std::runtime_error{"Not yet supported"};
+    }
+  });
+}
+
+template <typename T>
+void copyInit(const onert::ir::Operand &model_obj, onert::backend::ITensor &obj)
+{
+  Init<T>(model_obj, obj, true);
+}
+
+template <typename T>
+void permuteInit(const onert::ir::Operand &model_obj, onert::backend::ITensor &obj,
+                 const onert::ir::Layout frontend_layout)
 {
+  const bool copy = frontend_layout == obj.layout();
+  Init<T>(model_obj, obj, copy, frontend_layout);
+}
+
+class AclConstantInitializer : public ir::OperationVisitor
+{
+public:
+  void run()
+  {
+    assert(_tensor_reg);
+    for (const auto &it : _init_map)
+    {
+      const auto &ind = it.first;
+      const auto &fn = it.second;
+
+      const auto &model_obj = _operands.at(ind);
+      auto tensor_obj = _tensor_reg->getNativeITensor(ind);
+      assert(tensor_obj != nullptr);
+      fn(model_obj, *tensor_obj);
+      VERBOSE(FillOperandData) << "Fill data for operand " << ind << std::endl;
+    }
+    _init_map.clear();
+  }
+
 public:
   AclConstantInitializer(const ir::Operands &operands,
                          const std::shared_ptr<ITensorRegistry> &tensor_reg);
 
 public:
+  using Initializer = std::function<void(const ir::Operand &, backend::ITensor &)>;
+
+public:
+  void registerDefaultInitializer(const ir::OperandIndex &index, const ir::Operand &obj)
+  {
+    registerPermuteInitializer(index, obj);
+  }
+  void registerCopyInitializer(const ir::OperandIndex &index, const ir::Operand &obj);
+  void registerPermuteInitializer(const ir::OperandIndex &index, const ir::Operand &obj);
+
+public:
+  void setLayout(ir::Layout layout) { _current_layout = layout; }
+  bool exist(const ir::OperandIndex &ind) { return _init_map.find(ind) != _init_map.end(); }
+
+public:
   void visit(const ir::operation::BatchToSpaceND &) override;
   void visit(const ir::operation::Conv2D &) override;
   void visit(const ir::operation::DepthwiseConv2D &) override;
@@ -47,11 +205,11 @@ protected:
   void copyInputInitialize(const ir::Operation &node, uint32_t index);
   void permuteInputInitialize(const ir::Operation &node, uint32_t index);
 
-private:
-  std::shared_ptr<ITensorRegistry> tensor_registry() const final { return _tensor_reg; }
-
 protected:
+  const ir::Operands &_operands;
   std::shared_ptr<ITensorRegistry> _tensor_reg;
+  std::unordered_map<ir::OperandIndex, Initializer> _init_map;
+  ir::Layout _current_layout;
 };
 
 } // namespace acl_common
diff --git a/runtime/onert/backend/acl_common/AclInternalBufferManager.h b/runtime/onert/backend/acl_common/AclInternalBufferManager.h
index f893bb44b..cca5778d4 100644
--- a/runtime/onert/backend/acl_common/AclInternalBufferManager.h
+++ b/runtime/onert/backend/acl_common/AclInternalBufferManager.h
@@ -20,7 +20,6 @@
 #include <arm_compute/runtime/IMemoryManager.h>
 #include <cassert>
 #include <memory>
-#include <backend/IMemoryManager.h>
 
 namespace onert
 {
@@ -34,10 +33,13 @@ namespace acl_common
 /**
  * @brief Interface for InternalBufferManager which has ::arm_compute::IMemoryManager pointer
  */
-struct IInternalBufferManager : public backend::IMemoryManager
+struct IInternalBufferManager
 {
   virtual ~IInternalBufferManager() = default;
 
+  virtual void allocate(void) = 0;
+  virtual void deallocate(void) = 0;
+
   /**
    * @brief Get shared_ptr of ::arm_compute::IMemoryManager
    */
diff --git a/runtime/onert/backend/acl_common/AclKernelGen.h b/runtime/onert/backend/acl_common/AclKernelGen.h
index 3d0813f81..e05d36a12 100644
--- a/runtime/onert/backend/acl_common/AclKernelGen.h
+++ b/runtime/onert/backend/acl_common/AclKernelGen.h
@@ -32,16 +32,16 @@ namespace acl_common
 
 void enableDimCorrection(IACLTensor *tensor)
 {
-  size_t input_rank = tensor->num_dimensions();
+  size_t input_rank = tensor->getShape().rank();
   const_cast<arm_compute::TensorShape &>(tensor->info()->tensor_shape())
-      .set(input_rank - 1, tensor->info()->dimension(input_rank - 1), true);
+    .set(input_rank - 1, tensor->info()->dimension(input_rank - 1), true);
 }
 
 void disableDimCorrection(IACLTensor *tensor)
 {
-  size_t input_rank = tensor->num_dimensions();
+  size_t input_rank = tensor->getShape().rank();
   const_cast<arm_compute::TensorShape &>(tensor->info()->tensor_shape())
-      .set(input_rank - 1, tensor->info()->dimension(input_rank - 1), false);
+    .set(input_rank - 1, tensor->info()->dimension(input_rank - 1), false);
 }
 
 template <typename Layer, typename... Args>
@@ -74,49 +74,49 @@ std::unique_ptr<exec::IFunction> kernelGenLSTM(const ir::operation::LSTM &node,
   // TODO Support dynamic rnn
   // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection.
   const auto scratch_buffer_index{
-      node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)};
+    node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)};
   const auto output_state_out_index{
-      node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)};
+    node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)};
   const auto cell_state_out_index{
-      node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)};
+    node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)};
   const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)};
 
   const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)};
   const auto input_to_input_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional
+    node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional
   const auto input_to_forget_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)};
+    node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)};
   const auto input_to_cell_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)};
+    node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)};
   const auto input_to_output_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
+    node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
   const auto recurrent_to_input_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional
+    node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional
   const auto recurrent_to_forget_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)};
+    node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)};
   const auto recurrent_to_cell_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)};
+    node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)};
   const auto recurrent_to_output_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
+    node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
   const auto cell_to_input_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional
+    node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional
   const auto cell_to_forget_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional
+    node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional
   const auto cell_to_output_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional
+    node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional
   const auto input_gate_bias_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)};
+    node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)};
   const auto forget_gate_bias_index{
-      node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)};
+    node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)};
   const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)};
   const auto output_gate_bias_index{
-      node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)};
+    node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)};
   const auto projection_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional
+    node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional
   const auto projection_bias_index{
-      node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; // optional
+    node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; // optional
   const auto output_state_in_index{
-      node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)};
+    node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)};
   const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)};
   const auto cell_threshold = node.param().cell_threshold;
   const auto projection_threshold = node.param().projection_threshold;
@@ -124,8 +124,8 @@ std::unique_ptr<exec::IFunction> kernelGenLSTM(const ir::operation::LSTM &node,
   bool has_input_to_input_weights = operands.at(input_to_input_weights_index).shape().dim(0) != 0 &&
                                     operands.at(input_to_input_weights_index).shape().dim(1) != 0;
   bool has_recurrent_to_input_weights =
-      operands.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
-      operands.at(recurrent_to_input_weights_index).shape().dim(1) != 0;
+    operands.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
+    operands.at(recurrent_to_input_weights_index).shape().dim(1) != 0;
   bool has_cell_to_forget_weights = operands.at(cell_to_forget_weights_index).shape().dim(0) != 0;
   bool has_cell_to_output_weights = operands.at(cell_to_output_weights_index).shape().dim(0) != 0;
   bool has_projection_weights = operands.at(projection_weights_index).shape().dim(0) != 0 &&
@@ -163,10 +163,10 @@ std::unique_ptr<exec::IFunction> kernelGenLSTM(const ir::operation::LSTM &node,
   auto input_to_cell_weights_tensor = tensor_reg->getAclTensor(input_to_cell_weights_index);
   auto input_to_output_weights_tensor = tensor_reg->getAclTensor(input_to_output_weights_index);
   auto recurrent_to_forget_weights_tensor =
-      tensor_reg->getAclTensor(recurrent_to_forget_weights_index);
+    tensor_reg->getAclTensor(recurrent_to_forget_weights_index);
   auto recurrent_to_cell_weights_tensor = tensor_reg->getAclTensor(recurrent_to_cell_weights_index);
   auto recurrent_to_output_weights_tensor =
-      tensor_reg->getAclTensor(recurrent_to_output_weights_index);
+    tensor_reg->getAclTensor(recurrent_to_output_weights_index);
 
   auto forget_gate_bias_tensor = tensor_reg->getAclTensor(forget_gate_bias_index);
   auto cell_bias_tensor = tensor_reg->getAclTensor(cell_bias_index);
@@ -180,12 +180,12 @@ std::unique_ptr<exec::IFunction> kernelGenLSTM(const ir::operation::LSTM &node,
   if (has_cifg_param)
   {
     auto input_to_input_weights_tensor =
-        tensor_reg->getAclTensor(input_to_input_weights_index); // optional
+      tensor_reg->getAclTensor(input_to_input_weights_index); // optional
     auto recurrent_to_input_weights_tensor =
-        tensor_reg->getAclTensor(recurrent_to_input_weights_index); // optional
+      tensor_reg->getAclTensor(recurrent_to_input_weights_index); // optional
     auto cell_to_input_weights_handle =
-        has_peephole_param ? tensor_reg->getAclTensor(cell_to_input_weights_index)->handle()
-                           : nullptr; // optional (non-cifg && peephole)
+      has_peephole_param ? tensor_reg->getAclTensor(cell_to_input_weights_index)->handle()
+                         : nullptr; // optional (non-cifg && peephole)
     auto input_gate_bias_tensor = tensor_reg->getAclTensor(input_gate_bias_index); // optional
     lstm_params.set_cifg_params(input_to_input_weights_tensor->handle(),
                                 recurrent_to_input_weights_tensor->handle(),
@@ -194,9 +194,9 @@ std::unique_ptr<exec::IFunction> kernelGenLSTM(const ir::operation::LSTM &node,
   if (has_peephole_param)
   {
     auto cell_to_forget_weights_tensor =
-        tensor_reg->getAclTensor(cell_to_forget_weights_index); // optional
+      tensor_reg->getAclTensor(cell_to_forget_weights_index); // optional
     auto cell_to_output_weights_tensor =
-        tensor_reg->getAclTensor(cell_to_output_weights_index); // optional
+      tensor_reg->getAclTensor(cell_to_output_weights_index); // optional
     lstm_params.set_peephole_params(cell_to_forget_weights_tensor->handle(),
                                     cell_to_output_weights_tensor->handle());
   }
@@ -204,21 +204,20 @@ std::unique_ptr<exec::IFunction> kernelGenLSTM(const ir::operation::LSTM &node,
   {
     auto projection_weights_tensor = tensor_reg->getAclTensor(projection_weights_index); // optional
     auto projection_bias_handle = has_projection_bias
-                                      ? tensor_reg->getAclTensor(projection_bias_index)->handle()
-                                      : nullptr; // optional
+                                    ? tensor_reg->getAclTensor(projection_bias_index)->handle()
+                                    : nullptr; // optional
     lstm_params.set_projection_params(projection_weights_tensor->handle(), projection_bias_handle);
   }
 
   auto fn = generateLayer<T_ACLLayer>(
-      input_tensor->handle(), input_to_forget_weights_tensor->handle(),
-      input_to_cell_weights_tensor->handle(), input_to_output_weights_tensor->handle(),
-      recurrent_to_forget_weights_tensor->handle(), recurrent_to_cell_weights_tensor->handle(),
-      recurrent_to_output_weights_tensor->handle(), forget_gate_bias_tensor->handle(),
-      cell_bias_tensor->handle(), output_gate_bias_tensor->handle(),
-      output_state_in_tensor->handle(), cell_state_in_tensor->handle(),
-      scratch_buffer_tensor->handle(), output_state_out_tensor->handle(),
-      cell_state_out_tensor->handle(), output_tensor->handle(), lstm_params, act_info, cell_clip,
-      projection_clip);
+    input_tensor->handle(), input_to_forget_weights_tensor->handle(),
+    input_to_cell_weights_tensor->handle(), input_to_output_weights_tensor->handle(),
+    recurrent_to_forget_weights_tensor->handle(), recurrent_to_cell_weights_tensor->handle(),
+    recurrent_to_output_weights_tensor->handle(), forget_gate_bias_tensor->handle(),
+    cell_bias_tensor->handle(), output_gate_bias_tensor->handle(), output_state_in_tensor->handle(),
+    cell_state_in_tensor->handle(), scratch_buffer_tensor->handle(),
+    output_state_out_tensor->handle(), cell_state_out_tensor->handle(), output_tensor->handle(),
+    lstm_params, act_info, cell_clip, projection_clip);
 
   return std::make_unique<T_FunctionWrapper>(std::move(fn));
 }
@@ -240,14 +239,14 @@ kernelGenFullyConnected(const ir::operation::FullyConnected &node, const ir::Ope
   const auto input_rank = operands.at(input_index).shape().rank();
 
   const auto output_size =
-      operands.at(output_index).shape().dim(operands.at(output_index).shape().rank() - 1);
+    operands.at(output_index).shape().dim(operands.at(output_index).shape().rank() - 1);
   UNUSED_RELEASE(output_size);
   assert(bias_index.undefined() || operands.at(bias_index).shape().dim(0) == output_size);
   assert(operands.at(weight_index).shape().dim(0) == output_size);
   const auto batch_size =
-      operands.at(output_index).shape().dim(operands.at(output_index).shape().rank() - 2);
+    operands.at(output_index).shape().dim(operands.at(output_index).shape().rank() - 2);
   const auto input_size =
-      operands.at(weight_index).shape().dim(operands.at(weight_index).shape().rank() - 1);
+    operands.at(weight_index).shape().dim(operands.at(weight_index).shape().rank() - 1);
 
   // Check for reshaping input's shape into rank-2
   bool needs_reshape = false;
@@ -285,10 +284,10 @@ kernelGenFullyConnected(const ir::operation::FullyConnected &node, const ir::Ope
   }
 
   auto fn = generateLayer<T_ACLLayer>(
-      tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
-      weight_tensor->handle(), bias_tensor != nullptr ? bias_tensor->handle() : nullptr,
-      output_tensor->handle(), needs_reshape,
-      asTensorShape(reshape, frontend_layout, asRuntimeLayout(acl_layout)), kernel_type);
+    tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
+    weight_tensor->handle(), bias_tensor != nullptr ? bias_tensor->handle() : nullptr,
+    output_tensor->handle(), needs_reshape,
+    asTensorShape(reshape, frontend_layout, asRuntimeLayout(acl_layout)), kernel_type);
 
   return std::make_unique<T_FunctionWrapper>(std::move(fn));
 }
@@ -309,7 +308,7 @@ kernelGenPool2D(const T_PoolOp &node, const ir::Operands &operands,
   const auto kw = node.param().kw;
   const auto stride = node.param().stride;
   const auto padding =
-      ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
+    ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
 
   VERBOSE(Pool2DParam) << "IFM_H: " << ifm_shape.H << std::endl;
   VERBOSE(Pool2DParam) << "IFM_W: " << ifm_shape.W << std::endl;
@@ -328,8 +327,8 @@ kernelGenPool2D(const T_PoolOp &node, const ir::Operands &operands,
   auto ifm_tensor = tensor_reg->getAclTensor(ifm_index);
 
   ::arm_compute::PoolingLayerInfo info{
-      pooling_type, ::arm_compute::Size2D{kw, kh}, ifm_tensor->info()->data_layout(),
-      asPadStrideInfo(padding, stride), true /* exclude_padding */};
+    pooling_type, ::arm_compute::Size2D{kw, kh}, ifm_tensor->info()->data_layout(),
+    asPadStrideInfo(padding, stride), true /* exclude_padding */};
 
   auto fn = generateLayer<T_ACLLayer>(ifm_tensor->handle(), ofm_tensor->handle(), info);
 
diff --git a/runtime/onert/backend/acl_common/AclLinearMemoryManager.h b/runtime/onert/backend/acl_common/AclLinearMemoryManager.h
index 09f25e7a8..5c546b77a 100644
--- a/runtime/onert/backend/acl_common/AclLinearMemoryManager.h
+++ b/runtime/onert/backend/acl_common/AclLinearMemoryManager.h
@@ -23,7 +23,11 @@
 #include "ir/OperandIndexMap.h"
 #include "util/logging.h"
 
-namespace
+namespace onert
+{
+namespace backend
+{
+namespace acl_common
 {
 
 template <typename T_MemoryManager, typename T_PoolManager, typename T_LifetimeManager>
@@ -33,19 +37,10 @@ std::shared_ptr<T_MemoryManager> createMemoryManager()
   std::shared_ptr<T_PoolManager> pool_mgr = std::make_shared<T_PoolManager>();
 
   std::shared_ptr<T_MemoryManager> mem_mgr =
-      std::make_shared<T_MemoryManager>(lifetime_mgr, pool_mgr);
+    std::make_shared<T_MemoryManager>(lifetime_mgr, pool_mgr);
   return mem_mgr;
 }
 
-} // namespace
-
-namespace onert
-{
-namespace backend
-{
-namespace acl_common
-{
-
 template <typename T_ITensor, typename T_Tensor, typename T_SubTensor, typename T_MemoryManager,
           typename T_PoolManager, typename T_LifetimeManager, typename T_Allocator,
           typename T_MemoryGroup>
@@ -53,9 +48,9 @@ class AclLinearMemoryManager : public AclMemoryManager<T_ITensor, T_Tensor, T_Su
 {
 public:
   AclLinearMemoryManager()
-      : _allocator{nullptr},
-        _io_manager{createMemoryManager<T_MemoryManager, T_PoolManager, T_LifetimeManager>()},
-        _io_group{std::make_shared<T_MemoryGroup>(_io_manager)}
+    : _allocator{nullptr},
+      _io_manager{createMemoryManager<T_MemoryManager, T_PoolManager, T_LifetimeManager>()},
+      _io_group{std::make_shared<T_MemoryGroup>(_io_manager)}
   {
     // DO NOTHING
   }
diff --git a/runtime/onert/backend/acl_common/AclMemoryManager.h b/runtime/onert/backend/acl_common/AclMemoryManager.h
index eefcec130..8e6bdd86a 100644
--- a/runtime/onert/backend/acl_common/AclMemoryManager.h
+++ b/runtime/onert/backend/acl_common/AclMemoryManager.h
@@ -21,7 +21,6 @@
 #include <arm_compute/runtime/IMemoryManager.h>
 #include <cassert>
 
-#include "backend/IMemoryManager.h"
 #include "ir/OperandIndexMap.h"
 #include "Convert.h"
 #include "util/logging.h"
@@ -33,8 +32,7 @@ namespace backend
 namespace acl_common
 {
 
-template <typename T_ITensor, typename T_Tensor, typename T_SubTensor>
-class AclMemoryManager : public backend::IMemoryManager
+template <typename T_ITensor, typename T_Tensor, typename T_SubTensor> class AclMemoryManager
 {
 public:
   AclMemoryManager()
@@ -44,7 +42,7 @@ public:
 
   virtual ~AclMemoryManager() = default;
 
-  void allocate(void) override
+  virtual void allocate(void)
   {
     for (const auto &tensor_entry : _tensors)
     {
@@ -53,7 +51,7 @@ public:
     }
   }
 
-  void deallocate(void) override
+  virtual void deallocate(void)
   {
     for (const auto &tensor_entry : _tensors)
     {
@@ -62,8 +60,12 @@ public:
     }
   }
 
-  virtual void startLifetime(const ir::OperandIndex &) { /* DO NOTHING */}
-  virtual void finishLifetime(const ir::OperandIndex &) { /* DO NOTHING */}
+  virtual void startLifetime(const ir::OperandIndex &)
+  { /* DO NOTHING */
+  }
+  virtual void finishLifetime(const ir::OperandIndex &)
+  { /* DO NOTHING */
+  }
 
   void buildTensor(const ir::OperandIndex &ind, const ::arm_compute::TensorInfo &info, size_t rank,
                    size_t num_uses)
@@ -78,7 +80,7 @@ public:
                       bool extent_parent)
   {
     auto subtensor =
-        std::make_shared<T_SubTensor>(parent_tensor.get(), shape, coordinates, rank, extent_parent);
+      std::make_shared<T_SubTensor>(parent_tensor.get(), shape, coordinates, rank, extent_parent);
     _subtensors[child_ind] = subtensor;
   }
 
diff --git a/runtime/onert/backend/acl_common/AclSubTensorAnalyzer.h b/runtime/onert/backend/acl_common/AclSubTensorAnalyzer.h
index 3367f748f..60f4ebf7e 100644
--- a/runtime/onert/backend/acl_common/AclSubTensorAnalyzer.h
+++ b/runtime/onert/backend/acl_common/AclSubTensorAnalyzer.h
@@ -95,7 +95,7 @@ public:
       coordinate_info.set(axis, axis_point);
 
       _parent_map.emplace(
-          input_index, acl_common::ParentInfo{output_index, _current_op_layout, coordinate_info});
+        input_index, acl_common::ParentInfo{output_index, _current_op_layout, coordinate_info});
 
       axis_point += input_shape.dim(axis);
     }
diff --git a/runtime/onert/backend/acl_common/AclTensorBuilder.h b/runtime/onert/backend/acl_common/AclTensorBuilder.h
index 12e9ab894..7c1c5dd9a 100644
--- a/runtime/onert/backend/acl_common/AclTensorBuilder.h
+++ b/runtime/onert/backend/acl_common/AclTensorBuilder.h
@@ -131,14 +131,14 @@ namespace acl_common
 template <typename T_ITensor, typename T_Tensor, typename T_SubTensor>
 AclTensorBuilder<T_ITensor, T_Tensor, T_SubTensor>::AclTensorBuilder(const ir::Operands &operands,
                                                                      T_AclTensorManager *tensor_mgr)
-    : _operands{operands}, _tensor_mgr{tensor_mgr}
+  : _operands{operands}, _tensor_mgr{tensor_mgr}
 {
   assert(_tensor_mgr);
 }
 
 template <typename T_ITensor, typename T_Tensor, typename T_SubTensor>
 void AclTensorBuilder<T_ITensor, T_Tensor, T_SubTensor>::registerTensorInfo(
-    const ir::OperandIndex &ind, const ir::OperandInfo &info, ir::Layout backend_layout)
+  const ir::OperandIndex &ind, const ir::OperandInfo &info, ir::Layout backend_layout)
 {
   assert(_tensor_mgr->constTensors().size() == 0);
   assert(_tensor_mgr->nonconstTensors().size() == 0);
@@ -175,7 +175,7 @@ void AclTensorBuilder<T_ITensor, T_Tensor, T_SubTensor>::registerTensorInfo(
       offset = {offset[0], offset[3], offset[1], offset[2]};
     }
     else if (_operands.at(parent_index).shape().rank() >= 4 &&
-             frontend_layout == ir::Layout::NHWC && backend_layout == ir::Layout::NCHW)
+             frontend_layout == ir::Layout::NCHW && backend_layout == ir::Layout::NHWC)
     {
       // Permutation changing layout beyond 4-D is not supported yet
       const auto parent_rank = _operands.at(parent_index).shape().rank();
@@ -203,7 +203,7 @@ void AclTensorBuilder<T_ITensor, T_Tensor, T_SubTensor>::notifyLastUse(const ir:
 
 template <typename T_ITensor, typename T_Tensor, typename T_SubTensor>
 bool AclTensorBuilder<T_ITensor, T_Tensor, T_SubTensor>::isRegistered(
-    const ir::OperandIndex &ind) const
+  const ir::OperandIndex &ind) const
 {
   return _tensor_info_map.find(ind) != _tensor_info_map.end();
 }
@@ -221,7 +221,7 @@ void AclTensorBuilder<T_ITensor, T_Tensor, T_SubTensor>::allocate(void)
 
   std::unordered_map<ir::OperandIndex, ir::OperandIndex> root_map;
   std::function<ir::OperandIndex &(ir::OperandIndex)> find_root =
-      [&](ir::OperandIndex ind) -> ir::OperandIndex & {
+    [&](ir::OperandIndex ind) -> ir::OperandIndex & {
     ir::OperandIndex &ret = root_map[ind];
 
     // We know the root parent value already
@@ -313,7 +313,7 @@ void AclTensorBuilder<T_ITensor, T_Tensor, T_SubTensor>::buildTensors(void)
     const auto &info = entry.second;
     const auto &backend_layout = _tensor_layout_map[ind];
     auto tensor_info =
-        asTensorInfo(info.shape(), info.typeInfo(), ir::Layout::UNKNOWN, backend_layout, true);
+      asTensorInfo(info.shape(), info.typeInfo(), ir::Layout::UNKNOWN, backend_layout, true);
     _tensor_mgr->buildTensor(ind, tensor_info, info.shape().rank(), info.isConstant(),
                              _uses_count_map[ind]);
   }
@@ -321,7 +321,7 @@ void AclTensorBuilder<T_ITensor, T_Tensor, T_SubTensor>::buildTensors(void)
   // Subtensors
   assert(_tensor_mgr->nonconstSubtensors().size() == 0);
   // TODO Iterate `_parent_map` instead, once the optimizer bug is fixed
-  //      `Optimizer` iterates the entire OpSequences, so there is a bug if iterating _parent_map
+  //      `Optimizer` iterates the entire Operations, so there is a bug if iterating _parent_map
   for (auto &entry : _tensor_info_map)
   {
     auto ind = entry.first;
@@ -368,7 +368,7 @@ void AclTensorBuilder<T_ITensor, T_Tensor, T_SubTensor>::buildTensors(void)
       assert(parent_tensor != nullptr);
 
       // Child's type should be same with parent
-      assert(tensor_info.typeInfo().offset() ==
+      assert(tensor_info.typeInfo().zero_point() ==
              parent_tensor->info()->quantization_info().uniform().offset);
       assert(tensor_info.typeInfo().scale() ==
              parent_tensor->info()->quantization_info().uniform().scale);
@@ -380,7 +380,7 @@ void AclTensorBuilder<T_ITensor, T_Tensor, T_SubTensor>::buildTensors(void)
 
       auto shape = asTensorShape(tensor_info.shape(), ir::Layout::UNKNOWN, backend_layout, true);
       ::arm_compute::Coordinates coordinates =
-          asTensorCoordinate(parent_info.coordinates, ir::Layout::UNKNOWN, backend_layout);
+        asTensorCoordinate(parent_info.coordinates, ir::Layout::UNKNOWN, backend_layout);
       _tensor_mgr->buildSubtensor(parent, current, shape, coordinates, tensor_info.shape().rank(),
                                   true);
       stack.pop();
@@ -390,7 +390,7 @@ void AclTensorBuilder<T_ITensor, T_Tensor, T_SubTensor>::buildTensors(void)
 
 template <typename T_ITensor, typename T_Tensor, typename T_SubTensor>
 bool AclTensorBuilder<T_ITensor, T_Tensor, T_SubTensor>::areSubTensorsOf(
-    const ir::OperandIndex &parent, const ir::OperandIndexSequence &seq)
+  const ir::OperandIndex &parent, const ir::OperandIndexSequence &seq)
 {
   for (auto &cand : seq)
   {
@@ -404,7 +404,7 @@ bool AclTensorBuilder<T_ITensor, T_Tensor, T_SubTensor>::areSubTensorsOf(
 
 template <typename T_ITensor, typename T_Tensor, typename T_SubTensor>
 bool AclTensorBuilder<T_ITensor, T_Tensor, T_SubTensor>::isSubTensorOf(
-    const ir::OperandIndex &parent, const ir::OperandIndex &child)
+  const ir::OperandIndex &parent, const ir::OperandIndex &child)
 {
   auto itr = _parent_map.find(child);
   if (itr == _parent_map.end())
diff --git a/runtime/onert/backend/acl_common/AclTensorManager.h b/runtime/onert/backend/acl_common/AclTensorManager.h
index d0a56c762..268cec201 100644
--- a/runtime/onert/backend/acl_common/AclTensorManager.h
+++ b/runtime/onert/backend/acl_common/AclTensorManager.h
@@ -19,7 +19,6 @@
 
 #include <arm_compute/runtime/IMemoryManager.h>
 
-#include "backend/ITensorManager.h"
 #include "AclMemoryManager.h"
 #include "AclInternalBufferManager.h"
 #include "ir/OperandIndexMap.h"
@@ -31,8 +30,7 @@ namespace backend
 namespace acl_common
 {
 
-template <typename T_ITensor, typename T_Tensor, typename T_SubTensor>
-class AclTensorManager : public backend::ITensorManager
+template <typename T_ITensor, typename T_Tensor, typename T_SubTensor> class AclTensorManager
 {
 public:
   using T_AclMemoryManager = AclMemoryManager<T_ITensor, T_Tensor, T_SubTensor>;
@@ -97,9 +95,9 @@ namespace acl_common
 
 template <typename T_ITensor, typename T_Tensor, typename T_SubTensor>
 AclTensorManager<T_ITensor, T_Tensor, T_SubTensor>::AclTensorManager(
-    T_AclMemoryManager *const_mgr, T_AclMemoryManager *nonconst_mgr,
-    IInternalBufferManager *inter_mgr)
-    : _const_mgr{const_mgr}, _nonconst_mgr{nonconst_mgr}, _inter_mgr{inter_mgr}
+  T_AclMemoryManager *const_mgr, T_AclMemoryManager *nonconst_mgr,
+  IInternalBufferManager *inter_mgr)
+  : _const_mgr{const_mgr}, _nonconst_mgr{nonconst_mgr}, _inter_mgr{inter_mgr}
 {
   // DO NOTHING
 }
@@ -142,8 +140,8 @@ void AclTensorManager<T_ITensor, T_Tensor, T_SubTensor>::deallocateInternalBuffe
 
 template <typename T_ITensor, typename T_Tensor, typename T_SubTensor>
 void AclTensorManager<T_ITensor, T_Tensor, T_SubTensor>::buildTensor(
-    const ir::OperandIndex &ind, const ::arm_compute::TensorInfo &info, size_t rank, bool as_const,
-    size_t num_uses)
+  const ir::OperandIndex &ind, const ::arm_compute::TensorInfo &info, size_t rank, bool as_const,
+  size_t num_uses)
 {
   assert(_ind_to_mgr.find(ind) == _ind_to_mgr.end());
   if (as_const)
@@ -160,9 +158,9 @@ void AclTensorManager<T_ITensor, T_Tensor, T_SubTensor>::buildTensor(
 
 template <typename T_ITensor, typename T_Tensor, typename T_SubTensor>
 void AclTensorManager<T_ITensor, T_Tensor, T_SubTensor>::buildSubtensor(
-    const ir::OperandIndex &parent, const ir::OperandIndex &child,
-    const ::arm_compute::TensorShape &shape, const ::arm_compute::Coordinates &coordinates,
-    size_t rank, bool extent_parent)
+  const ir::OperandIndex &parent, const ir::OperandIndex &child,
+  const ::arm_compute::TensorShape &shape, const ::arm_compute::Coordinates &coordinates,
+  size_t rank, bool extent_parent)
 {
   assert(_ind_to_mgr.find(child) == _ind_to_mgr.end());
   std::shared_ptr<T_ITensor> parent_tensor = findTensorAsParent(parent);
@@ -261,7 +259,7 @@ AclTensorManager<T_ITensor, T_Tensor, T_SubTensor>::internal_buffer_manager(void
 
 template <typename T_ITensor, typename T_Tensor, typename T_SubTensor>
 void AclTensorManager<T_ITensor, T_Tensor, T_SubTensor>::iterate(
-    const std::function<void(const ir::OperandIndex &)> &fn)
+  const std::function<void(const ir::OperandIndex &)> &fn)
 {
   for (auto it : _nonconst_mgr->tensors())
     fn(it.first);
@@ -286,7 +284,7 @@ void AclTensorManager<T_ITensor, T_Tensor, T_SubTensor>::tryDeallocConstants(voi
     // used in several nodes.
     if (tensor->handle() && !tensor->handle()->is_used() && tensor->num_uses() < 2)
     {
-      VERBOSE(AclTensorManager) << "Tensor #" << ind.value()
+      VERBOSE(AclTensorManager) << "Tensor " << ind
                                 << " will be deallocated as an unused constant tensor" << std::endl;
       tensor->allocator()->free();
       tensor.reset();
diff --git a/runtime/onert/backend/acl_common/Convert.cc b/runtime/onert/backend/acl_common/Convert.cc
index 7d3a69032..673d524e3 100644
--- a/runtime/onert/backend/acl_common/Convert.cc
+++ b/runtime/onert/backend/acl_common/Convert.cc
@@ -136,8 +136,8 @@ namespace acl_common
                                        bool apply_dim_correction)
 {
   ::arm_compute::TensorInfo info(
-      asTensorShape(shape, frontend_layout, backend_layout, apply_dim_correction), 1,
-      asDataType(typeInfo.type()), asQuantizationInfo(typeInfo.scale(), typeInfo.offset()));
+    asTensorShape(shape, frontend_layout, backend_layout, apply_dim_correction), 1,
+    asDataType(typeInfo.type()), asQuantizationInfo(typeInfo.scale(), typeInfo.zero_point()));
   info.set_data_layout(asDataLayout(backend_layout));
   return info;
 }
@@ -162,24 +162,24 @@ namespace acl_common
       return ::arm_compute::ActivationLayerInfo{};
     case ir::Activation::RELU:
       return ::arm_compute::ActivationLayerInfo{
-          ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU};
+        ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU};
     case ir::Activation::RELU1:
       return ::arm_compute::ActivationLayerInfo{
-          ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f};
+        ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f};
     case ir::Activation::RELU6:
       return ::arm_compute::ActivationLayerInfo{
-          ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.0f, 0.0f};
+        ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.0f, 0.0f};
     // Cases for activation of LSTM.
     case ir::Activation::TANH:
       return ::arm_compute::ActivationLayerInfo{
-          ::arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f};
+        ::arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f};
     case ir::Activation::SIGMOID:
       // NOTE The sigmoid function is a special case of the Logistic function when L=1, k=1, x0=0.
       // TODO In ACL and nnapi sepc, currently, Logistic's L always is 1, k always is 1, x0 always
       // 0(always sigmoid) regardless of values of the parameter.
       //      If ACL support non-sigmoid logistic, should fix param values.
       return ::arm_compute::ActivationLayerInfo{
-          ::arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC, 0.0f, 0.0f};
+        ::arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC, 0.0f, 0.0f};
     default:
       throw std::runtime_error{"Not supported internal activation, yet"};
       break;
@@ -198,32 +198,32 @@ asActivationLayerInfo(const ir::operation::ElementwiseActivation::Type op_type,
         if (alpha == ir::operation::ElementwiseActivation::infinity)
         {
           return ::arm_compute::ActivationLayerInfo{
-              ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU};
+            ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU};
         }
         else
         {
           return ::arm_compute::ActivationLayerInfo{
-              ::arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, alpha};
+            ::arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, alpha};
         }
       }
       else
       {
         return ::arm_compute::ActivationLayerInfo{
-            ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, alpha, beta};
+          ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, alpha, beta};
       }
     case ir::operation::ElementwiseActivation::Type::TANH:
       return ::arm_compute::ActivationLayerInfo{
-          ::arm_compute::ActivationLayerInfo::ActivationFunction::TANH, alpha, beta};
+        ::arm_compute::ActivationLayerInfo::ActivationFunction::TANH, alpha, beta};
     case ir::operation::ElementwiseActivation::Type::LOGISTIC:
       // NOTE The sigmoid function is a special case of the Logistic function when L=1, k=1, x0=0.
       // TODO In ACL and nnapi sepc, currently, Logistic's L always is 1, k always is 1, x0 always
       // 0(always sigmoid) regardless of values of the parameter.
       //      If ACL support non-sigmoid logistic, should fix param values.
       return ::arm_compute::ActivationLayerInfo{
-          ::arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC};
+        ::arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC};
     case ir::operation::ElementwiseActivation::Type::LEAKY_RELU:
       return ::arm_compute::ActivationLayerInfo{
-          ::arm_compute::ActivationLayerInfo::ActivationFunction::LEAKY_RELU, alpha};
+        ::arm_compute::ActivationLayerInfo::ActivationFunction::LEAKY_RELU, alpha};
     default:
       throw std::runtime_error{"Not supported internal elementwise activation, yet"};
       break;
diff --git a/runtime/onert/backend/acl_common/IACLTensor.cc b/runtime/onert/backend/acl_common/IACLTensor.cc
index 70988bd11..9920750fc 100644
--- a/runtime/onert/backend/acl_common/IACLTensor.cc
+++ b/runtime/onert/backend/acl_common/IACLTensor.cc
@@ -25,26 +25,14 @@ namespace backend
 namespace acl_common
 {
 
-size_t IACLTensor::dimension(size_t index) const
-{
-  // Assume that the front is higher dimensional.
-  // i.g. N: 0, C: 1, H: 2, W: 3 for NCHW layout
-  // NOTE This tensor must not be applied dim correction
-  auto rank = num_dimensions();
-  rank = rank == 0 ? 1 : rank;
-  assert(rank > index);
-  const ARMComputeAxis reversed{(static_cast<uint32_t>(rank - index) - 1)};
-  return info()->dimension(reversed.value());
-}
-
 size_t IACLTensor::calcOffset(const ir::Coordinates &coords) const
 {
-  auto rank = num_dimensions();
+  auto rank = _rank;
   rank = rank == 0 ? 1 : rank;
-  assert(rank == coords.size());
+  assert(static_cast<size_t>(rank) == coords.size());
 
   ::arm_compute::Coordinates acl_coords;
-  for (uint32_t i = 0; i < rank; ++i)
+  for (size_t i = 0; i < rank; ++i)
   {
     const ARMComputeAxis reversed{static_cast<uint32_t>((rank - i) - 1)};
     acl_coords.set(reversed.value(), coords[i]);
@@ -66,12 +54,22 @@ float IACLTensor::data_scale() const
   return info()->quantization_info().uniform().scale;
 }
 
-int32_t IACLTensor::data_offset() const
+int32_t IACLTensor::data_zero_point() const
 {
   // FIXME What if quantization info is non-uniform?
   return info()->quantization_info().uniform().offset;
 }
 
+const std::vector<float> &IACLTensor::data_scales() const
+{
+  throw std::runtime_error("IACLTensor::data_scales() is not supported.");
+}
+
+const std::vector<int32_t> &IACLTensor::data_zero_points() const
+{
+  throw std::runtime_error("IACLTensor::data_zero_points() is not supported.");
+}
+
 } // namespace acl_common
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/backend/acl_common/IACLTensor.h b/runtime/onert/backend/acl_common/IACLTensor.h
index 3d1268940..7ea6327a7 100644
--- a/runtime/onert/backend/acl_common/IACLTensor.h
+++ b/runtime/onert/backend/acl_common/IACLTensor.h
@@ -19,6 +19,7 @@
 
 #include <backend/ITensor.h>
 #include <arm_compute/core/ITensor.h>
+#include "Swizzle.h"
 
 namespace onert
 {
@@ -42,17 +43,27 @@ public:
   IACLTensor(IACLTensor &&) = default;
   IACLTensor &operator=(IACLTensor &&) = default;
 
+  IACLTensor(size_t rank) : _rank{rank} {}
+
 public:
   uint8_t *buffer() const final { return handle()->buffer(); }
   size_t total_size() const final { return info()->total_size(); }
-  size_t dimension(size_t index) const final;
   size_t calcOffset(const ir::Coordinates &coords) const final;
   ir::Layout layout() const final;
   ir::DataType data_type() const final;
   float data_scale() const override;
-  int32_t data_offset() const override;
+  int32_t data_zero_point() const override;
+  const std::vector<float> &data_scales() const override;
+  const std::vector<int32_t> &data_zero_points() const override;
   bool has_padding() const override { return info()->has_padding(); }
   bool is_dynamic() const override { return false; }
+  ir::Shape getShape() const override
+  {
+    onert::ir::Shape shape(num_dimensions());
+    for (uint32_t d = 0; d < num_dimensions(); d++)
+      shape.dim(d) = dimension(d);
+    return shape;
+  }
 
 public:
   virtual const arm_compute::ITensor *handle() const = 0;
@@ -60,6 +71,22 @@ public:
 
   const arm_compute::ITensorInfo *info() const { return handle()->info(); }
   arm_compute::ITensorInfo *info() { return handle()->info(); }
+
+  size_t dimension(size_t index) const
+  {
+    // Assume that the front is higher dimensional.
+    // i.g. N: 0, C: 1, H: 2, W: 3 for NCHW layout
+    // NOTE This tensor must not be applied dim correction
+    auto rank = _rank;
+    rank = rank == 0 ? 1 : rank;
+    assert(rank > index);
+    const ARMComputeAxis reversed{(static_cast<uint32_t>(rank - index) - 1)};
+    return info()->dimension(reversed.value());
+  }
+  size_t num_dimensions() const { return _rank; }
+
+protected:
+  size_t _rank; // Actual rank (reflects extended rank)
 };
 
 } // namespace acl_common
diff --git a/runtime/onert/backend/acl_common/Swizzle.h b/runtime/onert/backend/acl_common/Swizzle.h
index e1c7f8041..61338f972 100644
--- a/runtime/onert/backend/acl_common/Swizzle.h
+++ b/runtime/onert/backend/acl_common/Swizzle.h
@@ -131,7 +131,7 @@ getARMComputePermutationVector(uint32_t rank, const std::vector<int32_t> runtime
   }
 
   ::arm_compute::PermutationVector ACL_PV =
-      ::arm_compute::PermutationVector{new_pv[0], new_pv[1], new_pv[2], new_pv[3]};
+    ::arm_compute::PermutationVector{new_pv[0], new_pv[1], new_pv[2], new_pv[3]};
   ACL_PV.set_num_dimensions(rank);
 
   return ACL_PV;
@@ -146,7 +146,7 @@ inline T ReorderBits(T in, size_t numOfBits, const ir::Layout org_layout = ir::L
   for (int32_t i = numOfBits - 1; i >= 0; --i)
   {
     const uint32_t toShift =
-        numOfBits - ToARMComputeAxis(numOfBits, i, org_layout, acl_layout).value() - 1;
+      numOfBits - ToARMComputeAxis(numOfBits, i, org_layout, acl_layout).value() - 1;
     out += ((in & 1) << toShift);
     in >>= 1;
   }
diff --git a/runtime/onert/backend/acl_neon/Backend.h b/runtime/onert/backend/acl_neon/Backend.h
index b11c19733..62b163b11 100644
--- a/runtime/onert/backend/acl_neon/Backend.h
+++ b/runtime/onert/backend/acl_neon/Backend.h
@@ -42,20 +42,18 @@ public:
 
   std::shared_ptr<IConfig> config() const override { return _config; }
 
-  std::unique_ptr<backend::BackendContext>
-  newContext(const ir::Graph &graph, const std::shared_ptr<custom::IKernelBuilder> &,
-             bool is_linear_executor) const override
+  std::unique_ptr<backend::BackendContext> newContext(ContextData &&data) const override
   {
-    const auto &operands = graph.operands();
-    const auto &operations = graph.operations();
-    auto context = std::make_unique<acl_neon::BackendContext>(this, &graph);
-    auto tm = createTensorManager(is_linear_executor);
+    const auto &graph = *data.graph;
+    const auto &operands = data.graph->operands();
+    auto context = std::make_unique<acl_neon::BackendContext>(this, std::move(data));
+    auto tm = createTensorManager(data.is_linear_executor);
     auto tr = std::make_shared<acl_common::AclTensorRegistry<TensorManager>>(tm);
     auto tb = std::make_shared<TensorBuilder>(operands, tm);
     context->tensor_registry = tr;
     context->tensor_builder = tb;
     context->constant_initializer = std::make_shared<ConstantInitializer>(operands, tr);
-    context->kernel_gen = std::make_shared<KernelGenerator>(operands, operations, tb, tr);
+    context->kernel_gen = std::make_shared<KernelGenerator>(graph, tb, tr);
     context->optimizer = std::make_shared<Optimizer>(context.get());
     return context;
   }
diff --git a/runtime/onert/backend/acl_neon/BackendContext.cc b/runtime/onert/backend/acl_neon/BackendContext.cc
index 8b53171f7..4de3de02d 100644
--- a/runtime/onert/backend/acl_neon/BackendContext.cc
+++ b/runtime/onert/backend/acl_neon/BackendContext.cc
@@ -33,46 +33,34 @@ namespace acl_neon
 
 void BackendContext::initConsts()
 {
-  for (auto &op : operation_list())
-  {
-    constant_initializer->setLayout(op.layout);
-    graph()->operations().at(op.index).accept(*constant_initializer);
-  }
+  _data.graph->operations().iterate([&](const ir::OperationIndex &, const ir::Operation &op) {
+    constant_initializer->setLayout(graph()->layout());
+    op.accept(*constant_initializer);
+  });
 
-  for (auto ind : operand_list())
-  {
+  _data.graph->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &operand) {
+    if (_data.external_operands.contains(ind) || !operand.isConstant())
+      return;
     const auto &obj = graph()->operands().at(ind);
     if (obj.isConstant() && !constant_initializer->exist(ind))
     {
       constant_initializer->registerDefaultInitializer(ind, obj);
     }
-  }
+  });
 
   constant_initializer->run();
 }
 
-void BackendContext::planTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
-                                 const ir::OpSequences &op_seqs, const ir::LowerInfoMap &lower_info)
+void BackendContext::planTensors()
 {
   ir::OperandIndexMap<uint32_t> uses_map;
   ir::OperandIndexMap<uint32_t> def_map;
   ir::OperandIndexSequence constants;
 
   // Prepare scanning
-  for (auto ind : operand_list())
-  {
-    const auto &obj = graph()->operands().at(ind);
-    const auto &li = lower_info.operand.at(ind);
-    if (li->def_factors().getOnlyElement().backend() != backend())
-      continue;
-
-    // Ignore unused tensor
-    if (li->def_factors().size() == 0 && li->use_factors().size() == 0)
-    {
-      VERBOSE(planTensors) << "Operand #" << ind.value() << " will not be used. no more process."
-                           << std::endl;
+  _data.graph->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &obj) {
+    if (_data.external_operands.contains(ind))
       return;
-    }
 
     uses_map[ind] = obj.getUses().size();
     def_map[ind] = obj.getDef().valid() ? 1 : 0;
@@ -80,16 +68,15 @@ void BackendContext::planTensors(const std::vector<onert::ir::OpSequenceIndex> &
     if (obj.isConstant())
       constants.append(ind);
 
-    auto factor = li->def_factors().getOnlyElement();
     if (!tensor_builder->isRegistered(ind))
     {
-      // These tensors do not exist in any op_seq (No use and def)
+      // These tensors do not exist in any operation (No use and def)
       const auto info = obj.info();
-      const auto backend_layout = factor.layout();
+      const auto layout = _data.operand_layouts.at(ind);
       // TODO Change tensor info to have permuted shape
-      tensor_builder->registerTensorInfo(ind, info, backend_layout);
+      tensor_builder->registerTensorInfo(ind, info, layout);
     }
-  }
+  });
 
   // Start scanning to do notify{First|Last}Use for each tensor
 
@@ -107,64 +94,66 @@ void BackendContext::planTensors(const std::vector<onert::ir::OpSequenceIndex> &
   // 1. Scan DEF of outputs. If the DEF, allocate it
   // 2. Scan DEF of inputs. If variable tensor, allocate it
   // 3. Scan USE of inputs. Decrease the USE and deallocate if the USE is 0
-  for (const auto op_seq_ind : order)
+  for (const auto op_ind : _data.op_order)
   {
-    const auto &op_seq = op_seqs.at(op_seq_ind);
-    for (const auto &op_idx : op_seq.operations())
-    {
-      auto &op = graph()->operations().at(op_idx);
-      auto op_inputs = op.getInputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED;
-      auto op_outputs = op.getOutputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED;
+    auto op_inputs =
+      graph()->operations().at(op_ind).getInputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED;
+    auto op_outputs = graph()->operations().at(op_ind).getOutputs() | ir::Remove::DUPLICATED |
+                      ir::Remove::UNDEFINED;
 
-      // Define outputs
-      for (const auto &ind : op_outputs)
+    // Define outputs
+    for (const auto &ind : op_outputs)
+    {
+      if (!tensor_builder->isRegistered(ind))
+        continue;
+      assert(def_map.find(ind) != def_map.end());
+      if (def_map[ind])
       {
-        if (!tensor_builder->isRegistered(ind))
-          continue;
-        assert(def_map.find(ind) != def_map.end());
-        if (def_map[ind])
-        {
-          def_map[ind] = 0;
-          tensor_builder->notifyFirstUse(ind);
-        }
+        def_map[ind] = 0;
+        tensor_builder->notifyFirstUse(ind);
       }
+    }
 
-      // Scan variable tensors
-      // This tensor has features like constant. But OperandInfo and LowerInfo treat them as
-      // non-constant because of less memory usage by memory planning in here
-      for (const auto &ind : op_inputs)
+    // Scan variable tensors
+    // This tensor has features like constant. But OperandInfo and LowerInfo treat them as
+    // non-constant because of less memory usage by memory planning in here
+    for (const auto &ind : op_inputs)
+    {
+      if (!tensor_builder->isRegistered(ind))
+        continue;
+      const auto &operand = graph()->operands().at(ind);
+      if (operand.info().isVariable())
       {
-        if (!tensor_builder->isRegistered(ind))
-          continue;
-        const auto &operand = graph()->operands().at(ind);
-        if (operand.info().isVariable())
-        {
-          // The variable tensor with buffer is not supported yet
-          assert(operand.data() == nullptr);
-          assert(operand.getUses().size() == 1 && !operand.getDef().valid());
-          assert(lower_info.operand.at(ind)->def_factors().size() == 1 &&
-                 lower_info.operand.at(ind)->use_factors().size() == 1);
-          assert(uses_map[ind] == 1 && def_map[ind] == 0);
-          tensor_builder->notifyFirstUse(ind);
-        }
+        // The variable tensor with buffer is not supported yet
+        assert(operand.data() == nullptr);
+        assert(operand.getUses().size() == 1 && !operand.getDef().valid());
+        assert(uses_map[ind] == 1 && def_map[ind] == 0);
+        tensor_builder->notifyFirstUse(ind);
       }
+    }
 
-      for (const auto &ind : op_inputs)
+    for (const auto &ind : op_inputs)
+    {
+      if (!tensor_builder->isRegistered(ind))
+        continue;
+      assert(uses_map.find(ind) != uses_map.end());
+      assert(uses_map[ind] > 0);
+      uses_map[ind]--;
+      if (uses_map[ind] == 0)
       {
-        if (!tensor_builder->isRegistered(ind))
-          continue;
-        assert(uses_map.find(ind) != uses_map.end());
-        assert(uses_map[ind] > 0);
-        uses_map[ind]--;
-        if (uses_map[ind] == 0)
-        {
-          // plan for deallocation of static tensornode
-          tensor_builder->notifyLastUse(ind);
-        }
+        // plan for deallocation of static tensornode
+        tensor_builder->notifyLastUse(ind);
       }
     }
   }
 
+  _data.graph->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &) {
+    if (uses_map[ind] == 0)
+    {
+      tensor_builder->notifyLastUse(ind);
+    }
+  });
+
   // Dispose and validate
   for (const auto &ind : constants)
   {
@@ -176,77 +165,42 @@ void BackendContext::planTensors(const std::vector<onert::ir::OpSequenceIndex> &
   }
 
   assert(
-      std::all_of(uses_map.begin(), uses_map.end(),
-                  [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; }));
+    std::all_of(uses_map.begin(), uses_map.end(),
+                [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; }));
 
   assert(
-      std::all_of(def_map.begin(), def_map.end(),
-                  [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; }));
+    std::all_of(def_map.begin(), def_map.end(),
+                [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; }));
 }
 
-ITensorRegistry *BackendContext::genTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
-                                            const ir::OpSequences &op_seqs,
-                                            const ir::LowerInfoMap &lower_info)
+ITensorRegistry *BackendContext::genTensors()
 {
   optimizer->optimize();
 
-  for (const auto op_seq_ind : order)
-  {
-    const auto &op_seq = op_seqs.at(op_seq_ind);
-    auto model_io = (graph()->getInputs() + graph()->getOutputs()) | ir::Remove::UNDEFINED |
-                    ir::Remove::DUPLICATED;
-    for (const auto op_ind : op_seq)
-    {
-      bool op_assigned = [&]() {
-        for (auto &op_info : operation_list())
-          if (op_info.index == op_ind)
-            return true;
-        return false;
-      }();
-      if (!op_assigned)
-        continue;
+  graph()->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &obj) {
+    if (external_operands().contains(ind))
+      return;
 
-      const auto &op = graph()->operations().at(op_ind);
-      for (const auto &index : (op.getInputs() + op.getOutputs()) | ir::Remove::UNDEFINED)
-      {
-        if (!tensor_builder->isRegistered(index) && !model_io.contains(index) &&
-            find(operand_list().begin(), operand_list().end(), index) != operand_list().end())
-        {
-          const auto &operand_lower_info =
-              lower_info.operand.at(index)->def_factors().getOnlyElement();
-
-          // E.g., permute (CPU) -> tensor A -> MaxPool2D(acl_cl)
-          // op.getOutputs() of permute (CPU) returns tensor A
-          // but tensor A belongs to the backend of acl_cl.
-          // So, we have to make this tensor NOT registered for CPU.
-          if (operand_lower_info.backend() != backend())
-            continue;
-
-          const auto &obj = graph()->operands().at(index);
-          const auto frontend_layout = op_seq.getLayout();
-          const auto backend_layout = operand_lower_info.layout();
-          ir::OperandInfo backend_info{permuteShape(obj.shape(), frontend_layout, backend_layout),
-                                       obj.typeInfo(), obj.info().memAllocType(), obj.isConstant()};
-          tensor_builder->registerTensorInfo(index, backend_info, backend_layout);
-        }
-      }
-    }
-  }
+    const auto frontend_layout = graph()->layout();
+    const auto backend_layout = operand_layouts().at(ind);
+    ir::OperandInfo backend_info{permuteShape(obj.shape(), frontend_layout, backend_layout),
+                                 obj.typeInfo(), obj.info().memAllocType(), obj.isConstant()};
+    tensor_builder->registerTensorInfo(ind, backend_info, backend_layout);
+  });
 
   // TODO Get compiler options from compiler, and use it rather than getting it from Env
   if (util::getConfigString(util::config::EXECUTOR) == "Linear")
   {
-    planTensors(order, op_seqs, lower_info);
+    planTensors();
   }
   else
   {
     // For the executors that does not have fixed linear execution order:
     // To make tensors never be deallocated, this is a workaround to use static memory planner
-    for (auto ind : operand_list())
-    {
+    graph()->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &) {
       if (tensor_builder->isRegistered(ind))
         tensor_builder->notifyFirstUse(ind);
-    }
+    });
   }
 
   tensor_builder->prepare();
@@ -254,36 +208,23 @@ ITensorRegistry *BackendContext::genTensors(const std::vector<onert::ir::OpSeque
   return tensor_registry.get();
 }
 
-FunctionMap BackendContext::genKernels(const std::vector<onert::ir::OpSequenceIndex> &order,
-                                       const ir::OpSequences &op_seqs)
+FunctionMap BackendContext::genKernels()
 {
   FunctionMap ret;
 
-  for (auto op_seq_ind : order)
+  for (auto op_ind : _data.op_order)
   {
-    const auto &op_seq = op_seqs.at(op_seq_ind);
-    bool assigned = [&]() {
-      for (auto op_info : operation_list())
-        if (op_seq.exist(op_info.index))
-          return true;
-      return false;
-    }();
-    if (!assigned)
-      continue;
-    auto fn_seq = kernel_gen->generate(op_seqs.at(op_seq_ind));
-    ret.emplace_back(op_seq_ind, std::move(fn_seq));
+    auto fn_seq = kernel_gen->generate(op_ind);
+    ret.emplace_back(op_ind, std::move(fn_seq));
   }
 
   tensor_builder->allocate();
   initConsts();
 
   // NOTE For memory optimization, we want to free some operand data
-  for (auto ind : operand_list())
-  {
-    // TODO Remove const_cast
-    auto &obj = const_cast<ir::Graph *>(graph())->operands().at(ind);
-    obj.releaseData();
-  }
+  const_cast<ir::Graph &>(*_data.graph)
+    .operands()
+    .iterate([&](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); });
 
   for (auto &it : ret)
   {
@@ -297,6 +238,6 @@ FunctionMap BackendContext::genKernels(const std::vector<onert::ir::OpSequenceIn
   return ret;
 }
 
-} // namespace neon
+} // namespace acl_neon
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/backend/acl_neon/BackendContext.h b/runtime/onert/backend/acl_neon/BackendContext.h
index dd764c091..35d777f7b 100644
--- a/runtime/onert/backend/acl_neon/BackendContext.h
+++ b/runtime/onert/backend/acl_neon/BackendContext.h
@@ -34,27 +34,23 @@ class Optimizer;
 class BackendContext : public onert::backend::BackendContext
 {
 public:
-  BackendContext(const Backend *backend, const ir::Graph *graph,
+  BackendContext(const Backend *backend, ContextData &&data,
                  std::shared_ptr<ITensorRegistry> tensor_registry = nullptr,
                  std::shared_ptr<TensorBuilder> tensor_builder = nullptr,
                  std::shared_ptr<ConstantInitializer> constant_initializer = nullptr,
                  std::shared_ptr<KernelGenerator> kernel_gen = nullptr)
-      : onert::backend::BackendContext(backend, graph, tensor_registry),
-        tensor_builder{tensor_builder}, constant_initializer{constant_initializer},
-        kernel_gen{kernel_gen}
+    : onert::backend::BackendContext(backend, std::move(data), tensor_registry),
+      tensor_builder{tensor_builder}, constant_initializer{constant_initializer}, kernel_gen{
+                                                                                    kernel_gen}
   {
   }
 
-  ITensorRegistry *genTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
-                              const ir::OpSequences &op_seqs,
-                              const ir::LowerInfoMap &lower_info) override;
-  FunctionMap genKernels(const std::vector<onert::ir::OpSequenceIndex> &order,
-                         const ir::OpSequences &op_seqs) override;
+  ITensorRegistry *genTensors() override;
+  FunctionMap genKernels() override;
 
 private:
   void initConsts();
-  void planTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
-                   const ir::OpSequences &op_seqs, const ir::LowerInfoMap &lower_info);
+  void planTensors();
 
 public:
   // TODO Make it private
diff --git a/runtime/onert/backend/acl_neon/ConstantInitializer.cc b/runtime/onert/backend/acl_neon/ConstantInitializer.cc
index 79edb9ded..35da7c952 100644
--- a/runtime/onert/backend/acl_neon/ConstantInitializer.cc
+++ b/runtime/onert/backend/acl_neon/ConstantInitializer.cc
@@ -25,7 +25,7 @@ namespace acl_neon
 
 ConstantInitializer::ConstantInitializer(const ir::Operands &operands,
                                          const std::shared_ptr<ITensorRegistry> &tensor_reg)
-    : acl_common::AclConstantInitializer{operands, tensor_reg}
+  : acl_common::AclConstantInitializer{operands, tensor_reg}
 {
   // DO NOTHING
 }
@@ -72,11 +72,11 @@ void ConstantInitializer::visit(const ir::operation::SpaceToBatchND &node)
           {
             const int32_t value = base[i * 2 + j];
             int32_t *into = reinterpret_cast<int32_t *>(
-                // The coordinates of NETensor are different from the coordiantes of CLTensor in
-                // this operand.
-                // NEON : {j, reversed i}
-                // CL : {reversed i, j}
-                tensor.buffer() + tensor.calcOffset({j, shape.dim(0) - i - 1}));
+              // The coordinates of NETensor are different from the coordiantes of CLTensor in
+              // this operand.
+              // NEON : {j, reversed i}
+              // CL : {reversed i, j}
+              tensor.buffer() + tensor.calcOffset({j, shape.dim(0) - i - 1}));
             *into = value;
           }
         }
diff --git a/runtime/onert/backend/acl_neon/KernelGenerator.cc b/runtime/onert/backend/acl_neon/KernelGenerator.cc
index e712dfa81..94ea86dcf 100644
--- a/runtime/onert/backend/acl_neon/KernelGenerator.cc
+++ b/runtime/onert/backend/acl_neon/KernelGenerator.cc
@@ -41,33 +41,27 @@ namespace acl_neon
 
 using ::onert::backend::acl_common::asAclFunction;
 using ActivationBuilder = ::onert::backend::acl_common::AclActivationBuilder<
-    ::arm_compute::ITensor, ::arm_compute::NEActivationLayer, acl_common::AclFunction>;
+  ::arm_compute::ITensor, ::arm_compute::NEActivationLayer, acl_common::AclFunction>;
 
 KernelGenerator::KernelGenerator(
-    const ir::Operands &operands_ctx, const ir::Operations &operations_ctx,
-    const std::shared_ptr<TensorBuilder> &tensor_builder,
-    const std::shared_ptr<acl_common::AclTensorRegistry<TensorManager>> &tensor_reg)
-    : _ctx(operands_ctx), _operations_ctx(operations_ctx), _tensor_builder(tensor_builder),
-      _tensor_reg(tensor_reg), _current_layout(ir::Layout::UNKNOWN)
+  const ir::Graph &graph, const std::shared_ptr<TensorBuilder> &tensor_builder,
+  const std::shared_ptr<acl_common::AclTensorRegistry<TensorManager>> &tensor_reg)
+  : basic::KernelGeneratorBase{graph}, _ctx(graph.operands()),
+    _operations_ctx(graph.operations()), _current_layout{graph.layout()},
+    _tensor_builder(tensor_builder), _tensor_reg(tensor_reg)
 {
   // DO NOTHING
 }
 
-void KernelGenerator::visit(const ir::OpSequence &op_seq)
+std::unique_ptr<exec::FunctionSequence> KernelGenerator::generate(ir::OperationIndex ind)
 {
-  // TODO Move this to IKernelGenerator
-  //      (all derivatives have the same implementation for this)
-  assert(!_return_fn_seq);
-  _return_fn_seq = std::make_unique<exec::FunctionSequence>();
-  _return_fn_seq->enableDynamicShapeInferer(false);
-
-  _current_layout = op_seq.getLayout();
-  for (const auto &operation_idx : op_seq.operations())
-  {
-    const auto &node = _operations_ctx.at(operation_idx);
-    node.accept(*this);
-    _return_fn_seq->append(releaseFunction());
-  }
+  auto ret = std::make_unique<exec::FunctionSequence>();
+  ret->enableDynamicShapeInferer(false);
+
+  const auto &op = _graph.operations().at(ind);
+  op.accept(*this);
+  ret->append(releaseFunction());
+  return ret;
 }
 
 void KernelGenerator::visit(const ir::operation::ArgMinMax &node)
@@ -90,12 +84,12 @@ void KernelGenerator::visit(const ir::operation::ArgMinMax &node)
   }
   assert(axis_value >= 0 && axis_value < ifm_rank);
   const auto fixed_axis =
-      acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value();
+    acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value();
   auto reduce_type = node.param().is_arg_max ? ::arm_compute::ReductionOperation::ARG_IDX_MAX
                                              : ::arm_compute::ReductionOperation::ARG_IDX_MIN;
 
   auto fn = acl_common::generateLayer<arm_compute::NEArgMinMaxLayer>(
-      ifm_tensor->handle(), fixed_axis, ofm_tensor->handle(), reduce_type);
+    ifm_tensor->handle(), fixed_axis, ofm_tensor->handle(), reduce_type);
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -105,7 +99,7 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::INPUT)};
   const auto block_size_index{
-      node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
+    node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
 
   const auto NNApiInputs = 2;
   if (node.getInputs().size() != NNApiInputs)
@@ -133,7 +127,7 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
   assert(_ctx.at(block_size_index).data());
 
   auto fn = acl_common::generateLayer<arm_compute::NEBatchToSpaceLayer>(
-      ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle());
+    ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle());
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -156,29 +150,29 @@ void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
     case ir::operation::BinaryArithmetic::ArithmeticType::ADD:
     {
       fn = acl_common::generateLayer<arm_compute::NEArithmeticAddition>(
-          lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
-          arm_compute::ConvertPolicy::SATURATE);
+        lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
+        arm_compute::ConvertPolicy::SATURATE);
       break;
     }
     case ir::operation::BinaryArithmetic::ArithmeticType::SUB:
     {
       fn = acl_common::generateLayer<arm_compute::NEArithmeticSubtraction>(
-          lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
-          arm_compute::ConvertPolicy::SATURATE);
+        lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
+        arm_compute::ConvertPolicy::SATURATE);
       break;
     }
     case ir::operation::BinaryArithmetic::ArithmeticType::MUL:
     {
       // RoundingPolicy for scale:1.0 is only allowed RoundingPolicy::TO_ZERO
       fn = acl_common::generateLayer<arm_compute::NEPixelWiseMultiplication>(
-          lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale
-          arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_ZERO);
+        lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale
+        arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_ZERO);
       break;
     }
     case ir::operation::BinaryArithmetic::ArithmeticType::DIV:
     {
       fn = acl_common::generateLayer<arm_compute::NEElementwiseDivision>(
-          lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
+        lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
       break;
     }
     default:
@@ -186,7 +180,7 @@ void KernelGenerator::visit(const ir::operation::BinaryArithmetic &node)
       break;
   }
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
+    asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::Conv2D &node)
@@ -206,8 +200,8 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
   const auto ker_width = ker_shape.dim(2);
 
   const auto stride = node.param().stride;
-  const auto padding = ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride,
-                                            ker_width, ker_height);
+  const auto padding =
+    ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, ker_width, ker_height);
   const auto activation = node.param().activation;
 
   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
@@ -219,9 +213,9 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
   const auto act_info = acl_common::asActivationLayerInfo(activation);
 
   auto fn = acl_common::generateLayer<arm_compute::NEConvolutionLayer>(
-      _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), ifm_tensor->handle(),
-      ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(), conv_info,
-      ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
+    _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), ifm_tensor->handle(),
+    ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(), conv_info,
+    ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -238,7 +232,7 @@ void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 
   auto fn = acl_common::generateLayer<arm_compute::NEDepthToSpaceLayer>(
-      input_tensor->handle(), output_tensor->handle(), block_size);
+    input_tensor->handle(), output_tensor->handle(), block_size);
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -262,8 +256,8 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
   const auto stride = node.param().stride;
   const auto dilation = node.param().dilation;
   const auto padding =
-      ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, ker_width,
-                           ker_height, dilation.width_factor, dilation.height_factor);
+    ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, ker_width, ker_height,
+                         dilation.width_factor, dilation.height_factor);
   const auto multiplier = node.param().multiplier;
   const auto activation = node.param().activation;
 
@@ -277,8 +271,8 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
   const auto dilation_info = acl_common::asDilation(dilation.width_factor, dilation.height_factor);
 
   auto fn = acl_common::generateLayer<arm_compute::NEDepthwiseConvolutionLayer>(
-      ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(),
-      conv_info, multiplier, act_info, dilation_info);
+    ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(), ofm_tensor->handle(),
+    conv_info, multiplier, act_info, dilation_info);
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -304,15 +298,15 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
   }
 
   auto output_tensor = _tensor_reg->getAclTensor(ofm_index);
-  std::vector<::arm_compute::ITensor *> input_tensors;
+  std::vector<const ::arm_compute::ITensor *> input_tensors;
   for (const auto &ifm_ind : input_indexes)
     input_tensors.emplace_back(_tensor_reg->getAclTensor(ifm_ind)->handle());
 
   std::unique_ptr<::arm_compute::IFunction> fn;
   if (input_indexes.size() < 2)
   {
-    fn = acl_common::generateLayer<arm_compute::NECopy>(input_tensors.at(0),
-                                                        output_tensor->handle());
+    ::arm_compute::ITensor *input_tesor = _tensor_reg->getAclTensor(input_indexes.at(0))->handle();
+    fn = acl_common::generateLayer<arm_compute::NECopy>(input_tesor, output_tensor->handle());
   }
   else
   {
@@ -320,9 +314,9 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
     const auto frontend_layout = _current_layout;
     const auto backend_layout = output_tensor->layout();
     const auto fixed_axis =
-        acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
+      acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
     fn = acl_common::generateLayer<arm_compute::NEConcatenateLayer>(
-        input_tensors, output_tensor->handle(), fixed_axis);
+      input_tensors, output_tensor->handle(), fixed_axis);
   }
 
   _return_fn = asAclFunction(std::move(fn));
@@ -336,12 +330,12 @@ void KernelGenerator::visit(const ir::operation::ElementwiseActivation &node)
   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 
-  const ::arm_compute::ActivationLayerInfo act_info = acl_common::asActivationLayerInfo(
-      node.param().op_type, node.param().alpha, node.param().beta);
+  const ::arm_compute::ActivationLayerInfo act_info =
+    acl_common::asActivationLayerInfo(node.param().op_type, node.param().alpha, node.param().beta);
 
   std::unique_ptr<arm_compute::IFunction> fn =
-      acl_common::generateLayer<arm_compute::NEActivationLayer>(ifm_tensor->handle(),
-                                                                ofm_tensor->handle(), act_info);
+    acl_common::generateLayer<arm_compute::NEActivationLayer>(ifm_tensor->handle(),
+                                                              ofm_tensor->handle(), act_info);
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -362,25 +356,25 @@ void KernelGenerator::visit(const ir::operation::ElementwiseBinary &node)
     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_AND:
     {
       fn = acl_common::generateLayer<arm_compute::NELogicalAnd>(
-          lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
+        lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
       break;
     }
     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::LOGICAL_OR:
     {
       fn = acl_common::generateLayer<arm_compute::NELogicalOr>(
-          lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
+        lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
       break;
     }
     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::MAX:
     {
       fn = acl_common::generateLayer<arm_compute::NEElementwiseMax>(
-          lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
+        lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
       break;
     }
     case ir::operation::ElementwiseBinary::ElementwiseBinaryType::MIN:
     {
       fn = acl_common::generateLayer<arm_compute::NEElementwiseMin>(
-          lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
+        lhs_tensor->handle(), rhs_tensor->handle(), output_tensor->handle());
       break;
     }
     default:
@@ -408,10 +402,10 @@ void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node)
     case ir::operation::ElementwiseUnary::Type::ABS:
     {
       const ::arm_compute::ActivationLayerInfo act_info{
-          ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
+        ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
 
       fn = acl_common::generateLayer<arm_compute::NEActivationLayer>(
-          input_tensor->handle(), output_tensor->handle(), act_info);
+        input_tensor->handle(), output_tensor->handle(), act_info);
       break;
     }
     case ir::operation::ElementwiseUnary::Type::CAST:
@@ -429,7 +423,7 @@ void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node)
       else
       {
         fn = acl_common::generateLayer<arm_compute::NECast>(
-            input_tensor->handle(), output_tensor->handle(), arm_compute::ConvertPolicy::SATURATE);
+          input_tensor->handle(), output_tensor->handle(), arm_compute::ConvertPolicy::SATURATE);
       }
       break;
     }
@@ -472,10 +466,10 @@ void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node)
     case ir::operation::ElementwiseUnary::Type::SQRT:
     {
       const ::arm_compute::ActivationLayerInfo act_info{
-          ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
+        ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
 
       fn = acl_common::generateLayer<arm_compute::NEActivationLayer>(
-          input_tensor->handle(), output_tensor->handle(), act_info);
+        input_tensor->handle(), output_tensor->handle(), act_info);
       break;
     }
     default:
@@ -499,7 +493,7 @@ void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
   auto values_tensor = _tensor_reg->getAclTensor(values_index);
 
   auto fn = acl_common::generateLayer<arm_compute::NEEmbeddingLookup>(
-      values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle());
+    values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle());
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -511,13 +505,13 @@ void KernelGenerator::visit(const ir::operation::FullyConnected &node)
   const auto activation = node.param().activation;
   if (node.param().weights_format == ir::FullyConnectedWeightsFormat::Shuffled16x1Float32)
     throw std::runtime_error(
-        "KernelGenerator(acl_neon): FullyConnected 16x1Float32 weights is not supported.");
+      "KernelGenerator(acl_neon): FullyConnected 16x1Float32 weights is not supported.");
 
   auto fn = acl_common::kernelGenFullyConnected<acl_common::AclFunction, ::arm_compute::ITensor,
                                                 ::arm_compute::NEFullyConnectedReshapingLayer>(
-      node, _ctx, _tensor_builder, _tensor_reg, _current_layout);
+    node, _ctx, _tensor_builder, _tensor_reg, _current_layout);
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle()));
+    std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
@@ -537,8 +531,8 @@ void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
   auto values_tensor = _tensor_reg->getAclTensor(values_index);
 
   auto fn = acl_common::generateLayer<arm_compute::NEHashtableLookup>(
-      lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(),
-      output_tensor->handle(), hits_tensor->handle());
+    lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(),
+    output_tensor->handle(), hits_tensor->handle());
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -593,7 +587,7 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
   }
 
   auto fn = acl_common::generateLayer<arm_compute::NEGatherEx>(
-      ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis);
+    ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis);
 
   // Revert disabling applied dim_correction
   if (ifm_tensor->dimension(0) == 1)
@@ -623,11 +617,11 @@ void KernelGenerator::visit(const ir::operation::InstanceNorm &node)
   auto activation = node.param().activation;
 
   auto fn = acl_common::generateLayer<arm_compute::NEInstanceNormalizationLayerEx>(
-      ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(), beta_tensor->handle(),
-      epsilon);
+    ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(), beta_tensor->handle(),
+    epsilon);
 
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
+    asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::L2Normalization &node)
@@ -644,10 +638,10 @@ void KernelGenerator::visit(const ir::operation::L2Normalization &node)
   // TODO Support optional constant dimension that normalization would be performed on
   const auto normalization_axis = _ctx.at(ifm_index).shape().rank() - 1;
   int32_t radius =
-      2 * ifm_shape.dim(normalization_axis) + 1; // normSize = depth(last dimension) * 2 + 1
-  float alpha = 1.0f;                            // In the implementation to make alpha_ become 1
-  float beta = 0.5f;                             // pow(reduction, -0.5) = 1 / sqrt(reduction)
-  float bias = 0.0f;                             // Don't offset the reduction.
+    2 * ifm_shape.dim(normalization_axis) + 1; // normSize = depth(last dimension) * 2 + 1
+  float alpha = 1.0f;                          // In the implementation to make alpha_ become 1
+  float beta = 0.5f;                           // pow(reduction, -0.5) = 1 / sqrt(reduction)
+  float bias = 0.0f;                           // Don't offset the reduction.
 
   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
@@ -656,7 +650,7 @@ void KernelGenerator::visit(const ir::operation::L2Normalization &node)
                                                                radius, alpha, beta, bias, false);
 
   auto fn = acl_common::generateLayer<arm_compute::NENormalizationLayer>(
-      ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
+    ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -665,7 +659,7 @@ void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &nod
 {
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{
-      node.getInputs().at(ir::operation::LocalResponseNormalization::Input::INPUT)};
+    node.getInputs().at(ir::operation::LocalResponseNormalization::Input::INPUT)};
 
   auto radius = node.param().radius;
   auto alpha = node.param().alpha;
@@ -676,10 +670,10 @@ void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &nod
   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 
   const auto norm_info = ::arm_compute::NormalizationLayerInfo(
-      ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
+    ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
 
   auto fn = acl_common::generateLayer<arm_compute::NENormalizationLayer>(
-      ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
+    ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -761,7 +755,7 @@ void KernelGenerator::visit(const ir::operation::Pad &node)
     const auto frontend_layout = _current_layout;
     const auto backend_layout = _tensor_reg->getAclTensor(input_index)->layout();
     const auto axis =
-        acl_common::ToARMComputeAxis(rank, n, frontend_layout, backend_layout).value();
+      acl_common::ToARMComputeAxis(rank, n, frontend_layout, backend_layout).value();
     padding_list[axis] = ::arm_compute::PaddingInfo{from[0], from[1]};
   }
 
@@ -769,12 +763,12 @@ void KernelGenerator::visit(const ir::operation::Pad &node)
   UNUSED_RELEASE(input_type);
   assert(input->info()->data_type() == acl_common::asDataType(input_type.type()));
   assert(input->info()->quantization_info() ==
-         ::arm_compute::QuantizationInfo(input_type.scale(), input_type.offset()));
+         ::arm_compute::QuantizationInfo(input_type.scale(), input_type.zero_point()));
   const auto pixel_value =
-      ::arm_compute::PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
+    ::arm_compute::PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
 
   auto fn =
-      acl_common::generateLayer<arm_compute::NEPadLayer>(input, output, padding_list, pixel_value);
+    acl_common::generateLayer<arm_compute::NEPadLayer>(input, output, padding_list, pixel_value);
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -782,14 +776,14 @@ void KernelGenerator::visit(const ir::operation::Pad &node)
 void KernelGenerator::visit(const ir::operation::Pool2D &node)
 {
   auto raw_fn = acl_common::kernelGenPool2D<::arm_compute::NEPoolingLayer>(
-      node, _ctx, _tensor_reg, _current_layout, acl_common::convertPoolType(node.param().op_type));
+    node, _ctx, _tensor_reg, _current_layout, acl_common::convertPoolType(node.param().op_type));
 
   const auto ofm_index{node.getOutputs().at(0)};
   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
   const auto activation = node.param().activation;
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclFunction(std::move(raw_fn)),
-      ActivationBuilder::generate(activation, ofm_tensor->handle()));
+    asAclFunction(std::move(raw_fn)),
+    ActivationBuilder::generate(activation, ofm_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::Permute &node)
@@ -838,7 +832,7 @@ void KernelGenerator::visit(const ir::operation::PReLU &node)
   auto alpha_tensor = _tensor_reg->getAclTensor(alpha_index);
 
   auto fn = acl_common::generateLayer<arm_compute::NEPReluLayer>(
-      ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle());
+    ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle());
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -858,7 +852,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
   const auto frontend_layout = _current_layout;
   const auto backend_layout = input_tensor->layout();
   const auto reduce_axes =
-      acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout);
+    acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout);
   const auto reduce_type = node.param().reduce_type;
   const auto keep_dims = node.param().keep_dims;
 
@@ -876,8 +870,8 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
   else
   {
     fn = acl_common::generateLayer<arm_compute::NEReduceOperation>(
-        input_tensor->handle(), reduce_axes, keep_dims, output_tensor->handle(),
-        acl_common::convertReduceType(reduce_type));
+      input_tensor->handle(), reduce_axes, keep_dims, output_tensor->handle(),
+      acl_common::convertReduceType(reduce_type));
   }
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -914,9 +908,11 @@ void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 
   auto fn = acl_common::generateLayer<arm_compute::NEScale>(
-      ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::InterpolationPolicy::BILINEAR,
-      ::arm_compute::BorderMode::REPLICATE, ::arm_compute::PixelValue(0.f),
-      ::arm_compute::SamplingPolicy::TOP_LEFT);
+    ifm_tensor->handle(), ofm_tensor->handle(),
+    ::arm_compute::ScaleKernelInfo{::arm_compute::InterpolationPolicy::BILINEAR,
+                                   ::arm_compute::BorderMode::REPLICATE,
+                                   ::arm_compute::PixelValue(0.f),
+                                   ::arm_compute::SamplingPolicy::TOP_LEFT, false /*use padding*/});
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -925,12 +921,12 @@ void KernelGenerator::visit(const ir::operation::RNN &node)
 {
   const auto output_index{node.getOutputs().at(ir::operation::RNN::Output::OUTPUT)};
   const auto hidden_state_out_index{
-      node.getOutputs().at(ir::operation::RNN::Output::HIDDEN_STATE_OUT)};
+    node.getOutputs().at(ir::operation::RNN::Output::HIDDEN_STATE_OUT)};
 
   const auto input_index{node.getInputs().at(ir::operation::RNN::Input::INPUT)};
   const auto weights_index{node.getInputs().at(ir::operation::RNN::Input::WEIGHTS)};
   const auto recurrent_weights_index{
-      node.getInputs().at(ir::operation::RNN::Input::RECURRENT_WEIGHTS)};
+    node.getInputs().at(ir::operation::RNN::Input::RECURRENT_WEIGHTS)};
   const auto bias_index{node.getInputs().at(ir::operation::RNN::Input::BIAS)};
   const auto hidden_state_in_index{node.getInputs().at(ir::operation::RNN::Input::HIDDEN_STATE_IN)};
 
@@ -947,13 +943,13 @@ void KernelGenerator::visit(const ir::operation::RNN &node)
   auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
 
   auto copy_layer = acl_common::generateLayer<arm_compute::NECopy>(
-      hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle());
+    hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle());
   _return_fn = asAclFunction(std::move(copy_layer));
 
   auto fn = acl_common::generateLayer<arm_compute::NERNNLayer>(
-      _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
-      weights_tensor->handle(), recurrent_weights_tensor->handle(), bias_tensor->handle(),
-      hidden_state_out_tensor->handle(), output_tensor->handle(), act_info);
+    _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
+    weights_tensor->handle(), recurrent_weights_tensor->handle(), bias_tensor->handle(),
+    hidden_state_out_tensor->handle(), output_tensor->handle(), act_info);
   _return_fn = asAclFunction(std::move(fn));
 }
 
@@ -985,22 +981,10 @@ void KernelGenerator::visit(const ir::operation::Softmax &node)
   auto output_tensor = _tensor_reg->getAclTensor(output_index);
   auto input_tensor = _tensor_reg->getAclTensor(input_index);
 
-  // Disable applied dim_correction
-  if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
-  {
-    // This means that high dimension's value is 1 and input tensor is applied dim_correction
-    acl_common::disableDimCorrection(input_tensor);
-  }
-
+  // NOTE NESoftmaxLayer's default axis is -1
   auto fn = acl_common::generateLayer<arm_compute::NESoftmaxLayer>(
-      _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
-      output_tensor->handle(), beta);
-
-  // Revert disabling applied dim_correction
-  if (input_tensor->dimension(0) == 1)
-  {
-    acl_common::disableDimCorrection(input_tensor);
-  }
+    _tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
+    output_tensor->handle(), beta);
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1010,7 +994,7 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::INPUT)};
   const auto block_size_index{
-      node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
+    node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
   const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
 
   auto ofm_tensor = _tensor_reg->getAclTensor(ofm_index);
@@ -1022,8 +1006,8 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
   assert(_ctx.at(paddings_index).data());
 
   auto fn = acl_common::generateLayer<arm_compute::NESpaceToBatchLayer>(
-      ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(),
-      ofm_tensor->handle());
+    ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(),
+    ofm_tensor->handle());
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1039,7 +1023,7 @@ void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
   auto ifm_tensor = _tensor_reg->getAclTensor(ifm_index);
 
   auto fn = acl_common::generateLayer<arm_compute::NESpaceToDepthLayer>(
-      ifm_tensor->handle(), ofm_tensor->handle(), block_size);
+    ifm_tensor->handle(), ofm_tensor->handle(), block_size);
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1074,7 +1058,7 @@ void KernelGenerator::visit(const ir::operation::Split &node)
   axis = acl_common::ToARMComputeAxis(ifm_rank, axis, frontend_layout, backend_layout).value();
 
   auto fn =
-      acl_common::generateLayer<arm_compute::NESplit>(ifm_tensor->handle(), output_tensors, axis);
+    acl_common::generateLayer<arm_compute::NESplit>(ifm_tensor->handle(), output_tensors, axis);
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1090,7 +1074,7 @@ void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
   auto rhs_tensor = _tensor_reg->getAclTensor(rhs_index);
 
   auto fn = acl_common::generateLayer<arm_compute::NEElementwiseSquaredDiff>(
-      lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
+    lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1134,7 +1118,7 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
     {
       auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
                                                                  backend_layout)
-                      .value();
+                    .value();
 
       int32_t begin_value = *(reinterpret_cast<const int32_t *>(beginData_base) + n);
       starts[axis] = begin_value;
@@ -1154,7 +1138,7 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
   }
 
   auto fn = acl_common::generateLayer<arm_compute::NESlice>(
-      inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set);
+    inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set);
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1206,7 +1190,7 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
     {
       auto axis = ::onert::backend::acl_common::ToARMComputeAxis(input_rank, n, frontend_layout,
                                                                  backend_layout)
-                      .value();
+                    .value();
 
       int32_t start_value = *(reinterpret_cast<const int32_t *>(startData_base) + n);
       starts[axis] = start_value;
@@ -1224,7 +1208,7 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
   const auto begin_mask = acl_common::ReorderBits<int32_t>(node.param().begin_mask, input_rank);
   const auto end_mask = acl_common::ReorderBits<int32_t>(node.param().end_mask, input_rank);
   const auto shrink_axis_mask =
-      acl_common::ReorderBits<int32_t>(node.param().shrink_axis_mask, input_rank);
+    acl_common::ReorderBits<int32_t>(node.param().shrink_axis_mask, input_rank);
 
   ::arm_compute::Coordinates starts_set;
   ::arm_compute::Coordinates ends_set;
@@ -1238,18 +1222,19 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
   }
 
   // Disable applied dim_correction
-  if (inputData_tensor->num_dimensions() != inputData_tensor->info()->num_dimensions())
+  if (static_cast<size_t>(inputData_tensor->getShape().rank()) !=
+      inputData_tensor->info()->num_dimensions())
   {
     // This means that high dimension's value is 1 and input tensor is applied dim_correction
     acl_common::disableDimCorrection(inputData_tensor);
   }
 
   auto fn = acl_common::generateLayer<arm_compute::NEStridedSlice>(
-      inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set, strides_set,
-      begin_mask, end_mask, shrink_axis_mask);
+    inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set, strides_set,
+    begin_mask, end_mask, shrink_axis_mask);
 
   // Revert disabling applied dim_correction
-  if (inputData_tensor->dimension(0) == 1)
+  if (inputData_tensor->getShape().dim(0) == 1)
   {
     acl_common::enableDimCorrection(inputData_tensor);
   }
@@ -1279,7 +1264,7 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node)
   if (node.param().padding.type == ir::PaddingType::VALID)
   {
     invalid_horizontal =
-        ofm_shape.W - (1 + (ifm_shape.W - 1) * stride.horizontal) - (ker_shape.W - 1);
+      ofm_shape.W - (1 + (ifm_shape.W - 1) * stride.horizontal) - (ker_shape.W - 1);
     invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
   }
 
@@ -1290,8 +1275,8 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node)
   const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
 
   auto fn = acl_common::generateLayer<arm_compute::NETransposeConvLayer>(
-      ifm_tensor->handle(), ker_tensor->handle(), nullptr, ofm_tensor->handle(), tconv_info,
-      invalid_horizontal, invalid_vertical);
+    ifm_tensor->handle(), ker_tensor->handle(), nullptr, ofm_tensor->handle(), tconv_info,
+    invalid_horizontal, invalid_vertical);
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1335,7 +1320,7 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
   else
   {
     auto backend_pv =
-        acl_common::getARMComputePermutationVector(rank, pv, frontend_layout, backend_layout);
+      acl_common::getARMComputePermutationVector(rank, pv, frontend_layout, backend_layout);
 
     fn = acl_common::generateLayer<arm_compute::NEPermute>(ifm_tensor->handle(),
                                                            ofm_tensor->handle(), backend_pv);
@@ -1366,17 +1351,18 @@ void KernelGenerator::visit(const ir::operation::Unpack &node)
   axis = acl_common::ToARMComputeAxis(input_rank, axis, frontend_layout, backend_layout).value();
 
   // Disable applied dim_correction
-  if (input_tensor->num_dimensions() != input_tensor->info()->num_dimensions())
+  if (static_cast<size_t>(input_tensor->getShape().rank()) !=
+      input_tensor->info()->num_dimensions())
   {
     // This means that high dimension's value is 1 and input tensor is applied dim_correction
     acl_common::disableDimCorrection(input_tensor);
   }
 
   auto fn =
-      acl_common::generateLayer<arm_compute::NEUnstack>(input_tensor->handle(), outputs, axis);
+    acl_common::generateLayer<arm_compute::NEUnstack>(input_tensor->handle(), outputs, axis);
 
   // Revert disabling applied dim_correction
-  if (input_tensor->dimension(0) == 1)
+  if (input_tensor->getShape().dim(0) == 1)
   {
     acl_common::enableDimCorrection(input_tensor);
   }
@@ -1411,8 +1397,8 @@ void KernelGenerator::visit(const ir::operation::Comparison &node)
   auto input1_tensor = _tensor_reg->getAclTensor(input1_index);
 
   auto fn = acl_common::generateLayer<arm_compute::NEElementwiseComparison>(
-      input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
-      (arm_compute::ComparisonOperation)comparison_type);
+    input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
+    (arm_compute::ComparisonOperation)comparison_type);
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1438,8 +1424,8 @@ void KernelGenerator::visit(const ir::operation::OneHot &node)
   axis = acl_common::ToARMComputeAxis(output_rank, axis, frontend_layout, backend_layout).value();
 
   auto fn = acl_common::generateLayer<arm_compute::NEOneHot>(
-      indices_tensor->handle(), depth_tensor->handle(), onvalue_tensor->handle(),
-      offvalue_tensor->handle(), output_tensor->handle(), axis);
+    indices_tensor->handle(), depth_tensor->handle(), onvalue_tensor->handle(),
+    offvalue_tensor->handle(), output_tensor->handle(), axis);
   _return_fn = asAclFunction(std::move(fn));
 }
 
diff --git a/runtime/onert/backend/acl_neon/KernelGenerator.h b/runtime/onert/backend/acl_neon/KernelGenerator.h
index 2a4b307b8..0ccf21328 100644
--- a/runtime/onert/backend/acl_neon/KernelGenerator.h
+++ b/runtime/onert/backend/acl_neon/KernelGenerator.h
@@ -17,7 +17,7 @@
 #ifndef __ONERT_BACKEND_ACL_NEON_KERNEL_GENERATOR_H__
 #define __ONERT_BACKEND_ACL_NEON_KERNEL_GENERATOR_H__
 
-#include <backend/cpu_common/KernelGeneratorBase.h>
+#include <backend/basic/KernelGeneratorBase.h>
 
 #include "ir/Operands.h"
 #include "TensorBuilder.h"
@@ -31,15 +31,15 @@ namespace backend
 namespace acl_neon
 {
 
-class KernelGenerator : public cpu_common::KernelGeneratorBase
+class KernelGenerator : public basic::KernelGeneratorBase
 {
 public:
-  KernelGenerator(const ir::Operands &operands_ctx, const ir::Operations &operations_ctx,
-                  const std::shared_ptr<TensorBuilder> &tensor_builder,
+  KernelGenerator(const ir::Graph &graph, const std::shared_ptr<TensorBuilder> &tensor_builder,
                   const std::shared_ptr<acl_common::AclTensorRegistry<TensorManager>> &_tensor_reg);
 
-  void visit(const ir::OpSequence &) override;
+  std::unique_ptr<exec::FunctionSequence> generate(ir::OperationIndex ind) override;
 
+private:
   void visit(const ir::operation::ArgMinMax &) override;
   void visit(const ir::operation::BatchToSpaceND &) override;
   void visit(const ir::operation::BinaryArithmetic &) override;
@@ -85,9 +85,9 @@ public:
 private:
   const ir::Operands &_ctx;
   const ir::Operations &_operations_ctx;
+  const ir::Layout _current_layout;
   std::shared_ptr<TensorBuilder> _tensor_builder;
   std::shared_ptr<acl_common::AclTensorRegistry<TensorManager>> _tensor_reg;
-  ir::Layout _current_layout;
 };
 
 } // namespace acl_neon
diff --git a/runtime/onert/backend/acl_neon/Optimizer.cc b/runtime/onert/backend/acl_neon/Optimizer.cc
index ac80901cc..781103f9c 100644
--- a/runtime/onert/backend/acl_neon/Optimizer.cc
+++ b/runtime/onert/backend/acl_neon/Optimizer.cc
@@ -31,8 +31,8 @@ namespace acl_neon
 {
 
 Optimizer::Optimizer(BackendContext *context)
-    : _context{context},
-      _tensor_builder{std::dynamic_pointer_cast<TensorBuilder>(context->tensor_builder)}
+  : _context{context}, _tensor_builder{
+                         std::dynamic_pointer_cast<TensorBuilder>(context->tensor_builder)}
 {
   assert(context);
 }
@@ -42,14 +42,12 @@ void Optimizer::optimize()
   // Concat elimination (build subtensor info)
   {
     acl_common::AclSubTensorAnalyzer sa{*_context->graph()};
-    for (auto op_info : _context->operation_list())
-    {
-      auto &op = _context->graph()->operations().at(op_info.index);
-      sa.setLayout(op_info.layout);
-      op.accept(sa);
-    }
-
-    _tensor_builder->parent_map(sa.releaseParentMap());
+    sa.setUsePadding();
+    _context->graph()->operations().iterate(
+      [&](const ir::OperationIndex &, const ir::Operation &op) {
+        sa.setLayout(_context->graph()->layout());
+        op.accept(sa);
+      });
   }
 }
 
diff --git a/runtime/onert/backend/acl_neon/TensorBuilder.h b/runtime/onert/backend/acl_neon/TensorBuilder.h
index 070dc20ac..7b6e8406b 100644
--- a/runtime/onert/backend/acl_neon/TensorBuilder.h
+++ b/runtime/onert/backend/acl_neon/TensorBuilder.h
@@ -30,7 +30,7 @@ namespace acl_neon
 {
 
 using TensorBuilder =
-    acl_common::AclTensorBuilder<operand::INETensor, operand::NETensor, operand::NESubTensor>;
+  acl_common::AclTensorBuilder<operand::INETensor, operand::NETensor, operand::NESubTensor>;
 
 } // namespace acl_neon
 } // namespace backend
diff --git a/runtime/onert/backend/acl_neon/TensorManager.h b/runtime/onert/backend/acl_neon/TensorManager.h
index 3b7cfbcfd..5ecc0fbb3 100644
--- a/runtime/onert/backend/acl_neon/TensorManager.h
+++ b/runtime/onert/backend/acl_neon/TensorManager.h
@@ -41,16 +41,16 @@ namespace acl_neon
 {
 
 using MemoryManager =
-    acl_common::AclMemoryManager<operand::INETensor, operand::NETensor, operand::NESubTensor>;
+  acl_common::AclMemoryManager<operand::INETensor, operand::NETensor, operand::NESubTensor>;
 
 using LinearMemoryManager = acl_common::AclLinearMemoryManager<
-    operand::INETensor, operand::NETensor, operand::NESubTensor,
-    ::arm_compute::MemoryManagerOnDemand, ::arm_compute::PoolManager,
-    ::arm_compute::OffsetLifetimeManager, ::arm_compute::Allocator, ::arm_compute::MemoryGroup>;
+  operand::INETensor, operand::NETensor, operand::NESubTensor, ::arm_compute::MemoryManagerOnDemand,
+  ::arm_compute::PoolManager, ::arm_compute::OffsetLifetimeManager, ::arm_compute::Allocator,
+  ::arm_compute::MemoryGroup>;
 
 using InternalBufferManager = acl_common::AclInternalBufferManager<
-    ::arm_compute::MemoryManagerOnDemand, ::arm_compute::PoolManager,
-    ::arm_compute::OffsetLifetimeManager, ::arm_compute::Allocator>;
+  ::arm_compute::MemoryManagerOnDemand, ::arm_compute::PoolManager,
+  ::arm_compute::OffsetLifetimeManager, ::arm_compute::Allocator>;
 
 using TensorManager = acl_common::AclTensorManager<acl_neon::operand::INETensor, operand::NETensor,
                                                    operand::NESubTensor>;
diff --git a/runtime/onert/backend/acl_neon/operand/INETensor.h b/runtime/onert/backend/acl_neon/operand/INETensor.h
index db0ce6fdc..3747b12b7 100644
--- a/runtime/onert/backend/acl_neon/operand/INETensor.h
+++ b/runtime/onert/backend/acl_neon/operand/INETensor.h
@@ -33,6 +33,7 @@ namespace operand
 class INETensor : public acl_common::IACLTensor
 {
 public:
+  INETensor(size_t rank) : IACLTensor{rank} {}
   const arm_compute::ITensor *handle() const override = 0;
   arm_compute::ITensor *handle() override = 0;
   void access(const std::function<void(ITensor &tensor)> &fn) final;
diff --git a/runtime/onert/backend/acl_neon/operand/NESubTensor.cc b/runtime/onert/backend/acl_neon/operand/NESubTensor.cc
index 457addd55..fe82f6206 100644
--- a/runtime/onert/backend/acl_neon/operand/NESubTensor.cc
+++ b/runtime/onert/backend/acl_neon/operand/NESubTensor.cc
@@ -27,9 +27,8 @@ namespace operand
 
 NESubTensor::NESubTensor(INETensor *parent, const arm_compute::TensorShape &tensor_shape,
                          const arm_compute::Coordinates &coords, size_t rank, bool extend_parent)
-    : _ne_sub_tensor(std::make_shared<arm_compute::SubTensor>(parent->handle(), tensor_shape,
-                                                              coords, extend_parent)),
-      _rank{rank}
+  : INETensor{rank}, _ne_sub_tensor(std::make_shared<arm_compute::SubTensor>(
+                       parent->handle(), tensor_shape, coords, extend_parent))
 {
   // DO NOTHING
 }
diff --git a/runtime/onert/backend/acl_neon/operand/NESubTensor.h b/runtime/onert/backend/acl_neon/operand/NESubTensor.h
index 9944e4ba0..74dbe9011 100644
--- a/runtime/onert/backend/acl_neon/operand/NESubTensor.h
+++ b/runtime/onert/backend/acl_neon/operand/NESubTensor.h
@@ -39,9 +39,6 @@ public:
               const arm_compute::Coordinates &coords, size_t rank, bool extend_parent = false);
 
 public:
-  size_t num_dimensions() const final { return _rank; }
-
-public:
   const arm_compute::SubTensor *handle() const override;
   arm_compute::SubTensor *handle() override;
 
@@ -52,7 +49,6 @@ public:
 
 private:
   std::shared_ptr<arm_compute::SubTensor> _ne_sub_tensor;
-  size_t _rank;
 };
 
 } // namespace operand
diff --git a/runtime/onert/backend/acl_neon/operand/NETensor.cc b/runtime/onert/backend/acl_neon/operand/NETensor.cc
index 53dbb3021..4b237d731 100644
--- a/runtime/onert/backend/acl_neon/operand/NETensor.cc
+++ b/runtime/onert/backend/acl_neon/operand/NETensor.cc
@@ -28,7 +28,7 @@ namespace operand
 {
 
 NETensor::NETensor(const arm_compute::TensorInfo &info, size_t rank, size_t num_uses)
-    : _ne_tensor(std::make_shared<arm_compute::Tensor>()), _rank{rank}, _num_uses{num_uses}
+  : INETensor{rank}, _ne_tensor(std::make_shared<arm_compute::Tensor>()), _num_uses{num_uses}
 {
   allocator()->init(info);
 }
diff --git a/runtime/onert/backend/acl_neon/operand/NETensor.h b/runtime/onert/backend/acl_neon/operand/NETensor.h
index 0dd81afec..69f8b2111 100644
--- a/runtime/onert/backend/acl_neon/operand/NETensor.h
+++ b/runtime/onert/backend/acl_neon/operand/NETensor.h
@@ -40,9 +40,6 @@ public:
   NETensor(const arm_compute::TensorInfo &info, size_t rank, size_t num_uses);
 
 public:
-  size_t num_dimensions() const final { return _rank; }
-
-public:
   const arm_compute::Tensor *handle() const override;
   arm_compute::Tensor *handle() override;
   size_t num_uses() const { return _num_uses; }
@@ -52,7 +49,6 @@ public:
 
 private:
   std::shared_ptr<arm_compute::Tensor> _ne_tensor;
-  size_t _rank;
   size_t _num_uses;
 };
 
diff --git a/runtime/onert/backend/cpu/Backend.h b/runtime/onert/backend/cpu/Backend.h
index 0b416a7e9..398c188a8 100644
--- a/runtime/onert/backend/cpu/Backend.h
+++ b/runtime/onert/backend/cpu/Backend.h
@@ -19,7 +19,6 @@
 
 #include "BackendContext.h"
 #include "Config.h"
-#include "ConstantInitializer.h"
 #include "KernelGenerator.h"
 
 #include <backend/Backend.h>
@@ -40,19 +39,16 @@ public:
 
   std::shared_ptr<IConfig> config() const override { return _config; }
 
-  std::unique_ptr<onert::backend::BackendContext>
-  newContext(const ir::Graph &graph, const std::shared_ptr<custom::IKernelBuilder> &kb,
-             bool) const override
+  std::unique_ptr<onert::backend::BackendContext> newContext(ContextData &&data) const override
   {
-    const auto &operands = graph.operands();
-    const auto &operations = graph.operations();
-    auto context = std::make_unique<BackendContext>(this, &graph);
-    auto tr = std::make_shared<cpu_common::TensorRegistry>();
+    auto custom_kernel_builder = data.custom_kernel_builder;
+    auto &graph = *data.graph;
+    auto context = std::make_unique<BackendContext>(this, std::move(data));
+    auto tr = std::make_shared<basic::TensorRegistry>();
     auto tb = std::make_shared<TensorBuilder>(tr);
     context->tensor_registry = tr;
     context->tensor_builder = tb;
-    context->constant_initializer = std::make_shared<ConstantInitializer>(operands, tr);
-    context->kernel_gen = std::make_shared<KernelGenerator>(operands, operations, tb, tr, kb,
+    context->kernel_gen = std::make_shared<KernelGenerator>(graph, tb, tr, custom_kernel_builder,
                                                             context->external_context());
     return context;
   }
diff --git a/runtime/onert/backend/cpu/BackendContext.cc b/runtime/onert/backend/cpu/BackendContext.cc
index 6b958c1b7..e6f7b8470 100644
--- a/runtime/onert/backend/cpu/BackendContext.cc
+++ b/runtime/onert/backend/cpu/BackendContext.cc
@@ -22,7 +22,7 @@
 #include "ir/Index.h"
 #include "ir/OperandIndexMap.h"
 #include "ir/OperandIndexSequence.h"
-#include "backend/cpu_common/BackendContextHelpers.h"
+#include "backend/basic/BackendContextHelpers.h"
 
 namespace onert
 {
@@ -31,107 +31,24 @@ namespace backend
 namespace cpu
 {
 
-void BackendContext::initConsts()
-{
-  for (auto &op : operation_list())
-  {
-    constant_initializer->setLayout(op.layout);
-    graph()->operations().at(op.index).accept(*constant_initializer);
-  }
-
-  for (auto ind : operand_list())
-  {
-    const auto &obj = graph()->operands().at(ind);
-    if (obj.isConstant() && !constant_initializer->exist(ind))
-    {
-      constant_initializer->registerDefaultInitializer(ind, obj);
-    }
-  }
-
-  constant_initializer->run();
-}
-
-ITensorRegistry *BackendContext::genTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
-                                            const ir::OpSequences &op_seqs,
-                                            const ir::LowerInfoMap &lower_info)
-{
-  auto model_io = (graph()->getInputs() + graph()->getOutputs()) | ir::Remove::UNDEFINED |
-                  ir::Remove::DUPLICATED;
-  for (auto index : operand_list())
-  {
-    if (model_io.contains(index))
-      continue;
-    const auto &obj = graph()->operands().at(index);
-    const auto frontend_layout = [&]() {
-      if (obj.getUses().size() == 0)
-        return ir::Layout::UNKNOWN;
-      auto use_op_ind = *obj.getUses().begin(); // FIXME What if it has two or more uses?
-      for (auto &operation_info : operation_list())
-      {
-        if (operation_info.index == use_op_ind)
-          return operation_info.layout;
-      }
-      return ir::Layout::UNKNOWN;
-    }();
-    const auto &permute_factor = lower_info.operand.at(index)->def_factors().getOnlyElement();
-    if (permute_factor.backend() != backend())
-      continue;
-    const auto backend_layout = permute_factor.layout();
-    ir::OperandInfo backend_info{permuteShape(obj.shape(), frontend_layout, backend_layout),
-                                 obj.typeInfo(), obj.info().memAllocType(), obj.isConstant()};
-    tensor_builder->registerTensorInfo(index, backend_info, backend_layout);
-  }
-
-  // TODO Get compiler options from compiler, and use it rather than getting it from Env
-  if (util::getConfigString(util::config::EXECUTOR) == "Linear")
-  {
-    cpu_common::planTensors(*this, order, op_seqs, lower_info);
-  }
-  else
-  {
-    // For the executors that does not have fixed linear execution order:
-    // To make tensors never be deallocated, this is a workaround to use static memory planner
-    for (auto ind : operand_list())
-    {
-      if (tensor_builder->isRegistered(ind))
-        tensor_builder->notifyFirstUse(ind);
-    }
-  }
+ITensorRegistry *BackendContext::genTensors() { return basic::genTensors(*this); }
 
-  tensor_builder->prepare();
-
-  return tensor_registry.get();
-}
-
-FunctionMap BackendContext::genKernels(const std::vector<onert::ir::OpSequenceIndex> &order,
-                                       const ir::OpSequences &op_seqs)
+FunctionMap BackendContext::genKernels()
 {
   FunctionMap ret;
 
-  for (auto op_seq_ind : order)
+  for (auto op_ind : _data.op_order)
   {
-    const auto &op_seq = op_seqs.at(op_seq_ind);
-    bool assigned = [&]() {
-      for (auto op_info : operation_list())
-        if (op_seq.exist(op_info.index))
-          return true;
-      return false;
-    }();
-    if (!assigned)
-      continue;
-    auto fn_seq = kernel_gen->generate(op_seqs.at(op_seq_ind));
-    ret.emplace_back(op_seq_ind, std::move(fn_seq));
+    auto fn_seq = kernel_gen->generate(op_ind);
+    ret.emplace_back(op_ind, std::move(fn_seq));
   }
 
-  initConsts();
+  basic::initConsts(*this);
 
   // NOTE For memory optimization, we want to free some operand data
-  for (auto ind : operand_list())
-  {
-    // TODO Remove const_cast
-    auto &obj = const_cast<ir::Graph *>(graph())->operands().at(ind);
-    obj.releaseData();
-  }
+  const_cast<ir::Graph &>(*_data.graph)
+    .operands()
+    .iterate([&](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); });
 
   for (auto &it : ret)
   {
diff --git a/runtime/onert/backend/cpu/BackendContext.h b/runtime/onert/backend/cpu/BackendContext.h
index 0a4106d33..69ab30c82 100644
--- a/runtime/onert/backend/cpu/BackendContext.h
+++ b/runtime/onert/backend/cpu/BackendContext.h
@@ -19,7 +19,6 @@
 
 #include <backend/BackendContext.h>
 #include "TensorBuilder.h"
-#include "ConstantInitializer.h"
 #include "KernelGenerator.h"
 #include "ExternalContext.h"
 
@@ -33,34 +32,23 @@ namespace cpu
 class BackendContext : public onert::backend::BackendContext
 {
 public:
-  BackendContext(const Backend *backend, const ir::Graph *graph,
+  BackendContext(const Backend *backend, ContextData &&data,
                  std::shared_ptr<ITensorRegistry> tensor_registry = nullptr,
                  std::shared_ptr<TensorBuilder> tensor_builder = nullptr,
-                 std::shared_ptr<ConstantInitializer> constant_initializer = nullptr,
                  std::shared_ptr<KernelGenerator> kernel_gen = nullptr)
-      : onert::backend::BackendContext(backend, graph, tensor_registry),
-        tensor_builder{tensor_builder}, constant_initializer{constant_initializer},
-        kernel_gen{kernel_gen}, _external_context(new ExternalContext)
+    : onert::backend::BackendContext(backend, std::move(data), tensor_registry),
+      tensor_builder{tensor_builder}, kernel_gen{kernel_gen}, _external_context(new ExternalContext)
   {
   }
 
-  ITensorRegistry *genTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
-                              const ir::OpSequences &op_seqs,
-                              const ir::LowerInfoMap &lower_info) override;
-  FunctionMap genKernels(const std::vector<onert::ir::OpSequenceIndex> &order,
-                         const ir::OpSequences &op_seqs) override;
+  ITensorRegistry *genTensors() override;
+  FunctionMap genKernels() override;
 
   std::shared_ptr<ExternalContext> external_context() { return _external_context; }
 
-private:
-  void initConsts();
-  void planTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
-                   const ir::OpSequences &op_seqs, const ir::LowerInfoMap &lower_info);
-
 public:
   // TODO Make it private
   std::shared_ptr<TensorBuilder> tensor_builder;
-  std::shared_ptr<ConstantInitializer> constant_initializer;
   std::shared_ptr<KernelGenerator> kernel_gen;
 
 private:
diff --git a/runtime/onert/backend/cpu/ConstantInitializer.h b/runtime/onert/backend/cpu/ConstantInitializer.h
deleted file mode 100644
index d7858c0f6..000000000
--- a/runtime/onert/backend/cpu/ConstantInitializer.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __ONERT_BACKEND_CPU_CONSTANT_INITIALIZER_H__
-#define __ONERT_BACKEND_CPU_CONSTANT_INITIALIZER_H__
-
-#include <backend/cpu_common/ConstantInitializer.h>
-
-namespace onert
-{
-namespace backend
-{
-namespace cpu
-{
-
-using ConstantInitializer = cpu_common::ConstantInitializer;
-
-} // namespace cpu
-} // namespace backend
-} // namespace onert
-
-#endif // __ONERT_BACKEND_CPU_CONSTANT_INITIALIZER_H__
diff --git a/runtime/onert/backend/cpu/ExternalContext.h b/runtime/onert/backend/cpu/ExternalContext.h
index f5d11f4f1..ab0bb5f10 100644
--- a/runtime/onert/backend/cpu/ExternalContext.h
+++ b/runtime/onert/backend/cpu/ExternalContext.h
@@ -20,11 +20,6 @@
 #include <util/ConfigSource.h>
 #include <ruy/context.h>
 
-namespace
-{
-const int kDefaultNumThreadpoolThreads = 1;
-}
-
 namespace onert
 {
 namespace backend
@@ -34,6 +29,9 @@ namespace cpu
 
 class ExternalContext
 {
+private:
+  static const int kDefaultNumThreadpoolThreads = 1;
+
 public:
   ExternalContext() : _ruy_context(new ruy::Context)
   {
@@ -43,7 +41,7 @@ public:
   void setMaxNumThreads(int max_num_threads)
   {
     const int target_num_threads =
-        max_num_threads > -1 ? max_num_threads : kDefaultNumThreadpoolThreads;
+      max_num_threads > -1 ? max_num_threads : kDefaultNumThreadpoolThreads;
     _ruy_context->set_max_num_threads(target_num_threads);
   }
 
diff --git a/runtime/onert/backend/cpu/KernelGenerator.cc b/runtime/onert/backend/cpu/KernelGenerator.cc
index 25756eced..d5096ff09 100644
--- a/runtime/onert/backend/cpu/KernelGenerator.cc
+++ b/runtime/onert/backend/cpu/KernelGenerator.cc
@@ -41,6 +41,7 @@
 #include "ops/PadLayer.h"
 #include "ops/PoolLayer.h"
 #include "ops/PowLayer.h"
+#include "ops/QuantizeLayer.h"
 #include "ops/RangeLayer.h"
 #include "ops/RankLayer.h"
 #include "ops/ReduceLayer.h"
@@ -221,78 +222,74 @@ ops::ReduceType convertReduceType(ir::operation::Reduce::ReduceType reduce_type_
 } // namespace
 
 KernelGenerator::KernelGenerator(
-    const ir::Operands &operands_ctx, const ir::Operations &operations_ctx,
-    const std::shared_ptr<TensorBuilder> &tensor_builder,
-    const std::shared_ptr<cpu_common::TensorRegistry> &tensor_reg,
-    const std::shared_ptr<backend::custom::IKernelBuilder> &kernel_builder,
-    const std::shared_ptr<ExternalContext> &external_context)
-    : _ctx(operands_ctx), _operations_ctx{operations_ctx}, _tensor_builder(tensor_builder),
-      _tensor_reg{tensor_reg}, _kernel_builder(kernel_builder),
-      _current_layout(ir::Layout::UNKNOWN), _external_context(external_context)
+  const ir::Graph &graph, const std::shared_ptr<TensorBuilder> &tensor_builder,
+  const std::shared_ptr<basic::TensorRegistry> &tensor_reg,
+  const std::shared_ptr<backend::custom::IKernelBuilder> &kernel_builder,
+  const std::shared_ptr<ExternalContext> &external_context)
+  : basic::KernelGeneratorBase{graph},
+    _ctx(graph.operands()), _operations_ctx{graph.operations()}, _current_layout{graph.layout()},
+    _tensor_builder(tensor_builder), _tensor_reg{tensor_reg}, _kernel_builder(kernel_builder),
+    _external_context(external_context)
 {
   // DO NOTHING
 }
 
-void KernelGenerator::visit(const ir::operation::AddN &node)
+std::unique_ptr<exec::FunctionSequence> KernelGenerator::generate(ir::OperationIndex ind)
 {
-  const auto output_index{node.getOutputs().at(0)};
-
-  std::vector<const IPortableTensor *> input_tensors;
-  for (auto &input_idx : node.getInputs())
-    input_tensors.emplace_back(_tensor_reg->getPortableTensor(input_idx));
-
-  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
-
-  auto fn = std::make_unique<ops::AddNLayer>();
-
-  fn->configure(std::move(input_tensors), output_tensor);
-
-  _return_fn = std::move(fn);
-}
+  auto ret = std::make_unique<exec::FunctionSequence>();
 
-void KernelGenerator::visit(const ir::OpSequence &op_seq)
-{
-  assert(!_return_fn_seq);
   assert(_tensor_builder->dynamicTensorManager());
   assert(_tensor_reg);
 
   auto dyn_shape_inferer = std::make_shared<exec::DynamicShapeInferer>(_ctx, _tensor_reg);
 
-  _return_fn_seq = std::make_unique<exec::FunctionSequence>();
-
   // Prepare to handle dynamic tensors later
   auto dyn_ctx = std::make_shared<exec::FunctionSequence::DynamicTensorCtx>();
   {
-    dyn_ctx->op_seq = &op_seq;
+    dyn_ctx->op_ind = ind;
     dyn_ctx->operations = &_operations_ctx;
     dyn_ctx->dynamic_shape_inferer = std::move(dyn_shape_inferer);
-    dyn_ctx->dynamic_tensor_manager = _tensor_builder->dynamicTensorManager();
 
-    _return_fn_seq->dynamic_tensor_ctx(dyn_ctx);
+    ret->dynamic_tensor_ctx(dyn_ctx);
   }
 
-  _current_layout = op_seq.getLayout();
-  for (const auto &operation_idx : op_seq.operations())
+  auto &op = _graph.operations().at(ind);
+  op.accept(*this);
+  assert(_return_fn); // _return_fn must have been generated
+  ret->append(std::move(_return_fn));
+
+  for (auto ind : (op.getInputs() | ir::Remove::UNDEFINED) + op.getOutputs())
   {
-    const auto &node = _operations_ctx.at(operation_idx);
-    node.accept(*this);
-    _return_fn_seq->append(releaseFunction());
+    auto portable_tensor = _tensor_reg->getPortableTensor(ind);
+    if (portable_tensor)
+    {
+      assert(portable_tensor->layout() == ir::Layout::NHWC);
+    }
 
-    for (const auto &ind : (node.getInputs() | ir::Remove::UNDEFINED) + node.getOutputs())
+    auto tensor = _tensor_reg->getNativeTensor(ind);
+    if (tensor)
     {
-      auto portable_tensor = _tensor_reg->getPortableTensor(ind);
-      if (portable_tensor)
-      {
-        assert(portable_tensor->layout() == ir::Layout::NHWC);
-      }
-
-      auto tensor = _tensor_reg->getNativeTensor(ind);
-      if (tensor)
-      {
-        tensor->increase_ref();
-      }
+      tensor->increase_ref();
     }
   }
+  return ret;
+}
+
+void KernelGenerator::visit(const ir::operation::AddN &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+
+  std::vector<const IPortableTensor *> input_tensors;
+  for (auto &input_idx : node.getInputs())
+    input_tensors.emplace_back(_tensor_reg->getPortableTensor(input_idx));
+
+  auto output_tensor = _tensor_reg->getPortableTensor(output_index);
+
+  auto fn = std::make_unique<ops::AddNLayer>();
+
+  fn->configure(std::move(input_tensors), output_tensor);
+
+  _return_fn = std::move(fn);
 }
 
 void KernelGenerator::visit(const ir::operation::Conv2D &node)
@@ -333,8 +330,8 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
   const auto ker_width = ker_shape.dim(2);
 
   const auto padding =
-      ir::calculatePadding(param_padding, ifm_shape, ofm_shape, stride, ker_width, ker_height,
-                           dilation.width_factor, dilation.height_factor);
+    ir::calculatePadding(param_padding, ifm_shape, ofm_shape, stride, ker_width, ker_height,
+                         dilation.width_factor, dilation.height_factor);
 
   fn->configure(ifm_tensor, ker_tensor, bias_tensor, param_padding.type, padding.left,
                 padding.right, padding.top, padding.bottom, stride.horizontal, stride.vertical,
@@ -615,7 +612,7 @@ void KernelGenerator::visit(const ir::operation::OneHot &node)
   auto offvalue_tensor = _tensor_reg->getPortableTensor(offvalue_index);
 
   assert(indices_tensor->data_type() == OperandType::INT32);
-  assert(axis <= static_cast<int>(indices_tensor->num_dimensions()));
+  assert(axis <= static_cast<int>(indices_tensor->getShape().rank()));
 
   auto fn = std::make_unique<ops::OneHotLayer>();
 
@@ -712,11 +709,18 @@ void KernelGenerator::visit(const ir::operation::ElementwiseUnary &node)
   auto output_tensor = _tensor_reg->getPortableTensor(output_index);
   auto input_tensor = _tensor_reg->getPortableTensor(input_index);
 
-  auto fn = std::make_unique<ops::ElementwiseUnaryLayer>();
-
-  fn->configure(input_tensor, output_tensor, convertElementwiseUnaryType(node.param().op_type));
-
-  _return_fn = std::move(fn);
+  if (node.param().op_type == ir::operation::ElementwiseUnary::Type::QUANTIZE)
+  {
+    auto fn = std::make_unique<ops::QuantizeLayer>();
+    fn->configure(input_tensor, output_tensor);
+    _return_fn = std::move(fn);
+  }
+  else
+  {
+    auto fn = std::make_unique<ops::ElementwiseUnaryLayer>();
+    fn->configure(input_tensor, output_tensor, convertElementwiseUnaryType(node.param().op_type));
+    _return_fn = std::move(fn);
+  }
 }
 
 void KernelGenerator::visit(const ir::operation::ExpandDims &node)
@@ -1041,7 +1045,7 @@ void KernelGenerator::visit(const ir::operation::Pool2D &node)
   const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_layout);
   const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_layout);
   const auto padding =
-      ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
+    ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
   const auto activation = node.param().activation;
 
   auto ofm_tensor = _tensor_reg->getPortableTensor(ofm_index);
@@ -1337,49 +1341,49 @@ void KernelGenerator::visit(const ir::operation::SplitV &node)
 void KernelGenerator::visit(const ir::operation::LSTM &node)
 {
   const auto scratch_buffer_index{
-      node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)};
+    node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)};
   const auto output_state_out_index{
-      node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)};
+    node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)};
   const auto cell_state_out_index{
-      node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)};
+    node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)};
   const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)};
 
   const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)};
   const auto input_to_input_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional
+    node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional
   const auto input_to_forget_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)};
+    node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)};
   const auto input_to_cell_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)};
+    node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)};
   const auto input_to_output_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
+    node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
   const auto recurrent_to_input_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional
+    node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional
   const auto recurrent_to_forget_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)};
+    node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)};
   const auto recurrent_to_cell_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)};
+    node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)};
   const auto recurrent_to_output_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
+    node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
   const auto cell_to_input_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional
+    node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional
   const auto cell_to_forget_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional
+    node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional
   const auto cell_to_output_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional
+    node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional
   const auto input_gate_bias_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)};
+    node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)};
   const auto forget_gate_bias_index{
-      node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)};
+    node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)};
   const auto cell_gate_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)};
   const auto output_gate_bias_index{
-      node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)};
+    node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)};
   const auto projection_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional
+    node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional
   const auto projection_bias_index{
-      node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; // optional
+    node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; // optional
   const auto output_state_in_index{
-      node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)};
+    node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)};
   const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)};
   const auto time_major = node.param().time_major;
 
@@ -1391,9 +1395,9 @@ void KernelGenerator::visit(const ir::operation::LSTM &node)
                                     (_ctx.at(input_to_input_weights_index).shape().dim(0) != 0 &&
                                      _ctx.at(input_to_input_weights_index).shape().dim(1) != 0);
   bool has_recurrent_to_input_weights =
-      _ctx.exist(recurrent_to_input_weights_index) &&
-      (_ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
-       _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0);
+    _ctx.exist(recurrent_to_input_weights_index) &&
+    (_ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
+     _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0);
 
   // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole.
   // But the cell_to_input_weights does not exist in regular CIFG although peephole.
@@ -1405,56 +1409,56 @@ void KernelGenerator::visit(const ir::operation::LSTM &node)
                                     _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0;
 
   bool has_input_gate_bias =
-      _ctx.exist(input_gate_bias_index) && _ctx.at(input_gate_bias_index).shape().dim(0);
+    _ctx.exist(input_gate_bias_index) && _ctx.at(input_gate_bias_index).shape().dim(0);
 
   bool has_projection_weights = _ctx.exist(projection_weights_index) &&
                                 (_ctx.at(projection_weights_index).shape().dim(0) != 0 &&
                                  _ctx.at(projection_weights_index).shape().dim(1) != 0);
   bool has_projection_bias =
-      _ctx.exist(projection_bias_index) && _ctx.at(projection_bias_index).shape().dim(0);
+    _ctx.exist(projection_bias_index) && _ctx.at(projection_bias_index).shape().dim(0);
 
   auto scratch_buffer_tensor = _ctx.exist(scratch_buffer_index)
-                                   ? _tensor_reg->getPortableTensor(scratch_buffer_index)
-                                   : nullptr; // optional
+                                 ? _tensor_reg->getPortableTensor(scratch_buffer_index)
+                                 : nullptr; // optional
   auto output_state_out_tensor = _ctx.exist(output_state_out_index)
-                                     ? _tensor_reg->getPortableTensor(output_state_out_index)
-                                     : nullptr; // optional
-  auto cell_state_out_tensor = _ctx.exist(cell_state_out_index)
-                                   ? _tensor_reg->getPortableTensor(cell_state_out_index)
+                                   ? _tensor_reg->getPortableTensor(output_state_out_index)
                                    : nullptr; // optional
+  auto cell_state_out_tensor = _ctx.exist(cell_state_out_index)
+                                 ? _tensor_reg->getPortableTensor(cell_state_out_index)
+                                 : nullptr; // optional
   auto output_tensor = _tensor_reg->getPortableTensor(output_index);
 
   auto input_tensor = _tensor_reg->getPortableTensor(input_index);
 
   auto input_to_input_weights_tensor =
-      has_input_to_input_weights ? _tensor_reg->getPortableTensor(input_to_input_weights_index)
-                                 : nullptr; // optional
+    has_input_to_input_weights ? _tensor_reg->getPortableTensor(input_to_input_weights_index)
+                               : nullptr; // optional
   auto input_to_forget_weights_tensor =
-      _tensor_reg->getPortableTensor(input_to_forget_weights_index);
+    _tensor_reg->getPortableTensor(input_to_forget_weights_index);
   auto input_to_cell_weights_tensor = _tensor_reg->getPortableTensor(input_to_cell_weights_index);
   auto input_to_output_weights_tensor =
-      _tensor_reg->getPortableTensor(input_to_output_weights_index);
+    _tensor_reg->getPortableTensor(input_to_output_weights_index);
   auto recurrent_to_input_weights_tensor =
-      has_recurrent_to_input_weights
-          ? _tensor_reg->getPortableTensor(recurrent_to_input_weights_index)
-          : nullptr; // optional
+    has_recurrent_to_input_weights
+      ? _tensor_reg->getPortableTensor(recurrent_to_input_weights_index)
+      : nullptr; // optional
   auto recurrent_to_forget_weights_tensor =
-      _tensor_reg->getPortableTensor(recurrent_to_forget_weights_index);
+    _tensor_reg->getPortableTensor(recurrent_to_forget_weights_index);
   auto recurrent_to_cell_weights_tensor =
-      _tensor_reg->getPortableTensor(recurrent_to_cell_weights_index);
+    _tensor_reg->getPortableTensor(recurrent_to_cell_weights_index);
   auto recurrent_to_output_weights_tensor =
-      _tensor_reg->getPortableTensor(recurrent_to_output_weights_index);
+    _tensor_reg->getPortableTensor(recurrent_to_output_weights_index);
 
   auto cell_to_input_weights_tensor = _tensor_reg->getPortableTensor(cell_to_input_weights_index);
   auto cell_to_forget_weights_tensor =
-      has_cell_to_forget_weights ? _tensor_reg->getPortableTensor(cell_to_forget_weights_index)
-                                 : nullptr; // optional
+    has_cell_to_forget_weights ? _tensor_reg->getPortableTensor(cell_to_forget_weights_index)
+                               : nullptr; // optional
   auto cell_to_output_weights_tensor =
-      has_cell_to_output_weights ? _tensor_reg->getPortableTensor(cell_to_output_weights_index)
-                                 : nullptr; // optional
+    has_cell_to_output_weights ? _tensor_reg->getPortableTensor(cell_to_output_weights_index)
+                               : nullptr; // optional
 
   auto input_gate_bias_tensor =
-      has_input_gate_bias ? _tensor_reg->getPortableTensor(input_gate_bias_index) : nullptr;
+    has_input_gate_bias ? _tensor_reg->getPortableTensor(input_gate_bias_index) : nullptr;
   auto forget_gate_bias_tensor = _tensor_reg->getPortableTensor(forget_gate_bias_index);
   auto cell_gate_bias_tensor = _tensor_reg->getPortableTensor(cell_gate_bias_index);
   auto output_gate_bias_tensor = _tensor_reg->getPortableTensor(output_gate_bias_index);
@@ -1462,11 +1466,11 @@ void KernelGenerator::visit(const ir::operation::LSTM &node)
   auto cell_state_in_tensor = _tensor_reg->getPortableTensor(cell_state_in_index);
 
   auto projection_weights_tensor = has_projection_weights
-                                       ? _tensor_reg->getPortableTensor(projection_weights_index)
-                                       : nullptr; // optional
+                                     ? _tensor_reg->getPortableTensor(projection_weights_index)
+                                     : nullptr; // optional
   auto projection_bias_tensor = has_projection_bias
-                                    ? _tensor_reg->getPortableTensor(projection_bias_index)
-                                    : nullptr; // optional
+                                  ? _tensor_reg->getPortableTensor(projection_bias_index)
+                                  : nullptr; // optional
 
   IPortableTensor *input_layer_norm_weights_tensor = nullptr;
   IPortableTensor *forget_layer_norm_weights_tensor = nullptr;
@@ -1475,45 +1479,45 @@ void KernelGenerator::visit(const ir::operation::LSTM &node)
   if (node.getInputs().size() == 24)
   {
     const auto input_layer_norm_weights_index{
-        node.getInputs().at(ir::operation::LSTM::Input::INPUT_LAYER_NORMALIZATION_WEIGHTS)};
+      node.getInputs().at(ir::operation::LSTM::Input::INPUT_LAYER_NORMALIZATION_WEIGHTS)};
     const auto forget_layer_norm_weights_index{
-        node.getInputs().at(ir::operation::LSTM::Input::FORGET_LAYER_NORMALIZATION_WEIGHTS)};
+      node.getInputs().at(ir::operation::LSTM::Input::FORGET_LAYER_NORMALIZATION_WEIGHTS)};
     const auto cell_layer_norm_weights_index{
-        node.getInputs().at(ir::operation::LSTM::Input::CELL_LAYER_NORMALIZATION_WEIGHTS)};
+      node.getInputs().at(ir::operation::LSTM::Input::CELL_LAYER_NORMALIZATION_WEIGHTS)};
     const auto output_layer_norm_weights_index{
-        node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_LAYER_NORMALIZATION_WEIGHTS)};
+      node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_LAYER_NORMALIZATION_WEIGHTS)};
 
     input_layer_norm_weights_tensor =
-        _tensor_reg->getPortableTensor(input_layer_norm_weights_index);
+      _tensor_reg->getPortableTensor(input_layer_norm_weights_index);
     forget_layer_norm_weights_tensor =
-        _tensor_reg->getPortableTensor(forget_layer_norm_weights_index);
+      _tensor_reg->getPortableTensor(forget_layer_norm_weights_index);
     cell_layer_norm_weights_tensor = _tensor_reg->getPortableTensor(cell_layer_norm_weights_index);
     output_layer_norm_weights_tensor =
-        _tensor_reg->getPortableTensor(output_layer_norm_weights_index);
+      _tensor_reg->getPortableTensor(output_layer_norm_weights_index);
   }
 
   auto fn = std::make_unique<ops::LSTMLayer>();
 
   fn->configure(
-      input_tensor, input_to_input_weights_tensor, input_to_forget_weights_tensor,
-      input_to_cell_weights_tensor, input_to_output_weights_tensor,
-      recurrent_to_input_weights_tensor, recurrent_to_forget_weights_tensor,
-      recurrent_to_cell_weights_tensor, recurrent_to_output_weights_tensor,
-      cell_to_input_weights_tensor, cell_to_forget_weights_tensor, cell_to_output_weights_tensor,
-      input_layer_norm_weights_tensor, forget_layer_norm_weights_tensor,
-      cell_layer_norm_weights_tensor, output_layer_norm_weights_tensor,
-      /*aux_input=*/nullptr,
-      /*aux_input_to_input_weights=*/nullptr,
-      /*aux_input_to_forget_weights=*/nullptr,
-      /*aux_input_to_cell_weights=*/nullptr,
-      /*aux_input_to_output_weights=*/nullptr, input_gate_bias_tensor, forget_gate_bias_tensor,
-      cell_gate_bias_tensor, output_gate_bias_tensor, projection_weights_tensor,
-      projection_bias_tensor, output_state_in_tensor, cell_state_in_tensor, node.param(),
-      /*forward_sequence=*/true, time_major,
-      /*output_offset=*/0, scratch_buffer_tensor, output_state_out_tensor, cell_state_out_tensor,
-      output_tensor,
-      !_ctx.at(output_state_in_index).info().isVariable() /* means empty buffer on frontend now */,
-      !_ctx.at(cell_state_in_index).info().isVariable());
+    input_tensor, input_to_input_weights_tensor, input_to_forget_weights_tensor,
+    input_to_cell_weights_tensor, input_to_output_weights_tensor, recurrent_to_input_weights_tensor,
+    recurrent_to_forget_weights_tensor, recurrent_to_cell_weights_tensor,
+    recurrent_to_output_weights_tensor, cell_to_input_weights_tensor, cell_to_forget_weights_tensor,
+    cell_to_output_weights_tensor, input_layer_norm_weights_tensor,
+    forget_layer_norm_weights_tensor, cell_layer_norm_weights_tensor,
+    output_layer_norm_weights_tensor,
+    /*aux_input=*/nullptr,
+    /*aux_input_to_input_weights=*/nullptr,
+    /*aux_input_to_forget_weights=*/nullptr,
+    /*aux_input_to_cell_weights=*/nullptr,
+    /*aux_input_to_output_weights=*/nullptr, input_gate_bias_tensor, forget_gate_bias_tensor,
+    cell_gate_bias_tensor, output_gate_bias_tensor, projection_weights_tensor,
+    projection_bias_tensor, output_state_in_tensor, cell_state_in_tensor, node.param(),
+    /*forward_sequence=*/true, time_major,
+    /*output_offset=*/0, scratch_buffer_tensor, output_state_out_tensor, cell_state_out_tensor,
+    output_tensor,
+    !_ctx.at(output_state_in_index).info().isVariable() /* means empty buffer on frontend now */,
+    !_ctx.at(cell_state_in_index).info().isVariable());
 
   _return_fn = std::move(fn);
 }
diff --git a/runtime/onert/backend/cpu/KernelGenerator.h b/runtime/onert/backend/cpu/KernelGenerator.h
index 3a4cfbffa..d452d0ba6 100644
--- a/runtime/onert/backend/cpu/KernelGenerator.h
+++ b/runtime/onert/backend/cpu/KernelGenerator.h
@@ -19,11 +19,11 @@
 
 #include "ExternalContext.h"
 #include "TensorBuilder.h"
-#include "backend/cpu_common/TensorRegistry.h"
+#include "backend/basic/TensorRegistry.h"
 #include "Tensor.h"
 
 #include <backend/CustomKernelBuilder.h>
-#include <backend/cpu_common/KernelGeneratorBase.h>
+#include <backend/basic/KernelGeneratorBase.h>
 #include <ir/Operands.h>
 #include <ir/Operations.h>
 
@@ -34,16 +34,15 @@ namespace backend
 namespace cpu
 {
 
-class KernelGenerator : public cpu_common::KernelGeneratorBase
+class KernelGenerator : public basic::KernelGeneratorBase
 {
 public:
-  KernelGenerator(const ir::Operands &operands_ctx, const ir::Operations &operations_ctx,
-                  const std::shared_ptr<TensorBuilder> &tensor_builder,
-                  const std::shared_ptr<cpu_common::TensorRegistry> &tensor_reg,
+  KernelGenerator(const ir::Graph &graph, const std::shared_ptr<TensorBuilder> &tensor_builder,
+                  const std::shared_ptr<basic::TensorRegistry> &tensor_reg,
                   const std::shared_ptr<custom::IKernelBuilder> &kernel_builder,
                   const std::shared_ptr<ExternalContext> &external_context);
 
-  void visit(const ir::OpSequence &) override;
+  std::unique_ptr<exec::FunctionSequence> generate(ir::OperationIndex op_ind) override;
 
   void visit(const ir::operation::AddN &) override;
   void visit(const ir::operation::ArgMinMax &) override;
@@ -100,10 +99,10 @@ public:
 private:
   const ir::Operands &_ctx;
   const ir::Operations &_operations_ctx;
+  ir::Layout _current_layout;
   std::shared_ptr<TensorBuilder> _tensor_builder;
-  std::shared_ptr<cpu_common::TensorRegistry> _tensor_reg;
+  std::shared_ptr<basic::TensorRegistry> _tensor_reg;
   std::shared_ptr<backend::custom::IKernelBuilder> _kernel_builder;
-  ir::Layout _current_layout;
   const std::shared_ptr<ExternalContext> _external_context;
 };
 
diff --git a/runtime/onert/backend/cpu/StaticTensorManager.h b/runtime/onert/backend/cpu/StaticTensorManager.h
index d07f0c814..bcbb569ea 100644
--- a/runtime/onert/backend/cpu/StaticTensorManager.h
+++ b/runtime/onert/backend/cpu/StaticTensorManager.h
@@ -17,7 +17,7 @@
 #ifndef __ONERT_BACKEND_CPU_STATICTENSOR_MANAGER_H__
 #define __ONERT_BACKEND_CPU_STATICTENSOR_MANAGER_H__
 
-#include "backend/cpu_common/StaticTensorManager.h"
+#include "backend/basic/StaticTensorManager.h"
 
 namespace onert
 {
@@ -26,7 +26,7 @@ namespace backend
 namespace cpu
 {
 
-using StaticTensorManager = cpu_common::StaticTensorManager;
+using StaticTensorManager = basic::StaticTensorManager;
 
 } // namespace cpu
 } // namespace backend
diff --git a/runtime/onert/backend/cpu/Tensor.h b/runtime/onert/backend/cpu/Tensor.h
index d663c3f50..f42d3d068 100644
--- a/runtime/onert/backend/cpu/Tensor.h
+++ b/runtime/onert/backend/cpu/Tensor.h
@@ -17,7 +17,7 @@
 #ifndef __ONERT_BACKEND_CPU_TENSOR_H__
 #define __ONERT_BACKEND_CPU_TENSOR_H__
 
-#include <backend/cpu_common/Tensor.h>
+#include <backend/basic/Tensor.h>
 #include <ir/Data.h>
 
 namespace onert
@@ -27,8 +27,8 @@ namespace backend
 namespace cpu
 {
 
-using Tensor = cpu_common::Tensor;
-using ExternalTensor = cpu_common::ExternalTensor;
+using Tensor = basic::Tensor;
+using ExternalTensor = basic::ExternalTensor;
 
 } // namespace cpu
 } // namespace backend
diff --git a/runtime/onert/backend/cpu/TensorBuilder.cc b/runtime/onert/backend/cpu/TensorBuilder.cc
deleted file mode 100644
index e6bc55b0b..000000000
--- a/runtime/onert/backend/cpu/TensorBuilder.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "TensorBuilder.h"
-
-#include <util/logging.h>
-
-#include <cassert>
-
-namespace onert
-{
-namespace backend
-{
-namespace cpu
-{
-
-TensorBuilder::TensorBuilder(const std::shared_ptr<cpu_common::TensorRegistry> &tensor_reg)
-    : _tensor_reg{tensor_reg},
-      _dynamic_tensor_mgr{new cpu_common::DynamicTensorManager(_tensor_reg)},
-      _static_tensor_mgr{new StaticTensorManager(_tensor_reg, _dynamic_tensor_mgr.get())}
-{
-  /* empty */
-}
-
-void TensorBuilder::registerTensorInfo(const ir::OperandIndex &ind, const ir::OperandInfo &info,
-                                       ir::Layout layout)
-{
-  _tensor_info_map.emplace(ind, info);
-
-  // CPU backend supports only one layout as NHWC
-  assert(layout == ir::Layout::NHWC);
-  if (info.isDynamic())
-  {
-    _dynamic_tensor_mgr->buildTensor(ind, info, layout);
-  }
-  else
-  {
-    _static_tensor_mgr->buildTensor(ind, info, layout, info.isConstant());
-  }
-}
-
-void TensorBuilder::notifyFirstUse(const ir::OperandIndex &ind)
-{
-  assert(_tensor_info_map.find(ind) != _tensor_info_map.end());
-  const auto tensor_info = _tensor_info_map.at(ind);
-
-  if (!_tensor_reg->getNativeTensor(ind)->is_dynamic())
-  {
-    const auto size = tensor_info.total_size();
-    _static_tensor_mgr->claimPlan(ind, size);
-  }
-}
-
-void TensorBuilder::notifyLastUse(const ir::OperandIndex &ind)
-{
-  if (!_tensor_reg->getNativeTensor(ind)->is_dynamic())
-  {
-    _static_tensor_mgr->releasePlan(ind);
-  }
-}
-
-bool TensorBuilder::isRegistered(const ir::OperandIndex &ind) const
-{
-  return _tensor_info_map.find(ind) != _tensor_info_map.end();
-}
-
-void TensorBuilder::prepare(void) { _static_tensor_mgr->allocateNonconsts(); }
-
-void TensorBuilder::allocate()
-{
-  // NOTE For now nothing to do. Allocation is done in prepare stage, which is not appropriate
-  //      This is because CPU kernels require `ITensor`s to be allocated before Kernel Generation.
-}
-
-} // namespace cpu
-} // namespace backend
-} // namespace onert
diff --git a/runtime/onert/backend/cpu/TensorBuilder.h b/runtime/onert/backend/cpu/TensorBuilder.h
index 9d8a5deb5..a7a410f17 100644
--- a/runtime/onert/backend/cpu/TensorBuilder.h
+++ b/runtime/onert/backend/cpu/TensorBuilder.h
@@ -17,15 +17,7 @@
 #ifndef __ONERT_BACKEND_CPU_TENSOR_BUILDER_H__
 #define __ONERT_BACKEND_CPU_TENSOR_BUILDER_H__
 
-#include <backend/cpu_common/DynamicTensorManager.h>
-#include <backend/cpu_common/TensorRegistry.h>
-
-#include <ir/OperandIndexMap.h>
-
-#include "StaticTensorManager.h"
-#include "Tensor.h"
-
-#include <unordered_map>
+#include <backend/basic/TensorBuilder.h>
 
 namespace onert
 {
@@ -34,37 +26,7 @@ namespace backend
 namespace cpu
 {
 
-class TensorBuilder
-{
-public:
-  TensorBuilder(const std::shared_ptr<cpu_common::TensorRegistry> &tensor_reg);
-
-  /**
-   * @brief     Register tensor information to allocate on CPU backend
-   * @param[in] ind    Operand index
-   * @param[in] info   Operand information
-   * @param[in] layout Operand data layout
-   */
-  void registerTensorInfo(const ir::OperandIndex &ind, const ir::OperandInfo &info,
-                          ir::Layout backend_layout);
-
-  void notifyFirstUse(const ir::OperandIndex &);
-  void notifyLastUse(const ir::OperandIndex &);
-
-  bool isRegistered(const ir::OperandIndex &) const;
-
-  void prepare(void);
-  void allocate();
-  void postFunctionPrepare() { /* DO NOTHING */}
-
-  IDynamicTensorManager *dynamicTensorManager(void) { return _dynamic_tensor_mgr.get(); }
-
-private:
-  const std::shared_ptr<cpu_common::TensorRegistry> _tensor_reg;
-  std::unique_ptr<cpu_common::DynamicTensorManager> _dynamic_tensor_mgr;
-  std::unique_ptr<StaticTensorManager> _static_tensor_mgr;
-  ir::OperandIndexMap<ir::OperandInfo> _tensor_info_map;
-};
+using TensorBuilder = basic::TensorBuilder;
 
 } // namespace cpu
 } // namespace backend
diff --git a/runtime/onert/backend/cpu/ops/AddNLayer.cc b/runtime/onert/backend/cpu/ops/AddNLayer.cc
index 5c0395dcc..967991295 100644
--- a/runtime/onert/backend/cpu/ops/AddNLayer.cc
+++ b/runtime/onert/backend/cpu/ops/AddNLayer.cc
@@ -44,20 +44,18 @@ void AddNLayer::run()
     std::vector<const int32_t *> input_buffers(input_size);
     for (size_t i = 0; i < input_size; i++)
     {
-      input_buffers[i] = reinterpret_cast<int32_t *>(_inputs[i]->buffer());
+      input_buffers[i] = getBuffer<int32_t>(_inputs[i]);
     }
-    AddN(getTensorShape(_inputs[0]), input_size, input_buffers.data(),
-         reinterpret_cast<int32_t *>(_output->buffer()));
+    AddN(getShape(_inputs[0]), input_size, input_buffers.data(), getBuffer<int32_t>(_output));
   }
   else if (_output->data_type() == ir::DataType::FLOAT32)
   {
     std::vector<const float *> input_buffers(input_size);
     for (size_t i = 0; i < input_size; i++)
     {
-      input_buffers[i] = reinterpret_cast<float *>(_inputs[i]->buffer());
+      input_buffers[i] = getBuffer<float>(_inputs[i]);
     }
-    AddN(getTensorShape(_inputs[0]), input_size, input_buffers.data(),
-         reinterpret_cast<float *>(_output->buffer()));
+    AddN(getShape(_inputs[0]), input_size, input_buffers.data(), getBuffer<float>(_output));
   }
   else
   {
diff --git a/runtime/onert/backend/cpu/ops/ArgMinMaxLayer.cc b/runtime/onert/backend/cpu/ops/ArgMinMaxLayer.cc
index d5ffdef0b..a1b8bfce3 100644
--- a/runtime/onert/backend/cpu/ops/ArgMinMaxLayer.cc
+++ b/runtime/onert/backend/cpu/ops/ArgMinMaxLayer.cc
@@ -42,7 +42,7 @@ template <typename T> std::function<bool(T, T)> GetComparefunction(bool is_arg_m
     return std::less<T>();
   }
 }
-}
+} // namespace
 
 void ArgMinMaxLayer::configure(const IPortableTensor *input, IPortableTensor *output,
                                const IPortableTensor *axis, bool is_arg_max)
@@ -59,15 +59,14 @@ void ArgMinMaxLayer::run()
   {
     throw std::runtime_error("ArgMinMax: wrong shape of axis");
   }
-  auto axis = *reinterpret_cast<const int32_t *>(_axis->buffer());
+  auto axis = *getBuffer<int32_t>(_axis);
   if (axis < 0)
   {
-    axis += _input->num_dimensions();
+    axis += _input->getShape().rank();
   }
-#define TF_LITE_ARG_MIN_MAX(input_type, axis_type, output_type)                                \
-  ArgMinMax(getTensorShape(_input), reinterpret_cast<const input_type *>(_input->buffer()),    \
-            getTensorShape(_output), reinterpret_cast<output_type *>(_output->buffer()), axis, \
-            GetComparefunction<input_type>(_is_arg_max));
+#define TF_LITE_ARG_MIN_MAX(input_type, axis_type, output_type)                 \
+  ArgMinMax(getShape(_input), getBuffer<input_type>(_input), getShape(_output), \
+            getBuffer<output_type>(_output), axis, GetComparefunction<input_type>(_is_arg_max));
   if (_output->data_type() == ir::DataType::INT32)
   {
     switch (_input->data_type())
diff --git a/runtime/onert/backend/cpu/ops/BatchMatMulLayer.cc b/runtime/onert/backend/cpu/ops/BatchMatMulLayer.cc
index ba9655924..3b08fd5b1 100644
--- a/runtime/onert/backend/cpu/ops/BatchMatMulLayer.cc
+++ b/runtime/onert/backend/cpu/ops/BatchMatMulLayer.cc
@@ -28,8 +28,8 @@ namespace ops
 {
 
 BatchMatMulLayer::BatchMatMulLayer()
-    : _lhs(nullptr), _rhs(nullptr), _output(nullptr), _adj_x(false), _adj_y(false),
-      _kernel(new nnfw::cker::BatchMatMul())
+  : _lhs(nullptr), _rhs(nullptr), _output(nullptr), _adj_x(false), _adj_y(false),
+    _kernel(new nnfw::cker::BatchMatMul())
 {
   // DO NOTHING
 }
@@ -39,16 +39,15 @@ BatchMatMulLayer::~BatchMatMulLayer() = default;
 void BatchMatMulLayer::batchMatMulFloat32()
 {
   nnfw::cker::BatchMatMul &batchmatmul_kernel = *_kernel;
-  nnfw::cker::Shape lhs_shape = getTensorShape(_lhs);
-  nnfw::cker::Shape rhs_shape = getTensorShape(_rhs);
-  nnfw::cker::Shape output_shape = getTensorShape(_output);
+  nnfw::cker::Shape lhs_shape = getShape(_lhs);
+  nnfw::cker::Shape rhs_shape = getShape(_rhs);
+  nnfw::cker::Shape output_shape = getShape(_output);
 
   // TODO implement for constant input
 
   batchmatmul_kernel.prepare(lhs_shape, rhs_shape, _adj_x, _adj_y);
-  batchmatmul_kernel(lhs_shape, reinterpret_cast<const float *>(_lhs->buffer()), rhs_shape,
-                     reinterpret_cast<const float *>(_rhs->buffer()), _adj_x, _adj_y, output_shape,
-                     reinterpret_cast<float *>(_output->buffer()));
+  batchmatmul_kernel(lhs_shape, getBuffer<float>(_lhs), rhs_shape, getBuffer<float>(_rhs), _adj_x,
+                     _adj_y, output_shape, getBuffer<float>(_output));
 }
 
 void BatchMatMulLayer::configure(const IPortableTensor *lhs, const IPortableTensor *rhs, bool adj_x,
diff --git a/runtime/onert/backend/cpu/ops/BatchToSpaceNDLayer.cc b/runtime/onert/backend/cpu/ops/BatchToSpaceNDLayer.cc
index f2f10eb9d..2609481fb 100644
--- a/runtime/onert/backend/cpu/ops/BatchToSpaceNDLayer.cc
+++ b/runtime/onert/backend/cpu/ops/BatchToSpaceNDLayer.cc
@@ -28,7 +28,7 @@ namespace ops
 {
 
 BatchToSpaceNDLayer::BatchToSpaceNDLayer()
-    : _input(nullptr), _output(nullptr), _block_shape(nullptr), _crops(nullptr)
+  : _input(nullptr), _output(nullptr), _block_shape(nullptr), _crops(nullptr)
 {
   // DO NOTHING
 }
@@ -44,12 +44,11 @@ template <typename T> void BatchToSpaceNDLayer::batchToSpaceNDGeneric()
   }
   else
   {
-    _crops_buffer = reinterpret_cast<const int32_t *>(_crops->buffer());
+    _crops_buffer = getBuffer<int32_t>(_crops);
   }
-  nnfw::cker::BatchToSpaceND<T>(
-      getTensorShape(_input), reinterpret_cast<const T *>(_input->buffer()),
-      reinterpret_cast<const int32_t *>(_block_shape->buffer()), _crops_buffer,
-      getTensorShape(_output), reinterpret_cast<T *>(_output->buffer()));
+  nnfw::cker::BatchToSpaceND<T>(getShape(_input), getBuffer<T>(_input),
+                                getBuffer<int32_t>(_block_shape), _crops_buffer, getShape(_output),
+                                getBuffer<T>(_output));
 }
 
 void BatchToSpaceNDLayer::configure(const IPortableTensor *input, IPortableTensor *output,
diff --git a/runtime/onert/backend/cpu/ops/BinaryArithmeticLayer.cc b/runtime/onert/backend/cpu/ops/BinaryArithmeticLayer.cc
index ff1126932..e0d5a3ccb 100644
--- a/runtime/onert/backend/cpu/ops/BinaryArithmeticLayer.cc
+++ b/runtime/onert/backend/cpu/ops/BinaryArithmeticLayer.cc
@@ -40,7 +40,7 @@ template <nnfw::cker::BinaryArithmeticOpType arithmetic_type, typename T> struct
 
   Eval(const IPortableTensor *lhs, const IPortableTensor *rhs, IPortableTensor *output,
        nnfw::cker::BinaryArithmeticOpParam op_params)
-      : _op_params(std::move(op_params)), _need_broadcast(false)
+    : _op_params(std::move(op_params)), _need_broadcast(false)
   {
     if (!output->is_dynamic())
       updateCache(lhs, rhs, output);
@@ -48,9 +48,9 @@ template <nnfw::cker::BinaryArithmeticOpType arithmetic_type, typename T> struct
 
   void updateCache(const IPortableTensor *lhs, const IPortableTensor *rhs, IPortableTensor *output)
   {
-    _lhs_shape.ReplaceWith(getTensorShape(lhs));
-    _rhs_shape.ReplaceWith(getTensorShape(rhs));
-    _output_shape.ReplaceWith(getTensorShape(output));
+    _lhs_shape.ReplaceWith(getShape(lhs));
+    _rhs_shape.ReplaceWith(getShape(rhs));
+    _output_shape.ReplaceWith(getShape(output));
     _need_broadcast = nnfw::cker::ProcessBroadcastShapes(_lhs_shape, _rhs_shape, &_op_params);
   }
 
@@ -61,20 +61,20 @@ template <nnfw::cker::BinaryArithmeticOpType arithmetic_type, typename T> struct
     if (output->is_dynamic())
       updateCache(lhs, rhs, output);
     else
-      assert(_lhs_shape == getTensorShape(lhs) && _rhs_shape == getTensorShape(rhs) &&
-             _output_shape == getTensorShape(output));
-    auto lhs_buffer = reinterpret_cast<const T *>(lhs->buffer());
-    auto rhs_buffer = reinterpret_cast<const T *>(rhs->buffer());
-    auto output_buffer = reinterpret_cast<T *>(output->buffer());
+      assert(_lhs_shape == getShape(lhs) && _rhs_shape == getShape(rhs) &&
+             _output_shape == getShape(output));
+    auto lhs_buffer = getBuffer<T>(lhs);
+    auto rhs_buffer = getBuffer<T>(rhs);
+    auto output_buffer = getBuffer<T>(output);
     if (_need_broadcast)
     {
       nnfw::cker::BroadcastBinaryArithmeticOp<arithmetic_type>(
-          _op_params, _lhs_shape, lhs_buffer, _rhs_shape, rhs_buffer, _output_shape, output_buffer);
+        _op_params, _lhs_shape, lhs_buffer, _rhs_shape, rhs_buffer, _output_shape, output_buffer);
     }
     else
     {
       nnfw::cker::BinaryArithmeticOp<arithmetic_type>(
-          _op_params, _lhs_shape, lhs_buffer, _rhs_shape, rhs_buffer, _output_shape, output_buffer);
+        _op_params, _lhs_shape, lhs_buffer, _rhs_shape, rhs_buffer, _output_shape, output_buffer);
     }
   }
 };
@@ -115,19 +115,17 @@ void setAddOrSubQuant8Params(const IPortableTensor *lhs, const IPortableTensor *
                              nnfw::cker::BinaryArithmeticOpParam *params)
 {
   int32_t output_activation_min, output_activation_max;
-  CalculateActivationRangeUint8(activation, output, &output_activation_min, &output_activation_max);
+  CalculateActivationRangeQuantized(activation, output, &output_activation_min,
+                                    &output_activation_max);
   nnfw::cker::BinaryArithmeticOpParam &op_params = *params;
   op_params.quantized_activation_max = output_activation_max;
   op_params.quantized_activation_min = output_activation_min;
   // Parameters for scaled quantized computation
   op_params.left_shift = 20;
   // Zero-points of input and output tensors
-  op_params.input1_offset = -lhs->data_offset();
-  op_params.input2_offset = -rhs->data_offset();
-  op_params.output_offset = output->data_offset();
-  assert((op_params.input1_offset <= 0) && (op_params.input1_offset >= -255));
-  assert((op_params.input2_offset <= 0) && (op_params.input2_offset >= -255));
-  assert((op_params.output_offset >= 0) && (op_params.output_offset <= 255));
+  op_params.input1_offset = -lhs->data_zero_point();
+  op_params.input2_offset = -rhs->data_zero_point();
+  op_params.output_offset = output->data_zero_point();
 
   // Compute normalized scale for _lhs and _rhs values,
   // and represent in 32-bit fixed point
@@ -136,7 +134,7 @@ void setAddOrSubQuant8Params(const IPortableTensor *lhs, const IPortableTensor *
   const double real_rhs_scale = rhs->data_scale() / norm_max_scale;
   // output scale is used to normalize final result, so we invert the scale here
   const double real_output_scale =
-      norm_max_scale / (output->data_scale() * (1 << op_params.left_shift));
+    norm_max_scale / (output->data_scale() * (1 << op_params.left_shift));
 
   // Represent the scales as fixed int32_t multipliers, and int32_t shifts
   QuantizeMultiplier(real_lhs_scale, &op_params.input1_multiplier, &op_params.input1_shift);
@@ -149,14 +147,15 @@ void setMulQuant8Params(const IPortableTensor *lhs, const IPortableTensor *rhs,
                         nnfw::cker::BinaryArithmeticOpParam *params)
 {
   int32_t output_activation_min, output_activation_max;
-  CalculateActivationRangeUint8(activation, output, &output_activation_min, &output_activation_max);
+  CalculateActivationRangeQuantized(activation, output, &output_activation_min,
+                                    &output_activation_max);
   nnfw::cker::BinaryArithmeticOpParam &op_params = *params;
 
   op_params.quantized_activation_max = output_activation_max;
   op_params.quantized_activation_min = output_activation_min;
-  op_params.input1_offset = -lhs->data_offset();
-  op_params.input2_offset = -rhs->data_offset();
-  op_params.output_offset = output->data_offset();
+  op_params.input1_offset = -lhs->data_zero_point();
+  op_params.input2_offset = -rhs->data_zero_point();
+  op_params.output_offset = output->data_zero_point();
 
   double real_multiplier = lhs->data_scale() * rhs->data_scale() / output->data_scale();
   QuantizeMultiplier(real_multiplier, &op_params.output_multiplier, &op_params.output_shift);
@@ -184,12 +183,19 @@ void BinaryArithmeticLayer::configure(const IPortableTensor *lhs, const IPortabl
       {
         setAddOrSubQuant8Params(_lhs, _rhs, _output, activation, &op_params);
         _kernel =
-            Eval<nnfw::cker::BinaryArithmeticOpType::ADD, uint8_t>(_lhs, _rhs, _output, op_params);
+          Eval<nnfw::cker::BinaryArithmeticOpType::ADD, uint8_t>(_lhs, _rhs, _output, op_params);
       }
+      else if (_lhs->data_type() == OperandType::QUANT_INT8_ASYMM)
+      {
+        setAddOrSubQuant8Params(_lhs, _rhs, _output, activation, &op_params);
+        _kernel =
+          Eval<nnfw::cker::BinaryArithmeticOpType::ADD, int8_t>(_lhs, _rhs, _output, op_params);
+      }
+
       else
       {
         _kernel = generateKernelGeneric<nnfw::cker::BinaryArithmeticOpType::ADD>(
-            _lhs, _rhs, _output, activation, op_params);
+          _lhs, _rhs, _output, activation, op_params);
       }
       break;
     case ArithmeticType::kSub:
@@ -198,12 +204,20 @@ void BinaryArithmeticLayer::configure(const IPortableTensor *lhs, const IPortabl
         setAddOrSubQuant8Params(_lhs, _rhs, _output, activation, &op_params);
         op_params.input2_multiplier *= -1;
         _kernel =
-            Eval<nnfw::cker::BinaryArithmeticOpType::SUB, uint8_t>(_lhs, _rhs, _output, op_params);
+          Eval<nnfw::cker::BinaryArithmeticOpType::SUB, uint8_t>(_lhs, _rhs, _output, op_params);
+      }
+      else if (_lhs->data_type() == OperandType::QUANT_INT8_ASYMM)
+      {
+        setAddOrSubQuant8Params(_lhs, _rhs, _output, activation, &op_params);
+        op_params.input2_multiplier *= -1;
+        _kernel =
+          Eval<nnfw::cker::BinaryArithmeticOpType::SUB, int8_t>(_lhs, _rhs, _output, op_params);
       }
+
       else
       {
         _kernel = generateKernelGeneric<nnfw::cker::BinaryArithmeticOpType::SUB>(
-            _lhs, _rhs, _output, activation, op_params);
+          _lhs, _rhs, _output, activation, op_params);
       }
       break;
     case ArithmeticType::kMul:
@@ -212,19 +226,26 @@ void BinaryArithmeticLayer::configure(const IPortableTensor *lhs, const IPortabl
         nnfw::cker::BinaryArithmeticOpParam op_params;
         setMulQuant8Params(_lhs, _rhs, _output, activation, &op_params);
         _kernel =
-            Eval<nnfw::cker::BinaryArithmeticOpType::MUL, uint8_t>(_lhs, _rhs, _output, op_params);
+          Eval<nnfw::cker::BinaryArithmeticOpType::MUL, uint8_t>(_lhs, _rhs, _output, op_params);
+      }
+      else if (_lhs->data_type() == OperandType::QUANT_INT8_ASYMM)
+      {
+        nnfw::cker::BinaryArithmeticOpParam op_params;
+        setMulQuant8Params(_lhs, _rhs, _output, activation, &op_params);
+        _kernel =
+          Eval<nnfw::cker::BinaryArithmeticOpType::MUL, int8_t>(_lhs, _rhs, _output, op_params);
       }
       else
       {
         _kernel = generateKernelGeneric<nnfw::cker::BinaryArithmeticOpType::MUL>(
-            _lhs, _rhs, _output, activation, op_params);
+          _lhs, _rhs, _output, activation, op_params);
       }
       break;
     case ArithmeticType::kDiv:
       if (_lhs->data_type() == OperandType::QUANT_UINT8_ASYMM)
       {
         throw std::runtime_error{
-            "BinaryArithmetic(Div): Div operation does not support quantization"};
+          "BinaryArithmetic(Div): Div operation does not support quantization"};
       }
       else if (_lhs->data_type() == OperandType::INT32)
       {
@@ -233,7 +254,7 @@ void BinaryArithmeticLayer::configure(const IPortableTensor *lhs, const IPortabl
       else
       {
         _kernel = generateKernelGeneric<nnfw::cker::BinaryArithmeticOpType::DIV>(
-            _lhs, _rhs, _output, activation, op_params);
+          _lhs, _rhs, _output, activation, op_params);
       }
       break;
     default:
diff --git a/runtime/onert/backend/cpu/ops/BroadcastToLayer.cc b/runtime/onert/backend/cpu/ops/BroadcastToLayer.cc
index d9c1bbfc5..d31b814bb 100644
--- a/runtime/onert/backend/cpu/ops/BroadcastToLayer.cc
+++ b/runtime/onert/backend/cpu/ops/BroadcastToLayer.cc
@@ -49,19 +49,18 @@ void BroadcastToLayer::run()
   {
     // ToDo : It need to support INT8 and UINT8 also when will be applied quantization.
     case OperandType::FLOAT32:
-      nnfw::cker::BroadcastTo<float>(
-          getTensorShape(_input), reinterpret_cast<float *>(_input->buffer()),
-          getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()));
+      nnfw::cker::BroadcastTo<float>(getShape(_input), reinterpret_cast<float *>(_input->buffer()),
+                                     getShape(_output), getBuffer<float>(_output));
       break;
     case OperandType::INT32:
-      nnfw::cker::BroadcastTo<int32_t>(
-          getTensorShape(_input), reinterpret_cast<int32_t *>(_input->buffer()),
-          getTensorShape(_output), reinterpret_cast<int32_t *>(_output->buffer()));
+      nnfw::cker::BroadcastTo<int32_t>(getShape(_input),
+                                       reinterpret_cast<int32_t *>(_input->buffer()),
+                                       getShape(_output), getBuffer<int32_t>(_output));
       break;
     case OperandType::UINT32:
-      nnfw::cker::BroadcastTo<uint32_t>(
-          getTensorShape(_input), reinterpret_cast<uint32_t *>(_input->buffer()),
-          getTensorShape(_output), reinterpret_cast<uint32_t *>(_output->buffer()));
+      nnfw::cker::BroadcastTo<uint32_t>(getShape(_input),
+                                        reinterpret_cast<uint32_t *>(_input->buffer()),
+                                        getShape(_output), getBuffer<uint32_t>(_output));
       break;
     default:
       throw std::runtime_error{"BroadcastToLayer: unsupported data type"};
diff --git a/runtime/onert/backend/cpu/ops/CompareLayer.cc b/runtime/onert/backend/cpu/ops/CompareLayer.cc
index adf902aaf..b621952cc 100644
--- a/runtime/onert/backend/cpu/ops/CompareLayer.cc
+++ b/runtime/onert/backend/cpu/ops/CompareLayer.cc
@@ -49,10 +49,10 @@ void compareQuant8(const IPortableTensor *lhs, const IPortableTensor *rhs, IPort
 {
   nnfw::cker::ComparisonParams params;
   params.left_shift = 8;
-  params.input1_offset = -lhs->data_offset();
-  params.input2_offset = -rhs->data_offset();
+  params.input1_offset = -lhs->data_zero_point();
+  params.input2_offset = -rhs->data_zero_point();
   const double norm_max_scale =
-      2 * std::max(std::abs(lhs->data_scale()), std::abs(rhs->data_scale()));
+    2 * std::max(std::abs(lhs->data_scale()), std::abs(rhs->data_scale()));
   const double adjusted_lhs_scale = lhs->data_scale() / norm_max_scale;
   const double adjusted_rhs_scale = rhs->data_scale() / norm_max_scale;
   QuantizeMultiplierSmallerThanOneExp(adjusted_lhs_scale, &params.input1_multiplier,
@@ -61,19 +61,18 @@ void compareQuant8(const IPortableTensor *lhs, const IPortableTensor *rhs, IPort
                                       &params.input2_shift);
   params.is_broadcast = !HaveSameShapes(lhs, rhs);
 
-  using CompareFunction =
-      void (*)(ComparisonParams & params, const Shape &input1_shape, const T *input1_data,
-               const Shape &input2_shape, const T *input2_data, const Shape &output_shape,
-               bool *output_data);
+  using CompareFunction = void (*)(
+    ComparisonParams & params, const Shape &input1_shape, const T *input1_data,
+    const Shape &input2_shape, const T *input2_data, const Shape &output_shape, bool *output_data);
 
   static const CompareFunction broadcast_fns[] = {
-      Broadcast4DSlowEqualWithScaling,   Broadcast4DSlowNotEqualWithScaling,
-      Broadcast4DSlowGreaterWithScaling, Broadcast4DSlowGreaterEqualWithScaling,
-      Broadcast4DSlowLessWithScaling,    Broadcast4DSlowLessEqualWithScaling,
+    Broadcast4DSlowEqualWithScaling,   Broadcast4DSlowNotEqualWithScaling,
+    Broadcast4DSlowGreaterWithScaling, Broadcast4DSlowGreaterEqualWithScaling,
+    Broadcast4DSlowLessWithScaling,    Broadcast4DSlowLessEqualWithScaling,
   };
   static const CompareFunction non_broadcast_fns[] = {
-      EqualWithScaling,        NotEqualWithScaling, GreaterWithScaling,
-      GreaterEqualWithScaling, LessWithScaling,     LessEqualWithScaling,
+    EqualWithScaling,        NotEqualWithScaling, GreaterWithScaling,
+    GreaterEqualWithScaling, LessWithScaling,     LessEqualWithScaling,
   };
 
   static_assert(sizeof(broadcast_fns) == sizeof(non_broadcast_fns),
@@ -85,9 +84,8 @@ void compareQuant8(const IPortableTensor *lhs, const IPortableTensor *rhs, IPort
 
   CompareFunction fn = (params.is_broadcast ? broadcast_fns[index] : non_broadcast_fns[index]);
 
-  fn(params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-     getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-     getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+  fn(params, getExtendedTensorShape(lhs), getBuffer<T>(lhs), getExtendedTensorShape(rhs),
+     getBuffer<T>(rhs), getExtendedTensorShape(output), getBuffer<bool>(output));
 }
 
 template <typename T>
@@ -97,16 +95,16 @@ void compareScalar(const IPortableTensor *lhs, const IPortableTensor *rhs, IPort
   bool requires_broadcast = !HaveSameShapes(lhs, rhs);
 
   using CompareFunction =
-      void (*)(const Shape &input1_shape, const T *input1_data, const Shape &input2_shape,
-               const T *input2_data, const Shape &output_shape, bool *output_data);
+    void (*)(const Shape &input1_shape, const T *input1_data, const Shape &input2_shape,
+             const T *input2_data, const Shape &output_shape, bool *output_data);
 
   static const CompareFunction broadcast_fns[] = {
-      Broadcast4DSlowEqual,        Broadcast4DSlowNotEqual, Broadcast4DSlowGreater,
-      Broadcast4DSlowGreaterEqual, Broadcast4DSlowLess,     Broadcast4DSlowLessEqual,
+    Broadcast4DSlowEqual,        Broadcast4DSlowNotEqual, Broadcast4DSlowGreater,
+    Broadcast4DSlowGreaterEqual, Broadcast4DSlowLess,     Broadcast4DSlowLessEqual,
   };
   static const CompareFunction non_broadcast_fns[] = {
-      EqualNoScaling,        NotEqualNoScaling, GreaterNoScaling,
-      GreaterEqualNoScaling, LessNoScaling,     LessEqualNoScaling,
+    EqualNoScaling,        NotEqualNoScaling, GreaterNoScaling,
+    GreaterEqualNoScaling, LessNoScaling,     LessEqualNoScaling,
   };
 
   static_assert(sizeof(broadcast_fns) == sizeof(non_broadcast_fns),
@@ -118,16 +116,15 @@ void compareScalar(const IPortableTensor *lhs, const IPortableTensor *rhs, IPort
 
   CompareFunction fn = (requires_broadcast ? broadcast_fns[index] : non_broadcast_fns[index]);
 
-  fn(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-     getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-     getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+  fn(getExtendedTensorShape(lhs), getBuffer<T>(lhs), getExtendedTensorShape(rhs), getBuffer<T>(rhs),
+     getExtendedTensorShape(output), getBuffer<bool>(output));
 }
 
 } // namespace
 
 CompareLayer::CompareLayer()
-    : _lhs(nullptr), _rhs(nullptr), _output(nullptr),
-      _op_type(ir::operation::Comparison::ComparisonType::Equal)
+  : _lhs(nullptr), _rhs(nullptr), _output(nullptr),
+    _op_type(ir::operation::Comparison::ComparisonType::Equal)
 {
   // DO NOTHING
 }
diff --git a/runtime/onert/backend/cpu/ops/ConcatLayer.cc b/runtime/onert/backend/cpu/ops/ConcatLayer.cc
index edfdfc1a6..5d48b0e7f 100644
--- a/runtime/onert/backend/cpu/ops/ConcatLayer.cc
+++ b/runtime/onert/backend/cpu/ops/ConcatLayer.cc
@@ -49,7 +49,7 @@ template <typename T> void ConcatLayer::concatenationGeneral()
 
   for (uint32_t i = 0; i < num_inputs; i++)
   {
-    inputDims.push_back(getTensorShape(_inputs[i]));
+    inputDims.push_back(getShape(_inputs[i]));
     inputDimsPtr.push_back(&inputDims[i]);
   }
 
@@ -57,11 +57,11 @@ template <typename T> void ConcatLayer::concatenationGeneral()
 
   for (const auto input : _inputs)
   {
-    inputDataPtrs.emplace_back(reinterpret_cast<const T *>(input->buffer()));
+    inputDataPtrs.emplace_back(getBuffer<T>(input));
   }
 
   nnfw::cker::Concatenation<T>(op_params, inputDimsPtr.data(), inputDataPtrs.data(),
-                               getTensorShape(_output), reinterpret_cast<T *>(_output->buffer()));
+                               getShape(_output), getBuffer<T>(_output));
 }
 void ConcatLayer::concatenationQuant8()
 {
@@ -71,7 +71,7 @@ void ConcatLayer::concatenationQuant8()
   std::vector<float> input_scales(num_inputs);
   for (uint32_t i = 0; i < num_inputs; i++)
   {
-    input_zeropoints[i] = _inputs[i]->data_offset();
+    input_zeropoints[i] = _inputs[i]->data_zero_point();
     input_scales[i] = _inputs[i]->data_scale();
   }
 
@@ -80,7 +80,7 @@ void ConcatLayer::concatenationQuant8()
   op_params.inputs_count = num_inputs;
   op_params.input_zeropoint = input_zeropoints.data();
   op_params.input_scale = input_scales.data();
-  op_params.output_zeropoint = _output->data_offset();
+  op_params.output_zeropoint = _output->data_zero_point();
   op_params.output_scale = _output->data_scale();
 
   std::vector<nnfw::cker::Shape *> inputDimsPtr;
@@ -89,19 +89,18 @@ void ConcatLayer::concatenationQuant8()
   inputDims.reserve(num_inputs);
   for (uint32_t i = 0; i < num_inputs; i++)
   {
-    inputDims.push_back(getTensorShape(_inputs[i]));
+    inputDims.push_back(getShape(_inputs[i]));
     inputDimsPtr.push_back(&inputDims[i]);
   }
 
   std::vector<const uint8_t *> inputDataPtrs;
   for (const auto input : _inputs)
   {
-    inputDataPtrs.emplace_back(reinterpret_cast<const uint8_t *>(input->buffer()));
+    inputDataPtrs.emplace_back(getBuffer<uint8_t>(input));
   }
 
   nnfw::cker::ConcatenationWithScaling(op_params, inputDimsPtr.data(), inputDataPtrs.data(),
-                                       getTensorShape(_output),
-                                       reinterpret_cast<uint8_t *>(_output->buffer()));
+                                       getShape(_output), getBuffer<uint8_t>(_output));
 }
 
 void ConcatLayer::configure(const std::vector<const IPortableTensor *> &inputs, int32_t axis,
diff --git a/runtime/onert/backend/cpu/ops/ConvolutionLayer.cc b/runtime/onert/backend/cpu/ops/ConvolutionLayer.cc
index c964e38f9..2255d5e9f 100644
--- a/runtime/onert/backend/cpu/ops/ConvolutionLayer.cc
+++ b/runtime/onert/backend/cpu/ops/ConvolutionLayer.cc
@@ -15,6 +15,7 @@
  */
 
 #include "ConvolutionLayer.h"
+#include "OperationUtils.h"
 
 #include "../Tensor.h"
 #include "ir/Padding.h"
@@ -29,11 +30,11 @@ namespace cpu
 namespace ops
 {
 ConvolutionLayer::ConvolutionLayer()
-    : _input(nullptr), _kernel(nullptr), _bias(nullptr), _output(nullptr),
-      _paddingType(ir::PaddingType::EXPLICIT), _paddingLeft(0), _paddingTop(0), _paddingRight(0),
-      _paddingBottom(0), _strideWidth(0), _strideHeight(0), _dilationWidthFactor(1),
-      _dilationHeightFactor(1), _activation(ir::Activation::NONE),
-      _conv_kernel(new nnfw::cker::Conv()), _prepare(false)
+  : _input(nullptr), _kernel(nullptr), _bias(nullptr), _output(nullptr),
+    _paddingType(ir::PaddingType::EXPLICIT), _paddingLeft(0), _paddingTop(0), _paddingRight(0),
+    _paddingBottom(0), _strideWidth(0), _strideHeight(0), _dilationWidthFactor(1),
+    _dilationHeightFactor(1), _activation(ir::Activation::NONE),
+    _conv_kernel(new nnfw::cker::Conv()), _prepare(false)
 {
   // DO NOTHING
 }
@@ -57,18 +58,17 @@ void ConvolutionLayer::convFloat32()
   op_params.float_activation_max = output_activation_max;
 
   nnfw::cker::Conv &kernel = *_conv_kernel;
-  kernel(op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
-         getTensorShape(_kernel), reinterpret_cast<const float *>(_kernel->buffer()),
-         getTensorShape(_bias), reinterpret_cast<const float *>(_bias->buffer()),
-         getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()));
+  kernel(op_params, getShape(_input), getBuffer<float>(_input), getShape(_kernel),
+         getBuffer<float>(_kernel), getShape(_bias), getBuffer<float>(_bias), getShape(_output),
+         getBuffer<float>(_output));
 }
 
 void ConvolutionLayer::convQuant8()
 {
   int32_t output_activation_min = 0;
   int32_t output_activation_max = 0;
-  CalculateActivationRangeUint8(_activation, _output, &output_activation_min,
-                                &output_activation_max);
+  CalculateActivationRangeQuantized(_activation, _output, &output_activation_min,
+                                    &output_activation_max);
 
   double real_multiplier = 0.0;
   int32_t output_multiplier = 0;
@@ -84,9 +84,9 @@ void ConvolutionLayer::convQuant8()
   op_params.padding_type = getPaddingType(_paddingType);
   op_params.padding_values.width = _paddingLeft;
   op_params.padding_values.height = _paddingTop;
-  op_params.input_offset = -_input->data_offset();
-  op_params.weights_offset = -_kernel->data_offset();
-  op_params.output_offset = _output->data_offset();
+  op_params.input_offset = -_input->data_zero_point();
+  op_params.weights_offset = -_kernel->data_zero_point();
+  op_params.output_offset = _output->data_zero_point();
   op_params.output_multiplier = output_multiplier;
   op_params.output_shift = output_shift;
   op_params.quantized_activation_min = output_activation_min;
@@ -94,10 +94,35 @@ void ConvolutionLayer::convQuant8()
   op_params.is_replaced_weights = true;
 
   nnfw::cker::Conv &kernel = *_conv_kernel;
-  kernel(op_params, getTensorShape(_input), reinterpret_cast<const uint8_t *>(_input->buffer()),
-         getTensorShape(_kernel), reinterpret_cast<const uint8_t *>(_kernel->buffer()),
-         getTensorShape(_bias), reinterpret_cast<const int32_t *>(_bias->buffer()),
-         getTensorShape(_output), reinterpret_cast<uint8_t *>(_output->buffer()));
+  kernel(op_params, getShape(_input), getBuffer<uint8_t>(_input), getShape(_kernel),
+         getBuffer<uint8_t>(_kernel), getShape(_bias), getBuffer<int32_t>(_bias), getShape(_output),
+         getBuffer<uint8_t>(_output));
+}
+
+void ConvolutionLayer::convQuant8PerChannel()
+{
+  int32_t output_activation_min = 0;
+  int32_t output_activation_max = 0;
+  CalculateActivationRangeQuantized(_activation, _output, &output_activation_min,
+                                    &output_activation_max);
+
+  nnfw::cker::ConvParams op_params;
+  op_params.input_offset = -_input->data_zero_point();
+  op_params.output_offset = _output->data_zero_point();
+  op_params.stride_height = _strideHeight;
+  op_params.stride_width = _strideWidth;
+  op_params.dilation_height_factor = _dilationHeightFactor;
+  op_params.dilation_width_factor = _dilationWidthFactor;
+  op_params.padding_values.height = _paddingTop;
+  op_params.padding_values.width = _paddingLeft;
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+
+  nnfw::cker::Conv &kernel = *_conv_kernel;
+  kernel(op_params, getShape(_input), reinterpret_cast<const int8_t *>(_input->buffer()),
+         getShape(_kernel), reinterpret_cast<const int8_t *>(_kernel->buffer()), getShape(_bias),
+         reinterpret_cast<const int32_t *>(_bias->buffer()), getShape(_output),
+         reinterpret_cast<int8_t *>(_output->buffer()));
 }
 
 void ConvolutionLayer::configure(const IPortableTensor *input, const IPortableTensor *kernel,
@@ -150,8 +175,8 @@ void ConvolutionLayer::run()
     param_padding.param.bottom = _paddingBottom;
 
     const auto padding =
-        ir::calculatePadding(param_padding, ifm_shape, ofm_shape, stride, ker_width, ker_height,
-                             _dilationWidthFactor, _dilationHeightFactor);
+      ir::calculatePadding(param_padding, ifm_shape, ofm_shape, stride, ker_width, ker_height,
+                           _dilationWidthFactor, _dilationHeightFactor);
 
     _paddingLeft = padding.left;
     _paddingRight = padding.right;
@@ -166,6 +191,10 @@ void ConvolutionLayer::run()
   {
     convQuant8();
   }
+  else if (_input->data_type() == OperandType::QUANT_INT8_ASYMM)
+  {
+    convQuant8PerChannel();
+  }
   else
   {
     throw std::runtime_error{"Conv: unsupported data type"};
@@ -181,9 +210,8 @@ void ConvolutionLayer::prepare()
   if (_input->data_type() == OperandType::FLOAT32 && _kernel->is_constant())
   {
     bool is_transposed = false;
-    kernel.prepare(getTensorShape(_kernel), reinterpret_cast<const float *>(_kernel->buffer()),
-                   getPaddingType(_paddingType), is_transposed, _dilationWidthFactor,
-                   _dilationHeightFactor);
+    kernel.prepare(getShape(_kernel), getBuffer<float>(_kernel), getPaddingType(_paddingType),
+                   is_transposed, _dilationWidthFactor, _dilationHeightFactor);
 
     // Decrease reference of _kernel(weights) only when _kernel is constant
     if (is_transposed)
@@ -197,8 +225,22 @@ void ConvolutionLayer::prepare()
   else if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM && _kernel->is_constant() &&
            !_input->is_dynamic() && !_output->is_dynamic())
   {
-    kernel.prepareQuant(getTensorShape(_input), getTensorShape(_kernel), getTensorShape(_output),
-                        _strideWidth, _strideHeight, _dilationWidthFactor, _dilationHeightFactor);
+    kernel.prepareQuant(getShape(_input), getShape(_kernel), getShape(_output), _strideWidth,
+                        _strideHeight, _dilationWidthFactor, _dilationHeightFactor);
+  }
+  else if (_input->data_type() == OperandType::QUANT_INT8_ASYMM)
+  {
+    if (_kernel->is_constant() && !_input->is_dynamic() && !_output->is_dynamic())
+    {
+      GetQuantizedConvolutionMultipliersAndShifts(
+        _input->data_scale(), _output->data_scale(), _kernel->data_scales().data(),
+        _kernel->data_scales().size(), getShape(_kernel).Dims(0),
+        kernel.per_channel_output_multiplier(), kernel.per_channel_output_shift());
+    }
+    else
+    {
+      throw std::runtime_error{"Conv2D: Int8 dynamic weight is not supported"};
+    }
   }
   _prepare = true;
 }
diff --git a/runtime/onert/backend/cpu/ops/ConvolutionLayer.h b/runtime/onert/backend/cpu/ops/ConvolutionLayer.h
index 398892e65..5d7f7c296 100644
--- a/runtime/onert/backend/cpu/ops/ConvolutionLayer.h
+++ b/runtime/onert/backend/cpu/ops/ConvolutionLayer.h
@@ -52,6 +52,8 @@ public:
 
   void convQuant8();
 
+  void convQuant8PerChannel();
+
   void configure(const IPortableTensor *input, const IPortableTensor *kernel,
                  const IPortableTensor *bias, ir::PaddingType _paddingType,
                  const uint32_t paddingLeft, const uint32_t paddingRight, const uint32_t paddingTop,
diff --git a/runtime/onert/backend/cpu/ops/DepthToSpaceLayer.cc b/runtime/onert/backend/cpu/ops/DepthToSpaceLayer.cc
index d265d0ac2..e23b7c14a 100644
--- a/runtime/onert/backend/cpu/ops/DepthToSpaceLayer.cc
+++ b/runtime/onert/backend/cpu/ops/DepthToSpaceLayer.cc
@@ -35,9 +35,8 @@ DepthToSpaceLayer::DepthToSpaceLayer() : _input(nullptr), _block_size(0), _outpu
 
 template <typename T> void DepthToSpaceLayer::depthToSpace()
 {
-  nnfw::cker::DepthToSpace(getTensorShape(_input), reinterpret_cast<const T *>(_input->buffer()),
-                           getTensorShape(_output), reinterpret_cast<T *>(_output->buffer()),
-                           _block_size);
+  nnfw::cker::DepthToSpace(getShape(_input), getBuffer<T>(_input), getShape(_output),
+                           getBuffer<T>(_output), _block_size);
 }
 
 void DepthToSpaceLayer::configure(const IPortableTensor *input, const int32_t block_size,
diff --git a/runtime/onert/backend/cpu/ops/DepthwiseConvolutionLayer.cc b/runtime/onert/backend/cpu/ops/DepthwiseConvolutionLayer.cc
index 85553d14d..30641ecae 100644
--- a/runtime/onert/backend/cpu/ops/DepthwiseConvolutionLayer.cc
+++ b/runtime/onert/backend/cpu/ops/DepthwiseConvolutionLayer.cc
@@ -44,19 +44,17 @@ void DepthwiseConvolutionLayer::convFloat32()
   op_params.float_activation_max = output_activation_max;
 
   nnfw::cker::DepthwiseConv<float, float>(
-      op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
-      getTensorShape(_kernel), reinterpret_cast<const float *>(_kernel->buffer()),
-      getTensorShape(_bias), reinterpret_cast<const float *>(_bias->buffer()),
-      getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()),
-      _external_context->ruy_context());
+    op_params, getShape(_input), getBuffer<float>(_input), getShape(_kernel),
+    getBuffer<float>(_kernel), getShape(_bias), getBuffer<float>(_bias), getShape(_output),
+    getBuffer<float>(_output), _external_context->ruy_context());
 }
 
 void DepthwiseConvolutionLayer::convQuant8()
 {
   int32_t output_activation_min = 0;
   int32_t output_activation_max = 0;
-  CalculateActivationRangeUint8(_activation, _output, &output_activation_min,
-                                &output_activation_max);
+  CalculateActivationRangeQuantized(_activation, _output, &output_activation_min,
+                                    &output_activation_max);
 
   double real_multiplier = 0.0;
   int32_t output_multiplier = 0;
@@ -72,29 +70,70 @@ void DepthwiseConvolutionLayer::convQuant8()
   op_params.padding_values.width = _paddingLeft;
   op_params.padding_values.height = _paddingTop;
   op_params.depth_multiplier = _multiplier;
-  op_params.input_offset = -_input->data_offset();
-  op_params.weights_offset = -_kernel->data_offset();
-  op_params.output_offset = _output->data_offset();
+  op_params.input_offset = -_input->data_zero_point();
+  op_params.weights_offset = -_kernel->data_zero_point();
+  op_params.output_offset = _output->data_zero_point();
   op_params.output_multiplier = output_multiplier;
   op_params.output_shift = output_shift;
   op_params.quantized_activation_min = output_activation_min;
   op_params.quantized_activation_max = output_activation_max;
 
   nnfw::cker::DepthwiseConv<uint8_t, int32_t>(
-      op_params, getTensorShape(_input), reinterpret_cast<const uint8_t *>(_input->buffer()),
-      getTensorShape(_kernel), reinterpret_cast<const uint8_t *>(_kernel->buffer()),
-      getTensorShape(_bias), reinterpret_cast<const int32_t *>(_bias->buffer()),
-      getTensorShape(_output), reinterpret_cast<uint8_t *>(_output->buffer()),
-      _external_context->ruy_context());
+    op_params, getShape(_input), getBuffer<uint8_t>(_input), getShape(_kernel),
+    getBuffer<uint8_t>(_kernel), getShape(_bias), getBuffer<int32_t>(_bias), getShape(_output),
+    getBuffer<uint8_t>(_output), _external_context->ruy_context());
+}
+
+void DepthwiseConvolutionLayer::convQuant8PerChannel()
+{
+  if (!_prepared)
+  {
+    prepareQuant8PerChannel();
+    _prepared = true;
+  }
+
+  int32_t output_activation_min = 0;
+  int32_t output_activation_max = 0;
+  CalculateActivationRangeQuantized(_activation, _output, &output_activation_min,
+                                    &output_activation_max);
+
+  nnfw::cker::DepthwiseConvParams op_params;
+  op_params.padding_type = nnfw::cker::PaddingType::kSame;
+  op_params.padding_values.width = _paddingLeft;
+  op_params.padding_values.height = _paddingTop;
+  op_params.depth_multiplier = _multiplier;
+  op_params.stride_width = _strideWidth;
+  op_params.stride_height = _strideHeight;
+  op_params.dilation_width_factor = _dilationWidth;
+  op_params.dilation_height_factor = _dilationHeight;
+  op_params.input_offset = -_input->data_zero_point();
+  op_params.weights_offset = 0;
+  op_params.output_offset = _output->data_zero_point();
+  op_params.quantized_activation_min = output_activation_min;
+  op_params.quantized_activation_max = output_activation_max;
+
+  nnfw::cker::optimized_integer_ops::DepthwiseConvPerChannel(
+    op_params, _per_channel_output_multiplier.data(), _per_channel_output_shift.data(),
+    getShape(_input), getBuffer<int8_t>(_input), getShape(_kernel), getBuffer<int8_t>(_kernel),
+    getShape(_bias), getBuffer<int32_t>(_bias), getShape(_output), getBuffer<int8_t>(_output),
+    _external_context->ruy_context());
+}
+
+void DepthwiseConvolutionLayer::prepareQuant8PerChannel()
+{
+  GetQuantizedConvolutionMultipliersAndShifts(
+    _input->data_scale(), _output->data_scale(), _kernel->data_scales().data(),
+    _kernel->data_scales().size(), getShape(_kernel).Dims(3), _per_channel_output_multiplier,
+    _per_channel_output_shift);
 }
 
 void DepthwiseConvolutionLayer::configure(
-    const IPortableTensor *input, const IPortableTensor *kernel, const IPortableTensor *bias,
-    const uint32_t paddingLeft, const uint32_t paddingRight, const uint32_t paddingTop,
-    const uint32_t paddingBottom, const uint32_t strideWidth, const uint32_t strideHeight,
-    const uint32_t multiplier, const uint32_t dilationWidth, const uint32_t dilationHeight,
-    const ir::Activation activation, IPortableTensor *output,
-    const std::shared_ptr<ExternalContext> &external_context)
+  const IPortableTensor *input, const IPortableTensor *kernel, const IPortableTensor *bias,
+  const uint32_t paddingLeft, const uint32_t paddingRight, const uint32_t paddingTop,
+  const uint32_t paddingBottom, const uint32_t strideWidth, const uint32_t strideHeight,
+  const uint32_t multiplier, const uint32_t dilationWidth, const uint32_t dilationHeight,
+  const ir::Activation activation, IPortableTensor *output,
+  const std::shared_ptr<ExternalContext> &external_context)
 {
   _input = input;
   _kernel = kernel;
@@ -111,6 +150,15 @@ void DepthwiseConvolutionLayer::configure(
   _activation = activation;
   _output = output;
   _external_context = external_context;
+
+  if (_input->data_type() == OperandType::QUANT_INT8_ASYMM)
+  {
+    if (_kernel->is_constant() && !_input->is_dynamic() && !_output->is_dynamic())
+    {
+      prepareQuant8PerChannel();
+      _prepared = true;
+    }
+  }
 }
 
 void DepthwiseConvolutionLayer::run()
@@ -123,6 +171,10 @@ void DepthwiseConvolutionLayer::run()
   {
     convQuant8();
   }
+  else if (_input->data_type() == OperandType::QUANT_INT8_ASYMM)
+  {
+    convQuant8PerChannel();
+  }
   else
   {
     throw std::runtime_error{"DepthwiseConv: unsupported data type"};
diff --git a/runtime/onert/backend/cpu/ops/DepthwiseConvolutionLayer.h b/runtime/onert/backend/cpu/ops/DepthwiseConvolutionLayer.h
index fe1fcc182..720550636 100644
--- a/runtime/onert/backend/cpu/ops/DepthwiseConvolutionLayer.h
+++ b/runtime/onert/backend/cpu/ops/DepthwiseConvolutionLayer.h
@@ -42,6 +42,8 @@ public:
 
   void convQuant8();
 
+  void convQuant8PerChannel();
+
   void configure(const IPortableTensor *input, const IPortableTensor *kernel,
                  const IPortableTensor *bias, const uint32_t paddingLeft,
                  const uint32_t paddingRight, const uint32_t paddingTop,
@@ -53,6 +55,9 @@ public:
   void run() override;
 
 private:
+  void prepareQuant8PerChannel();
+
+private:
   const IPortableTensor *_input{nullptr};
   const IPortableTensor *_kernel{nullptr};
   const IPortableTensor *_bias{nullptr};
@@ -74,6 +79,12 @@ private:
   ir::Activation _activation{ir::Activation::NONE};
 
   std::shared_ptr<ExternalContext> _external_context;
+
+  bool _prepared{false};
+
+  // Per channel output multiplier and shift.
+  std::vector<int32_t> _per_channel_output_multiplier;
+  std::vector<int> _per_channel_output_shift;
 };
 
 } // namespace ops
diff --git a/runtime/onert/backend/cpu/ops/EinsumLayer.cc b/runtime/onert/backend/cpu/ops/EinsumLayer.cc
index 8c16740a3..8e10c4642 100644
--- a/runtime/onert/backend/cpu/ops/EinsumLayer.cc
+++ b/runtime/onert/backend/cpu/ops/EinsumLayer.cc
@@ -28,7 +28,7 @@ namespace ops
 {
 
 EinsumLayer::EinsumLayer()
-    : _inputs(), _output(nullptr), _equation(), _einsum_kernel(new nnfw::cker::Einsum())
+  : _inputs(), _output(nullptr), _equation(), _einsum_kernel(new nnfw::cker::Einsum())
 {
   // DO NOTHING
 }
@@ -47,12 +47,11 @@ void EinsumLayer::einsumFloat32()
 
   for (uint32_t i = 0; i < num_inputs; i++)
   {
-    inputShapes.emplace_back(getTensorShape(_inputs[i]));
-    inputFloatPtrs.emplace_back(reinterpret_cast<const float *>(_inputs[i]->buffer()));
+    inputShapes.emplace_back(getShape(_inputs[i]));
+    inputFloatPtrs.emplace_back(getBuffer<float>(_inputs[i]));
   }
 
-  kernel(_equation, inputShapes, inputFloatPtrs, getTensorShape(_output),
-         reinterpret_cast<float *>(_output->buffer()));
+  kernel(_equation, inputShapes, inputFloatPtrs, getShape(_output), getBuffer<float>(_output));
 }
 
 void EinsumLayer::run()
diff --git a/runtime/onert/backend/cpu/ops/ElementwiseActivationLayer.cc b/runtime/onert/backend/cpu/ops/ElementwiseActivationLayer.cc
index 3e1da5ec0..27b2cdf68 100644
--- a/runtime/onert/backend/cpu/ops/ElementwiseActivationLayer.cc
+++ b/runtime/onert/backend/cpu/ops/ElementwiseActivationLayer.cc
@@ -35,7 +35,7 @@ namespace ops
 {
 
 ElementwiseActivationLayer::ElementwiseActivationLayer()
-    : _input(nullptr), _output(nullptr), _kernel()
+  : _input(nullptr), _output(nullptr), _kernel()
 {
   // DO NOTHING
 }
@@ -43,9 +43,9 @@ ElementwiseActivationLayer::ElementwiseActivationLayer()
 void ElementwiseActivationLayer::PopulateLookupTable(const ElementwiseActivationType op_type)
 {
   const auto input_scale = static_cast<double>(_input->data_scale());
-  const auto input_zero_point = static_cast<int32_t>(_input->data_offset());
+  const auto input_zero_point = static_cast<int32_t>(_input->data_zero_point());
   const auto output_scale = static_cast<double>(_output->data_scale());
-  const auto output_zero_point = static_cast<int32_t>(_output->data_offset());
+  const auto output_zero_point = static_cast<int32_t>(_output->data_zero_point());
   const float inverse_scale = 1 / output_scale;
   int32_t maxval = std::numeric_limits<uint8_t>::max();
   int32_t minval = std::numeric_limits<uint8_t>::min();
@@ -74,9 +74,9 @@ void ElementwiseActivationLayer::PopulateLookupTable(const ElementwiseActivation
 void ElementwiseActivationLayer::EvalUsingLookupTable(const IPortableTensor *input,
                                                       IPortableTensor *output)
 {
-  const int size = MatchingFlatSize(getTensorShape(input), getTensorShape(output));
-  const uint8_t *input_data = reinterpret_cast<const uint8_t *>(input->buffer());
-  uint8_t *output_data = reinterpret_cast<uint8_t *>(output->buffer());
+  const int size = MatchingFlatSize(getShape(input), getShape(output));
+  const uint8_t *input_data = getBuffer<uint8_t>(input);
+  uint8_t *output_data = getBuffer<uint8_t>(output);
 
   for (int i = 0; i < size; ++i)
   {
@@ -97,8 +97,8 @@ void ElementwiseActivationLayer::configure(const IPortableTensor *input, IPortab
       if (input->data_type() == OperandType::FLOAT32)
       {
         _kernel = [](const IPortableTensor *input, IPortableTensor *output) {
-          nnfw::cker::ELU(getTensorShape(input), reinterpret_cast<const float *>(input->buffer()),
-                          getTensorShape(output), reinterpret_cast<float *>(output->buffer()));
+          nnfw::cker::ELU(getShape(input), getBuffer<float>(input), getShape(output),
+                          getBuffer<float>(output));
         };
       }
       else
@@ -116,9 +116,8 @@ void ElementwiseActivationLayer::configure(const IPortableTensor *input, IPortab
       else if (_input->data_type() == OperandType::FLOAT32)
       {
         _kernel = [](const IPortableTensor *input, IPortableTensor *output) {
-          nnfw::cker::Logistic(getTensorShape(input),
-                               reinterpret_cast<const float *>(input->buffer()),
-                               getTensorShape(output), reinterpret_cast<float *>(output->buffer()));
+          nnfw::cker::Logistic(getShape(input), getBuffer<float>(input), getShape(output),
+                               getBuffer<float>(output));
         };
       }
       else
@@ -132,23 +131,20 @@ void ElementwiseActivationLayer::configure(const IPortableTensor *input, IPortab
         if (alpha == std::numeric_limits<float>::infinity() && beta == 0.f)
         {
           _kernel = [](const IPortableTensor *input, IPortableTensor *output) {
-            nnfw::cker::ReLU(getTensorShape(input),
-                             reinterpret_cast<const float *>(input->buffer()),
-                             getTensorShape(output), reinterpret_cast<float *>(output->buffer()));
+            nnfw::cker::ReLU(getShape(input), getBuffer<float>(input), getShape(output),
+                             getBuffer<float>(output));
           };
         }
         else if (alpha == 6.f && beta == 0.f)
         {
           _kernel = [](const IPortableTensor *input, IPortableTensor *output) {
-            nnfw::cker::ReLU6(getTensorShape(input),
-                              reinterpret_cast<const float *>(input->buffer()),
-                              reinterpret_cast<float *>(output->buffer()));
+            nnfw::cker::ReLU6(getShape(input), getBuffer<float>(input), getBuffer<float>(output));
           };
         }
         else
         {
           throw std::runtime_error(
-              "ElementwiseActivationLayer : This layer suppports only ReLU(0-inf) and ReLU6(0-6)");
+            "ElementwiseActivationLayer : This layer suppports only ReLU(0-inf) and ReLU6(0-6)");
         }
       }
       else
@@ -166,8 +162,8 @@ void ElementwiseActivationLayer::configure(const IPortableTensor *input, IPortab
       else if (_input->data_type() == OperandType::FLOAT32)
       {
         _kernel = [](const IPortableTensor *input, IPortableTensor *output) {
-          nnfw::cker::Tanh(getTensorShape(input), reinterpret_cast<const float *>(input->buffer()),
-                           getTensorShape(output), reinterpret_cast<float *>(output->buffer()));
+          nnfw::cker::Tanh(getShape(input), getBuffer<float>(input), getShape(output),
+                           getBuffer<float>(output));
         };
       }
       else
@@ -179,10 +175,9 @@ void ElementwiseActivationLayer::configure(const IPortableTensor *input, IPortab
       if (_input->data_type() == OperandType::FLOAT32)
       {
         _kernel = [alpha](const IPortableTensor *input, IPortableTensor *output) {
-          nnfw::cker::LeakyReLU(nnfw::cker::LeakyReluParams{alpha}, getTensorShape(input),
-                                reinterpret_cast<const float *>(input->buffer()),
-                                getTensorShape(output),
-                                reinterpret_cast<float *>(output->buffer()));
+          nnfw::cker::LeakyReLU(nnfw::cker::LeakyReluParams{alpha}, getShape(input),
+                                getBuffer<float>(input), getShape(output),
+                                getBuffer<float>(output));
         };
       }
       else
diff --git a/runtime/onert/backend/cpu/ops/ElementwiseBinaryLayer.cc b/runtime/onert/backend/cpu/ops/ElementwiseBinaryLayer.cc
index 1e17a0828..1704c7cc6 100644
--- a/runtime/onert/backend/cpu/ops/ElementwiseBinaryLayer.cc
+++ b/runtime/onert/backend/cpu/ops/ElementwiseBinaryLayer.cc
@@ -39,16 +39,13 @@ void logicalAndGeneric(const IPortableTensor *lhs, const IPortableTensor *rhs,
 {
   if (!HaveSameShapes(lhs, rhs))
   {
-    nnfw::cker::LogicalAndBroadcast<T>(
-        getTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()), getTensorShape(rhs),
-        reinterpret_cast<const T *>(rhs->buffer()), getTensorShape(output),
-        reinterpret_cast<T *>(output->buffer()));
+    nnfw::cker::LogicalAndBroadcast<T>(getShape(lhs), getBuffer<T>(lhs), getShape(rhs),
+                                       getBuffer<T>(rhs), getShape(output), getBuffer<T>(output));
   }
   else
   {
-    nnfw::cker::LogicalAndElementwise<T>(
-        getTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-        reinterpret_cast<const T *>(rhs->buffer()), reinterpret_cast<T *>(output->buffer()));
+    nnfw::cker::LogicalAndElementwise<T>(getShape(lhs), getBuffer<T>(lhs), getBuffer<T>(rhs),
+                                         getBuffer<T>(output));
   }
 }
 
@@ -58,40 +55,36 @@ void logicalOrGeneric(const IPortableTensor *lhs, const IPortableTensor *rhs,
 {
   if (!HaveSameShapes(lhs, rhs))
   {
-    nnfw::cker::LogicalOrBroadcast<T>(
-        getTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()), getTensorShape(rhs),
-        reinterpret_cast<const T *>(rhs->buffer()), getTensorShape(output),
-        reinterpret_cast<T *>(output->buffer()));
+    nnfw::cker::LogicalOrBroadcast<T>(getShape(lhs), getBuffer<T>(lhs), getShape(rhs),
+                                      getBuffer<T>(rhs), getShape(output), getBuffer<T>(output));
   }
   else
   {
-    nnfw::cker::LogicalOrElementwise<T>(
-        getTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-        reinterpret_cast<const T *>(rhs->buffer()), reinterpret_cast<T *>(output->buffer()));
+    nnfw::cker::LogicalOrElementwise<T>(getShape(lhs), getBuffer<T>(lhs), getBuffer<T>(rhs),
+                                        getBuffer<T>(output));
   }
 }
 
 template <typename T>
 void maximumGeneric(const IPortableTensor *lhs, const IPortableTensor *rhs, IPortableTensor *output)
 {
-  nnfw::cker::Max<T>(getTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-                     getTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-                     getTensorShape(output), reinterpret_cast<T *>(output->buffer()));
+  nnfw::cker::Max<T>(getShape(lhs), getBuffer<T>(lhs), getShape(rhs), getBuffer<T>(rhs),
+                     getShape(output), getBuffer<T>(output));
 }
 
 template <typename T>
 void minimumGeneric(const IPortableTensor *lhs, const IPortableTensor *rhs, IPortableTensor *output)
 {
-  nnfw::cker::Min<T>(getTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-                     getTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-                     getTensorShape(output), reinterpret_cast<T *>(output->buffer()));
+  nnfw::cker::Min<T>(getShape(lhs), getBuffer<T>(lhs), getShape(rhs), getBuffer<T>(rhs),
+                     getShape(output), getBuffer<T>(output));
 }
 
 bool haveSameQauntInfo(const IPortableTensor *lhs, const IPortableTensor *rhs,
                        const IPortableTensor *output)
 {
   return (lhs->data_scale() == rhs->data_scale() && lhs->data_scale() == output->data_scale()) &&
-         (lhs->data_offset() == rhs->data_offset() && lhs->data_offset() == output->data_offset());
+         (lhs->data_zero_point() == rhs->data_zero_point() &&
+          lhs->data_zero_point() == output->data_zero_point());
 }
 } // namespace
 
diff --git a/runtime/onert/backend/cpu/ops/ElementwiseUnaryLayer.cc b/runtime/onert/backend/cpu/ops/ElementwiseUnaryLayer.cc
index 15d7f3049..d58937b5f 100644
--- a/runtime/onert/backend/cpu/ops/ElementwiseUnaryLayer.cc
+++ b/runtime/onert/backend/cpu/ops/ElementwiseUnaryLayer.cc
@@ -23,7 +23,6 @@
 #include <cker/operation/Erf.h>
 #include <cker/operation/Exp.h>
 #include <cker/operation/LogicalNot.h>
-#include <cker/operation/Quantize.h>
 #include <cker/operation/Round.h>
 
 namespace onert
@@ -39,8 +38,8 @@ namespace
 {
 void absFloat32(const IPortableTensor *input, IPortableTensor *output)
 {
-  nnfw::cker::Abs(getTensorShape(input), reinterpret_cast<const float *>(input->buffer()),
-                  getTensorShape(output), reinterpret_cast<float *>(output->buffer()));
+  nnfw::cker::Abs(getShape(input), getBuffer<float>(input), getShape(output),
+                  getBuffer<float>(output));
 }
 
 template <typename FromT>
@@ -83,8 +82,8 @@ void cast(const IPortableTensor *input, IPortableTensor *output)
   const auto in = *reinterpret_cast<const DataPtr *>(&input_buf);
   auto out = *reinterpret_cast<DataPtr *>(&output_buf);
 
-  auto input_shape = getTensorShape(input);
-  auto output_shape = getTensorShape(output);
+  auto input_shape = getShape(input);
+  auto output_shape = getShape(output);
   const auto num_elements = MatchingFlatSize(input_shape, output_shape);
 
   switch (input->data_type())
@@ -115,96 +114,85 @@ void cast(const IPortableTensor *input, IPortableTensor *output)
 
 void cosFloat32(const IPortableTensor *input, IPortableTensor *output)
 {
-  nnfw::cker::Cos(getTensorShape(input), reinterpret_cast<const float *>(input->buffer()),
-                  getTensorShape(output), reinterpret_cast<float *>(output->buffer()));
+  nnfw::cker::Cos(getShape(input), getBuffer<float>(input), getShape(output),
+                  getBuffer<float>(output));
 }
 
 void dequantizeInt8(const IPortableTensor *input, IPortableTensor *output)
 {
-  nnfw::cker::Dequantize(getTensorShape(input), reinterpret_cast<const int8_t *>(input->buffer()),
-                         getTensorShape(output), reinterpret_cast<float *>(output->buffer()),
-                         input->data_scale(), input->data_offset());
+  nnfw::cker::Dequantize(getShape(input), getBuffer<int8_t>(input), getShape(output),
+                         getBuffer<float>(output), input->data_scale(), input->data_zero_point());
 }
 
 void dequantizeUint8(const IPortableTensor *input, IPortableTensor *output)
 {
-  nnfw::cker::Dequantize(getTensorShape(input), reinterpret_cast<const uint8_t *>(input->buffer()),
-                         getTensorShape(output), reinterpret_cast<float *>(output->buffer()),
-                         input->data_scale(), input->data_offset());
+  nnfw::cker::Dequantize(getShape(input), getBuffer<uint8_t>(input), getShape(output),
+                         getBuffer<float>(output), input->data_scale(), input->data_zero_point());
 }
 
 void expFloat32(const IPortableTensor *input, IPortableTensor *output)
 {
-  nnfw::cker::Exp(getTensorShape(input), reinterpret_cast<const float *>(input->buffer()),
-                  getTensorShape(output), reinterpret_cast<float *>(output->buffer()));
+  nnfw::cker::Exp(getShape(input), getBuffer<float>(input), getShape(output),
+                  getBuffer<float>(output));
 }
 
 void erfFloat32(const IPortableTensor *input, IPortableTensor *output)
 {
-  nnfw::cker::Erf(getTensorShape(input), reinterpret_cast<const float *>(input->buffer()),
-                  getTensorShape(output), reinterpret_cast<float *>(output->buffer()));
+  nnfw::cker::Erf(getShape(input), getBuffer<float>(input), getShape(output),
+                  getBuffer<float>(output));
 }
 
 void floorFloat32(const IPortableTensor *input, IPortableTensor *output)
 {
-  nnfw::cker::Floor(getTensorShape(input), reinterpret_cast<const float *>(input->buffer()),
-                    getTensorShape(output), reinterpret_cast<float *>(output->buffer()));
+  nnfw::cker::Floor(getShape(input), getBuffer<float>(input), getShape(output),
+                    getBuffer<float>(output));
 }
 
 void logFloat32(const IPortableTensor *input, IPortableTensor *output)
 {
-  nnfw::cker::Log(getTensorShape(input), reinterpret_cast<const float *>(input->buffer()),
-                  getTensorShape(output), reinterpret_cast<float *>(output->buffer()));
+  nnfw::cker::Log(getShape(input), getBuffer<float>(input), getShape(output),
+                  getBuffer<float>(output));
 }
 
 void logicalNot(const IPortableTensor *input, IPortableTensor *output)
 {
-  nnfw::cker::LogicalNot(getTensorShape(input), reinterpret_cast<const bool *>(input->buffer()),
-                         getTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
+  nnfw::cker::LogicalNot(getShape(input), getBuffer<bool>(input), getShape(output),
+                         getBuffer<bool>(output));
 }
 
 template <typename T> void neg(const IPortableTensor *input, IPortableTensor *output)
 {
-  nnfw::cker::Neg<T>(getTensorShape(input), reinterpret_cast<const T *>(input->buffer()),
-                     getTensorShape(output), reinterpret_cast<T *>(output->buffer()));
-}
-
-template <typename InputT, typename OutputT>
-void affineQuantize(const IPortableTensor *input, IPortableTensor *output)
-{
-  nnfw::cker::Quantize(getTensorShape(input), reinterpret_cast<const InputT *>(input->buffer()),
-                       getTensorShape(output), reinterpret_cast<OutputT *>(output->buffer()),
-                       output->data_scale(), output->data_offset());
+  nnfw::cker::Neg<T>(getShape(input), getBuffer<T>(input), getShape(output), getBuffer<T>(output));
 }
 
 void roundFloat32(const IPortableTensor *input, IPortableTensor *output)
 {
-  nnfw::cker::Round(getTensorShape(input), reinterpret_cast<const float *>(input->buffer()),
-                    getTensorShape(output), reinterpret_cast<float *>(output->buffer()));
+  nnfw::cker::Round(getShape(input), getBuffer<float>(input), getShape(output),
+                    getBuffer<float>(output));
 }
 
 void rsqrtFloat32(const IPortableTensor *input, IPortableTensor *output)
 {
-  nnfw::cker::Rsqrt(getTensorShape(input), reinterpret_cast<const float *>(input->buffer()),
-                    getTensorShape(output), reinterpret_cast<float *>(output->buffer()));
+  nnfw::cker::Rsqrt(getShape(input), getBuffer<float>(input), getShape(output),
+                    getBuffer<float>(output));
 }
 
 void sinFloat32(const IPortableTensor *input, IPortableTensor *output)
 {
-  nnfw::cker::Sin(getTensorShape(input), reinterpret_cast<const float *>(input->buffer()),
-                  getTensorShape(output), reinterpret_cast<float *>(output->buffer()));
+  nnfw::cker::Sin(getShape(input), getBuffer<float>(input), getShape(output),
+                  getBuffer<float>(output));
 }
 
 void sqrtFloat32(const IPortableTensor *input, IPortableTensor *output)
 {
-  nnfw::cker::Sqrt(getTensorShape(input), reinterpret_cast<const float *>(input->buffer()),
-                   getTensorShape(output), reinterpret_cast<float *>(output->buffer()));
+  nnfw::cker::Sqrt(getShape(input), getBuffer<float>(input), getShape(output),
+                   getBuffer<float>(output));
 }
 
 void squareFloat32(const IPortableTensor *input, IPortableTensor *output)
 {
-  nnfw::cker::Square(getTensorShape(input), reinterpret_cast<const float *>(input->buffer()),
-                     getTensorShape(output), reinterpret_cast<float *>(output->buffer()));
+  nnfw::cker::Square(getShape(input), getBuffer<float>(input), getShape(output),
+                     getBuffer<float>(output));
 }
 
 template <typename T> void zerosLikeFloat32(const IPortableTensor *input, IPortableTensor *output)
@@ -212,9 +200,9 @@ template <typename T> void zerosLikeFloat32(const IPortableTensor *input, IPorta
   if (!HaveSameShapes(input, output))
     throw std::runtime_error{"ZerosLike: input and output shape don't match."};
 
-  auto element_size = getTensorShape(input).FlatSize();
+  auto element_size = getShape(input).FlatSize();
 
-  memset(reinterpret_cast<T *>(output->buffer()), 0, element_size * sizeof(T));
+  memset(getBuffer<T>(output), 0, element_size * sizeof(T));
 }
 } // namespace
 
@@ -335,16 +323,6 @@ void ElementwiseUnaryLayer::configure(const IPortableTensor *input, IPortableTen
         throw std::runtime_error{"Neg: Unsupported  data type"};
       }
       break;
-    case ElementwiseUnaryType::kQuantize:
-      if ((input->data_type() == OperandType::FLOAT32))
-      {
-        _kernel = affineQuantize<float, uint8_t>;
-      }
-      else
-      {
-        throw std::runtime_error{"Quantize: Unsupported  data type"};
-      }
-      break;
     case ElementwiseUnaryType::kRound:
       if ((input->data_type() == OperandType::FLOAT32))
       {
@@ -410,7 +388,7 @@ void ElementwiseUnaryLayer::configure(const IPortableTensor *input, IPortableTen
       }
       break;
     default:
-      throw std::runtime_error{"ElementwiseBinary: Unsupported ElementwiseBinary type"};
+      throw std::runtime_error{"ElementwiseUnary: Unsupported ElementwiseUnary type"};
   }
 }
 
diff --git a/runtime/onert/backend/cpu/ops/FillLayer.cc b/runtime/onert/backend/cpu/ops/FillLayer.cc
index 5b7c17907..cc12fcbd8 100644
--- a/runtime/onert/backend/cpu/ops/FillLayer.cc
+++ b/runtime/onert/backend/cpu/ops/FillLayer.cc
@@ -45,24 +45,20 @@ void FillLayer::run()
   switch (_output->data_type())
   {
     case OperandType::FLOAT32:
-      nnfw::cker::Fill<float *>(reinterpret_cast<float *>(_value->buffer()),
-                                getTensorShape(_output),
-                                reinterpret_cast<float *>(_output->buffer()));
+      nnfw::cker::Fill<float>(getBuffer<float>(_value), getShape(_output),
+                              getBuffer<float>(_output));
       break;
     case OperandType::INT32:
-      nnfw::cker::Fill<int32_t *>(reinterpret_cast<int32_t *>(_value->buffer()),
-                                  getTensorShape(_output),
-                                  reinterpret_cast<int32_t *>(_output->buffer()));
+      nnfw::cker::Fill<int32_t>(getBuffer<int32_t>(_value), getShape(_output),
+                                getBuffer<int32_t>(_output));
       break;
     case OperandType::INT64:
-      nnfw::cker::Fill<int64_t *>(reinterpret_cast<int64_t *>(_value->buffer()),
-                                  getTensorShape(_output),
-                                  reinterpret_cast<int64_t *>(_output->buffer()));
+      nnfw::cker::Fill<int64_t>(getBuffer<int64_t>(_value), getShape(_output),
+                                getBuffer<int64_t>(_output));
       break;
     case OperandType::UINT32:
-      nnfw::cker::Fill<uint32_t *>(reinterpret_cast<uint32_t *>(_value->buffer()),
-                                   getTensorShape(_output),
-                                   reinterpret_cast<uint32_t *>(_output->buffer()));
+      nnfw::cker::Fill<uint32_t>(getBuffer<uint32_t>(_value), getShape(_output),
+                                 getBuffer<uint32_t>(_output));
       break;
     default:
       throw std::runtime_error{"Fill: unsupported data type"};
diff --git a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc
index 47ac1d873..6857f7f9f 100644
--- a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc
+++ b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc
@@ -31,9 +31,9 @@ namespace ops
 {
 
 FullyConnectedLayer::FullyConnectedLayer()
-    : _input(nullptr), _weights(nullptr), _bias(nullptr), _output(nullptr),
-      _activation(ir::Activation::NONE), _temp_arena(new nnfw::cker::FCTempArena()),
-      _external_context(nullptr), _is_hybrid(false), _is_shuffled16x1float32(false)
+  : _input(nullptr), _weights(nullptr), _bias(nullptr), _output(nullptr),
+    _activation(ir::Activation::NONE), _temp_arena(new nnfw::cker::FCTempArena()),
+    _external_context(nullptr), _is_hybrid(false), _is_shuffled16x1float32(false)
 {
   // DO NOTHING
 }
@@ -45,11 +45,10 @@ void FullyConnectedLayer::fullyConnectedFloat32()
   nnfw::cker::FullyConnectedParams op_params;
   op_params.activation = convertActivationType(_activation);
 
-  nnfw::cker::FullyConnected(
-      op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
-      getTensorShape(_weights), reinterpret_cast<const float *>(_weights->buffer()),
-      getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr),
-      getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()));
+  nnfw::cker::FullyConnected(op_params, getShape(_input), getBuffer<float>(_input),
+                             getShape(_weights), getBuffer<float>(_weights), getShape(_bias),
+                             _bias ? getBuffer<float>(_bias) : nullptr, getShape(_output),
+                             getBuffer<float>(_output));
 }
 
 // executionMutex is used to protect concurrent access of non-threadsafe resources
@@ -63,23 +62,22 @@ void FullyConnectedLayer::fullyConnectedQuant8()
   int32_t output_activation_max = 0;
   GetQuantizedConvolutionMultiplier(_input, _weights, _bias, _output, &real_multiplier);
   QuantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
-  CalculateActivationRangeUint8(_activation, _output, &output_activation_min,
-                                &output_activation_max);
+  CalculateActivationRangeQuantized(_activation, _output, &output_activation_min,
+                                    &output_activation_max);
 
   nnfw::cker::FullyConnectedParams op_params;
-  op_params.input_offset = -_input->data_offset();
-  op_params.weights_offset = -_weights->data_offset();
-  op_params.output_offset = _output->data_offset();
+  op_params.input_offset = -_input->data_zero_point();
+  op_params.weights_offset = -_weights->data_zero_point();
+  op_params.output_offset = _output->data_zero_point();
   op_params.output_multiplier = output_multiplier;
   op_params.output_shift = output_shift;
   op_params.quantized_activation_min = output_activation_min;
   op_params.quantized_activation_max = output_activation_max;
 
-  nnfw::cker::FullyConnected(
-      op_params, getTensorShape(_input), reinterpret_cast<const uint8_t *>(_input->buffer()),
-      getTensorShape(_weights), reinterpret_cast<const uint8_t *>(_weights->buffer()),
-      getTensorShape(_bias), reinterpret_cast<const int32_t *>(_bias ? _bias->buffer() : nullptr),
-      getTensorShape(_output), reinterpret_cast<uint8_t *>(_output->buffer()));
+  nnfw::cker::FullyConnected(op_params, getShape(_input), getBuffer<uint8_t>(_input),
+                             getShape(_weights), getBuffer<uint8_t>(_weights), getShape(_bias),
+                             _bias ? getBuffer<int32_t>(_bias) : nullptr, getShape(_output),
+                             getBuffer<uint8_t>(_output));
 }
 
 void FullyConnectedLayer::fullyConnectedHybrid()
@@ -87,7 +85,7 @@ void FullyConnectedLayer::fullyConnectedHybrid()
   nnfw::cker::FCTempArena &temp_arena = *_temp_arena;
   if (!temp_arena.prepared)
   {
-    temp_arena.prepare(getTensorShape(_input), getTensorShape(_weights));
+    temp_arena.prepare(getShape(_input), getShape(_weights));
   }
 
   nnfw::cker::FullyConnectedParams op_params;
@@ -96,20 +94,16 @@ void FullyConnectedLayer::fullyConnectedHybrid()
 
 #ifndef USE_RUY_GEMV
   nnfw::cker::FullyConnectedHybrid(
-      op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
-      getTensorShape(_weights), reinterpret_cast<const int8_t *>(_weights->buffer()),
-      getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr),
-      getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), temp_arena,
-      _external_context->ruy_context());
+    op_params, getShape(_input), getBuffer<float>(_input), getShape(_weights),
+    getBuffer<int8_t>(_weights), getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr,
+    getShape(_output), getBuffer<float>(_output), temp_arena, _external_context->ruy_context());
 #else
   nnfw::cker::FullyConnectedHybrid(
-      op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
-      getTensorShape(_weights),
-      (_cached_weights) ? reinterpret_cast<const int8_t *>(_cached_weights)
-                        : reinterpret_cast<const int8_t *>(_weights->buffer()),
-      getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr),
-      getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), temp_arena,
-      _external_context->ruy_context());
+    op_params, getShape(_input), getBuffer<float>(_input), getShape(_weights),
+    (_cached_weights) ? reinterpret_cast<const int8_t *>(_cached_weights)
+                      : getBuffer<int8_t>(_weights),
+    getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr, getShape(_output),
+    getBuffer<float>(_output), temp_arena, _external_context->ruy_context());
 
   if (_cached_weights == nullptr || _is_weights_freed)
     return;
@@ -120,8 +114,8 @@ void FullyConnectedLayer::fullyConnectedHybrid()
 
   // if input's elements are filled with zero, it by-passes(does not enter ruy-kernel path)
   // so that handle this case
-  const int input_size = getTensorShape(_input).FlatSize();
-  if (nnfw::cker::IsZeroVector(reinterpret_cast<float *>(_input->buffer()), input_size))
+  const int input_size = getShape(_input).FlatSize();
+  if (nnfw::cker::IsZeroVector(getBuffer<float>(_input), input_size))
     return;
 
   auto weight_tensor = nnfw::misc::polymorphic_downcast<const Tensor *>(_weights);
@@ -138,6 +132,10 @@ void FullyConnectedLayer::fullyConnectedHybrid()
   tensor->decrease_ref();
   if (tensor->buffer() == nullptr) // ref == 0?
   {
+#if defined(__ANDROID__) && (__ANDROID_API__ >= 26)
+    // NOTE This line forces OS to release any unused memory immediately
+    mallopt(M_PURGE, 0);
+#endif
     _is_weights_freed = true;
   }
 #endif
@@ -155,20 +153,16 @@ void FullyConnectedLayer::fullyConnectedSparseWeight()
   if (block_size.size() == 0)
   {
     nnfw::cker::FullyConnectedSparseWeightRandom(
-        op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
-        getTensorShape(_weights), reinterpret_cast<const float *>(_weights->buffer()),
-        getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr),
-        getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), w1_segments,
-        w1_indices);
+      op_params, getShape(_input), getBuffer<float>(_input), getShape(_weights),
+      getBuffer<float>(_weights), getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr,
+      getShape(_output), getBuffer<float>(_output), w1_segments, w1_indices);
   }
   else if (block_size.size() == 2 && block_size[0] == 16 && block_size[1] == 1)
   {
     nnfw::cker::FullyConnectedSparseWeight16x1(
-        op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
-        getTensorShape(_weights), reinterpret_cast<const float *>(_weights->buffer()),
-        getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr),
-        getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), w1_segments,
-        w1_indices);
+      op_params, getShape(_input), getBuffer<float>(_input), getShape(_weights),
+      getBuffer<float>(_weights), getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr,
+      getShape(_output), getBuffer<float>(_output), w1_segments, w1_indices);
   }
   else
     throw std::runtime_error{"FullyConnected: unsupported sparsity"};
@@ -183,11 +177,10 @@ void FullyConnectedLayer::fullyConnected16x1Float32()
   nnfw::cker::FullyConnectedParams op_params;
   op_params.activation = convertActivationType(_activation);
 
-  nnfw::cker::FullyConnected16x1Float32(
-      op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
-      getTensorShape(_weights), reinterpret_cast<const float *>(_weights->buffer()),
-      getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr),
-      getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()));
+  nnfw::cker::FullyConnected16x1Float32(op_params, getShape(_input), getBuffer<float>(_input),
+                                        getShape(_weights), getBuffer<float>(_weights),
+                                        getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr,
+                                        getShape(_output), getBuffer<float>(_output));
 #else
   throw std::runtime_error{"FullyConnected: Shuffled16x1Float32 weights_format is not supported."};
 #endif
@@ -211,7 +204,7 @@ void FullyConnectedLayer::configure(const IPortableTensor *input, const IPortabl
   if (_is_shuffled16x1float32)
   {
     throw std::runtime_error{
-        "FullyConnected: Shuffled16x1Float32 weights_format is not supported."};
+      "FullyConnected: Shuffled16x1Float32 weights_format is not supported."};
   }
 #endif
   _external_context = external_context;
@@ -245,8 +238,8 @@ void FullyConnectedLayer::prepare()
 {
   if (_bias && _bias->is_constant())
   {
-    const int bias_size = getTensorShape(_bias).FlatSize();
-    if (nnfw::cker::IsZeroVector(reinterpret_cast<float *>(_bias->buffer()), bias_size))
+    const int bias_size = getShape(_bias).FlatSize();
+    if (nnfw::cker::IsZeroVector(getBuffer<float>(_bias), bias_size))
     {
       _bias = nullptr;
     }
@@ -268,7 +261,7 @@ void FullyConnectedLayer::prepare()
   if (_input->is_dynamic() || !_weights->is_constant())
     return;
 
-  const int rows = getTensorShape(_weights).Dims(0);
+  const int rows = getShape(_weights).Dims(0);
   if (rows % 4 == 0)
   {
     // TODO If it's possible to extract precaching from ruy kernel,
diff --git a/runtime/onert/backend/cpu/ops/FusedBatchNormLayer.cc b/runtime/onert/backend/cpu/ops/FusedBatchNormLayer.cc
index c2c592db7..1bec15a08 100644
--- a/runtime/onert/backend/cpu/ops/FusedBatchNormLayer.cc
+++ b/runtime/onert/backend/cpu/ops/FusedBatchNormLayer.cc
@@ -28,8 +28,8 @@ namespace ops
 {
 
 FusedBatchNormLayer::FusedBatchNormLayer()
-    : _inputs(), _output(nullptr), _epsilon(0), _is_training(true),
-      _fusedbatchnorm_kernel(new nnfw::cker::FusedBatchNorm())
+  : _inputs(), _output(nullptr), _epsilon(0), _is_training(true),
+    _fusedbatchnorm_kernel(new nnfw::cker::FusedBatchNorm())
 {
   // DO NOTHING
 }
@@ -48,8 +48,8 @@ void FusedBatchNormLayer::fusedbatchnormFloat32()
 
   for (uint32_t i = 0; i < num_inputs; i++)
   {
-    inputShapes.emplace_back(getTensorShape(_inputs[i]));
-    inputFloatPtrs.emplace_back(reinterpret_cast<const float *>(_inputs[i]->buffer()));
+    inputShapes.emplace_back(getShape(_inputs[i]));
+    inputFloatPtrs.emplace_back(getBuffer<float>(_inputs[i]));
   }
 
   nnfw::cker::FusedBatchNormParams param;
@@ -58,8 +58,7 @@ void FusedBatchNormLayer::fusedbatchnormFloat32()
   param.is_training = _is_training;
   param.data_format = _data_format;
 
-  kernel(inputShapes, inputFloatPtrs, getTensorShape(_output),
-         reinterpret_cast<float *>(_output->buffer()), param);
+  kernel(inputShapes, inputFloatPtrs, getShape(_output), getBuffer<float>(_output), param);
 }
 
 void FusedBatchNormLayer::run()
diff --git a/runtime/onert/backend/cpu/ops/GatherLayer.cc b/runtime/onert/backend/cpu/ops/GatherLayer.cc
index 641daa972..f955eef16 100644
--- a/runtime/onert/backend/cpu/ops/GatherLayer.cc
+++ b/runtime/onert/backend/cpu/ops/GatherLayer.cc
@@ -51,9 +51,8 @@ template <typename InputType> void GatherLayer::runByInputType()
       using IndicesType = int32_t;
 
       nnfw::cker::Gather<InputType, IndicesType>(
-          op_params, getTensorShape(_input), reinterpret_cast<const InputType *>(_input->buffer()),
-          getTensorShape(_indices), reinterpret_cast<const IndicesType *>(_indices->buffer()),
-          getTensorShape(_output), reinterpret_cast<OutputType *>(_output->buffer()));
+        op_params, getShape(_input), getBuffer<InputType>(_input), getShape(_indices),
+        getBuffer<IndicesType>(_indices), getShape(_output), getBuffer<OutputType>(_output));
       break;
     }
     case OperandType::INT64:
@@ -61,9 +60,8 @@ template <typename InputType> void GatherLayer::runByInputType()
       using IndicesType = int64_t;
 
       nnfw::cker::Gather<InputType, IndicesType>(
-          op_params, getTensorShape(_input), reinterpret_cast<const InputType *>(_input->buffer()),
-          getTensorShape(_indices), reinterpret_cast<const IndicesType *>(_indices->buffer()),
-          getTensorShape(_output), reinterpret_cast<OutputType *>(_output->buffer()));
+        op_params, getShape(_input), getBuffer<InputType>(_input), getShape(_indices),
+        getBuffer<IndicesType>(_indices), getShape(_output), getBuffer<OutputType>(_output));
       break;
     }
     default:
diff --git a/runtime/onert/backend/cpu/ops/L2NormLayer.cc b/runtime/onert/backend/cpu/ops/L2NormLayer.cc
index 0d99b0586..fe5019de6 100644
--- a/runtime/onert/backend/cpu/ops/L2NormLayer.cc
+++ b/runtime/onert/backend/cpu/ops/L2NormLayer.cc
@@ -44,19 +44,17 @@ void L2NormLayer::run()
   switch (_input->data_type())
   {
     case OperandType::FLOAT32:
-      nnfw::cker::L2NormalizeFloat32(
-          getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
-          getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()));
+      nnfw::cker::L2NormalizeFloat32(getShape(_input), getBuffer<float>(_input), getShape(_output),
+                                     getBuffer<float>(_output));
       break;
 
     case OperandType::QUANT_UINT8_ASYMM:
     {
       nnfw::cker::L2NormParams params;
-      assert(_input->data_offset() == 128);
-      params.input_zero_point = _input->data_offset();
-      nnfw::cker::L2NormalizeQuant8(
-          params, getTensorShape(_input), reinterpret_cast<const uint8_t *>(_input->buffer()),
-          getTensorShape(_output), reinterpret_cast<uint8_t *>(_output->buffer()));
+      assert(_input->data_zero_point() == 128);
+      params.input_zero_point = _input->data_zero_point();
+      nnfw::cker::L2NormalizeQuant8(params, getShape(_input), getBuffer<uint8_t>(_input),
+                                    getShape(_output), getBuffer<uint8_t>(_output));
     }
     break;
 
diff --git a/runtime/onert/backend/cpu/ops/LSTMLayer.cc b/runtime/onert/backend/cpu/ops/LSTMLayer.cc
index a1b67565b..16b0feec8 100644
--- a/runtime/onert/backend/cpu/ops/LSTMLayer.cc
+++ b/runtime/onert/backend/cpu/ops/LSTMLayer.cc
@@ -44,7 +44,7 @@ T *getOptionalOutputBuffer(onert::backend::IPortableTensor *tensor, std::vector<
   else
   {
     assert(tensor->total_size() == total_size);
-    return reinterpret_cast<T *>(tensor->buffer());
+    return getBuffer<T>(tensor);
   }
 }
 
@@ -58,28 +58,29 @@ inline void initializeStateBuffer(const onert::backend::IPortableTensor *tensor_
   else
     memset(buffer, 0, tensor_in->total_size());
 }
-}
+} // namespace
 
 void LSTMLayer::LSTMFloat()
 {
-  assert(_input->num_dimensions() >= 2 && _input->num_dimensions() <= 3);
+  auto in_shape = _input->getShape();
+  assert(in_shape.rank() >= 2 && in_shape.rank() <= 3);
   int max_time, n_batch;
-  if (_input->num_dimensions() == 3)
+  if (in_shape.rank() == 3)
   {
-    max_time = (_time_major) ? _input->dimension(0) : _input->dimension(1);
-    n_batch = (_time_major) ? _input->dimension(1) : _input->dimension(0);
+    max_time = (_time_major) ? in_shape.dim(0) : in_shape.dim(1);
+    n_batch = (_time_major) ? in_shape.dim(1) : in_shape.dim(0);
   }
   else
   {
     max_time = 1;
-    n_batch = _input->dimension(0);
+    n_batch = in_shape.dim(0);
   }
-  const int n_input = _input->dimension(_input->num_dimensions() - 1);
+  const int n_input = in_shape.dim(_input->getShape().rank() - 1);
   const int aux_input_size = 0;
 
   // n_cell and n_output will be the same size when there is no projection.
-  const int n_cell = _input_to_output_weights->dimension(0);
-  const int n_output = _recurrent_to_output_weights->dimension(1);
+  const int n_cell = _input_to_output_weights->getShape().dim(0);
+  const int n_output = _recurrent_to_output_weights->getShape().dim(1);
 
   // Since we have already checked that weights are all there or none, we can
   // check the existence of only one to the get the condition.
@@ -89,14 +90,14 @@ void LSTMLayer::LSTMFloat()
   float *output_state_buf = getOptionalOutputBuffer<float>(_output_state, &_output_state_vec,
                                                            _output_state_in->total_size());
   float *cell_state_buf =
-      getOptionalOutputBuffer<float>(_cell_state, &_cell_state_vec, _cell_state_in->total_size());
+    getOptionalOutputBuffer<float>(_cell_state, &_cell_state_vec, _cell_state_in->total_size());
 
   initializeStateBuffer(_output_state_in, output_state_buf, _has_output_state_data);
   initializeStateBuffer(_cell_state_in, cell_state_buf, _has_cell_state_data);
 
   // Index the scratch buffers pointers to the global scratch buffer.
   float *scratch_buffer_buf = getOptionalOutputBuffer<float>(
-      _scratch_buffer, &_scratch_vec, n_batch * n_cell * (use_cifg ? 3 : 4) * sizeof(float));
+    _scratch_buffer, &_scratch_vec, n_batch * n_cell * (use_cifg ? 3 : 4) * sizeof(float));
   float *input_gate_scratch = nullptr;
   float *cell_gate_scratch = nullptr;
   float *forget_gate_scratch = nullptr;
@@ -117,22 +118,25 @@ void LSTMLayer::LSTMFloat()
 
   auto optional_tensor_ptr = [](const IPortableTensor *tensor) {
     // If tensor is not given or the tensor size is 0, consider it was not given
-    return (tensor && tensor->total_size() > 0) ? reinterpret_cast<float *>(tensor->buffer())
-                                                : nullptr;
+    return (tensor && tensor->total_size() > 0) ? getBuffer<float>(tensor) : nullptr;
   };
   // Optional inputs
-  float *input_to_input_weights_ptr = optional_tensor_ptr(_input_to_input_weights);
-  float *recurrent_to_input_weights_ptr = optional_tensor_ptr(_recurrent_to_input_weights);
-  float *cell_to_input_weights_ptr = optional_tensor_ptr(_cell_to_input_weights);
-  float *cell_to_forget_weights_ptr = optional_tensor_ptr(_cell_to_forget_weights);
-  float *cell_to_output_weights_ptr = optional_tensor_ptr(_cell_to_output_weights);
-  float *input_gate_bias_ptr = optional_tensor_ptr(_input_gate_bias);
-  float *projection_weights_ptr = optional_tensor_ptr(_projection_weights);
-  float *projection_bias_ptr = optional_tensor_ptr(_projection_bias);
-  float *input_layer_norm_coefficients_ptr = optional_tensor_ptr(_input_layer_norm_coefficients);
-  float *forget_layer_norm_coefficients_ptr = optional_tensor_ptr(_forget_layer_norm_coefficients);
-  float *cell_layer_norm_coefficients_ptr = optional_tensor_ptr(_cell_layer_norm_coefficients);
-  float *output_layer_norm_coefficients_ptr = optional_tensor_ptr(_output_layer_norm_coefficients);
+  const float *input_to_input_weights_ptr = optional_tensor_ptr(_input_to_input_weights);
+  const float *recurrent_to_input_weights_ptr = optional_tensor_ptr(_recurrent_to_input_weights);
+  const float *cell_to_input_weights_ptr = optional_tensor_ptr(_cell_to_input_weights);
+  const float *cell_to_forget_weights_ptr = optional_tensor_ptr(_cell_to_forget_weights);
+  const float *cell_to_output_weights_ptr = optional_tensor_ptr(_cell_to_output_weights);
+  const float *input_gate_bias_ptr = optional_tensor_ptr(_input_gate_bias);
+  const float *projection_weights_ptr = optional_tensor_ptr(_projection_weights);
+  const float *projection_bias_ptr = optional_tensor_ptr(_projection_bias);
+  const float *input_layer_norm_coefficients_ptr =
+    optional_tensor_ptr(_input_layer_norm_coefficients);
+  const float *forget_layer_norm_coefficients_ptr =
+    optional_tensor_ptr(_forget_layer_norm_coefficients);
+  const float *cell_layer_norm_coefficients_ptr =
+    optional_tensor_ptr(_cell_layer_norm_coefficients);
+  const float *output_layer_norm_coefficients_ptr =
+    optional_tensor_ptr(_output_layer_norm_coefficients);
 
   // Copy out the LSTM specific params so they can be passed in the function.
   nnfw::cker::LSTMParams lstm_params;
@@ -140,7 +144,8 @@ void LSTMLayer::LSTMFloat()
   lstm_params.cell_clip = _params.cell_threshold;
   lstm_params.proj_clip = _params.projection_threshold;
 
-  const int output_batch_leading_dim = _output->dimension(_output->num_dimensions() - 1);
+  auto out_shape = _output->getShape();
+  const int output_batch_leading_dim = out_shape.dim(out_shape.rank() - 1);
   if (_time_major)
   {
     // Loop through the sequence.
@@ -151,36 +156,33 @@ void LSTMLayer::LSTMFloat()
       // If this is the forward_sequence, step forward, otherwise step
       // backwards.
       const int t_rel = _forward_sequence ? t : max_time - t - 1;
-      const float *input_ptr = reinterpret_cast<float *>(_input->buffer()) + t_rel * input_step;
+      const float *input_ptr = getBuffer<float>(_input) + t_rel * input_step;
       const float *aux_input_ptr = nullptr;
       if (_aux_input)
       {
-        aux_input_ptr = reinterpret_cast<float *>(_aux_input->buffer()) + t_rel * input_step;
+        aux_input_ptr = getBuffer<float>(_aux_input) + t_rel * input_step;
       }
-      float *output_ptr =
-          reinterpret_cast<float *>(_output->buffer()) + t_rel * output_step + _output_offset;
+      float *output_ptr = getBuffer<float>(_output) + t_rel * output_step + _output_offset;
 
       LstmStepFloat(
-          input_ptr, input_to_input_weights_ptr,
-          reinterpret_cast<float *>(_input_to_forget_weights->buffer()),
-          reinterpret_cast<float *>(_input_to_cell_weights->buffer()),
-          reinterpret_cast<float *>(_input_to_output_weights->buffer()), aux_input_ptr,
-          /*aux_input_to_input_weights=*/nullptr,
-          /*aux_input_to_forget_weights=*/nullptr,
-          /*aux_input_to_cell_weights=*/nullptr,
-          /*aux_input_to_output_weights=*/nullptr, recurrent_to_input_weights_ptr,
-          reinterpret_cast<float *>(_recurrent_to_forget_weights->buffer()),
-          reinterpret_cast<float *>(_recurrent_to_cell_weights->buffer()),
-          reinterpret_cast<float *>(_recurrent_to_output_weights->buffer()),
-          cell_to_input_weights_ptr, cell_to_forget_weights_ptr, cell_to_output_weights_ptr,
-          input_layer_norm_coefficients_ptr, forget_layer_norm_coefficients_ptr,
-          cell_layer_norm_coefficients_ptr, output_layer_norm_coefficients_ptr, input_gate_bias_ptr,
-          reinterpret_cast<float *>(_forget_gate_bias->buffer()),
-          reinterpret_cast<float *>(_cell_gate_bias->buffer()),
-          reinterpret_cast<float *>(_output_gate_bias->buffer()), projection_weights_ptr,
-          projection_bias_ptr, &lstm_params, n_batch, n_cell, n_input, aux_input_size, n_output,
-          output_batch_leading_dim, output_state_buf, cell_state_buf, input_gate_scratch,
-          forget_gate_scratch, cell_gate_scratch, output_gate_scratch, output_ptr);
+        input_ptr, input_to_input_weights_ptr, getBuffer<float>(_input_to_forget_weights),
+        getBuffer<float>(_input_to_cell_weights), getBuffer<float>(_input_to_output_weights),
+        aux_input_ptr,
+        /*aux_input_to_input_weights=*/nullptr,
+        /*aux_input_to_forget_weights=*/nullptr,
+        /*aux_input_to_cell_weights=*/nullptr,
+        /*aux_input_to_output_weights=*/nullptr, recurrent_to_input_weights_ptr,
+        getBuffer<float>(_recurrent_to_forget_weights),
+        getBuffer<float>(_recurrent_to_cell_weights),
+        getBuffer<float>(_recurrent_to_output_weights), cell_to_input_weights_ptr,
+        cell_to_forget_weights_ptr, cell_to_output_weights_ptr, input_layer_norm_coefficients_ptr,
+        forget_layer_norm_coefficients_ptr, cell_layer_norm_coefficients_ptr,
+        output_layer_norm_coefficients_ptr, input_gate_bias_ptr,
+        getBuffer<float>(_forget_gate_bias), getBuffer<float>(_cell_gate_bias),
+        getBuffer<float>(_output_gate_bias), projection_weights_ptr, projection_bias_ptr,
+        &lstm_params, n_batch, n_cell, n_input, aux_input_size, n_output, output_batch_leading_dim,
+        output_state_buf, cell_state_buf, input_gate_scratch, forget_gate_scratch,
+        cell_gate_scratch, output_gate_scratch, output_ptr);
     }
   }
   else
@@ -195,78 +197,69 @@ void LSTMLayer::LSTMFloat()
         // backwards.
         const int t_rel = _forward_sequence ? t : max_time - t - 1;
         const int time_offset = b * max_time + t_rel;
-        const float *input_ptr =
-            reinterpret_cast<float *>(_input->buffer()) + time_offset * input_step;
+        const float *input_ptr = getBuffer<float>(_input) + time_offset * input_step;
         const float *aux_input_ptr = nullptr;
         if (_aux_input)
         {
-          aux_input_ptr =
-              reinterpret_cast<float *>(_aux_input->buffer()) + time_offset * input_step;
+          aux_input_ptr = getBuffer<float>(_aux_input) + time_offset * input_step;
         }
-        float *output_ptr = reinterpret_cast<float *>(_output->buffer()) +
-                            time_offset * output_step + _output_offset;
+        float *output_ptr = getBuffer<float>(_output) + time_offset * output_step + _output_offset;
 
         // Offset the {output,cell}_state pointers to the right batch.
         float *output_state_ptr = output_state_buf + b * output_batch_leading_dim;
         float *cell_state_ptr = cell_state_buf + b * n_cell;
         // Offset the scratch pointers to the right batch.
         float *input_gate_scratch_ptr =
-            input_gate_scratch ? input_gate_scratch + b * n_cell : nullptr;
+          input_gate_scratch ? input_gate_scratch + b * n_cell : nullptr;
         float *forget_gate_scratch_ptr = forget_gate_scratch + b * n_cell;
         float *cell_gate_scratch_ptr = cell_gate_scratch + b * n_cell;
         float *output_gate_scratch_ptr = output_gate_scratch + b * n_cell;
 
         LstmStepFloat(
-            input_ptr, input_to_input_weights_ptr,
-            reinterpret_cast<float *>(_input_to_forget_weights->buffer()),
-            reinterpret_cast<float *>(_input_to_cell_weights->buffer()),
-            reinterpret_cast<float *>(_input_to_output_weights->buffer()), aux_input_ptr,
-            /*aux_input_to_input_weights=*/nullptr,
-            /*aux_input_to_forget_weights=*/nullptr,
-            /*aux_input_to_cell_weights=*/nullptr,
-            /*aux_input_to_output_weights=*/nullptr, recurrent_to_input_weights_ptr,
-            reinterpret_cast<float *>(_recurrent_to_forget_weights->buffer()),
-            reinterpret_cast<float *>(_recurrent_to_cell_weights->buffer()),
-            reinterpret_cast<float *>(_recurrent_to_output_weights->buffer()),
-            cell_to_input_weights_ptr, cell_to_forget_weights_ptr, cell_to_output_weights_ptr,
-            input_layer_norm_coefficients_ptr, forget_layer_norm_coefficients_ptr,
-            cell_layer_norm_coefficients_ptr, output_layer_norm_coefficients_ptr,
-            input_gate_bias_ptr, reinterpret_cast<float *>(_forget_gate_bias->buffer()),
-            reinterpret_cast<float *>(_cell_gate_bias->buffer()),
-            reinterpret_cast<float *>(_output_gate_bias->buffer()), projection_weights_ptr,
-            projection_bias_ptr, &lstm_params, /*n_batch=*/1, n_cell, n_input, aux_input_size,
-            n_output, output_batch_leading_dim, output_state_ptr, cell_state_ptr,
-            input_gate_scratch_ptr, forget_gate_scratch_ptr, cell_gate_scratch_ptr,
-            output_gate_scratch_ptr, output_ptr);
+          input_ptr, input_to_input_weights_ptr, getBuffer<float>(_input_to_forget_weights),
+          getBuffer<float>(_input_to_cell_weights), getBuffer<float>(_input_to_output_weights),
+          aux_input_ptr,
+          /*aux_input_to_input_weights=*/nullptr,
+          /*aux_input_to_forget_weights=*/nullptr,
+          /*aux_input_to_cell_weights=*/nullptr,
+          /*aux_input_to_output_weights=*/nullptr, recurrent_to_input_weights_ptr,
+          getBuffer<float>(_recurrent_to_forget_weights),
+          getBuffer<float>(_recurrent_to_cell_weights),
+          getBuffer<float>(_recurrent_to_output_weights), cell_to_input_weights_ptr,
+          cell_to_forget_weights_ptr, cell_to_output_weights_ptr, input_layer_norm_coefficients_ptr,
+          forget_layer_norm_coefficients_ptr, cell_layer_norm_coefficients_ptr,
+          output_layer_norm_coefficients_ptr, input_gate_bias_ptr,
+          getBuffer<float>(_forget_gate_bias), getBuffer<float>(_cell_gate_bias),
+          getBuffer<float>(_output_gate_bias), projection_weights_ptr, projection_bias_ptr,
+          &lstm_params, /*n_batch=*/1, n_cell, n_input, aux_input_size, n_output,
+          output_batch_leading_dim, output_state_ptr, cell_state_ptr, input_gate_scratch_ptr,
+          forget_gate_scratch_ptr, cell_gate_scratch_ptr, output_gate_scratch_ptr, output_ptr);
       }
     }
   }
 }
 
 void LSTMLayer::configure(
-    const IPortableTensor *input, const IPortableTensor *input_to_input_weights,
-    const IPortableTensor *input_to_forget_weights, const IPortableTensor *input_to_cell_weights,
-    const IPortableTensor *input_to_output_weights,
-    const IPortableTensor *recurrent_to_input_weights,
-    const IPortableTensor *recurrent_to_forget_weights,
-    const IPortableTensor *recurrent_to_cell_weights,
-    const IPortableTensor *recurrent_to_output_weights,
-    const IPortableTensor *cell_to_input_weights, const IPortableTensor *cell_to_forget_weights,
-    const IPortableTensor *cell_to_output_weights, const IPortableTensor *input_layer_norm_weights,
-    const IPortableTensor *forget_layer_norm_weights,
-    const IPortableTensor *cell_layer_norm_weights,
-    const IPortableTensor *output_layer_norm_weights, const IPortableTensor *aux_input,
-    const IPortableTensor *aux_input_to_input_weights,
-    const IPortableTensor *aux_input_to_forget_weights,
-    const IPortableTensor *aux_input_to_cell_weights,
-    const IPortableTensor *aux_input_to_output_weights, const IPortableTensor *input_gate_bias,
-    const IPortableTensor *forget_gate_bias, const IPortableTensor *cell_gate_bias,
-    const IPortableTensor *output_gate_bias, const IPortableTensor *projection_weights,
-    const IPortableTensor *projection_bias, const IPortableTensor *output_state_in,
-    const IPortableTensor *cell_state_in, const ir::operation::LSTM::Param &params,
-    bool forward_sequence, bool time_major, int output_offset, IPortableTensor *scratch_buffer,
-    IPortableTensor *output_state, IPortableTensor *cell_state, IPortableTensor *output,
-    bool has_output_state_data, bool has_cell_state_data)
+  const IPortableTensor *input, const IPortableTensor *input_to_input_weights,
+  const IPortableTensor *input_to_forget_weights, const IPortableTensor *input_to_cell_weights,
+  const IPortableTensor *input_to_output_weights, const IPortableTensor *recurrent_to_input_weights,
+  const IPortableTensor *recurrent_to_forget_weights,
+  const IPortableTensor *recurrent_to_cell_weights,
+  const IPortableTensor *recurrent_to_output_weights, const IPortableTensor *cell_to_input_weights,
+  const IPortableTensor *cell_to_forget_weights, const IPortableTensor *cell_to_output_weights,
+  const IPortableTensor *input_layer_norm_weights, const IPortableTensor *forget_layer_norm_weights,
+  const IPortableTensor *cell_layer_norm_weights, const IPortableTensor *output_layer_norm_weights,
+  const IPortableTensor *aux_input, const IPortableTensor *aux_input_to_input_weights,
+  const IPortableTensor *aux_input_to_forget_weights,
+  const IPortableTensor *aux_input_to_cell_weights,
+  const IPortableTensor *aux_input_to_output_weights, const IPortableTensor *input_gate_bias,
+  const IPortableTensor *forget_gate_bias, const IPortableTensor *cell_gate_bias,
+  const IPortableTensor *output_gate_bias, const IPortableTensor *projection_weights,
+  const IPortableTensor *projection_bias, const IPortableTensor *output_state_in,
+  const IPortableTensor *cell_state_in, const ir::operation::LSTM::Param &params,
+  bool forward_sequence, bool time_major, int output_offset, IPortableTensor *scratch_buffer,
+  IPortableTensor *output_state, IPortableTensor *cell_state, IPortableTensor *output,
+  bool has_output_state_data, bool has_cell_state_data)
 {
   _input = input;
   _input_to_input_weights = input_to_input_weights;
diff --git a/runtime/onert/backend/cpu/ops/LSTMLayer.h b/runtime/onert/backend/cpu/ops/LSTMLayer.h
index 5978cce63..72ac2ed04 100644
--- a/runtime/onert/backend/cpu/ops/LSTMLayer.h
+++ b/runtime/onert/backend/cpu/ops/LSTMLayer.h
@@ -49,33 +49,30 @@ public:
 public:
   void LSTMFloat();
 
-  void configure(const IPortableTensor *input, const IPortableTensor *input_to_input_weights,
-                 const IPortableTensor *input_to_forget_weights,
-                 const IPortableTensor *input_to_cell_weights,
-                 const IPortableTensor *input_to_output_weights,
-                 const IPortableTensor *recurrent_to_input_weights,
-                 const IPortableTensor *recurrent_to_forget_weights,
-                 const IPortableTensor *recurrent_to_cell_weights,
-                 const IPortableTensor *recurrent_to_output_weights,
-                 const IPortableTensor *cell_to_input_weights,
-                 const IPortableTensor *cell_to_forget_weights,
-                 const IPortableTensor *cell_to_output_weights,
-                 const IPortableTensor *input_layer_norm_weights,
-                 const IPortableTensor *forget_layer_norm_weights,
-                 const IPortableTensor *cell_layer_norm_weights,
-                 const IPortableTensor *output_layer_norm_weights, const IPortableTensor *aux_input,
-                 const IPortableTensor *aux_input_to_input_weights,
-                 const IPortableTensor *aux_input_to_forget_weights,
-                 const IPortableTensor *aux_input_to_cell_weights,
-                 const IPortableTensor *aux_input_to_output_weights,
-                 const IPortableTensor *input_gate_bias, const IPortableTensor *forget_gate_bias,
-                 const IPortableTensor *cell_gate_bias, const IPortableTensor *output_gate_bias,
-                 const IPortableTensor *projection_weights, const IPortableTensor *projection_bias,
-                 const IPortableTensor *output_state_in, const IPortableTensor *cell_state_in,
-                 const ir::operation::LSTM::Param &params, bool forward_sequence, bool time_major,
-                 int32_t output_offset, IPortableTensor *scratch_buffer,
-                 IPortableTensor *output_state, IPortableTensor *cell_state,
-                 IPortableTensor *output, bool has_output_state_data, bool has_cell_state_data);
+  void configure(
+    const IPortableTensor *input, const IPortableTensor *input_to_input_weights,
+    const IPortableTensor *input_to_forget_weights, const IPortableTensor *input_to_cell_weights,
+    const IPortableTensor *input_to_output_weights,
+    const IPortableTensor *recurrent_to_input_weights,
+    const IPortableTensor *recurrent_to_forget_weights,
+    const IPortableTensor *recurrent_to_cell_weights,
+    const IPortableTensor *recurrent_to_output_weights,
+    const IPortableTensor *cell_to_input_weights, const IPortableTensor *cell_to_forget_weights,
+    const IPortableTensor *cell_to_output_weights, const IPortableTensor *input_layer_norm_weights,
+    const IPortableTensor *forget_layer_norm_weights,
+    const IPortableTensor *cell_layer_norm_weights,
+    const IPortableTensor *output_layer_norm_weights, const IPortableTensor *aux_input,
+    const IPortableTensor *aux_input_to_input_weights,
+    const IPortableTensor *aux_input_to_forget_weights,
+    const IPortableTensor *aux_input_to_cell_weights,
+    const IPortableTensor *aux_input_to_output_weights, const IPortableTensor *input_gate_bias,
+    const IPortableTensor *forget_gate_bias, const IPortableTensor *cell_gate_bias,
+    const IPortableTensor *output_gate_bias, const IPortableTensor *projection_weights,
+    const IPortableTensor *projection_bias, const IPortableTensor *output_state_in,
+    const IPortableTensor *cell_state_in, const ir::operation::LSTM::Param &params,
+    bool forward_sequence, bool time_major, int32_t output_offset, IPortableTensor *scratch_buffer,
+    IPortableTensor *output_state, IPortableTensor *cell_state, IPortableTensor *output,
+    bool has_output_state_data, bool has_cell_state_data);
 
   void run() override;
 
diff --git a/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc b/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc
index 1d7ee6caa..a544dd970 100644
--- a/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc
+++ b/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc
@@ -49,9 +49,8 @@ void LogSoftMaxLayer::logsoftmaxFloat32()
   nnfw::cker::SoftmaxParams op_params;
   op_params.beta = _beta;
   op_params.axis = _axis;
-  nnfw::cker::LogSoftmax(op_params, getTensorShape(_input),
-                         reinterpret_cast<const float *>(_input->buffer()), getTensorShape(_output),
-                         reinterpret_cast<float *>(_output->buffer()));
+  nnfw::cker::LogSoftmax(op_params, getShape(_input), getBuffer<float>(_input), getShape(_output),
+                         getBuffer<float>(_output));
 }
 
 void LogSoftMaxLayer::logsoftmaxQuant8()
@@ -60,11 +59,11 @@ void LogSoftMaxLayer::logsoftmaxQuant8()
   op_params.beta = _beta;
   op_params.axis = _axis;
   op_params.table = _table;
-  op_params.zero_point = _output->data_offset();
+  op_params.zero_point = _output->data_zero_point();
   op_params.scale = _output->data_scale();
-  nnfw::cker::LogSoftmax(op_params, _input->data_scale(), getTensorShape(_input),
-                         reinterpret_cast<const uint8_t *>(_input->buffer()),
-                         getTensorShape(_output), reinterpret_cast<uint8_t *>(_output->buffer()));
+  nnfw::cker::LogSoftmax(op_params, _input->data_scale(), getShape(_input),
+                         getBuffer<uint8_t>(_input), getShape(_output),
+                         getBuffer<uint8_t>(_output));
 }
 
 void LogSoftMaxLayer::configure(const IPortableTensor *input, const float beta, const int axis,
diff --git a/runtime/onert/backend/cpu/ops/MatrixBandPartLayer.cc b/runtime/onert/backend/cpu/ops/MatrixBandPartLayer.cc
index b770cce5d..7220a2bab 100644
--- a/runtime/onert/backend/cpu/ops/MatrixBandPartLayer.cc
+++ b/runtime/onert/backend/cpu/ops/MatrixBandPartLayer.cc
@@ -30,7 +30,7 @@ namespace ops
 {
 
 MatrixBandPartLayer::MatrixBandPartLayer()
-    : _input(nullptr), _num_lower_diag(nullptr), _num_upper_diag(nullptr), _output(nullptr)
+  : _input(nullptr), _num_lower_diag(nullptr), _num_upper_diag(nullptr), _output(nullptr)
 {
   // DO NOTHING
 }
@@ -40,18 +40,14 @@ void MatrixBandPartLayer::matrixBandPartFloat32()
   if (_num_lower_diag->data_type() == OperandType::INT64)
   {
     nnfw::cker::MatrixBandPart<int64_t>(
-        *reinterpret_cast<const int64_t *>(_num_lower_diag->buffer()),
-        *reinterpret_cast<const int64_t *>(_num_upper_diag->buffer()), getTensorShape(_input),
-        reinterpret_cast<const float *>(_input->buffer()), getTensorShape(_output),
-        reinterpret_cast<float *>(_output->buffer()));
+      *getBuffer<int64_t>(_num_lower_diag), *getBuffer<int64_t>(_num_upper_diag), getShape(_input),
+      getBuffer<float>(_input), getShape(_output), getBuffer<float>(_output));
   }
   else
   {
     nnfw::cker::MatrixBandPart<int32_t>(
-        *reinterpret_cast<const int32_t *>(_num_lower_diag->buffer()),
-        *reinterpret_cast<const int32_t *>(_num_upper_diag->buffer()), getTensorShape(_input),
-        reinterpret_cast<const float *>(_input->buffer()), getTensorShape(_output),
-        reinterpret_cast<float *>(_output->buffer()));
+      *getBuffer<int32_t>(_num_lower_diag), *getBuffer<int32_t>(_num_upper_diag), getShape(_input),
+      getBuffer<float>(_input), getShape(_output), getBuffer<float>(_output));
   }
 }
 
diff --git a/runtime/onert/backend/cpu/ops/MeanLayer.cc b/runtime/onert/backend/cpu/ops/MeanLayer.cc
index f130692ee..c86a9d126 100644
--- a/runtime/onert/backend/cpu/ops/MeanLayer.cc
+++ b/runtime/onert/backend/cpu/ops/MeanLayer.cc
@@ -36,33 +36,29 @@ MeanLayer::MeanLayer() : _input(nullptr), _axes(nullptr), _output(nullptr), _kee
 
 void MeanLayer::MeanFloat32()
 {
-  const auto inputShape = getTensorShape(_input);
+  const auto inputShape = getShape(_input);
   const auto axisVec = getReducerAxes(_axes);
   bool axis_is_1_and_2 =
-      _keep_dims && inputShape.DimensionsCount() == 4 && axisVec.size() == 2 &&
-      ((axisVec[0] == 1 && axisVec[1] == 2) || (axisVec[0] == 2 && axisVec[1] == 1));
+    _keep_dims && inputShape.DimensionsCount() == 4 && axisVec.size() == 2 &&
+    ((axisVec[0] == 1 && axisVec[1] == 2) || (axisVec[0] == 2 && axisVec[1] == 1));
 
   if (axis_is_1_and_2)
   {
-    nnfw::cker::MeanAxis1And2(inputShape, reinterpret_cast<const float *>(_input->buffer()),
-                              getTensorShape(_output),
-                              reinterpret_cast<float *>(_output->buffer()));
+    nnfw::cker::MeanAxis1And2(inputShape, getBuffer<float>(_input), getShape(_output),
+                              getBuffer<float>(_output));
   }
   else
   {
-    nnfw::cker::Mean(inputShape, reinterpret_cast<const float *>(_input->buffer()),
-                     getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()),
-                     axisVec);
+    nnfw::cker::Mean(inputShape, getBuffer<float>(_input), getShape(_output),
+                     getBuffer<float>(_output), axisVec);
   }
 }
 
 void MeanLayer::MeanQuant8()
 {
-  nnfw::cker::MeanQ8Asymm(getTensorShape(_input),
-                          reinterpret_cast<const uint8_t *>(_input->buffer()), _input->data_scale(),
-                          _input->data_offset(), getTensorShape(_output),
-                          reinterpret_cast<uint8_t *>(_output->buffer()), _output->data_scale(),
-                          _output->data_offset(), getReducerAxes(_axes));
+  nnfw::cker::MeanQ8Asymm(getShape(_input), getBuffer<uint8_t>(_input), _input->data_scale(),
+                          _input->data_zero_point(), getShape(_output), getBuffer<uint8_t>(_output),
+                          _output->data_scale(), _output->data_zero_point(), getReducerAxes(_axes));
 }
 
 void MeanLayer::configure(const IPortableTensor *input, const IPortableTensor *axes,
diff --git a/runtime/onert/backend/cpu/ops/OneHotLayer.cc b/runtime/onert/backend/cpu/ops/OneHotLayer.cc
index 2a82b00ee..66773a608 100644
--- a/runtime/onert/backend/cpu/ops/OneHotLayer.cc
+++ b/runtime/onert/backend/cpu/ops/OneHotLayer.cc
@@ -33,10 +33,8 @@ template <typename T> void OneHotLayer::oneHotImpl()
 {
   // It assumes index is int32_t type.
   nnfw::cker::OneHot<T, int32_t>(
-      *reinterpret_cast<const int32_t *>(_depth->buffer()),
-      *reinterpret_cast<T *>(_on_value->buffer()), *reinterpret_cast<T *>(_off_value->buffer()),
-      _axis, getTensorShape(_indices), reinterpret_cast<const int32_t *>(_indices->buffer()),
-      getTensorShape(_output), reinterpret_cast<T *>(_output->buffer()));
+    *getBuffer<int32_t>(_depth), *getBuffer<T>(_on_value), *getBuffer<T>(_off_value), _axis,
+    getShape(_indices), getBuffer<int32_t>(_indices), getShape(_output), getBuffer<T>(_output));
 }
 
 void OneHotLayer::configure(const IPortableTensor *indices, const IPortableTensor *depth,
diff --git a/runtime/onert/backend/cpu/ops/OneHotLayer.h b/runtime/onert/backend/cpu/ops/OneHotLayer.h
index c05498440..b0f03a261 100644
--- a/runtime/onert/backend/cpu/ops/OneHotLayer.h
+++ b/runtime/onert/backend/cpu/ops/OneHotLayer.h
@@ -34,8 +34,8 @@ class OneHotLayer : public ::onert::exec::IFunction
 {
 public:
   OneHotLayer()
-      : _indices(nullptr), _depth(nullptr), _on_value(nullptr), _off_value(nullptr),
-        _output(nullptr), _axis(-1)
+    : _indices(nullptr), _depth(nullptr), _on_value(nullptr), _off_value(nullptr), _output(nullptr),
+      _axis(-1)
   {
     // DO NOTHING
   }
diff --git a/runtime/onert/backend/cpu/ops/OperationUtils.cc b/runtime/onert/backend/cpu/ops/OperationUtils.cc
index 2eee6dc85..8ac875842 100644
--- a/runtime/onert/backend/cpu/ops/OperationUtils.cc
+++ b/runtime/onert/backend/cpu/ops/OperationUtils.cc
@@ -32,16 +32,17 @@ namespace ops
 uint32_t getNumberOfDimensions(const IPortableTensor *tensor)
 {
   assert(tensor);
-  return tensor->num_dimensions();
+  return tensor->getShape().rank();
 }
 
 uint32_t getNumberOfElements(const IPortableTensor *tensor)
 {
   assert(tensor);
   uint32_t count = 1;
-  for (size_t i = 0; i < tensor->num_dimensions(); i++)
+  auto shape = tensor->getShape();
+  for (int i = 0; i < shape.rank(); i++)
   {
-    count *= tensor->dimension(i);
+    count *= shape.dim(i);
   }
   return count;
 }
@@ -49,12 +50,13 @@ uint32_t getNumberOfElements(const IPortableTensor *tensor)
 uint32_t getSizeOfDimension(const IPortableTensor *tensor, uint32_t dimensionIdx)
 {
   assert(tensor);
-  if (dimensionIdx >= tensor->num_dimensions())
+  auto shape = tensor->getShape();
+  if (dimensionIdx >= static_cast<uint32_t>(shape.rank()))
   {
     // TODO, log the error
     return 0;
   }
-  return tensor->dimension(dimensionIdx);
+  return shape.dim(dimensionIdx);
 }
 
 void QuantizeMultiplier(double double_multiplier, int32_t *quantized_multiplier, int *shift)
@@ -94,6 +96,34 @@ void GetQuantizedConvolutionMultiplier(const IPortableTensor *input, const IPort
   *multiplier = input_product_scale / output_scale;
 }
 
+void GetQuantizedConvolutionMultipliersAndShifts(
+  float input_scale, float output_scale, const float *filter_scales, size_t filter_scales_size,
+  int num_channels, std::vector<int32_t> &per_channel_output_multiplier,
+  std::vector<int> &per_channel_output_shift)
+{
+  // Originates from tflite's PopulateConvolutionQuantizationParams()
+  per_channel_output_multiplier.resize(num_channels);
+  per_channel_output_shift.resize(num_channels);
+
+  const bool is_per_channel = filter_scales_size > 1;
+  auto per_channel_multiplier = per_channel_output_multiplier.data();
+  auto per_channel_shift = per_channel_output_shift.data();
+  for (int i = 0; i < num_channels; ++i)
+  {
+    // If per-tensor quantization parameter is specified, broadcast it along the
+    // quantization dimension (channels_out).
+    const float scale = is_per_channel ? filter_scales[i] : filter_scales[0];
+    const double filter_scale = static_cast<double>(scale);
+    const double effective_output_scale =
+      static_cast<double>(input_scale) * filter_scale / static_cast<double>(output_scale);
+    int32_t significand;
+    int channel_shift;
+    QuantizeMultiplier(effective_output_scale, &significand, &channel_shift);
+    per_channel_multiplier[i] = significand;
+    per_channel_shift[i] = channel_shift;
+  }
+}
+
 void QuantizeMultiplierGreaterThanOne(double double_multiplier, int32_t *quantized_multiplier,
                                       int *left_shift)
 {
@@ -111,13 +141,29 @@ void QuantizeMultiplierGreaterThanOne(double double_multiplier, int32_t *quantiz
   *quantized_multiplier = static_cast<int32_t>(q_fixed);
 }
 
-void CalculateActivationRangeUint8(ir::Activation activation, const IPortableTensor *output,
-                                   int32_t *act_min, int32_t *act_max)
+void CalculateActivationRangeQuantized(ir::Activation activation, const IPortableTensor *output,
+                                       int32_t *act_min, int32_t *act_max)
 {
-  const int32_t qmin = std::numeric_limits<uint8_t>::min();
-  const int32_t qmax = std::numeric_limits<uint8_t>::max();
+  int32_t qmin = 0;
+  int32_t qmax = 0;
+
+  switch (output->data_type())
+  {
+    case OperandType::QUANT_UINT8_ASYMM:
+      qmin = std::numeric_limits<uint8_t>::min();
+      qmax = std::numeric_limits<uint8_t>::max();
+      break;
+    case OperandType::QUANT_INT8_ASYMM:
+    case OperandType::QUANT_INT8_SYMM:
+      qmin = std::numeric_limits<int8_t>::min();
+      qmax = std::numeric_limits<int8_t>::max();
+      break;
+    default:
+      throw std::runtime_error("CalculateActivationRangeQuantized: Not supported operand type.");
+  }
+
   const auto scale = output->data_scale();
-  const auto zero_point = output->data_offset();
+  const auto zero_point = output->data_zero_point();
   auto quantize = [scale, zero_point](float f) {
     return zero_point + static_cast<int32_t>(std::round(f / scale));
   };
@@ -167,8 +213,10 @@ bool HaveSameShapes(const IPortableTensor *input1, const IPortableTensor *input2
   if (getNumberOfDimensions(input1) != getNumberOfDimensions(input2))
     return false;
 
+  auto shape1 = input1->getShape();
+  auto shape2 = input2->getShape();
   for (uint32_t i = 0; i < getNumberOfDimensions(input1); i++)
-    if (input1->dimension(i) != input2->dimension(i))
+    if (shape1.dim(i) != shape2.dim(i))
       return false;
 
   return true;
@@ -237,20 +285,21 @@ std::vector<int32_t> getReducerAxes(const IPortableTensor *axes)
 {
   std::vector<int32_t> ret;
 
+  auto axes_vals = (axes->getShape().rank() == 0) ? 1 : axes->getShape().dim(0);
   assert(axes->layout() == ir::Layout::NHWC);
-  assert(axes->dimension(0) == axes->getShape().num_elements());
+  assert(static_cast<size_t>(axes_vals) == axes->getShape().num_elements());
   switch (axes->data_type())
   {
     case ir::DataType::INT32:
     {
-      for (size_t i = 0; i < axes->dimension(0); ++i)
-        ret.emplace_back(*(reinterpret_cast<const int32_t *>(axes->buffer()) + i));
+      for (int i = 0; i < axes_vals; ++i)
+        ret.emplace_back(*(getBuffer<int32_t>(axes) + i));
       break;
     }
     case ir::DataType::INT64:
     {
-      for (size_t i = 0; i < axes->dimension(0); ++i)
-        ret.emplace_back(*(reinterpret_cast<const int64_t *>(axes->buffer()) + i));
+      for (int i = 0; i < axes_vals; ++i)
+        ret.emplace_back(*(getBuffer<int64_t>(axes) + i));
       break;
     }
     default:
diff --git a/runtime/onert/backend/cpu/ops/OperationUtils.h b/runtime/onert/backend/cpu/ops/OperationUtils.h
index ea44aeb7a..ac2fbb84f 100644
--- a/runtime/onert/backend/cpu/ops/OperationUtils.h
+++ b/runtime/onert/backend/cpu/ops/OperationUtils.h
@@ -74,7 +74,8 @@ inline nnfw::cker::Shape getExtendedTensorShape(const IPortableTensor *tensor)
   assert(tensor);
   const int32_t extended_rank = 4;
   int32_t raw_shape[extended_rank];
-  uint32_t src = extended_rank - tensor->num_dimensions();
+  auto shape = tensor->getShape();
+  uint32_t src = extended_rank - shape.rank();
   for (uint32_t i = 0; i < extended_rank; ++i)
   {
     if (i < src)
@@ -83,14 +84,14 @@ inline nnfw::cker::Shape getExtendedTensorShape(const IPortableTensor *tensor)
     }
     else
     {
-      raw_shape[i] = tensor->dimension(i - src);
+      raw_shape[i] = shape.dim(i - src);
     }
   }
 
   return nnfw::cker::Shape(extended_rank, raw_shape);
 }
 
-inline nnfw::cker::Shape getTensorShape(const IPortableTensor *tensor)
+inline nnfw::cker::Shape getShape(const IPortableTensor *tensor)
 {
   if (tensor == nullptr)
     return nnfw::cker::Shape();
@@ -160,6 +161,11 @@ void GetQuantizedConvolutionMultiplier(const IPortableTensor *inputDescr,
 void QuantizeMultiplierGreaterThanOne(double double_multiplier, int32_t *quantized_multiplier,
                                       int *left_shift);
 
+void GetQuantizedConvolutionMultipliersAndShifts(
+  float input_scale, float output_scale, const float *filter_scales, size_t filter_scales_size,
+  int num_channels, std::vector<int32_t> &per_channel_output_multiplier,
+  std::vector<int> &per_channel_output_shift);
+
 template <typename T>
 void CalculateActivationRange(ir::Activation activation, T *activation_min, T *activation_max)
 {
@@ -194,8 +200,8 @@ void CalculateActivationRange(ir::Activation activation, T *activation_min, T *a
   }
 }
 
-void CalculateActivationRangeUint8(ir::Activation activation, const IPortableTensor *output,
-                                   int32_t *act_min, int32_t *act_max);
+void CalculateActivationRangeQuantized(ir::Activation activation, const IPortableTensor *output,
+                                       int32_t *act_min, int32_t *act_max);
 
 bool HaveSameShapes(const IPortableTensor *input1, const IPortableTensor *input2);
 
@@ -207,6 +213,16 @@ nnfw::cker::PaddingType getPaddingType(ir::PaddingType ir_padding_type);
 
 std::vector<int32_t> getReducerAxes(const IPortableTensor *axes);
 
+template <typename T> const T *getBuffer(const IPortableTensor *tensor)
+{
+  return reinterpret_cast<const T *>(tensor->buffer());
+}
+
+template <typename T> T *getBuffer(IPortableTensor *tensor)
+{
+  return reinterpret_cast<T *>(tensor->buffer());
+}
+
 } // namespace ops
 } // namespace cpu
 } // namespace backend
diff --git a/runtime/onert/backend/cpu/ops/PackLayer.cc b/runtime/onert/backend/cpu/ops/PackLayer.cc
index 314b192a2..beac6c73b 100644
--- a/runtime/onert/backend/cpu/ops/PackLayer.cc
+++ b/runtime/onert/backend/cpu/ops/PackLayer.cc
@@ -48,7 +48,7 @@ template <typename T> void PackLayer::packImpl()
 
   for (uint32_t i = 0; i < num_inputs; i++)
   {
-    inputDims.push_back(getTensorShape(_inputs[i]));
+    inputDims.push_back(getShape(_inputs[i]));
     inputDimsPtr.push_back(&inputDims[i]);
   }
 
@@ -56,11 +56,10 @@ template <typename T> void PackLayer::packImpl()
 
   for (const auto input : _inputs)
   {
-    inputPtrs.emplace_back(reinterpret_cast<const T *>(input->buffer()));
+    inputPtrs.emplace_back(getBuffer<T>(input));
   }
 
-  nnfw::cker::Pack<T>(op_params, inputPtrs.data(), getTensorShape(_output),
-                      reinterpret_cast<T *>(_output->buffer()));
+  nnfw::cker::Pack<T>(op_params, inputPtrs.data(), getShape(_output), getBuffer<T>(_output));
 }
 
 void PackLayer::configure(const std::vector<const IPortableTensor *> &inputs, int32_t axis,
diff --git a/runtime/onert/backend/cpu/ops/PadLayer.cc b/runtime/onert/backend/cpu/ops/PadLayer.cc
index 6a2bf9da0..d9da564c4 100644
--- a/runtime/onert/backend/cpu/ops/PadLayer.cc
+++ b/runtime/onert/backend/cpu/ops/PadLayer.cc
@@ -28,16 +28,15 @@ namespace ops
 {
 
 PadLayer::PadLayer()
-    : _input(nullptr), _output(nullptr), _padData(), _padRank(), _constantValueData()
+  : _input(nullptr), _output(nullptr), _padData(), _padRank(), _constantValueData()
 {
   // DO NOTHING
 }
 
 template <typename T> void PadLayer::padImpl(const T *constant_value_data)
 {
-  nnfw::cker::Pad<T>(_padData, _padRank, getTensorShape(_input),
-                     reinterpret_cast<const T *>(_input->buffer()), getTensorShape(_output),
-                     reinterpret_cast<T *>(_output->buffer()), constant_value_data);
+  nnfw::cker::Pad<T>(_padData, _padRank, getShape(_input), getBuffer<T>(_input), getShape(_output),
+                     getBuffer<T>(_output), constant_value_data);
 }
 
 void PadLayer::configure(const IPortableTensor *input, IPortableTensor *output,
@@ -52,25 +51,35 @@ void PadLayer::configure(const IPortableTensor *input, IPortableTensor *output,
 
 void PadLayer::run()
 {
-  if (_input->data_type() == OperandType::FLOAT32)
+  switch (_input->data_type())
   {
-    padImpl<float>(_constantValueData.f);
-  }
-  else if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM)
-  {
-    if (_constantValueData.u8 == nullptr)
-    {
-      uint8_t pad_value = static_cast<uint8_t>(_output->data_offset());
-      padImpl<uint8_t>(&pad_value);
-    }
-    else
-    {
-      padImpl<uint8_t>(_constantValueData.u8);
-    }
-  }
-  else
-  {
-    throw std::runtime_error{"Pad: unsupported data type"};
+    case OperandType::FLOAT32:
+      padImpl<float>(_constantValueData.f);
+      break;
+    case OperandType::QUANT_UINT8_ASYMM:
+      if (_constantValueData.u8 == nullptr)
+      {
+        uint8_t pad_value = static_cast<uint8_t>(_output->data_zero_point());
+        padImpl<uint8_t>(&pad_value);
+      }
+      else
+      {
+        padImpl<uint8_t>(_constantValueData.u8);
+      }
+      break;
+    case OperandType::QUANT_INT8_ASYMM:
+      if (_constantValueData.i8 == nullptr)
+      {
+        int8_t pad_value = static_cast<int8_t>(_output->data_zero_point());
+        padImpl<int8_t>(&pad_value);
+      }
+      else
+      {
+        padImpl<int8_t>(_constantValueData.i8);
+      }
+      break;
+    default:
+      throw std::runtime_error{"Pad: unsupported data type"};
   }
 }
 
diff --git a/runtime/onert/backend/cpu/ops/PoolLayer.cc b/runtime/onert/backend/cpu/ops/PoolLayer.cc
index 85d02a751..101b6f266 100644
--- a/runtime/onert/backend/cpu/ops/PoolLayer.cc
+++ b/runtime/onert/backend/cpu/ops/PoolLayer.cc
@@ -36,18 +36,16 @@ template <typename T>
 void avgPool2D(const nnfw::cker::PoolParams &params, const IPortableTensor *input,
                IPortableTensor *output)
 {
-  nnfw::cker::AveragePool<T>(params, getTensorShape(input),
-                             reinterpret_cast<const T *>(input->buffer()), getTensorShape(output),
-                             reinterpret_cast<T *>(output->buffer()));
+  nnfw::cker::AveragePool<T>(params, getShape(input), getBuffer<T>(input), getShape(output),
+                             getBuffer<T>(output));
 }
 
 template <typename T>
 void maxPool2D(const nnfw::cker::PoolParams &params, const IPortableTensor *input,
                IPortableTensor *output)
 {
-  nnfw::cker::MaxPool<T>(params, getTensorShape(input),
-                         reinterpret_cast<const T *>(input->buffer()), getTensorShape(output),
-                         reinterpret_cast<T *>(output->buffer()));
+  nnfw::cker::MaxPool<T>(params, getShape(input), getBuffer<T>(input), getShape(output),
+                         getBuffer<T>(output));
 }
 
 template <typename T>
@@ -96,29 +94,44 @@ void PoolLayer::configure(const IPortableTensor *input, const uint32_t paddingLe
   _output = output;
 
   POOLING_PARAMETERS
-  if (_input->data_type() == OperandType::FLOAT32)
-  {
-    float output_activation_min = 0;
-    float output_activation_max = 0;
-    CalculateActivationRange<float>(activation, &output_activation_min, &output_activation_max);
-    op_params.float_activation_min = output_activation_min;
-    op_params.float_activation_max = output_activation_max;
 
-    _kernel = generateKernelGeneric<float>(op_params, op_type);
-  }
-  else if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM)
-  {
-    int32_t output_activation_min = 0;
-    int32_t output_activation_max = 0;
-    CalculateActivationRangeUint8(activation, _output, &output_activation_min,
-                                  &output_activation_max);
-    op_params.quantized_activation_min = output_activation_min;
-    op_params.quantized_activation_max = output_activation_max;
-    _kernel = generateKernelGeneric<uint8_t>(op_params, op_type);
-  }
-  else
+  switch (_input->data_type())
   {
-    throw std::runtime_error{"Pool: unsupported data type"};
+    case OperandType::FLOAT32:
+    {
+      float output_activation_min = 0;
+      float output_activation_max = 0;
+      CalculateActivationRange<float>(activation, &output_activation_min, &output_activation_max);
+      op_params.float_activation_min = output_activation_min;
+      op_params.float_activation_max = output_activation_max;
+
+      _kernel = generateKernelGeneric<float>(op_params, op_type);
+      break;
+    }
+    case OperandType::QUANT_UINT8_ASYMM:
+    {
+      int32_t output_activation_min = 0;
+      int32_t output_activation_max = 0;
+      CalculateActivationRangeQuantized(activation, _output, &output_activation_min,
+                                        &output_activation_max);
+      op_params.quantized_activation_min = output_activation_min;
+      op_params.quantized_activation_max = output_activation_max;
+      _kernel = generateKernelGeneric<uint8_t>(op_params, op_type);
+      break;
+    }
+    case OperandType::QUANT_INT8_ASYMM:
+    {
+      int32_t output_activation_min = 0;
+      int32_t output_activation_max = 0;
+      CalculateActivationRangeQuantized(activation, _output, &output_activation_min,
+                                        &output_activation_max);
+      op_params.quantized_activation_min = output_activation_min;
+      op_params.quantized_activation_max = output_activation_max;
+      _kernel = generateKernelGeneric<int8_t>(op_params, op_type);
+      break;
+    }
+    default:
+      throw std::runtime_error{"Pool: unsupported data type"};
   }
 }
 
diff --git a/runtime/onert/backend/cpu/ops/PowLayer.cc b/runtime/onert/backend/cpu/ops/PowLayer.cc
index 04a1af1e1..efd024dee 100644
--- a/runtime/onert/backend/cpu/ops/PowLayer.cc
+++ b/runtime/onert/backend/cpu/ops/PowLayer.cc
@@ -39,15 +39,13 @@ void PowLayer::powFloat32()
   if (!HaveSameShapes(_lhs, _rhs))
   {
     nnfw::cker::BroadcastBinaryArithmeticOp<nnfw::cker::BinaryArithmeticOpType::POW>(
-        op_params, getTensorShape(_lhs), reinterpret_cast<const float *>(_lhs->buffer()),
-        getTensorShape(_rhs), reinterpret_cast<const float *>(_rhs->buffer()),
-        getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()));
+      op_params, getShape(_lhs), getBuffer<float>(_lhs), getShape(_rhs), getBuffer<float>(_rhs),
+      getShape(_output), getBuffer<float>(_output));
     return;
   }
 
-  nnfw::cker::powImpl(getTensorShape(_lhs), reinterpret_cast<const float *>(_lhs->buffer()),
-                      getTensorShape(_rhs), reinterpret_cast<const float *>(_rhs->buffer()),
-                      getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()));
+  nnfw::cker::powImpl(getShape(_lhs), getBuffer<float>(_lhs), getShape(_rhs),
+                      getBuffer<float>(_rhs), getShape(_output), getBuffer<float>(_output));
 }
 
 void PowLayer::configure(const IPortableTensor *lhs, const IPortableTensor *rhs,
diff --git a/runtime/onert/backend/cpu/ops/QuantizeLayer.cc b/runtime/onert/backend/cpu/ops/QuantizeLayer.cc
new file mode 100644
index 000000000..08550e7c9
--- /dev/null
+++ b/runtime/onert/backend/cpu/ops/QuantizeLayer.cc
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "OperationUtils.h"
+#include "QuantizeLayer.h"
+
+#include <cker/operation/Dequantize.h>
+#include <cker/operation/Erf.h>
+#include <cker/operation/Exp.h>
+#include <cker/operation/LogicalNot.h>
+#include <cker/operation/Quantize.h>
+#include <cker/operation/Round.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+namespace ops
+{
+template <typename InputT, typename OutputT>
+void affineQuantize(const IPortableTensor *input, IPortableTensor *output)
+{
+  nnfw::cker::Quantize(getShape(input), getBuffer<InputT>(input), getShape(output),
+                       getBuffer<OutputT>(output), output->data_scale(), output->data_zero_point());
+}
+
+void QuantizeLayer::configure(const IPortableTensor *input, IPortableTensor *output)
+{
+  assert(input != nullptr);
+  assert(output != nullptr);
+
+  _input = input;
+  _output = output;
+
+  if ((_input->data_type() == OperandType::FLOAT32))
+  {
+    // DO NOTHING
+  }
+  else if (((input->data_type() == OperandType::QUANT_UINT8_ASYMM) &&
+            (output->data_type() == OperandType::QUANT_INT8_ASYMM)) ||
+           ((input->data_type() == OperandType::QUANT_INT8_ASYMM) &&
+            (output->data_type() == OperandType::QUANT_UINT8_ASYMM)))
+  {
+    const double effective_output_scale =
+      static_cast<double>(input->data_scale()) / static_cast<double>(output->data_scale());
+    QuantizeMultiplier(effective_output_scale, &_output_multiplier, &_output_shift);
+  }
+  else
+  {
+    throw std::runtime_error{"Quantize: Unsupported  data type"};
+  }
+}
+
+void QuantizeLayer::run()
+{
+  if ((_input->data_type() == OperandType::FLOAT32))
+  {
+    affineQuantize<float, uint8_t>(_input, _output);
+  }
+  else if ((_input->data_type() == OperandType::QUANT_UINT8_ASYMM) &&
+           (_output->data_type() == OperandType::QUANT_INT8_ASYMM))
+  {
+    nnfw::cker::Requantize<uint8_t, int8_t>(
+      getBuffer<uint8_t>(_input), MatchingFlatSize(getShape(_input), getShape(_output)),
+      _output_multiplier, _output_shift, _input->data_zero_point(), _output->data_zero_point(),
+      getBuffer<int8_t>(_output));
+  }
+  else if ((_input->data_type() == OperandType::QUANT_INT8_ASYMM) &&
+           (_output->data_type() == OperandType::QUANT_UINT8_ASYMM))
+  {
+    nnfw::cker::Requantize<int8_t, uint8_t>(
+      getBuffer<int8_t>(_input), MatchingFlatSize(getShape(_input), getShape(_output)),
+      _output_multiplier, _output_shift, _input->data_zero_point(), _output->data_zero_point(),
+      getBuffer<uint8_t>(_output));
+  }
+  else
+  {
+    throw std::runtime_error{"Quantize: Unsupported  data type"};
+  }
+}
+
+} // namespace ops
+} // namespace cpu
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/cpu/ops/QuantizeLayer.h b/runtime/onert/backend/cpu/ops/QuantizeLayer.h
new file mode 100644
index 000000000..112d31562
--- /dev/null
+++ b/runtime/onert/backend/cpu/ops/QuantizeLayer.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_CPU_OPS_QUANTIZELAYER_H__
+#define __ONERT_BACKEND_CPU_OPS_QUANTIZELAYER_H__
+
+#include <backend/IPortableTensor.h>
+
+#include <exec/IFunction.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+namespace ops
+{
+class QuantizeLayer : public ::onert::exec::IFunction
+{
+public:
+  QuantizeLayer() : _input(nullptr), _output(nullptr), _output_multiplier(0), _output_shift(0)
+  {
+    // DO NOTHING
+  }
+
+public:
+  void configure(const IPortableTensor *input, IPortableTensor *output);
+  void run() override;
+
+private:
+  const IPortableTensor *_input;
+  IPortableTensor *_output;
+  int32_t _output_multiplier;
+  int _output_shift;
+};
+} // namespace ops
+} // namespace cpu
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_CPU_OPS_QUANTIZELAYER_H__
diff --git a/runtime/onert/backend/cpu/ops/RangeLayer.cc b/runtime/onert/backend/cpu/ops/RangeLayer.cc
index f00101fa8..a41b31b3f 100644
--- a/runtime/onert/backend/cpu/ops/RangeLayer.cc
+++ b/runtime/onert/backend/cpu/ops/RangeLayer.cc
@@ -47,16 +47,12 @@ void RangeLayer::run()
   switch (_output->data_type())
   {
     case OperandType::FLOAT32:
-      nnfw::cker::Range<float>(reinterpret_cast<float *>(_start->buffer()),
-                               reinterpret_cast<float *>(_limit->buffer()),
-                               reinterpret_cast<float *>(_delta->buffer()),
-                               reinterpret_cast<float *>(_output->buffer()));
+      nnfw::cker::Range<float>(getBuffer<float>(_start), getBuffer<float>(_limit),
+                               getBuffer<float>(_delta), getBuffer<float>(_output));
       break;
     case OperandType::INT32:
-      nnfw::cker::Range<int32_t>(reinterpret_cast<int32_t *>(_start->buffer()),
-                                 reinterpret_cast<int32_t *>(_limit->buffer()),
-                                 reinterpret_cast<int32_t *>(_delta->buffer()),
-                                 reinterpret_cast<int32_t *>(_output->buffer()));
+      nnfw::cker::Range<int32_t>(getBuffer<int32_t>(_start), getBuffer<int32_t>(_limit),
+                                 getBuffer<int32_t>(_delta), getBuffer<int32_t>(_output));
       break;
     default:
       throw std::runtime_error{"Range: unsupported data type"};
diff --git a/runtime/onert/backend/cpu/ops/RankLayer.cc b/runtime/onert/backend/cpu/ops/RankLayer.cc
index 184f4925b..765c595ff 100644
--- a/runtime/onert/backend/cpu/ops/RankLayer.cc
+++ b/runtime/onert/backend/cpu/ops/RankLayer.cc
@@ -40,8 +40,8 @@ void RankLayer::configure(const IPortableTensor *input, IPortableTensor *output)
 
 void RankLayer::run()
 {
-  int32_t *output_data = reinterpret_cast<int32_t *>(_output->buffer());
-  output_data[0] = _input->num_dimensions();
+  int32_t *output_data = getBuffer<int32_t>(_output);
+  output_data[0] = _input->getShape().rank();
 }
 
 } // namespace ops
diff --git a/runtime/onert/backend/cpu/ops/ReduceLayer.cc b/runtime/onert/backend/cpu/ops/ReduceLayer.cc
index 4a55b2a33..66b5abb15 100644
--- a/runtime/onert/backend/cpu/ops/ReduceLayer.cc
+++ b/runtime/onert/backend/cpu/ops/ReduceLayer.cc
@@ -38,10 +38,10 @@ void evalLogic(const IPortableTensor *input, IPortableTensor *output, const std:
                bool keep_dims, T init_value, nnfw::cker::Reduce &reduce_kernel,
                T reducer(const T current, const T in))
 {
-  reduce_kernel.prepare(input->num_dimensions(), axes.size());
-  bool result = reduce_kernel.ReduceGeneric<T>(
-      getTensorShape(input), reinterpret_cast<const T *>(input->buffer()), getTensorShape(output),
-      reinterpret_cast<T *>(output->buffer()), axes, keep_dims, init_value, reducer);
+  reduce_kernel.prepare(input->getShape().rank(), axes.size());
+  bool result =
+    reduce_kernel.ReduceGeneric<T>(getShape(input), getBuffer<T>(input), getShape(output),
+                                   getBuffer<T>(output), axes, keep_dims, init_value, reducer);
 
   if (!result)
   {
@@ -67,15 +67,15 @@ evalType(bool keep_dims, nnfw::cker::Reduce &reduce_kernel, ReduceType reduce_ty
       break;
     case ReduceType::kMax:
       return std::bind(
-          &evalLogic<T>, std::placeholders::_1, std::placeholders::_2, std::placeholders::_3,
-          keep_dims, std::numeric_limits<T>::lowest(), reduce_kernel,
-          [](const T current, const T in) -> T { return (in > current) ? in : current; });
+        &evalLogic<T>, std::placeholders::_1, std::placeholders::_2, std::placeholders::_3,
+        keep_dims, std::numeric_limits<T>::lowest(), reduce_kernel,
+        [](const T current, const T in) -> T { return (in > current) ? in : current; });
       break;
     case ReduceType::kMin:
       return std::bind(
-          &evalLogic<T>, std::placeholders::_1, std::placeholders::_2, std::placeholders::_3,
-          keep_dims, std::numeric_limits<T>::max(), reduce_kernel,
-          [](const T current, const T in) -> T { return (in < current) ? in : current; });
+        &evalLogic<T>, std::placeholders::_1, std::placeholders::_2, std::placeholders::_3,
+        keep_dims, std::numeric_limits<T>::max(), reduce_kernel,
+        [](const T current, const T in) -> T { return (in < current) ? in : current; });
       break;
     default:
       throw std::runtime_error{"Reduce: Unsupported reduce type"};
@@ -127,21 +127,21 @@ void evalSumQuantized(const IPortableTensor *input, IPortableTensor *output,
                       nnfw::cker::Reduce &reduce_kernel)
 {
   const bool same_scale = (input->data_scale() == output->data_scale() &&
-                           input->data_offset() == output->data_offset());
+                           input->data_zero_point() == output->data_zero_point());
 
-  reduce_kernel.prepare(input->num_dimensions(), axes.size());
+  reduce_kernel.prepare(input->getShape().rank(), axes.size());
 
   if (!same_scale)
   {
     std::vector<int32_t> temp_sum(output->getShape().num_elements());
     bool result = reduce_kernel.QuantizedMeanOrSum<uint8_t, int32_t>(
-        reinterpret_cast<const uint8_t *>(input->buffer()), input->data_offset(),
-        input->data_scale(), getTensorShape(input), reinterpret_cast<uint8_t *>(output->buffer()),
-        output->data_offset(), output->data_scale(), getTensorShape(output), axes, keep_dims,
-        temp_sum.data(), true, [](const int32_t current, const uint8_t in) -> int32_t {
-          const int32_t actual_in = static_cast<int32_t>(in);
-          return current + actual_in;
-        });
+      getBuffer<uint8_t>(input), input->data_zero_point(), input->data_scale(), getShape(input),
+      getBuffer<uint8_t>(output), output->data_zero_point(), output->data_scale(), getShape(output),
+      axes, keep_dims, temp_sum.data(), true,
+      [](const int32_t current, const uint8_t in) -> int32_t {
+        const int32_t actual_in = static_cast<int32_t>(in);
+        return current + actual_in;
+      });
 
     if (!result)
     {
@@ -158,8 +158,8 @@ void evalSumQuantized(const IPortableTensor *input, IPortableTensor *output,
 } // namespace
 
 ReduceLayer::ReduceLayer()
-    : _input(nullptr), _axes(nullptr), _output(nullptr), _reduce_kernel(new nnfw::cker::Reduce()),
-      _kernel(), _reduceType(ReduceType::kInvalid)
+  : _input(nullptr), _axes(nullptr), _output(nullptr), _reduce_kernel(new nnfw::cker::Reduce()),
+    _kernel(), _reduceType(ReduceType::kInvalid)
 {
   // DO NOTHING
 }
@@ -209,12 +209,11 @@ void ReduceLayer::run()
 {
   const auto axes = getReducerAxes(_axes);
 #ifdef USE_NEON
-  int32_t rank = _input->num_dimensions();
+  int32_t rank = _input->getShape().rank();
   if (_input->data_type() == ir::DataType::FLOAT32 && _reduceType == ReduceType::kSum &&
       axes.size() == 1 && (axes[0] == -1 || axes[0] == rank - 1))
   {
-    OptimizedReduceSum(reinterpret_cast<const float *>(_input->buffer()), getTensorShape(_input),
-                       reinterpret_cast<float *>(_output->buffer()));
+    OptimizedReduceSum(getBuffer<float>(_input), getShape(_input), getBuffer<float>(_output));
     return;
   }
 #endif // NEON
diff --git a/runtime/onert/backend/cpu/ops/ReduceLayer.h b/runtime/onert/backend/cpu/ops/ReduceLayer.h
index 8265dd41f..e70f0fcb2 100644
--- a/runtime/onert/backend/cpu/ops/ReduceLayer.h
+++ b/runtime/onert/backend/cpu/ops/ReduceLayer.h
@@ -72,7 +72,7 @@ private:
   std::unique_ptr<nnfw::cker::Reduce> _reduce_kernel;
   std::function<void(const IPortableTensor *input, IPortableTensor *output,
                      const std::vector<int> &axes)>
-      _kernel;
+    _kernel;
 
   ReduceType _reduceType;
 };
diff --git a/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.cc b/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.cc
index 1fe56cb99..c32015fdc 100644
--- a/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.cc
+++ b/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.cc
@@ -28,8 +28,8 @@ namespace ops
 {
 
 ResizeBilinearLayer::ResizeBilinearLayer()
-    : _input(nullptr), _output(nullptr), _size(nullptr), _output_height(0), _output_width(0),
-      _align_corners(false), _half_pixel_centers(false)
+  : _input(nullptr), _output(nullptr), _size(nullptr), _output_height(0), _output_width(0),
+    _align_corners(false), _half_pixel_centers(false)
 {
   // DO NOTHING
 }
@@ -79,7 +79,7 @@ void ResizeBilinearLayer::run()
   }
   else
   {
-    const auto size_buf = reinterpret_cast<const int32_t *>(_size->buffer());
+    const auto size_buf = getBuffer<int32_t>(_size);
     params.output_height = size_buf[0];
     params.output_width = size_buf[1];
   }
@@ -89,15 +89,18 @@ void ResizeBilinearLayer::run()
   switch (_input->data_type())
   {
     case OperandType::FLOAT32:
-      nnfw::cker::ResizeBilinear(
-          params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
-          getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()));
+      nnfw::cker::ResizeBilinear(params, getShape(_input), getBuffer<float>(_input),
+                                 getShape(_output), getBuffer<float>(_output));
       break;
 
     case OperandType::QUANT_UINT8_ASYMM:
-      nnfw::cker::ResizeBilinear(
-          params, getTensorShape(_input), reinterpret_cast<const uint8_t *>(_input->buffer()),
-          getTensorShape(_output), reinterpret_cast<uint8_t *>(_output->buffer()));
+      nnfw::cker::ResizeBilinear(params, getShape(_input), getBuffer<uint8_t>(_input),
+                                 getShape(_output), getBuffer<uint8_t>(_output));
+      break;
+
+    case OperandType::QUANT_INT8_ASYMM:
+      nnfw::cker::ResizeBilinear(params, getShape(_input), getBuffer<int8_t>(_input),
+                                 getShape(_output), getBuffer<int8_t>(_output));
       break;
 
     case OperandType::UINT8:
diff --git a/runtime/onert/backend/cpu/ops/ReverseLayer.cc b/runtime/onert/backend/cpu/ops/ReverseLayer.cc
index 7979e77a0..cddab302a 100644
--- a/runtime/onert/backend/cpu/ops/ReverseLayer.cc
+++ b/runtime/onert/backend/cpu/ops/ReverseLayer.cc
@@ -36,18 +36,17 @@ void ReverseLayer::run()
   {
     throw std::runtime_error{"Reverse: only support 1 axis"};
   }
-  int32_t axis = *(reinterpret_cast<int32_t *>(_axis->buffer()));
+  int32_t axis = *getBuffer<int32_t>(_axis);
   if (axis < 0)
   {
-    axis += _input->num_dimensions();
+    axis += _input->getShape().rank();
   }
 
   switch (_input->data_type())
   {
     case OperandType::FLOAT32:
-      nnfw::cker::Reverse<float>(
-          axis, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
-          getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()));
+      nnfw::cker::Reverse<float>(axis, getShape(_input), getBuffer<float>(_input),
+                                 getShape(_output), getBuffer<float>(_output));
       break;
     default:
       throw std::runtime_error{"Reverse: unsupported data type"};
diff --git a/runtime/onert/backend/cpu/ops/SelectLayer.cc b/runtime/onert/backend/cpu/ops/SelectLayer.cc
index 95cfe1df0..4c28d1471 100644
--- a/runtime/onert/backend/cpu/ops/SelectLayer.cc
+++ b/runtime/onert/backend/cpu/ops/SelectLayer.cc
@@ -30,7 +30,7 @@ namespace ops
 {
 
 SelectLayer::SelectLayer()
-    : _cond(nullptr), _input_true(nullptr), _input_false(nullptr), _output(nullptr)
+  : _cond(nullptr), _input_true(nullptr), _input_false(nullptr), _output(nullptr)
 {
   // DO NOTHING
 }
@@ -47,11 +47,10 @@ void SelectLayer::configure(const IPortableTensor *cond, const IPortableTensor *
 void SelectLayer::run()
 {
 
-#define KERNEL_SELECT(type, op)                                                                  \
-  nnfw::cker::op(getTensorShape(_cond), reinterpret_cast<uint8_t *>(_cond->buffer()),            \
-                 getTensorShape(_input_true), reinterpret_cast<type *>(_input_true->buffer()),   \
-                 getTensorShape(_input_false), reinterpret_cast<type *>(_input_false->buffer()), \
-                 getTensorShape(_output), reinterpret_cast<type *>(_output->buffer()));
+#define KERNEL_SELECT(type, op)                                                     \
+  nnfw::cker::op(getShape(_cond), getBuffer<uint8_t>(_cond), getShape(_input_true), \
+                 getBuffer<type>(_input_true), getShape(_input_false),              \
+                 getBuffer<type>(_input_false), getShape(_output), getBuffer<type>(_output));
 
 #define KERNEL_SWITCH(type, op)                                  \
   switch (type)                                                  \
@@ -66,8 +65,8 @@ void SelectLayer::run()
 
   auto input_type = _input_true->data_type();
   bool require_broadcast =
-      !HaveSameShapes(_input_true, _cond) || !HaveSameShapes(_input_false, _cond);
-  bool rank_one_select = ((_input_true->num_dimensions() == 1) && !require_broadcast);
+    !HaveSameShapes(_input_true, _cond) || !HaveSameShapes(_input_false, _cond);
+  bool rank_one_select = ((_input_true->getShape().rank() == 1) && !require_broadcast);
 
   if (rank_one_select)
   {
diff --git a/runtime/onert/backend/cpu/ops/ShapeLayer.cc b/runtime/onert/backend/cpu/ops/ShapeLayer.cc
index 7268a89fa..46294e948 100644
--- a/runtime/onert/backend/cpu/ops/ShapeLayer.cc
+++ b/runtime/onert/backend/cpu/ops/ShapeLayer.cc
@@ -34,9 +34,10 @@ ShapeLayer::ShapeLayer() : _input(nullptr), _output(nullptr)
 
 template <typename T> void GetRawShape(const IPortableTensor *input, T *output_data)
 {
-  for (uint32_t i = 0; i < input->num_dimensions(); ++i)
+  auto shape = input->getShape();
+  for (int i = 0; i < shape.rank(); ++i)
   {
-    output_data[i] = static_cast<T>(input->dimension(i));
+    output_data[i] = static_cast<T>(shape.dim(i));
   }
 }
 
@@ -50,15 +51,15 @@ void ShapeLayer::run()
 {
   if (_output->data_type() == OperandType::UINT32)
   {
-    GetRawShape(_input, reinterpret_cast<uint32_t *>(_output->buffer()));
+    GetRawShape(_input, getBuffer<uint32_t>(_output));
   }
   else if (_output->data_type() == OperandType::INT32)
   {
-    GetRawShape(_input, reinterpret_cast<int32_t *>(_output->buffer()));
+    GetRawShape(_input, getBuffer<int32_t>(_output));
   }
   else if (_output->data_type() == OperandType::INT64)
   {
-    GetRawShape(_input, reinterpret_cast<int64_t *>(_output->buffer()));
+    GetRawShape(_input, getBuffer<int64_t>(_output));
   }
   else
   {
diff --git a/runtime/onert/backend/cpu/ops/SliceLayer.cc b/runtime/onert/backend/cpu/ops/SliceLayer.cc
index 449c073e6..6332fbb56 100644
--- a/runtime/onert/backend/cpu/ops/SliceLayer.cc
+++ b/runtime/onert/backend/cpu/ops/SliceLayer.cc
@@ -41,8 +41,8 @@ void SliceLayer::GetBeginAndSizeVectors(int dimensions, const IPortableTensor *b
 {
   for (int idx = dimensions - 1; idx >= 0; --idx)
   {
-    begins->push_back(reinterpret_cast<T *>(begin->buffer())[idx]);
-    sizes->push_back(reinterpret_cast<T *>(size->buffer())[idx]);
+    begins->push_back(getBuffer<T>(begin)[idx]);
+    sizes->push_back(getBuffer<T>(size)[idx]);
   }
 }
 
@@ -55,10 +55,21 @@ template <typename T> void SliceLayer::sliceImpl()
   begins.reserve(kMaxDim);
   sizes.reserve(kMaxDim);
 
-  GetBeginAndSizeVectors<int32_t>(_input->num_dimensions(), _begin, _size, &begins, &sizes);
+  if (_begin->data_type() == OperandType::INT32)
+  {
+    GetBeginAndSizeVectors<int32_t>(_input->getShape().rank(), _begin, _size, &begins, &sizes);
+  }
+  else if (_begin->data_type() == OperandType::INT64)
+  {
+    GetBeginAndSizeVectors<int64_t>(_input->getShape().rank(), _begin, _size, &begins, &sizes);
+  }
+  else
+  {
+    throw std::runtime_error{"Slice: unsupported begin and/or size data type"};
+  }
 
   // begins : 0-based, sizes : 1-based
-  for (int i = _input->num_dimensions(); i < kMaxDim; ++i)
+  for (int i = _input->getShape().rank(); i < kMaxDim; ++i)
   {
     begins.push_back(0);
     sizes.push_back(1);
@@ -73,9 +84,8 @@ template <typename T> void SliceLayer::sliceImpl()
     op_params.size[i] = sizes[3 - i];
   }
 
-  nnfw::cker::Slice(op_params, getExtendedTensorShape(_input),
-                    reinterpret_cast<const T *>(_input->buffer()),
-                    reinterpret_cast<T *>(_output->buffer()));
+  nnfw::cker::Slice(op_params, getExtendedTensorShape(_input), getBuffer<T>(_input),
+                    getBuffer<T>(_output));
 }
 
 void SliceLayer::configure(const IPortableTensor *input, const IPortableTensor *begin,
diff --git a/runtime/onert/backend/cpu/ops/SoftMaxLayer.cc b/runtime/onert/backend/cpu/ops/SoftMaxLayer.cc
index b42be3042..320914dae 100644
--- a/runtime/onert/backend/cpu/ops/SoftMaxLayer.cc
+++ b/runtime/onert/backend/cpu/ops/SoftMaxLayer.cc
@@ -39,8 +39,7 @@ void SoftMaxLayer::softmaxFloat32()
   if (getNumberOfDimensions(_input) == 1)
   {
     uint32_t input_size = getNumberOfElements(_input);
-    nnfw::cker::Softmax(reinterpret_cast<const float *>(_input->buffer()), input_size, 1, _beta,
-                        reinterpret_cast<float *>(_output->buffer()));
+    nnfw::cker::Softmax(getBuffer<float>(_input), input_size, 1, _beta, getBuffer<float>(_output));
   }
   else if (getNumberOfDimensions(_input) == 2)
   {
@@ -49,73 +48,41 @@ void SoftMaxLayer::softmaxFloat32()
       throw std::runtime_error("batch_size should not be 0");
 
     uint32_t input_size = getNumberOfElements(_input) / batch_size;
-    nnfw::cker::Softmax(reinterpret_cast<const float *>(_input->buffer()), input_size, batch_size,
-                        _beta, reinterpret_cast<float *>(_output->buffer()));
+    nnfw::cker::Softmax(getBuffer<float>(_input), input_size, batch_size, _beta,
+                        getBuffer<float>(_output));
   }
   else if (getNumberOfDimensions(_input) == 4)
   {
     nnfw::cker::SoftmaxParams op_params;
     op_params.beta = _beta;
-    nnfw::cker::Softmax(op_params, getTensorShape(_input),
-                        reinterpret_cast<const float *>(_input->buffer()), getTensorShape(_output),
-                        reinterpret_cast<float *>(_output->buffer()));
+    nnfw::cker::Softmax(op_params, getShape(_input), getBuffer<float>(_input), getShape(_output),
+                        getBuffer<float>(_output));
   }
   else
   {
     nnfw::cker::SoftmaxParams op_params;
     op_params.beta = _beta;
-    nnfw::cker::reference::Softmax(
-        op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
-        getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()));
+    nnfw::cker::reference::Softmax(op_params, getShape(_input), getBuffer<float>(_input),
+                                   getShape(_output), getBuffer<float>(_output));
   }
 }
 
-void SoftMaxLayer::softmaxQuant8()
+template <typename T> void SoftMaxLayer::softmaxQuant8()
 {
-  nnfw::cker::Shape descrIn4D(4);
-
-  if (getNumberOfDimensions(_input) == 2)
-  {
-    auto batch_size = getSizeOfDimension(_input, 0);
-    if (batch_size == 0)
-      throw std::runtime_error("batch_size should not be 0");
-
-    auto input_size = getNumberOfElements(_input) / batch_size;
-    descrIn4D.SetDim(0, batch_size);
-    descrIn4D.SetDim(1, 1);
-    descrIn4D.SetDim(2, 1);
-    descrIn4D.SetDim(3, input_size);
-  }
-  else if (getNumberOfDimensions(_input) == 4)
-  {
-    descrIn4D.SetDim(0, _input->dimension(0));
-    descrIn4D.SetDim(1, _input->dimension(1));
-    descrIn4D.SetDim(2, _input->dimension(2));
-    descrIn4D.SetDim(3, _input->dimension(3));
-  }
-  else
-  {
-    throw std::runtime_error{"only 2D and 4D tensors supported"};
-  }
-  if (_output->data_offset() != 0 || _output->data_scale() != 1.f / 256)
-  {
-    throw std::runtime_error{"incorrect scale / offset for output"};
-  }
-  static const int32_t kScaledDiffIntegerBits = 5;
-  const double input_beta_real_multiplier = std::min(
-      1.0 * _beta * _input->data_scale() * (1 << (31 - kScaledDiffIntegerBits)), (1ll << 31) - 1.0);
-  int32_t input_multiplier = 0;
-  int32_t input_left_shift = 0;
-  QuantizeMultiplierGreaterThanOne(input_beta_real_multiplier, &input_multiplier,
-                                   &input_left_shift);
-  float diff_min = -1.0f * CalculateInputRadius(kScaledDiffIntegerBits, input_left_shift);
-
   nnfw::cker::SoftmaxParams op_params;
-  op_params.input_multiplier = input_multiplier;
-  op_params.input_left_shift = input_left_shift;
-  op_params.diff_min = diff_min;
-  nnfw::cker::Softmax(op_params, descrIn4D, reinterpret_cast<const uint8_t *>(_input->buffer()),
-                      descrIn4D, reinterpret_cast<uint8_t *>(_output->buffer()));
+  op_params.scale = _output->data_scale();
+  op_params.zero_point = _output->data_zero_point();
+  op_params.uint8_table1 = _uint8_table1;
+  op_params.uint8_table2 = _uint8_table2;
+  op_params.table = _table;
+
+#ifdef TFLITE_SOFTMAX_USE_UINT16_LUT
+  nnfw::cker::SoftmaxInt8LUT<T, T>(op_params, getShape(_input), getBuffer<T>(_input),
+                                   getShape(_output), getBuffer<T>(_output));
+#else
+  nnfw::cker::Softmax<T, T>(op_params, getShape(_input), getBuffer<T>(_input), getShape(_output),
+                            getBuffer<T>(_output));
+#endif
 }
 
 void SoftMaxLayer::configure(const IPortableTensor *input, const float beta,
@@ -124,21 +91,36 @@ void SoftMaxLayer::configure(const IPortableTensor *input, const float beta,
   _input = input;
   _output = output;
   _beta = beta;
+
+  if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM ||
+      _input->data_type() == OperandType::QUANT_INT8_ASYMM)
+  {
+#ifdef TFLITE_SOFTMAX_USE_UINT16_LUT
+    // Only apply when both input & output are uint8/int8 & build with clang
+    // on aarch64.
+    nnfw::cker::PopulateSoftmaxUInt8LookupTable(_uint8_table1, _uint8_table2, _input->data_scale(),
+                                                _beta);
+#else
+    nnfw::cker::PopulateSoftmaxLookupTable(_table, _input->data_scale(), _beta);
+#endif
+  }
 }
 
 void SoftMaxLayer::run()
 {
-  if (_input->data_type() == OperandType::FLOAT32)
-  {
-    softmaxFloat32();
-  }
-  else if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM)
-  {
-    softmaxQuant8();
-  }
-  else
+  switch (_input->data_type())
   {
-    throw std::runtime_error{"SoftMax: unsupported data type"};
+    case OperandType::FLOAT32:
+      softmaxFloat32();
+      break;
+    case OperandType::QUANT_UINT8_ASYMM:
+      softmaxQuant8<uint8_t>();
+      break;
+    case OperandType::QUANT_INT8_ASYMM:
+      softmaxQuant8<int8_t>();
+      break;
+    default:
+      throw std::runtime_error{"SoftMax: unsupported data type"};
   }
 }
 
diff --git a/runtime/onert/backend/cpu/ops/SoftMaxLayer.h b/runtime/onert/backend/cpu/ops/SoftMaxLayer.h
index d0c704c2c..e63be0c3e 100644
--- a/runtime/onert/backend/cpu/ops/SoftMaxLayer.h
+++ b/runtime/onert/backend/cpu/ops/SoftMaxLayer.h
@@ -38,7 +38,7 @@ public:
 public:
   void softmaxFloat32();
 
-  void softmaxQuant8();
+  template <typename T> void softmaxQuant8();
 
   void configure(const IPortableTensor *input, const float beta, IPortableTensor *output);
 
@@ -49,6 +49,10 @@ private:
   IPortableTensor *_output;
 
   float _beta;
+
+  float _table[256];
+  uint8_t _uint8_table1[256];
+  uint8_t _uint8_table2[256];
 };
 
 } // namespace ops
diff --git a/runtime/onert/backend/cpu/ops/SpaceToBatchNDLayer.cc b/runtime/onert/backend/cpu/ops/SpaceToBatchNDLayer.cc
index 896e262ba..8dd0a01a5 100644
--- a/runtime/onert/backend/cpu/ops/SpaceToBatchNDLayer.cc
+++ b/runtime/onert/backend/cpu/ops/SpaceToBatchNDLayer.cc
@@ -29,7 +29,7 @@ namespace cpu
 namespace ops
 {
 SpaceToBatchNDLayer::SpaceToBatchNDLayer()
-    : _input(nullptr), _block_shape(nullptr), _padding(nullptr), _output(nullptr)
+  : _input(nullptr), _block_shape(nullptr), _padding(nullptr), _output(nullptr)
 {
   // DO NOTHING
 }
@@ -38,7 +38,7 @@ SpaceToBatchNDLayer::SpaceToBatchNDLayer()
 void SpaceToBatchNDLayer::checkDimension()
 {
   const int kSpatialDimensionNum = 2;
-  if (_block_shape->dimension(0) != kSpatialDimensionNum)
+  if (_block_shape->getShape().dim(0) != kSpatialDimensionNum)
   {
     throw std::runtime_error("SpaceToBatchND : block_shape(block_size) tensor's rank is wrong\n");
   }
@@ -47,18 +47,17 @@ void SpaceToBatchNDLayer::checkDimension()
   // shape height and width.
   for (int dim = 0; dim < kSpatialDimensionNum; ++dim)
   {
-    int final_dim_size =
-        (_input->dimension(dim + 1) + reinterpret_cast<int32_t *>(_padding->buffer())[dim * 2] +
-         reinterpret_cast<int32_t *>(_padding->buffer())[dim * 2 + 1]);
+    int final_dim_size = (_input->getShape().dim(dim + 1) + getBuffer<int32_t>(_padding)[dim * 2] +
+                          getBuffer<int32_t>(_padding)[dim * 2 + 1]);
 
-    if (final_dim_size % reinterpret_cast<int32_t *>(_block_shape->buffer())[dim] != 0)
+    if (final_dim_size % getBuffer<int32_t>(_block_shape)[dim] != 0)
     {
       throw std::runtime_error(
-          "SpaceToBatchND : padded input's dimension is not a multiple of block size\n");
+        "SpaceToBatchND : padded input's dimension is not a multiple of block size\n");
     }
 
-    if ((int32_t)_output->dimension(dim + 1) !=
-        final_dim_size / reinterpret_cast<int32_t *>(_block_shape->buffer())[dim])
+    if ((int32_t)_output->getShape().dim(dim + 1) !=
+        final_dim_size / getBuffer<int32_t>(_block_shape)[dim])
     {
       throw std::runtime_error("SpaceToBatchND : wrong output dimension\n");
     }
@@ -66,7 +65,7 @@ void SpaceToBatchNDLayer::checkDimension()
 }
 
 template <> uint32_t SpaceToBatchNDLayer::getPad<float>() { return 0; }
-template <> uint32_t SpaceToBatchNDLayer::getPad<uint8_t>() { return _output->data_offset(); }
+template <> uint32_t SpaceToBatchNDLayer::getPad<uint8_t>() { return _output->data_zero_point(); }
 
 template <typename T> void SpaceToBatchNDLayer::spaceToBatchND()
 {
@@ -75,11 +74,10 @@ template <typename T> void SpaceToBatchNDLayer::spaceToBatchND()
   nnfw::cker::SpaceToBatchParams params;
   params.output_offset = getPad<T>();
 
-  nnfw::cker::SpaceToBatchND(
-      params, getTensorShape(_input), reinterpret_cast<const T *>(_input->buffer()),
-      getTensorShape(_block_shape), reinterpret_cast<const int32_t *>(_block_shape->buffer()),
-      getTensorShape(_padding), reinterpret_cast<const int32_t *>(_padding->buffer()),
-      getTensorShape(_output), reinterpret_cast<T *>(_output->buffer()));
+  nnfw::cker::SpaceToBatchND(params, getShape(_input), getBuffer<T>(_input), getShape(_block_shape),
+                             getBuffer<int32_t>(_block_shape), getShape(_padding),
+                             getBuffer<int32_t>(_padding), getShape(_output),
+                             getBuffer<T>(_output));
 }
 
 void SpaceToBatchNDLayer::configure(const IPortableTensor *input,
diff --git a/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.cc b/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.cc
index a0869aed8..8271daf42 100644
--- a/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.cc
+++ b/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.cc
@@ -39,9 +39,8 @@ template <typename T> void SpaceToDepthLayer::spaceToDepth()
   nnfw::cker::SpaceToDepthParams params;
   params.block_size = _block_size;
 
-  nnfw::cker::SpaceToDepth(params, getTensorShape(_input),
-                           reinterpret_cast<const T *>(_input->buffer()), getTensorShape(_output),
-                           reinterpret_cast<T *>(_output->buffer()));
+  nnfw::cker::SpaceToDepth(params, getShape(_input), getBuffer<T>(_input), getShape(_output),
+                           getBuffer<T>(_output));
 }
 
 void SpaceToDepthLayer::configure(const IPortableTensor *input, const int32_t block_size,
diff --git a/runtime/onert/backend/cpu/ops/SplitLayer.cc b/runtime/onert/backend/cpu/ops/SplitLayer.cc
index 922cde2e3..6e4eaccd4 100644
--- a/runtime/onert/backend/cpu/ops/SplitLayer.cc
+++ b/runtime/onert/backend/cpu/ops/SplitLayer.cc
@@ -41,10 +41,10 @@ template <typename T> void SplitLayer::split(void)
   {
     throw std::runtime_error("ArgMinMax: wrong shape of axis");
   }
-  auto axis = *reinterpret_cast<const int32_t *>(_axis->buffer());
+  auto axis = *getBuffer<int32_t>(_axis);
   if (axis < 0)
   {
-    axis += _input->num_dimensions();
+    axis += _input->getShape().rank();
   }
   op_params.axis = axis;
   op_params.num_split = _num_splits;
@@ -54,12 +54,12 @@ template <typename T> void SplitLayer::split(void)
   for (const auto output : _outputs)
   {
     assert(output->total_size() == sizeOfData(output->data_type(), output->getShape().dims()));
-    outputPtrs.emplace_back(reinterpret_cast<T *>(output->buffer()));
+    outputPtrs.emplace_back(getBuffer<T>(output));
   }
 
   assert(_input->total_size() == sizeOfData(_input->data_type(), _input->getShape().dims()));
-  nnfw::cker::Split<T>(op_params, getTensorShape(_input), reinterpret_cast<T *>(_input->buffer()),
-                       getTensorShape(_outputs[0]), outputPtrs.data());
+  nnfw::cker::Split<T>(op_params, getShape(_input), getBuffer<T>(_input), getShape(_outputs[0]),
+                       outputPtrs.data());
 }
 
 void SplitLayer::configure(const IPortableTensor *input, const IPortableTensor *axis,
diff --git a/runtime/onert/backend/cpu/ops/SplitVLayer.cc b/runtime/onert/backend/cpu/ops/SplitVLayer.cc
index d6ca12442..166e6e6fd 100644
--- a/runtime/onert/backend/cpu/ops/SplitVLayer.cc
+++ b/runtime/onert/backend/cpu/ops/SplitVLayer.cc
@@ -30,7 +30,7 @@ namespace ops
 {
 
 SplitVLayer::SplitVLayer()
-    : _input(nullptr), _size_splits(nullptr), _split_dim(nullptr), _num_splits(0), _outputs()
+  : _input(nullptr), _size_splits(nullptr), _split_dim(nullptr), _num_splits(0), _outputs()
 {
   // DO NOTHING
 }
@@ -38,7 +38,7 @@ SplitVLayer::SplitVLayer()
 template <typename T> void SplitVLayer::splitV(void)
 {
   nnfw::cker::SplitVParams op_params;
-  op_params.axis = *(reinterpret_cast<const int32_t *>(_split_dim->buffer()));
+  op_params.axis = *getBuffer<int32_t>(_split_dim);
   op_params.num_split = _num_splits;
 
   std::vector<T *> outputPtrs;
@@ -47,13 +47,13 @@ template <typename T> void SplitVLayer::splitV(void)
   for (const auto output : _outputs)
   {
     assert(output->total_size() == sizeOfData(output->data_type(), output->getShape().dims()));
-    outputPtrs.emplace_back(reinterpret_cast<T *>(output->buffer()));
-    outshape.emplace_back(getTensorShape(output));
+    outputPtrs.emplace_back(getBuffer<T>(output));
+    outshape.emplace_back(getShape(output));
   }
 
   assert(_input->total_size() == sizeOfData(_input->data_type(), _input->getShape().dims()));
-  nnfw::cker::SplitV<T>(op_params, getTensorShape(_input), reinterpret_cast<T *>(_input->buffer()),
-                        outshape, outputPtrs.data());
+  nnfw::cker::SplitV<T>(op_params, getShape(_input), getBuffer<T>(_input), outshape,
+                        outputPtrs.data());
 }
 
 void SplitVLayer::configure(const IPortableTensor *input, const IPortableTensor *size_splits,
diff --git a/runtime/onert/backend/cpu/ops/SquaredDiffLayer.cc b/runtime/onert/backend/cpu/ops/SquaredDiffLayer.cc
index cf67a5c00..78984c5a9 100644
--- a/runtime/onert/backend/cpu/ops/SquaredDiffLayer.cc
+++ b/runtime/onert/backend/cpu/ops/SquaredDiffLayer.cc
@@ -36,9 +36,8 @@ SqDiffLayer::SqDiffLayer() : _input1(nullptr), _input2(nullptr), _output(nullptr
 
 void SqDiffLayer::SqDiffFloat32()
 {
-  nnfw::cker::SqDiff(getTensorShape(_input1), reinterpret_cast<const float *>(_input1->buffer()),
-                     getTensorShape(_input2), reinterpret_cast<const float *>(_input2->buffer()),
-                     getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()));
+  nnfw::cker::SqDiff(getShape(_input1), getBuffer<float>(_input1), getShape(_input2),
+                     getBuffer<float>(_input2), getShape(_output), getBuffer<float>(_output));
 }
 
 void SqDiffLayer::configure(const IPortableTensor *input1, const IPortableTensor *input2,
diff --git a/runtime/onert/backend/cpu/ops/StatelessRandomUniformLayer.cc b/runtime/onert/backend/cpu/ops/StatelessRandomUniformLayer.cc
index b8dfcb4b5..587582e8f 100644
--- a/runtime/onert/backend/cpu/ops/StatelessRandomUniformLayer.cc
+++ b/runtime/onert/backend/cpu/ops/StatelessRandomUniformLayer.cc
@@ -28,7 +28,7 @@ namespace ops
 {
 
 StatelessRandomUniformLayer::StatelessRandomUniformLayer()
-    : _shape(nullptr), _seed(nullptr), _output(nullptr)
+  : _shape(nullptr), _seed(nullptr), _output(nullptr)
 {
   // DO NOTHING
 }
@@ -43,10 +43,9 @@ void StatelessRandomUniformLayer::configure(const IPortableTensor *shape,
 
 void StatelessRandomUniformLayer::StatelessRandomUniformFloat32()
 {
-  nnfw::cker::StatelessRandomUniform(
-      getTensorShape(_shape), reinterpret_cast<const int *>(_shape->buffer()),
-      getTensorShape(_seed), reinterpret_cast<const int *>(_seed->buffer()),
-      getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()));
+  nnfw::cker::StatelessRandomUniform(getShape(_shape), getBuffer<int32_t>(_shape), getShape(_seed),
+                                     getBuffer<int32_t>(_seed), getShape(_output),
+                                     getBuffer<float>(_output));
 }
 
 void StatelessRandomUniformLayer::run()
diff --git a/runtime/onert/backend/cpu/ops/StridedSliceLayer.cc b/runtime/onert/backend/cpu/ops/StridedSliceLayer.cc
index f77f4d691..bb8550ad0 100644
--- a/runtime/onert/backend/cpu/ops/StridedSliceLayer.cc
+++ b/runtime/onert/backend/cpu/ops/StridedSliceLayer.cc
@@ -30,24 +30,23 @@ namespace ops
 {
 
 StridedSliceLayer::StridedSliceLayer()
-    : _input(nullptr), _begin(nullptr), _end(nullptr), _strides(nullptr), _output(nullptr),
-      _begin_mask(0), _ellipsis_mask(0), _end_mask(0), _new_axis_mask(0), _shrink_axis_mask(0)
+  : _input(nullptr), _begin(nullptr), _end(nullptr), _strides(nullptr), _output(nullptr),
+    _begin_mask(0), _ellipsis_mask(0), _end_mask(0), _new_axis_mask(0), _shrink_axis_mask(0)
 {
 }
 
 template <typename T> void StridedSliceLayer::stridedSliceImpl()
 {
-  const auto input_shape = getTensorShape(_input);
-  const auto output_shape = getTensorShape(_output);
+  const auto input_shape = getShape(_input);
+  const auto output_shape = getShape(_output);
   auto op_params = nnfw::cker::buildStridedSliceParams(
-      reinterpret_cast<uint32_t *>(_begin->buffer()), reinterpret_cast<uint32_t *>(_end->buffer()),
-      reinterpret_cast<uint32_t *>(_strides->buffer()), _begin_mask, _end_mask, _shrink_axis_mask,
-      input_shape.DimensionsCount());
+    getBuffer<uint32_t>(_begin), getBuffer<uint32_t>(_end), getBuffer<uint32_t>(_strides),
+    _begin_mask, _end_mask, _shrink_axis_mask, input_shape.DimensionsCount());
 
   nnfw::cker::checkOutputSize(op_params, input_shape, output_shape, input_shape.DimensionsCount());
 
-  nnfw::cker::StridedSlice(op_params, input_shape, reinterpret_cast<const T *>(_input->buffer()),
-                           output_shape, reinterpret_cast<T *>(_output->buffer()));
+  nnfw::cker::StridedSlice(op_params, input_shape, getBuffer<T>(_input), output_shape,
+                           getBuffer<T>(_output));
 }
 
 void StridedSliceLayer::configure(const IPortableTensor *input, const IPortableTensor *begin,
diff --git a/runtime/onert/backend/cpu/ops/TileLayer.cc b/runtime/onert/backend/cpu/ops/TileLayer.cc
index bfc371972..1f018db93 100644
--- a/runtime/onert/backend/cpu/ops/TileLayer.cc
+++ b/runtime/onert/backend/cpu/ops/TileLayer.cc
@@ -36,9 +36,8 @@ TileLayer::TileLayer() : _input(nullptr), _multipliers(nullptr), _output(nullptr
 
 void TileLayer::tileFloat32()
 {
-  TileOneDimension(getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
-                   reinterpret_cast<const int *>(_multipliers->buffer()),
-                   reinterpret_cast<float *>(_output->buffer()), 0);
+  TileOneDimension(getShape(_input), getBuffer<float>(_input), getBuffer<int>(_multipliers),
+                   getBuffer<float>(_output), 0);
 }
 
 void TileLayer::tileQuant8()
diff --git a/runtime/onert/backend/cpu/ops/TransposeLayer.cc b/runtime/onert/backend/cpu/ops/TransposeLayer.cc
index 3362c3396..850c07ab8 100644
--- a/runtime/onert/backend/cpu/ops/TransposeLayer.cc
+++ b/runtime/onert/backend/cpu/ops/TransposeLayer.cc
@@ -38,33 +38,33 @@ TransposeLayer::TransposeLayer() : _input(nullptr), _perm(nullptr), _output(null
 template <typename T> void TransposeLayer::transpose()
 {
   nnfw::cker::TransposeParams param;
-  assert(_perm->num_dimensions() == 1);
+  auto perm_shape = _perm->getShape();
+  assert(perm_shape.rank() == 1);
 
-  param.perm_count = _input->num_dimensions();
-  if (_perm->dimension(0) == 0) // This means _perm is (n-1...0)
+  param.perm_count = _input->getShape().rank();
+  if (perm_shape.dim(0) == 0) // This means _perm is (n-1...0)
   {
     const auto begin = param.perm;
-    const auto end = param.perm + _input->num_dimensions();
+    const auto end = param.perm + _input->getShape().rank();
     std::iota(begin, end, 0);
     std::reverse(begin, end);
   }
   else
   {
-    assert(param.perm_count == static_cast<int>(_perm->dimension(0)));
+    assert(param.perm_count == static_cast<int>(perm_shape.dim(0)));
     for (auto i = 0; i < param.perm_count; i++)
     {
-      param.perm[i] = *(reinterpret_cast<const int32_t *>(_perm->buffer()) + i);
+      param.perm[i] = *(getBuffer<int32_t>(_perm) + i);
     }
   }
 
-  nnfw::cker::Transpose(param, getTensorShape(_input),
-                        reinterpret_cast<const T *>(_input->buffer()), getTensorShape(_output),
-                        reinterpret_cast<T *>(_output->buffer()));
+  nnfw::cker::Transpose(param, getShape(_input), getBuffer<T>(_input), getShape(_output),
+                        getBuffer<T>(_output));
 }
 
 void TransposeLayer::transposeQuant8()
 {
-  if (_input->data_offset() != _output->data_offset())
+  if (_input->data_zero_point() != _output->data_zero_point())
   {
     throw std::runtime_error("TransposeLayer : qassym8 input and output offsets unmatched");
   }
diff --git a/runtime/onert/backend/cpu/ops/UnpackLayer.cc b/runtime/onert/backend/cpu/ops/UnpackLayer.cc
index 428b38588..f18fb9483 100644
--- a/runtime/onert/backend/cpu/ops/UnpackLayer.cc
+++ b/runtime/onert/backend/cpu/ops/UnpackLayer.cc
@@ -47,7 +47,7 @@ template <typename T> void UnpackLayer::unpackImpl()
 
   for (int32_t i = 0; i < _num_output; i++)
   {
-    outputDims.push_back(getTensorShape(_outputs[i]));
+    outputDims.push_back(getShape(_outputs[i]));
     outputDimsPtr.push_back(&outputDims[i]);
   }
 
@@ -55,11 +55,11 @@ template <typename T> void UnpackLayer::unpackImpl()
 
   for (const auto output : _outputs)
   {
-    outputPtrs.emplace_back(reinterpret_cast<T *>(output->buffer()));
+    outputPtrs.emplace_back(getBuffer<T>(output));
   }
 
-  nnfw::cker::Unpack<T>(op_params, getTensorShape(_input), reinterpret_cast<T *>(_input->buffer()),
-                        getTensorShape(_outputs[0]), outputPtrs.data());
+  nnfw::cker::Unpack<T>(op_params, getShape(_input), getBuffer<T>(_input), getShape(_outputs[0]),
+                        outputPtrs.data());
 }
 
 void UnpackLayer::configure(const IPortableTensor *input, uint32_t axis, int32_t num,
diff --git a/runtime/onert/backend/ruy/Backend.h b/runtime/onert/backend/ruy/Backend.h
index bc8a024d8..4077965c4 100644
--- a/runtime/onert/backend/ruy/Backend.h
+++ b/runtime/onert/backend/ruy/Backend.h
@@ -19,7 +19,6 @@
 
 #include "BackendContext.h"
 #include "Config.h"
-#include "ConstantInitializer.h"
 #include "KernelGenerator.h"
 
 #include <backend/Backend.h>
@@ -40,19 +39,16 @@ public:
 
   std::shared_ptr<IConfig> config() const override { return _config; }
 
-  std::unique_ptr<onert::backend::BackendContext>
-  newContext(const ir::Graph &graph, const std::shared_ptr<custom::IKernelBuilder> &kb,
-             bool) const override
+  std::unique_ptr<onert::backend::BackendContext> newContext(ContextData &&data) const override
   {
-    const auto &operands = graph.operands();
-    const auto &operations = graph.operations();
-    auto context = std::make_unique<BackendContext>(this, &graph);
-    auto tr = std::make_shared<cpu_common::TensorRegistry>();
+    auto custom_kernel_builder = data.custom_kernel_builder;
+    auto &graph = *data.graph;
+    auto context = std::make_unique<BackendContext>(this, std::move(data));
+    auto tr = std::make_shared<basic::TensorRegistry>();
     auto tb = std::make_shared<TensorBuilder>(tr);
     context->tensor_registry = tr;
     context->tensor_builder = tb;
-    context->constant_initializer = std::make_shared<ConstantInitializer>(operands, tr);
-    context->kernel_gen = std::make_shared<KernelGenerator>(operands, operations, tb, tr, kb,
+    context->kernel_gen = std::make_shared<KernelGenerator>(graph, tb, tr, custom_kernel_builder,
                                                             context->external_context());
     return context;
   }
diff --git a/runtime/onert/backend/ruy/BackendContext.cc b/runtime/onert/backend/ruy/BackendContext.cc
index ef686f480..877772619 100644
--- a/runtime/onert/backend/ruy/BackendContext.cc
+++ b/runtime/onert/backend/ruy/BackendContext.cc
@@ -22,7 +22,7 @@
 #include "ir/Index.h"
 #include "ir/OperandIndexMap.h"
 #include "ir/OperandIndexSequence.h"
-#include "backend/cpu_common/BackendContextHelpers.h"
+#include "backend/basic/BackendContextHelpers.h"
 
 namespace onert
 {
@@ -31,107 +31,24 @@ namespace backend
 namespace ruy
 {
 
-void BackendContext::initConsts()
-{
-  for (auto &op : operation_list())
-  {
-    constant_initializer->setLayout(op.layout);
-    graph()->operations().at(op.index).accept(*constant_initializer);
-  }
-
-  for (auto ind : operand_list())
-  {
-    const auto &obj = graph()->operands().at(ind);
-    if (obj.isConstant() && !constant_initializer->exist(ind))
-    {
-      constant_initializer->registerDefaultInitializer(ind, obj);
-    }
-  }
-
-  constant_initializer->run();
-}
-
-ITensorRegistry *BackendContext::genTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
-                                            const ir::OpSequences &op_seqs,
-                                            const ir::LowerInfoMap &lower_info)
-{
-  auto model_io = (graph()->getInputs() + graph()->getOutputs()) | ir::Remove::UNDEFINED |
-                  ir::Remove::DUPLICATED;
-  for (auto index : operand_list())
-  {
-    if (model_io.contains(index))
-      continue;
-    const auto &obj = graph()->operands().at(index);
-    const auto frontend_layout = [&]() {
-      if (obj.getUses().size() == 0)
-        return ir::Layout::UNKNOWN;
-      auto use_op_ind = *obj.getUses().begin(); // FIXME What if it has two or more uses?
-      for (auto &operation_info : operation_list())
-      {
-        if (operation_info.index == use_op_ind)
-          return operation_info.layout;
-      }
-      return ir::Layout::UNKNOWN;
-    }();
-    const auto &permute_factor = lower_info.operand.at(index)->def_factors().getOnlyElement();
-    if (permute_factor.backend() != backend())
-      continue;
-    const auto backend_layout = permute_factor.layout();
-    ir::OperandInfo backend_info{permuteShape(obj.shape(), frontend_layout, backend_layout),
-                                 obj.typeInfo(), obj.info().memAllocType(), obj.isConstant()};
-    tensor_builder->registerTensorInfo(index, backend_info, backend_layout);
-  }
-
-  // TODO Get compiler options from compiler, and use it rather than getting it from Env
-  if (util::getConfigString(util::config::EXECUTOR) == "Linear")
-  {
-    cpu_common::planTensors(*this, order, op_seqs, lower_info);
-  }
-  else
-  {
-    // For the executors that does not have fixed linear execution order:
-    // To make tensors never be deallocated, this is a workaround to use static memory planner
-    for (auto ind : operand_list())
-    {
-      if (tensor_builder->isRegistered(ind))
-        tensor_builder->notifyFirstUse(ind);
-    }
-  }
+ITensorRegistry *BackendContext::genTensors() { return basic::genTensors(*this); }
 
-  tensor_builder->prepare();
-
-  return tensor_registry.get();
-}
-
-FunctionMap BackendContext::genKernels(const std::vector<onert::ir::OpSequenceIndex> &order,
-                                       const ir::OpSequences &op_seqs)
+FunctionMap BackendContext::genKernels()
 {
   FunctionMap ret;
 
-  for (auto op_seq_ind : order)
+  for (auto op_ind : _data.op_order)
   {
-    const auto &op_seq = op_seqs.at(op_seq_ind);
-    bool assigned = [&]() {
-      for (auto op_info : operation_list())
-        if (op_seq.exist(op_info.index))
-          return true;
-      return false;
-    }();
-    if (!assigned)
-      continue;
-    auto fn_seq = kernel_gen->generate(op_seqs.at(op_seq_ind));
-    ret.emplace_back(op_seq_ind, std::move(fn_seq));
+    auto fn_seq = kernel_gen->generate(op_ind);
+    ret.emplace_back(op_ind, std::move(fn_seq));
   }
 
-  initConsts();
+  basic::initConsts(*this);
 
   // NOTE For memory optimization, we want to free some operand data
-  for (auto ind : operand_list())
-  {
-    // TODO Remove const_cast
-    auto &obj = const_cast<ir::Graph *>(graph())->operands().at(ind);
-    obj.releaseData();
-  }
+  const_cast<ir::Graph &>(*_data.graph)
+    .operands()
+    .iterate([&](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); });
 
   for (auto &it : ret)
   {
diff --git a/runtime/onert/backend/ruy/BackendContext.h b/runtime/onert/backend/ruy/BackendContext.h
index b965c9a9d..0dc30f557 100644
--- a/runtime/onert/backend/ruy/BackendContext.h
+++ b/runtime/onert/backend/ruy/BackendContext.h
@@ -19,7 +19,6 @@
 
 #include <backend/BackendContext.h>
 #include "TensorBuilder.h"
-#include "ConstantInitializer.h"
 #include "KernelGenerator.h"
 #include "ExternalContext.h"
 
@@ -33,35 +32,28 @@ namespace ruy
 class BackendContext : public onert::backend::BackendContext
 {
 public:
-  BackendContext(const Backend *backend, const ir::Graph *graph,
+  BackendContext(const Backend *backend, ContextData &&data,
                  std::shared_ptr<ITensorRegistry> tensor_registry = nullptr,
                  std::shared_ptr<TensorBuilder> tensor_builder = nullptr,
-                 std::shared_ptr<ConstantInitializer> constant_initializer = nullptr,
                  std::shared_ptr<KernelGenerator> kernel_gen = nullptr)
-      : onert::backend::BackendContext(backend, graph, tensor_registry),
-        tensor_builder{tensor_builder}, constant_initializer{constant_initializer},
-        kernel_gen{kernel_gen}, _external_context(new ExternalContext)
+    : onert::backend::BackendContext(backend, std::move(data), tensor_registry),
+      tensor_builder{tensor_builder}, kernel_gen{kernel_gen}, _external_context(new ExternalContext)
   {
   }
 
-  ITensorRegistry *genTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
-                              const ir::OpSequences &op_seqs,
-                              const ir::LowerInfoMap &lower_info) override;
+  ITensorRegistry *genTensors() override;
 
-  FunctionMap genKernels(const std::vector<ir::OpSequenceIndex> &order,
-                         const ir::OpSequences &op_seqs) override;
+  FunctionMap genKernels() override;
 
   std::shared_ptr<ExternalContext> external_context() { return _external_context; }
 
 private:
-  void initConsts();
-  void planTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
-                   const ir::OpSequences &op_seqs, const ir::LowerInfoMap &lower_info);
+  void planTensors(const std::vector<onert::ir::OperationIndex> &order,
+                   const compiler::GraphLowerInfo &lower_info);
 
 public:
   // TODO Make it private
   std::shared_ptr<TensorBuilder> tensor_builder;
-  std::shared_ptr<ConstantInitializer> constant_initializer;
   std::shared_ptr<KernelGenerator> kernel_gen;
 
 private:
diff --git a/runtime/onert/backend/ruy/Config.cc b/runtime/onert/backend/ruy/Config.cc
index 179caa9a6..c794f89bf 100644
--- a/runtime/onert/backend/ruy/Config.cc
+++ b/runtime/onert/backend/ruy/Config.cc
@@ -27,6 +27,6 @@ bool Config::initialize() { return true; }
 
 ir::Layout Config::supportLayout(const ir::Operation &, ir::Layout) { return ir::Layout::NHWC; }
 
-} // namespace cpu
+} // namespace ruy
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/backend/ruy/ExternalContext.h b/runtime/onert/backend/ruy/ExternalContext.h
index f51faccb8..3cc4eaa5a 100644
--- a/runtime/onert/backend/ruy/ExternalContext.h
+++ b/runtime/onert/backend/ruy/ExternalContext.h
@@ -20,11 +20,6 @@
 #include <util/ConfigSource.h>
 #include <ruy/context.h>
 
-namespace
-{
-const int kDefaultNumThreadpoolThreads = 4;
-}
-
 namespace onert
 {
 namespace backend
@@ -34,6 +29,9 @@ namespace ruy
 
 class ExternalContext
 {
+private:
+  static const int kDefaultNumThreadpoolThreads = 4;
+
 public:
   ExternalContext() : _ruy_context(new ::ruy::Context)
   {
@@ -43,7 +41,7 @@ public:
   void setMaxNumThreads(int max_num_threads)
   {
     const int target_num_threads =
-        max_num_threads > -1 ? max_num_threads : kDefaultNumThreadpoolThreads;
+      max_num_threads > -1 ? max_num_threads : kDefaultNumThreadpoolThreads;
     _ruy_context->set_max_num_threads(target_num_threads);
   }
 
diff --git a/runtime/onert/backend/ruy/KernelGenerator.cc b/runtime/onert/backend/ruy/KernelGenerator.cc
index cd2825068..c2f6a1f79 100644
--- a/runtime/onert/backend/ruy/KernelGenerator.cc
+++ b/runtime/onert/backend/ruy/KernelGenerator.cc
@@ -35,62 +35,58 @@ namespace backend
 namespace ruy
 {
 
-KernelGenerator::KernelGenerator(
-    const ir::Operands &operands_ctx, const ir::Operations &operations_ctx,
-    const std::shared_ptr<TensorBuilder> &tensor_builder,
-    const std::shared_ptr<cpu_common::TensorRegistry> &tensor_reg,
-    const std::shared_ptr<backend::custom::IKernelBuilder> &kernel_builder,
-    const std::shared_ptr<ExternalContext> &external_context)
-    : _ctx(operands_ctx), _operations_ctx{operations_ctx}, _tensor_builder(tensor_builder),
-      _tensor_reg{tensor_reg}, _kernel_builder(kernel_builder),
-      _current_layout(ir::Layout::UNKNOWN), _external_context(external_context)
+std::unique_ptr<exec::FunctionSequence> KernelGenerator::generate(ir::OperationIndex ind)
 {
-  // DO NOTHING
-}
+  auto ret = std::make_unique<exec::FunctionSequence>();
 
-void KernelGenerator::visit(const ir::OpSequence &op_seq)
-{
-  assert(!_return_fn_seq);
   assert(_tensor_builder->dynamicTensorManager());
   assert(_tensor_reg);
 
   auto dyn_shape_inferer = std::make_shared<exec::DynamicShapeInferer>(_ctx, _tensor_reg);
 
-  _return_fn_seq = std::make_unique<exec::FunctionSequence>();
-
   // Prepare to handle dynamic tensors later
   auto dyn_ctx = std::make_shared<exec::FunctionSequence::DynamicTensorCtx>();
   {
-    dyn_ctx->op_seq = &op_seq;
+    dyn_ctx->op_ind = ind;
     dyn_ctx->operations = &_operations_ctx;
     dyn_ctx->dynamic_shape_inferer = std::move(dyn_shape_inferer);
-    dyn_ctx->dynamic_tensor_manager = _tensor_builder->dynamicTensorManager();
 
-    _return_fn_seq->dynamic_tensor_ctx(dyn_ctx);
+    ret->dynamic_tensor_ctx(dyn_ctx);
   }
 
-  _current_layout = op_seq.getLayout();
-  for (const auto &operation_idx : op_seq.operations())
+  auto &op = _graph.operations().at(ind);
+  op.accept(*this);
+  assert(_return_fn); // _return_fn must have been generated
+  ret->append(std::move(_return_fn));
+
+  for (auto ind : (op.getInputs() | ir::Remove::UNDEFINED) + op.getOutputs())
   {
-    const auto &node = _operations_ctx.at(operation_idx);
-    node.accept(*this);
-    _return_fn_seq->append(releaseFunction());
+    auto portable_tensor = _tensor_reg->getPortableTensor(ind);
+    if (portable_tensor)
+    {
+      assert(portable_tensor->layout() == ir::Layout::NHWC);
+    }
 
-    for (const auto &ind : (node.getInputs() | ir::Remove::UNDEFINED) + node.getOutputs())
+    auto tensor = _tensor_reg->getNativeTensor(ind);
+    if (tensor)
     {
-      auto portable_tensor = _tensor_reg->getPortableTensor(ind);
-      if (portable_tensor)
-      {
-        assert(portable_tensor->layout() == ir::Layout::NHWC);
-      }
-
-      auto tensor = _tensor_reg->getNativeTensor(ind);
-      if (tensor)
-      {
-        tensor->increase_ref();
-      }
+      tensor->increase_ref();
     }
   }
+  return ret;
+}
+
+KernelGenerator::KernelGenerator(
+  const ir::Graph &graph, const std::shared_ptr<TensorBuilder> &tensor_builder,
+  const std::shared_ptr<basic::TensorRegistry> &tensor_reg,
+  const std::shared_ptr<backend::custom::IKernelBuilder> &kernel_builder,
+  const std::shared_ptr<ExternalContext> &external_context)
+  : basic::KernelGeneratorBase{graph},
+    _ctx(graph.operands()), _operations_ctx{graph.operations()}, _current_layout{graph.layout()},
+    _tensor_builder(tensor_builder), _tensor_reg{tensor_reg}, _kernel_builder(kernel_builder),
+    _external_context(external_context)
+{
+  // DO NOTHING
 }
 
 void KernelGenerator::visit(const ir::operation::Conv2D &node)
@@ -131,8 +127,8 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
   const auto ker_width = ker_shape.dim(2);
 
   const auto padding =
-      ir::calculatePadding(param_padding, ifm_shape, ofm_shape, stride, ker_width, ker_height,
-                           dilation.width_factor, dilation.height_factor);
+    ir::calculatePadding(param_padding, ifm_shape, ofm_shape, stride, ker_width, ker_height,
+                         dilation.width_factor, dilation.height_factor);
 
   fn->configure(ifm_tensor, ker_tensor, bias_tensor, param_padding.type, padding.left,
                 padding.right, padding.top, padding.bottom, stride.horizontal, stride.vertical,
diff --git a/runtime/onert/backend/ruy/KernelGenerator.h b/runtime/onert/backend/ruy/KernelGenerator.h
index 0f6bd590a..31551c46c 100644
--- a/runtime/onert/backend/ruy/KernelGenerator.h
+++ b/runtime/onert/backend/ruy/KernelGenerator.h
@@ -19,11 +19,11 @@
 
 #include "ExternalContext.h"
 #include "TensorBuilder.h"
-#include "backend/cpu_common/TensorRegistry.h"
+#include "backend/basic/TensorRegistry.h"
 #include "Tensor.h"
 
 #include <backend/CustomKernelBuilder.h>
-#include <backend/cpu_common/KernelGeneratorBase.h>
+#include <backend/basic/KernelGeneratorBase.h>
 #include <ir/Operands.h>
 #include <ir/Operations.h>
 
@@ -34,26 +34,27 @@ namespace backend
 namespace ruy
 {
 
-class KernelGenerator : public cpu_common::KernelGeneratorBase
+class KernelGenerator : public basic::KernelGeneratorBase
 {
 public:
-  KernelGenerator(const ir::Operands &operands_ctx, const ir::Operations &operations_ctx,
-                  const std::shared_ptr<TensorBuilder> &tensor_builder,
-                  const std::shared_ptr<cpu_common::TensorRegistry> &tensor_reg,
+  KernelGenerator(const ir::Graph &graph, const std::shared_ptr<TensorBuilder> &tensor_builder,
+                  const std::shared_ptr<basic::TensorRegistry> &tensor_reg,
                   const std::shared_ptr<custom::IKernelBuilder> &kernel_builder,
                   const std::shared_ptr<ExternalContext> &external_context);
 
-  void visit(const ir::OpSequence &) override;
+  std::unique_ptr<exec::FunctionSequence> generate(ir::OperationIndex ind) override;
+
+private:
   void visit(const ir::operation::Conv2D &) override;
   void visit(const ir::operation::FullyConnected &) override;
 
 private:
   const ir::Operands &_ctx;
   const ir::Operations &_operations_ctx;
+  const ir::Layout _current_layout;
   std::shared_ptr<TensorBuilder> _tensor_builder;
-  std::shared_ptr<cpu_common::TensorRegistry> _tensor_reg;
+  std::shared_ptr<basic::TensorRegistry> _tensor_reg;
   std::shared_ptr<backend::custom::IKernelBuilder> _kernel_builder;
-  ir::Layout _current_layout;
   const std::shared_ptr<ExternalContext> _external_context;
 };
 
diff --git a/runtime/onert/backend/ruy/StaticTensorManager.h b/runtime/onert/backend/ruy/StaticTensorManager.h
index af2d25241..867e4dedb 100644
--- a/runtime/onert/backend/ruy/StaticTensorManager.h
+++ b/runtime/onert/backend/ruy/StaticTensorManager.h
@@ -17,7 +17,7 @@
 #ifndef __ONERT_BACKEND_RUY_STATICTENSOR_MANAGER_H__
 #define __ONERT_BACKEND_RUY_STATICTENSOR_MANAGER_H__
 
-#include "backend/cpu_common/StaticTensorManager.h"
+#include "backend/basic/StaticTensorManager.h"
 
 namespace onert
 {
@@ -26,7 +26,7 @@ namespace backend
 namespace ruy
 {
 
-using StaticTensorManager = cpu_common::StaticTensorManager;
+using StaticTensorManager = basic::StaticTensorManager;
 
 } // namespace ruy
 } // namespace backend
diff --git a/runtime/onert/backend/ruy/Tensor.h b/runtime/onert/backend/ruy/Tensor.h
index 60d0fbf77..658086018 100644
--- a/runtime/onert/backend/ruy/Tensor.h
+++ b/runtime/onert/backend/ruy/Tensor.h
@@ -17,7 +17,7 @@
 #ifndef __ONERT_BACKEND_RUY_TENSOR_H__
 #define __ONERT_BACKEND_RUY_TENSOR_H__
 
-#include <backend/cpu_common/Tensor.h>
+#include <backend/basic/Tensor.h>
 #include <ir/Data.h>
 
 namespace onert
@@ -27,8 +27,8 @@ namespace backend
 namespace ruy
 {
 
-using Tensor = cpu_common::Tensor;
-using ExternalTensor = cpu_common::ExternalTensor;
+using Tensor = basic::Tensor;
+using ExternalTensor = basic::ExternalTensor;
 
 } // namespace ruy
 } // namespace backend
diff --git a/runtime/onert/backend/ruy/TensorBuilder.h b/runtime/onert/backend/ruy/TensorBuilder.h
index 91c07bd82..15d4e5b29 100644
--- a/runtime/onert/backend/ruy/TensorBuilder.h
+++ b/runtime/onert/backend/ruy/TensorBuilder.h
@@ -17,15 +17,7 @@
 #ifndef __ONERT_BACKEND_RUY_TENSOR_BUILDER_H__
 #define __ONERT_BACKEND_RUY_TENSOR_BUILDER_H__
 
-#include <backend/cpu_common/DynamicTensorManager.h>
-#include <backend/cpu_common/TensorRegistry.h>
-
-#include <ir/OperandIndexMap.h>
-
-#include "StaticTensorManager.h"
-#include "Tensor.h"
-
-#include <unordered_map>
+#include <backend/basic/TensorBuilder.h>
 
 namespace onert
 {
@@ -34,37 +26,7 @@ namespace backend
 namespace ruy
 {
 
-class TensorBuilder
-{
-public:
-  TensorBuilder(const std::shared_ptr<cpu_common::TensorRegistry> &tensor_reg);
-
-  /**
-   * @brief     Register tensor information to allocate on CPU backend
-   * @param[in] ind    Operand index
-   * @param[in] info   Operand information
-   * @param[in] layout Operand data layout
-   */
-  void registerTensorInfo(const ir::OperandIndex &ind, const ir::OperandInfo &info,
-                          ir::Layout backend_layout);
-
-  void notifyFirstUse(const ir::OperandIndex &);
-  void notifyLastUse(const ir::OperandIndex &);
-
-  bool isRegistered(const ir::OperandIndex &) const;
-
-  void prepare(void);
-  void allocate();
-  void postFunctionPrepare() { /* DO NOTHING */}
-
-  IDynamicTensorManager *dynamicTensorManager(void) { return _dynamic_tensor_mgr.get(); }
-
-private:
-  const std::shared_ptr<cpu_common::TensorRegistry> _tensor_reg;
-  std::unique_ptr<cpu_common::DynamicTensorManager> _dynamic_tensor_mgr;
-  std::unique_ptr<StaticTensorManager> _static_tensor_mgr;
-  ir::OperandIndexMap<ir::OperandInfo> _tensor_info_map;
-};
+using TensorBuilder = basic::TensorBuilder;
 
 } // namespace ruy
 } // namespace backend
diff --git a/runtime/onert/backend/ruy/ops/ConvolutionLayer.cc b/runtime/onert/backend/ruy/ops/ConvolutionLayer.cc
index d249b2ce3..1a2441082 100644
--- a/runtime/onert/backend/ruy/ops/ConvolutionLayer.cc
+++ b/runtime/onert/backend/ruy/ops/ConvolutionLayer.cc
@@ -28,11 +28,11 @@ namespace ruy
 namespace ops
 {
 ConvolutionLayer::ConvolutionLayer()
-    : _input(nullptr), _kernel(nullptr), _bias(nullptr), _output(nullptr),
-      _paddingType(ir::PaddingType::EXPLICIT), _paddingLeft(0), _paddingTop(0), _paddingRight(0),
-      _paddingBottom(0), _strideWidth(0), _strideHeight(0), _dilationWidthFactor(1),
-      _dilationHeightFactor(1), _activation(ir::Activation::NONE),
-      _conv_kernel(new nnfw::ruy::Conv()), _prepare(false)
+  : _input(nullptr), _kernel(nullptr), _bias(nullptr), _output(nullptr),
+    _paddingType(ir::PaddingType::EXPLICIT), _paddingLeft(0), _paddingTop(0), _paddingRight(0),
+    _paddingBottom(0), _strideWidth(0), _strideHeight(0), _dilationWidthFactor(1),
+    _dilationHeightFactor(1), _activation(ir::Activation::NONE),
+    _conv_kernel(new nnfw::ruy::Conv()), _prepare(false)
 {
   // DO NOTHING
 }
@@ -115,8 +115,8 @@ void ConvolutionLayer::run()
     param_padding.param.bottom = _paddingBottom;
 
     const auto padding =
-        ir::calculatePadding(param_padding, ifm_shape, ofm_shape, stride, ker_width, ker_height,
-                             _dilationWidthFactor, _dilationHeightFactor);
+      ir::calculatePadding(param_padding, ifm_shape, ofm_shape, stride, ker_width, ker_height,
+                           _dilationWidthFactor, _dilationHeightFactor);
 
     _paddingLeft = padding.left;
     _paddingRight = padding.right;
diff --git a/runtime/onert/backend/ruy/ops/FullyConnectedLayer.cc b/runtime/onert/backend/ruy/ops/FullyConnectedLayer.cc
index af693e3b4..9c9f31179 100644
--- a/runtime/onert/backend/ruy/ops/FullyConnectedLayer.cc
+++ b/runtime/onert/backend/ruy/ops/FullyConnectedLayer.cc
@@ -30,8 +30,8 @@ namespace ops
 {
 
 FullyConnectedLayer::FullyConnectedLayer()
-    : _input(nullptr), _weights(nullptr), _bias(nullptr), _output(nullptr),
-      _activation(ir::Activation::NONE), _external_context(nullptr)
+  : _input(nullptr), _weights(nullptr), _bias(nullptr), _output(nullptr),
+    _activation(ir::Activation::NONE), _external_context(nullptr)
 {
   // DO NOTHING
 }
@@ -51,11 +51,11 @@ void FullyConnectedLayer::fullyConnectedFloat32()
   op_params.rhs_cacheable = _input->is_constant();
 
   nnfw::ruy::FullyConnected(
-      op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
-      getTensorShape(_weights), reinterpret_cast<const float *>(_weights->buffer()),
-      getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr),
-      getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()),
-      _external_context->ruy_context());
+    op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
+    getTensorShape(_weights), reinterpret_cast<const float *>(_weights->buffer()),
+    getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr),
+    getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()),
+    _external_context->ruy_context());
 }
 
 void FullyConnectedLayer::configure(const IPortableTensor *input, const IPortableTensor *weights,
diff --git a/runtime/onert/backend/xnnpack/Backend.h b/runtime/onert/backend/xnnpack/Backend.h
index b7aef1625..67494a534 100644
--- a/runtime/onert/backend/xnnpack/Backend.h
+++ b/runtime/onert/backend/xnnpack/Backend.h
@@ -19,7 +19,6 @@
 
 #include "BackendContext.h"
 #include "Config.h"
-#include "ConstantInitializer.h"
 #include "KernelGenerator.h"
 
 #include <backend/Backend.h>
@@ -40,19 +39,16 @@ public:
 
   std::shared_ptr<IConfig> config() const override { return _config; }
 
-  std::unique_ptr<onert::backend::BackendContext>
-  newContext(const ir::Graph &graph, const std::shared_ptr<custom::IKernelBuilder> &kb,
-             bool) const override
+  std::unique_ptr<onert::backend::BackendContext> newContext(ContextData &&data) const override
   {
-    const auto &operands = graph.operands();
-    const auto &operations = graph.operations();
-    auto context = std::make_unique<BackendContext>(this, &graph);
-    auto tr = std::make_shared<cpu_common::TensorRegistry>();
+    auto custom_kernel_builder = data.custom_kernel_builder;
+    auto &graph = *data.graph;
+    auto context = std::make_unique<BackendContext>(this, std::move(data));
+    auto tr = std::make_shared<basic::TensorRegistry>();
     auto tb = std::make_shared<TensorBuilder>(tr);
     context->tensor_registry = tr;
     context->tensor_builder = tb;
-    context->constant_initializer = std::make_shared<ConstantInitializer>(operands, tr);
-    context->kernel_gen = std::make_shared<KernelGenerator>(operands, operations, tb, tr, kb,
+    context->kernel_gen = std::make_shared<KernelGenerator>(graph, tb, tr, custom_kernel_builder,
                                                             context->external_context());
     return context;
   }
diff --git a/runtime/onert/backend/xnnpack/BackendContext.cc b/runtime/onert/backend/xnnpack/BackendContext.cc
index 503d088aa..42fffb608 100644
--- a/runtime/onert/backend/xnnpack/BackendContext.cc
+++ b/runtime/onert/backend/xnnpack/BackendContext.cc
@@ -22,7 +22,7 @@
 #include "ir/Index.h"
 #include "ir/OperandIndexMap.h"
 #include "ir/OperandIndexSequence.h"
-#include "backend/cpu_common/BackendContextHelpers.h"
+#include "backend/basic/BackendContextHelpers.h"
 
 namespace onert
 {
@@ -31,107 +31,24 @@ namespace backend
 namespace xnnpack
 {
 
-void BackendContext::initConsts()
-{
-  for (auto &op : operation_list())
-  {
-    constant_initializer->setLayout(op.layout);
-    graph()->operations().at(op.index).accept(*constant_initializer);
-  }
-
-  for (auto ind : operand_list())
-  {
-    const auto &obj = graph()->operands().at(ind);
-    if (obj.isConstant() && !constant_initializer->exist(ind))
-    {
-      constant_initializer->registerDefaultInitializer(ind, obj);
-    }
-  }
-
-  constant_initializer->run();
-}
-
-ITensorRegistry *BackendContext::genTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
-                                            const ir::OpSequences &op_seqs,
-                                            const ir::LowerInfoMap &lower_info)
-{
-  auto model_io = (graph()->getInputs() + graph()->getOutputs()) | ir::Remove::UNDEFINED |
-                  ir::Remove::DUPLICATED;
-  for (auto index : operand_list())
-  {
-    if (model_io.contains(index))
-      continue;
-    const auto &obj = graph()->operands().at(index);
-    const auto frontend_layout = [&]() {
-      if (obj.getUses().size() == 0)
-        return ir::Layout::UNKNOWN;
-      auto use_op_ind = *obj.getUses().begin(); // FIXME What if it has two or more uses?
-      for (auto &operation_info : operation_list())
-      {
-        if (operation_info.index == use_op_ind)
-          return operation_info.layout;
-      }
-      return ir::Layout::UNKNOWN;
-    }();
-    const auto &permute_factor = lower_info.operand.at(index)->def_factors().getOnlyElement();
-    if (permute_factor.backend() != backend())
-      continue;
-    const auto backend_layout = permute_factor.layout();
-    ir::OperandInfo backend_info{permuteShape(obj.shape(), frontend_layout, backend_layout),
-                                 obj.typeInfo(), obj.info().memAllocType(), obj.isConstant()};
-    tensor_builder->registerTensorInfo(index, backend_info, backend_layout);
-  }
-
-  // TODO Get compiler options from compiler, and use it rather than getting it from Env
-  if (util::getConfigString(util::config::EXECUTOR) == "Linear")
-  {
-    cpu_common::planTensors(*this, order, op_seqs, lower_info);
-  }
-  else
-  {
-    // For the executors that does not have fixed linear execution order:
-    // To make tensors never be deallocated, this is a workaround to use static memory planner
-    for (auto ind : operand_list())
-    {
-      if (tensor_builder->isRegistered(ind))
-        tensor_builder->notifyFirstUse(ind);
-    }
-  }
+ITensorRegistry *BackendContext::genTensors() { return basic::genTensors(*this); }
 
-  tensor_builder->prepare();
-
-  return tensor_registry.get();
-}
-
-FunctionMap BackendContext::genKernels(const std::vector<onert::ir::OpSequenceIndex> &order,
-                                       const ir::OpSequences &op_seqs)
+FunctionMap BackendContext::genKernels()
 {
   FunctionMap ret;
 
-  for (auto op_seq_ind : order)
+  for (auto op_ind : _data.op_order)
   {
-    const auto &op_seq = op_seqs.at(op_seq_ind);
-    bool assigned = [&]() {
-      for (auto op_info : operation_list())
-        if (op_seq.exist(op_info.index))
-          return true;
-      return false;
-    }();
-    if (!assigned)
-      continue;
-    auto fn_seq = kernel_gen->generate(op_seqs.at(op_seq_ind));
-    ret.emplace_back(op_seq_ind, std::move(fn_seq));
+    auto fn_seq = kernel_gen->generate(op_ind);
+    ret.emplace_back(op_ind, std::move(fn_seq));
   }
 
-  initConsts();
+  basic::initConsts(*this);
 
   // NOTE For memory optimization, we want to free some operand data
-  for (auto ind : operand_list())
-  {
-    // TODO Remove const_cast
-    auto &obj = const_cast<ir::Graph *>(graph())->operands().at(ind);
-    obj.releaseData();
-  }
+  const_cast<ir::Graph &>(*_data.graph)
+    .operands()
+    .iterate([&](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); });
 
   for (auto &it : ret)
   {
diff --git a/runtime/onert/backend/xnnpack/BackendContext.h b/runtime/onert/backend/xnnpack/BackendContext.h
index f81175b9e..e3b66eef3 100644
--- a/runtime/onert/backend/xnnpack/BackendContext.h
+++ b/runtime/onert/backend/xnnpack/BackendContext.h
@@ -20,14 +20,10 @@
 #include <backend/BackendContext.h>
 #include <util/ConfigSource.h>
 #include "TensorBuilder.h"
-#include "ConstantInitializer.h"
 #include "KernelGenerator.h"
 #include "ExternalContext.h"
 
-namespace
-{
 const int kDefaultNumThreadpoolThreads = 1;
-}
 
 namespace onert
 {
@@ -39,14 +35,12 @@ namespace xnnpack
 class BackendContext : public onert::backend::BackendContext
 {
 public:
-  BackendContext(const Backend *backend, const ir::Graph *graph,
+  BackendContext(const Backend *backend, ContextData &&data,
                  std::shared_ptr<ITensorRegistry> tensor_registry = nullptr,
                  std::shared_ptr<TensorBuilder> tensor_builder = nullptr,
-                 std::shared_ptr<ConstantInitializer> constant_initializer = nullptr,
                  std::shared_ptr<KernelGenerator> kernel_gen = nullptr)
-      : onert::backend::BackendContext(backend, graph, tensor_registry),
-        tensor_builder{tensor_builder}, constant_initializer{constant_initializer},
-        kernel_gen{kernel_gen}, _external_context(nullptr)
+    : onert::backend::BackendContext(backend, std::move(data), tensor_registry),
+      tensor_builder{tensor_builder}, kernel_gen{kernel_gen}, _external_context(nullptr)
   {
     int num_threads = util::getConfigInt(util::config::XNNPACK_THREADS);
     if (num_threads < 1)
@@ -54,24 +48,14 @@ public:
     _external_context.reset(new ExternalContext(static_cast<size_t>(num_threads)));
   }
 
-  ITensorRegistry *genTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
-                              const ir::OpSequences &op_seqs,
-                              const ir::LowerInfoMap &lower_info) override;
-
-  FunctionMap genKernels(const std::vector<ir::OpSequenceIndex> &order,
-                         const ir::OpSequences &op_seqs) override;
+  ITensorRegistry *genTensors() override;
+  FunctionMap genKernels() override;
 
   std::shared_ptr<ExternalContext> external_context() { return _external_context; }
 
-private:
-  void initConsts();
-  void planTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
-                   const ir::OpSequences &op_seqs, const ir::LowerInfoMap &lower_info);
-
 public:
   // TODO Make it private
   std::shared_ptr<TensorBuilder> tensor_builder;
-  std::shared_ptr<ConstantInitializer> constant_initializer;
   std::shared_ptr<KernelGenerator> kernel_gen;
 
 private:
diff --git a/runtime/onert/backend/xnnpack/Config.cc b/runtime/onert/backend/xnnpack/Config.cc
index 4d42a3f18..8783ff390 100644
--- a/runtime/onert/backend/xnnpack/Config.cc
+++ b/runtime/onert/backend/xnnpack/Config.cc
@@ -39,6 +39,6 @@ bool Config::initialize()
 
 ir::Layout Config::supportLayout(const ir::Operation &, ir::Layout) { return ir::Layout::NHWC; }
 
-} // namespace cpu
+} // namespace xnnpack
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/backend/xnnpack/ConstantInitializer.h b/runtime/onert/backend/xnnpack/ConstantInitializer.h
deleted file mode 100644
index 45cdd8cd9..000000000
--- a/runtime/onert/backend/xnnpack/ConstantInitializer.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __ONERT_BACKEND_XNNPACK_CONSTANT_INITIALIZER_H__
-#define __ONERT_BACKEND_XNNPACK_CONSTANT_INITIALIZER_H__
-
-#include <backend/cpu_common/ConstantInitializer.h>
-
-namespace onert
-{
-namespace backend
-{
-namespace xnnpack
-{
-
-using ConstantInitializer = cpu_common::ConstantInitializer;
-
-} // namespace xnnpack
-} // namespace backend
-} // namespace onert
-
-#endif // __ONERT_BACKEND_XNNPACK_CONSTANT_INITIALIZER_H__
diff --git a/runtime/onert/backend/xnnpack/ExternalContext.cc b/runtime/onert/backend/xnnpack/ExternalContext.cc
index 3a9fe1b55..1fbcd4f02 100644
--- a/runtime/onert/backend/xnnpack/ExternalContext.cc
+++ b/runtime/onert/backend/xnnpack/ExternalContext.cc
@@ -26,7 +26,7 @@ namespace xnnpack
 {
 
 ExternalContext::ExternalContext(size_t num_threads)
-    : _threadpool(pthreadpool_create(num_threads), pthreadpool_destroy)
+  : _threadpool(pthreadpool_create(num_threads), pthreadpool_destroy)
 {
   assert(_threadpool);
 }
diff --git a/runtime/onert/backend/xnnpack/KernelGenerator.cc b/runtime/onert/backend/xnnpack/KernelGenerator.cc
index b7d3f60fb..28f729d77 100644
--- a/runtime/onert/backend/xnnpack/KernelGenerator.cc
+++ b/runtime/onert/backend/xnnpack/KernelGenerator.cc
@@ -37,61 +37,57 @@ namespace xnnpack
 {
 
 KernelGenerator::KernelGenerator(
-    const ir::Operands &operands_ctx, const ir::Operations &operations_ctx,
-    const std::shared_ptr<TensorBuilder> &tensor_builder,
-    const std::shared_ptr<cpu_common::TensorRegistry> &tensor_reg,
-    const std::shared_ptr<backend::custom::IKernelBuilder> &kernel_builder,
-    const std::shared_ptr<ExternalContext> &external_context)
-    : _ctx(operands_ctx), _operations_ctx{operations_ctx}, _tensor_builder(tensor_builder),
-      _tensor_reg{tensor_reg}, _kernel_builder(kernel_builder),
-      _current_layout(ir::Layout::UNKNOWN), _external_context(external_context)
+  const ir::Graph &graph, const std::shared_ptr<TensorBuilder> &tensor_builder,
+  const std::shared_ptr<basic::TensorRegistry> &tensor_reg,
+  const std::shared_ptr<backend::custom::IKernelBuilder> &kernel_builder,
+  const std::shared_ptr<ExternalContext> &external_context)
+  : basic::KernelGeneratorBase{graph},
+    _ctx(graph.operands()), _operations_ctx{graph.operations()}, _current_layout{graph.layout()},
+    _tensor_builder(tensor_builder), _tensor_reg{tensor_reg}, _kernel_builder(kernel_builder),
+    _external_context(external_context)
 {
   // DO NOTHING
 }
 
-void KernelGenerator::visit(const ir::OpSequence &op_seq)
+std::unique_ptr<exec::FunctionSequence> KernelGenerator::generate(ir::OperationIndex ind)
 {
-  assert(!_return_fn_seq);
+  auto ret = std::make_unique<exec::FunctionSequence>();
+
   assert(_tensor_builder->dynamicTensorManager());
   assert(_tensor_reg);
 
   auto dyn_shape_inferer = std::make_shared<exec::DynamicShapeInferer>(_ctx, _tensor_reg);
 
-  _return_fn_seq = std::make_unique<exec::FunctionSequence>();
-
   // Prepare to handle dynamic tensors later
   auto dyn_ctx = std::make_shared<exec::FunctionSequence::DynamicTensorCtx>();
   {
-    dyn_ctx->op_seq = &op_seq;
+    dyn_ctx->op_ind = ind;
     dyn_ctx->operations = &_operations_ctx;
     dyn_ctx->dynamic_shape_inferer = std::move(dyn_shape_inferer);
-    dyn_ctx->dynamic_tensor_manager = _tensor_builder->dynamicTensorManager();
 
-    _return_fn_seq->dynamic_tensor_ctx(dyn_ctx);
+    ret->dynamic_tensor_ctx(dyn_ctx);
   }
 
-  _current_layout = op_seq.getLayout();
-  for (const auto &operation_idx : op_seq.operations())
+  auto &op = _graph.operations().at(ind);
+  op.accept(*this);
+  assert(_return_fn); // _return_fn must have been generated
+  ret->append(std::move(_return_fn));
+
+  for (auto ind : (op.getInputs() | ir::Remove::UNDEFINED) + op.getOutputs())
   {
-    const auto &node = _operations_ctx.at(operation_idx);
-    node.accept(*this);
-    _return_fn_seq->append(releaseFunction());
+    auto portable_tensor = _tensor_reg->getPortableTensor(ind);
+    if (portable_tensor)
+    {
+      assert(portable_tensor->layout() == ir::Layout::NHWC);
+    }
 
-    for (const auto &ind : (node.getInputs() | ir::Remove::UNDEFINED) + node.getOutputs())
+    auto tensor = _tensor_reg->getNativeTensor(ind);
+    if (tensor)
     {
-      auto portable_tensor = _tensor_reg->getPortableTensor(ind);
-      if (portable_tensor)
-      {
-        assert(portable_tensor->layout() == ir::Layout::NHWC);
-      }
-
-      auto tensor = _tensor_reg->getNativeTensor(ind);
-      if (tensor)
-      {
-        tensor->increase_ref();
-      }
+      tensor->increase_ref();
     }
   }
+  return ret;
 }
 
 void KernelGenerator::visit(const ir::operation::Conv2D &node)
@@ -122,8 +118,8 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
   const auto ker_width = ker_shape.dim(2);
 
   const auto padding =
-      ir::calculatePadding(param_padding, ifm_shape, ofm_shape, stride, ker_width, ker_height,
-                           dilation.width_factor, dilation.height_factor);
+    ir::calculatePadding(param_padding, ifm_shape, ofm_shape, stride, ker_width, ker_height,
+                         dilation.width_factor, dilation.height_factor);
 
   fn->configure(ifm_tensor, ker_tensor, bias_tensor, param_padding.type, padding.left,
                 padding.right, padding.top, padding.bottom, stride.horizontal, stride.vertical,
diff --git a/runtime/onert/backend/xnnpack/KernelGenerator.h b/runtime/onert/backend/xnnpack/KernelGenerator.h
index 265824204..271a60653 100644
--- a/runtime/onert/backend/xnnpack/KernelGenerator.h
+++ b/runtime/onert/backend/xnnpack/KernelGenerator.h
@@ -19,11 +19,11 @@
 
 #include "ExternalContext.h"
 #include "TensorBuilder.h"
-#include "backend/cpu_common/TensorRegistry.h"
+#include "backend/basic/TensorRegistry.h"
 #include "Tensor.h"
 
 #include <backend/CustomKernelBuilder.h>
-#include <backend/cpu_common/KernelGeneratorBase.h>
+#include <backend/basic/KernelGeneratorBase.h>
 #include <ir/Operands.h>
 #include <ir/Operations.h>
 
@@ -34,16 +34,17 @@ namespace backend
 namespace xnnpack
 {
 
-class KernelGenerator : public cpu_common::KernelGeneratorBase
+class KernelGenerator : public basic::KernelGeneratorBase
 {
 public:
-  KernelGenerator(const ir::Operands &operands_ctx, const ir::Operations &operations_ctx,
-                  const std::shared_ptr<TensorBuilder> &tensor_builder,
-                  const std::shared_ptr<cpu_common::TensorRegistry> &tensor_reg,
+  KernelGenerator(const ir::Graph &graph, const std::shared_ptr<TensorBuilder> &tensor_builder,
+                  const std::shared_ptr<basic::TensorRegistry> &tensor_reg,
                   const std::shared_ptr<custom::IKernelBuilder> &kernel_builder,
                   const std::shared_ptr<ExternalContext> &external_context);
 
-  void visit(const ir::OpSequence &) override;
+  std::unique_ptr<exec::FunctionSequence> generate(ir::OperationIndex ind) override;
+
+private:
   void visit(const ir::operation::Conv2D &) override;
   void visit(const ir::operation::DepthwiseConv2D &) override;
   void visit(const ir::operation::FullyConnected &) override;
@@ -51,10 +52,10 @@ public:
 private:
   const ir::Operands &_ctx;
   const ir::Operations &_operations_ctx;
+  ir::Layout _current_layout;
   std::shared_ptr<TensorBuilder> _tensor_builder;
-  std::shared_ptr<cpu_common::TensorRegistry> _tensor_reg;
+  std::shared_ptr<basic::TensorRegistry> _tensor_reg;
   std::shared_ptr<backend::custom::IKernelBuilder> _kernel_builder;
-  ir::Layout _current_layout;
   const std::shared_ptr<ExternalContext> _external_context;
 };
 
diff --git a/runtime/onert/backend/xnnpack/StaticTensorManager.h b/runtime/onert/backend/xnnpack/StaticTensorManager.h
index f7344e8d8..adaa3623d 100644
--- a/runtime/onert/backend/xnnpack/StaticTensorManager.h
+++ b/runtime/onert/backend/xnnpack/StaticTensorManager.h
@@ -17,7 +17,7 @@
 #ifndef __ONERT_BACKEND_XNNPACK_STATICTENSOR_MANAGER_H__
 #define __ONERT_BACKEND_XNNPACK_STATICTENSOR_MANAGER_H__
 
-#include "backend/cpu_common/StaticTensorManager.h"
+#include "backend/basic/StaticTensorManager.h"
 
 namespace onert
 {
@@ -26,7 +26,7 @@ namespace backend
 namespace xnnpack
 {
 
-using StaticTensorManager = cpu_common::StaticTensorManager;
+using StaticTensorManager = basic::StaticTensorManager;
 
 } // namespace xnnpack
 } // namespace backend
diff --git a/runtime/onert/backend/xnnpack/Tensor.h b/runtime/onert/backend/xnnpack/Tensor.h
index b39cbd266..147361109 100644
--- a/runtime/onert/backend/xnnpack/Tensor.h
+++ b/runtime/onert/backend/xnnpack/Tensor.h
@@ -17,7 +17,7 @@
 #ifndef __ONERT_BACKEND_XNNPACK_TENSOR_H__
 #define __ONERT_BACKEND_XNNPACK_TENSOR_H__
 
-#include <backend/cpu_common/Tensor.h>
+#include <backend/basic/Tensor.h>
 #include <ir/Data.h>
 
 namespace onert
@@ -27,8 +27,8 @@ namespace backend
 namespace xnnpack
 {
 
-using Tensor = cpu_common::Tensor;
-using ExternalTensor = cpu_common::ExternalTensor;
+using Tensor = basic::Tensor;
+using ExternalTensor = basic::ExternalTensor;
 
 } // namespace xnnpack
 } // namespace backend
diff --git a/runtime/onert/backend/xnnpack/TensorBuilder.cc b/runtime/onert/backend/xnnpack/TensorBuilder.cc
deleted file mode 100644
index b570144ce..000000000
--- a/runtime/onert/backend/xnnpack/TensorBuilder.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "TensorBuilder.h"
-
-#include <util/logging.h>
-
-#include <cassert>
-
-namespace onert
-{
-namespace backend
-{
-namespace xnnpack
-{
-
-TensorBuilder::TensorBuilder(const std::shared_ptr<cpu_common::TensorRegistry> &tensor_reg)
-    : _tensor_reg{tensor_reg},
-      _dynamic_tensor_mgr{new cpu_common::DynamicTensorManager(_tensor_reg)},
-      _static_tensor_mgr{new StaticTensorManager(_tensor_reg, _dynamic_tensor_mgr.get())}
-{
-  /* empty */
-}
-
-void TensorBuilder::registerTensorInfo(const ir::OperandIndex &ind, const ir::OperandInfo &info,
-                                       ir::Layout layout)
-{
-  _tensor_info_map.emplace(ind, info);
-
-  // XNNPACK backend supports only one layout as NHWC
-  assert(layout == ir::Layout::NHWC);
-  if (info.isDynamic())
-  {
-    _dynamic_tensor_mgr->buildTensor(ind, info, layout);
-  }
-  else
-  {
-    _static_tensor_mgr->buildTensor(ind, info, layout, info.isConstant());
-  }
-}
-
-void TensorBuilder::notifyFirstUse(const ir::OperandIndex &ind)
-{
-  assert(_tensor_info_map.find(ind) != _tensor_info_map.end());
-  const auto tensor_info = _tensor_info_map.at(ind);
-
-  if (!_tensor_reg->getNativeTensor(ind)->is_dynamic())
-  {
-    const auto size = tensor_info.total_size();
-    _static_tensor_mgr->claimPlan(ind, size);
-  }
-}
-
-void TensorBuilder::notifyLastUse(const ir::OperandIndex &ind)
-{
-  if (!_tensor_reg->getNativeTensor(ind)->is_dynamic())
-  {
-    _static_tensor_mgr->releasePlan(ind);
-  }
-}
-
-bool TensorBuilder::isRegistered(const ir::OperandIndex &ind) const
-{
-  return _tensor_info_map.find(ind) != _tensor_info_map.end();
-}
-
-void TensorBuilder::prepare(void) { _static_tensor_mgr->allocateNonconsts(); }
-
-void TensorBuilder::allocate()
-{
-  // NOTE For now nothing to do. Allocation is done in prepare stage, which is not appropriate
-  //      This is because CPU kernels require `ITensor`s to be allocated before Kernel Generation.
-}
-
-} // namespace xnnpack
-} // namespace backend
-} // namespace onert
diff --git a/runtime/onert/backend/xnnpack/TensorBuilder.h b/runtime/onert/backend/xnnpack/TensorBuilder.h
index dddfedbf9..cbb7c9e18 100644
--- a/runtime/onert/backend/xnnpack/TensorBuilder.h
+++ b/runtime/onert/backend/xnnpack/TensorBuilder.h
@@ -17,15 +17,7 @@
 #ifndef __ONERT_BACKEND_XNNPACK_TENSOR_BUILDER_H__
 #define __ONERT_BACKEND_XNNPACK_TENSOR_BUILDER_H__
 
-#include <backend/cpu_common/DynamicTensorManager.h>
-#include <backend/cpu_common/TensorRegistry.h>
-
-#include <ir/OperandIndexMap.h>
-
-#include "StaticTensorManager.h"
-#include "Tensor.h"
-
-#include <unordered_map>
+#include <backend/basic/TensorBuilder.h>
 
 namespace onert
 {
@@ -34,37 +26,7 @@ namespace backend
 namespace xnnpack
 {
 
-class TensorBuilder
-{
-public:
-  TensorBuilder(const std::shared_ptr<cpu_common::TensorRegistry> &tensor_reg);
-
-  /**
-   * @brief     Register tensor information to allocate on XNNPACK backend
-   * @param[in] ind    Operand index
-   * @param[in] info   Operand information
-   * @param[in] layout Operand data layout
-   */
-  void registerTensorInfo(const ir::OperandIndex &ind, const ir::OperandInfo &info,
-                          ir::Layout backend_layout);
-
-  void notifyFirstUse(const ir::OperandIndex &);
-  void notifyLastUse(const ir::OperandIndex &);
-
-  bool isRegistered(const ir::OperandIndex &) const;
-
-  void prepare(void);
-  void allocate();
-  void postFunctionPrepare() { /* DO NOTHING */}
-
-  IDynamicTensorManager *dynamicTensorManager(void) { return _dynamic_tensor_mgr.get(); }
-
-private:
-  const std::shared_ptr<cpu_common::TensorRegistry> _tensor_reg;
-  std::unique_ptr<cpu_common::DynamicTensorManager> _dynamic_tensor_mgr;
-  std::unique_ptr<StaticTensorManager> _static_tensor_mgr;
-  ir::OperandIndexMap<ir::OperandInfo> _tensor_info_map;
-};
+using TensorBuilder = basic::TensorBuilder;
 
 } // namespace xnnpack
 } // namespace backend
diff --git a/runtime/onert/backend/xnnpack/ops/ConvolutionLayer.cc b/runtime/onert/backend/xnnpack/ops/ConvolutionLayer.cc
index 0612995c2..32ca99460 100644
--- a/runtime/onert/backend/xnnpack/ops/ConvolutionLayer.cc
+++ b/runtime/onert/backend/xnnpack/ops/ConvolutionLayer.cc
@@ -27,10 +27,10 @@ namespace xnnpack
 namespace ops
 {
 ConvolutionLayer::ConvolutionLayer(const std::shared_ptr<ExternalContext> external_context)
-    : Layer(external_context), _input(nullptr), _kernel(nullptr), _bias(nullptr), _output(nullptr),
-      _padding_type(ir::PaddingType::EXPLICIT), _padding_left(0), _padding_top(0),
-      _padding_right(0), _padding_bottom(0), _stride_width(0), _stride_height(0),
-      _dilation_width_factor(1), _dilation_height_factor(1), _activation(ir::Activation::NONE)
+  : Layer(external_context), _input(nullptr), _kernel(nullptr), _bias(nullptr), _output(nullptr),
+    _padding_type(ir::PaddingType::EXPLICIT), _padding_left(0), _padding_top(0), _padding_right(0),
+    _padding_bottom(0), _stride_width(0), _stride_height(0), _dilation_width_factor(1),
+    _dilation_height_factor(1), _activation(ir::Activation::NONE)
 {
   // DO NOTHING
 }
@@ -105,14 +105,13 @@ bool ConvolutionLayer::create()
   assert(static_cast<uint32_t>(_output->getShape().dim(3)) == output_channels);
 
   enum xnn_status status = xnn_create_convolution2d_nhwc_f32(
-      _padding_top, _padding_right, _padding_bottom, _padding_left, kernel_height, kernel_width,
-      _stride_height, _stride_width, _dilation_height_factor, _dilation_width_factor,
-      1 /* groups */, input_channels /* group_input_channels */,
-      output_channels /* group_output_channels */, input_channels /* input_channel_stride */,
-      output_channels /* output_channel_stride */,
-      reinterpret_cast<const float *>(_kernel->buffer()),
-      reinterpret_cast<const float *>(_bias->buffer()), output_activation_min,
-      output_activation_max, 0, &_kernel_op);
+    _padding_top, _padding_right, _padding_bottom, _padding_left, kernel_height, kernel_width,
+    _stride_height, _stride_width, _dilation_height_factor, _dilation_width_factor, 1 /* groups */,
+    input_channels /* group_input_channels */, output_channels /* group_output_channels */,
+    input_channels /* input_channel_stride */, output_channels /* output_channel_stride */,
+    reinterpret_cast<const float *>(_kernel->buffer()),
+    reinterpret_cast<const float *>(_bias->buffer()), output_activation_min, output_activation_max,
+    0, &_kernel_op);
   if (status != xnn_status_success)
   {
     throw std::runtime_error{"failed to create FP32 Convolution operator"};
@@ -133,9 +132,9 @@ bool ConvolutionLayer::setup()
   uint32_t input_height = _input->getShape().dim(1);
   uint32_t batch_size = _input->getShape().dim(0);
   enum xnn_status status = xnn_setup_convolution2d_nhwc_f32(
-      _kernel_op, batch_size, input_height, input_width,
-      reinterpret_cast<const float *>(_input->buffer()),
-      reinterpret_cast<float *>(_output->buffer()), _external_context->getThreadPool());
+    _kernel_op, batch_size, input_height, input_width,
+    reinterpret_cast<const float *>(_input->buffer()), reinterpret_cast<float *>(_output->buffer()),
+    _external_context->getThreadPool());
   if (status != xnn_status_success)
   {
     throw std::runtime_error{"failed to create FP32 Convolution operator"};
diff --git a/runtime/onert/backend/xnnpack/ops/DepthwiseConvolutionLayer.cc b/runtime/onert/backend/xnnpack/ops/DepthwiseConvolutionLayer.cc
index 947f04194..9a671d487 100644
--- a/runtime/onert/backend/xnnpack/ops/DepthwiseConvolutionLayer.cc
+++ b/runtime/onert/backend/xnnpack/ops/DepthwiseConvolutionLayer.cc
@@ -28,21 +28,21 @@ namespace ops
 {
 
 DepthwiseConvolutionLayer::DepthwiseConvolutionLayer(
-    const std::shared_ptr<ExternalContext> external_context)
-    : Layer(external_context), _input(nullptr), _kernel(nullptr), _bias(nullptr), _output(nullptr),
-      _padding_type(ir::PaddingType::EXPLICIT), _padding_left(0), _padding_top(0),
-      _padding_right(0), _padding_bottom(0), _stride_width(0), _stride_height(0), _multiplier(1),
-      _dilation_width_factor(1), _dilation_height_factor(1), _activation(ir::Activation::NONE)
+  const std::shared_ptr<ExternalContext> external_context)
+  : Layer(external_context), _input(nullptr), _kernel(nullptr), _bias(nullptr), _output(nullptr),
+    _padding_type(ir::PaddingType::EXPLICIT), _padding_left(0), _padding_top(0), _padding_right(0),
+    _padding_bottom(0), _stride_width(0), _stride_height(0), _multiplier(1),
+    _dilation_width_factor(1), _dilation_height_factor(1), _activation(ir::Activation::NONE)
 {
   // DO NOTHING
 }
 
 void DepthwiseConvolutionLayer::configure(
-    const IPortableTensor *input, const IPortableTensor *kernel, const IPortableTensor *bias,
-    ir::PaddingType padding_type, const uint32_t padding_left, const uint32_t padding_right,
-    const uint32_t padding_top, const uint32_t padding_bottom, const uint32_t stride_width,
-    const uint32_t stride_height, const uint32_t multiplier, const uint32_t dilation_width_factor,
-    const uint32_t dilation_height_factor, const ir::Activation activation, IPortableTensor *output)
+  const IPortableTensor *input, const IPortableTensor *kernel, const IPortableTensor *bias,
+  ir::PaddingType padding_type, const uint32_t padding_left, const uint32_t padding_right,
+  const uint32_t padding_top, const uint32_t padding_bottom, const uint32_t stride_width,
+  const uint32_t stride_height, const uint32_t multiplier, const uint32_t dilation_width_factor,
+  const uint32_t dilation_height_factor, const ir::Activation activation, IPortableTensor *output)
 {
   _input = input;
   _kernel = kernel;
@@ -106,14 +106,13 @@ bool DepthwiseConvolutionLayer::create()
   assert(output_channels == input_channels * _multiplier);
 
   enum xnn_status status = xnn_create_convolution2d_nhwc_f32(
-      _padding_top, _padding_right, _padding_bottom, _padding_left, kernel_height, kernel_width,
-      _stride_height, _stride_width, _dilation_height_factor, _dilation_width_factor,
-      input_channels /* groups */, 1 /* group_input_channels */,
-      _multiplier /* group_output_channels */, input_channels /* input_channel_stride */,
-      output_channels /* output_channel_stride */,
-      reinterpret_cast<const float *>(_kernel->buffer()),
-      reinterpret_cast<const float *>(_bias->buffer()), output_activation_min,
-      output_activation_max, XNN_FLAG_DEPTHWISE_CONVOLUTION, &_kernel_op);
+    _padding_top, _padding_right, _padding_bottom, _padding_left, kernel_height, kernel_width,
+    _stride_height, _stride_width, _dilation_height_factor, _dilation_width_factor,
+    input_channels /* groups */, 1 /* group_input_channels */,
+    _multiplier /* group_output_channels */, input_channels /* input_channel_stride */,
+    output_channels /* output_channel_stride */, reinterpret_cast<const float *>(_kernel->buffer()),
+    reinterpret_cast<const float *>(_bias->buffer()), output_activation_min, output_activation_max,
+    XNN_FLAG_DEPTHWISE_CONVOLUTION, &_kernel_op);
   if (status != xnn_status_success)
   {
     throw std::runtime_error{"failed to create FP32 DepthwiseConvolution operator"};
@@ -134,9 +133,9 @@ bool DepthwiseConvolutionLayer::setup()
   uint32_t input_height = _input->getShape().dim(1);
   uint32_t batch_size = _input->getShape().dim(0);
   enum xnn_status status = xnn_setup_convolution2d_nhwc_f32(
-      _kernel_op, batch_size, input_height, input_width,
-      reinterpret_cast<const float *>(_input->buffer()),
-      reinterpret_cast<float *>(_output->buffer()), _external_context->getThreadPool());
+    _kernel_op, batch_size, input_height, input_width,
+    reinterpret_cast<const float *>(_input->buffer()), reinterpret_cast<float *>(_output->buffer()),
+    _external_context->getThreadPool());
   if (status != xnn_status_success)
   {
     throw std::runtime_error{"failed to create FP32 DepthwiseConvolution operator"};
diff --git a/runtime/onert/backend/xnnpack/ops/FullyConnectedLayer.cc b/runtime/onert/backend/xnnpack/ops/FullyConnectedLayer.cc
index d595fda36..66171ad42 100644
--- a/runtime/onert/backend/xnnpack/ops/FullyConnectedLayer.cc
+++ b/runtime/onert/backend/xnnpack/ops/FullyConnectedLayer.cc
@@ -28,8 +28,8 @@ namespace ops
 {
 
 FullyConnectedLayer::FullyConnectedLayer(const std::shared_ptr<ExternalContext> external_context)
-    : Layer(external_context), _input(nullptr), _kernel(nullptr), _bias(nullptr), _output(nullptr),
-      _activation(ir::Activation::NONE)
+  : Layer(external_context), _input(nullptr), _kernel(nullptr), _bias(nullptr), _output(nullptr),
+    _activation(ir::Activation::NONE)
 {
   // DO NOTHING
 }
@@ -102,9 +102,9 @@ bool FullyConnectedLayer::create()
   const float *bias_buffer = (_bias) ? reinterpret_cast<const float *>(_bias->buffer()) : nullptr;
 
   enum xnn_status status = xnn_create_fully_connected_nc_f32(
-      input_channels, output_channels, input_channels /* input stride */,
-      output_channels /* output stride */, kernel_buffer, bias_buffer, output_activation_min,
-      output_activation_max, flag, &_kernel_op);
+    input_channels, output_channels, input_channels /* input stride */,
+    output_channels /* output stride */, kernel_buffer, bias_buffer, output_activation_min,
+    output_activation_max, flag, &_kernel_op);
   if (status != xnn_status_success)
   {
     throw std::runtime_error{"failed to create FP32 FullyConnected operator"};
@@ -123,8 +123,8 @@ bool FullyConnectedLayer::setup()
 
   uint32_t batch_size = _input->getShape().num_elements() / _kernel->getShape().dim(1);
   enum xnn_status status = xnn_setup_fully_connected_nc_f32(
-      _kernel_op, batch_size, reinterpret_cast<const float *>(_input->buffer()),
-      reinterpret_cast<float *>(_output->buffer()), _external_context->getThreadPool());
+    _kernel_op, batch_size, reinterpret_cast<const float *>(_input->buffer()),
+    reinterpret_cast<float *>(_output->buffer()), _external_context->getThreadPool());
   if (status != xnn_status_success)
   {
     throw std::runtime_error{"failed to create FP32 FullyConnected operator"};
diff --git a/runtime/onert/backend/xnnpack/ops/Layer.h b/runtime/onert/backend/xnnpack/ops/Layer.h
index 68b610f33..ec07e874f 100644
--- a/runtime/onert/backend/xnnpack/ops/Layer.h
+++ b/runtime/onert/backend/xnnpack/ops/Layer.h
@@ -41,7 +41,7 @@ class Layer : public ::onert::exec::IFunction
 {
 public:
   Layer(const std::shared_ptr<ExternalContext> external_context)
-      : _kernel_op{nullptr}, _create{false}, _setup{false}, _external_context{external_context}
+    : _kernel_op{nullptr}, _create{false}, _setup{false}, _external_context{external_context}
   {
     // DO NOTHING
   }
diff --git a/runtime/onert/core/CMakeLists.txt b/runtime/onert/core/CMakeLists.txt
index ea212a42b..6dbadf80b 100644
--- a/runtime/onert/core/CMakeLists.txt
+++ b/runtime/onert/core/CMakeLists.txt
@@ -43,12 +43,12 @@ if(NOT ENABLE_TEST)
 endif(NOT ENABLE_TEST)
 
 # Unit Tests
-set(TEST_ONERT_BACKEND_CPU_COMMON test_onert_backend_cpu_common)
+set(TEST_ONERT_CORE test_onert_core)
 
-add_executable(${TEST_ONERT_BACKEND_CPU_COMMON} ${TESTS})
+add_executable(${TEST_ONERT_CORE} ${TESTS})
 
-target_link_libraries(${TEST_ONERT_BACKEND_CPU_COMMON} onert_core)
-target_link_libraries(${TEST_ONERT_BACKEND_CPU_COMMON} gtest gtest_main dl ${LIB_PTHREAD})
+target_link_libraries(${TEST_ONERT_CORE} onert_core)
+target_link_libraries(${TEST_ONERT_CORE} gtest gtest_main dl ${LIB_PTHREAD})
 
-add_test(${TEST_ONERT_BACKEND_CPU_COMMON} ${TEST_ONERT_BACKEND_CPU_COMMON})
-install(TARGETS ${TEST_ONERT_BACKEND_CPU_COMMON} DESTINATION unittest_standalone)
+add_test(${TEST_ONERT_CORE} ${TEST_ONERT_CORE})
+install(TARGETS ${TEST_ONERT_CORE} DESTINATION unittest_standalone)
diff --git a/runtime/onert/core/include/backend/Backend.h b/runtime/onert/core/include/backend/Backend.h
index 4f6ebbba7..136a76fba 100644
--- a/runtime/onert/core/include/backend/Backend.h
+++ b/runtime/onert/core/include/backend/Backend.h
@@ -39,9 +39,7 @@ public:
   virtual ~Backend() = default;
   virtual std::shared_ptr<onert::backend::IConfig> config() const = 0;
 
-  virtual std::unique_ptr<BackendContext>
-  newContext(const ir::Graph &graph, const std::shared_ptr<backend::custom::IKernelBuilder> &kb,
-             bool is_linear_executor) const = 0;
+  virtual std::unique_ptr<BackendContext> newContext(ContextData &&) const = 0;
 };
 
 } // namespace backend
diff --git a/runtime/onert/core/include/backend/BackendContext.h b/runtime/onert/core/include/backend/BackendContext.h
index 4d212156a..ccecc2d34 100644
--- a/runtime/onert/core/include/backend/BackendContext.h
+++ b/runtime/onert/core/include/backend/BackendContext.h
@@ -19,7 +19,9 @@
 
 #include <memory>
 #include "ir/Graph.h"
-#include "ir/LowerInfoMap.h"
+#include "ir/OperationIndexMap.h"
+#include "ir/OperandIndexMap.h"
+#include "compiler/GraphLowerInfo.h"
 #include "exec/FunctionSequence.h"
 
 namespace onert
@@ -31,53 +33,47 @@ class Backend;
 struct ITensorRegistry;
 
 using FunctionMap =
-    std::vector<std::pair<ir::OpSequenceIndex, std::unique_ptr<exec::FunctionSequence>>>;
+  std::vector<std::pair<ir::OperationIndex, std::unique_ptr<exec::FunctionSequence>>>;
 
-class BackendContext
+struct ContextData
 {
-public:
-  struct OperationInfo
-  {
-    ir::OperationIndex index;
-    ir::Layout layout;
-
-    OperationInfo(ir::OperationIndex index, ir::Layout layout) : index{index}, layout{layout} {}
-  };
+  /* A partial graph that only includes used operand/operations of the original graph */
+  std::unique_ptr<ir::Graph> graph;
+  /* A linear order of operations. This is neccessary for when a graph is not fully connected */
+  std::vector<onert::ir::OperationIndex> op_order;
+  /* Operands that are defined by other backends */
+  util::Set<ir::OperandIndex> external_operands;
+  /* Operand layout info */
+  ir::OperandIndexMap<ir::Layout> operand_layouts;
+  /* Custom kernel builder */
+  std::shared_ptr<custom::IKernelBuilder> custom_kernel_builder;
+  /* Is linear executor or not */
+  bool is_linear_executor;
+};
 
+class BackendContext
+{
 public:
-  BackendContext(const Backend *backend, const ir::Graph *graph,
+  BackendContext(const Backend *backend, ContextData &&data,
                  std::shared_ptr<ITensorRegistry> tensor_registry = nullptr)
-      : _backend{backend}, _graph{graph}, tensor_registry{tensor_registry}
+    : _backend{backend}, _data{std::move(data)}, tensor_registry{tensor_registry}
   {
   }
 
   virtual ~BackendContext() = default;
 
-  void initialize(const std::vector<OperationInfo> &operation_list,
-                  const std::vector<ir::OperandIndex> &operand_list);
-  void initConsts();
-
   const Backend *backend() const { return _backend; }
-  const ir::Graph *graph() const { return _graph; }
-  const std::vector<OperationInfo> &operation_list() const { return _operation_list; }
-  const std::vector<ir::OperandIndex> &operand_list() const { return _operand_list; }
+  const ir::Graph *graph() const { return _data.graph.get(); }
+  const util::Set<ir::OperandIndex> &external_operands() const { return _data.external_operands; }
+  const ir::OperandIndexMap<ir::Layout> &operand_layouts() const { return _data.operand_layouts; }
+  const ContextData &data() const { return _data; }
 
-  virtual ITensorRegistry *genTensors(const std::vector<onert::ir::OpSequenceIndex> &,
-                                      const ir::OpSequences &, const ir::LowerInfoMap &)
-  {
-    return nullptr;
-  }
-  virtual FunctionMap genKernels(const std::vector<onert::ir::OpSequenceIndex> &,
-                                 const ir::OpSequences &)
-  {
-    return {};
-  }
+  virtual ITensorRegistry *genTensors() = 0;
+  virtual FunctionMap genKernels() = 0;
 
-private:
+protected:
   const Backend *_backend{nullptr};
-  const ir::Graph *_graph{nullptr};
-  std::vector<OperationInfo> _operation_list;
-  std::vector<ir::OperandIndex> _operand_list;
+  ContextData _data;
 
 public:
   std::shared_ptr<ITensorRegistry> tensor_registry;
diff --git a/runtime/onert/core/include/backend/IConfig.h b/runtime/onert/core/include/backend/IConfig.h
index ef9c5cdb2..409fd3d9f 100644
--- a/runtime/onert/core/include/backend/IConfig.h
+++ b/runtime/onert/core/include/backend/IConfig.h
@@ -54,7 +54,7 @@ struct IConfig
    */
   virtual ir::Layout supportLayout(const ir::Operation &node, ir::Layout frontend_layout) = 0;
   /**
-   * @brief The function that is called after each OpSequence run on profiling mode.
+   * @brief The function that is called after each Operation run on profiling mode.
    *        This may be useful for profiling GPU-based or special computing units.
    */
   virtual void sync() const {}
diff --git a/runtime/onert/core/include/backend/IDynamicTensorManager.h b/runtime/onert/core/include/backend/IDynamicTensorManager.h
deleted file mode 100644
index 67cfda24e..000000000
--- a/runtime/onert/core/include/backend/IDynamicTensorManager.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __ONERT_BACKEND_IDYNAMICTENSOR_MANAGER_H__
-#define __ONERT_BACKEND_IDYNAMICTENSOR_MANAGER_H__
-
-#include "ITensorManager.h"
-
-#include <ir/Index.h>
-#include <ir/Operation.h>
-#include <ir/Shape.h>
-#include <backend/ITensor.h>
-
-namespace onert
-{
-namespace backend
-{
-
-/**
- * @brief Interface as an abstract tensor manager, providing ways to handle memory
- *        for dynamic tensors.
- */
-struct IDynamicTensorManager : public ITensorManager
-{
-  virtual ~IDynamicTensorManager() = default;
-
-public:
-  /**
-   * @brief Plan when to delete a tensor. Note this planning is done at compilation time.
-   * @param op_ind        operation index
-   * @param tensor        candidate ITensor to dealloc. Tensor can be static
-   *                      or dynamic since tensor type may not be clearly known at compilation time.
-   */
-  virtual void planDealloc(ir::OperationIndex op_ind, backend::ITensor *tensor) = 0;
-
-  /**
-   * @brief Deallocate input tensors of op if an input tensor is a dynamic tensor and it won't
-   *        be used anymore
-   * @note  This will work after calling planDealloc
-   */
-  virtual void deallocInput(ir::OperationIndex op_ind) = 0;
-};
-
-} // namespace backend
-} // namespace onert
-
-#endif // __ONERT_BACKEND_IDYNAMICTENSOR_MANAGER_H__
diff --git a/runtime/onert/core/include/backend/IMemoryManager.h b/runtime/onert/core/include/backend/IMemoryManager.h
deleted file mode 100644
index bad2fd51a..000000000
--- a/runtime/onert/core/include/backend/IMemoryManager.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __ONERT_BACKEND_IMEMORY_MANAGER_H__
-#define __ONERT_BACKEND_IMEMORY_MANAGER_H__
-
-namespace onert
-{
-namespace backend
-{
-
-struct IMemoryManager
-{
-  virtual ~IMemoryManager() = default;
-
-  virtual void allocate(void) = 0;
-  virtual void deallocate(void) = 0;
-};
-
-} // namespace backend
-} // namespace onert
-
-#include <unordered_set>
-#include <memory>
-
-namespace onert
-{
-namespace backend
-{
-
-using MemoryManagerSet = std::unordered_set<std::unique_ptr<backend::IMemoryManager>>;
-
-} // namespace backend
-} // namespace onert
-
-#endif // __ONERT_BACKEND_IMEMORY_MANAGER_H__
diff --git a/runtime/onert/core/include/backend/IPortableTensor.h b/runtime/onert/core/include/backend/IPortableTensor.h
index 1b1f05fe1..608ca4407 100644
--- a/runtime/onert/core/include/backend/IPortableTensor.h
+++ b/runtime/onert/core/include/backend/IPortableTensor.h
@@ -43,6 +43,13 @@ public:
   virtual ~IPortableTensor();
   virtual const ir::Sparsity *sparsity() const { return nullptr; }
   const ir::OperandInfo &get_info() const { return _info; }
+  float data_scale() const override { return _info.typeInfo().scale(); }
+  int32_t data_zero_point() const override { return _info.typeInfo().zero_point(); }
+  const std::vector<float> &data_scales() const override { return _info.typeInfo().scales(); }
+  const std::vector<int32_t> &data_zero_points() const override
+  {
+    return _info.typeInfo().zero_points();
+  }
 
 public:
   bool has_padding() const final { return false; }
diff --git a/runtime/onert/core/include/backend/ITensor.h b/runtime/onert/core/include/backend/ITensor.h
index 3fadda1f5..0a4d9c814 100644
--- a/runtime/onert/core/include/backend/ITensor.h
+++ b/runtime/onert/core/include/backend/ITensor.h
@@ -32,23 +32,21 @@ namespace onert
 namespace backend
 {
 
-struct IDynamicTensorManager;
-
 class ITensor
 {
 public:
-  virtual ~ITensor() = default;
+  virtual ~ITensor();
 
 public:
   virtual uint8_t *buffer() const = 0;
   virtual size_t total_size() const = 0;
-  virtual size_t dimension(size_t index) const = 0;
-  virtual size_t num_dimensions() const = 0;
   virtual size_t calcOffset(const ir::Coordinates &coords) const = 0;
   virtual ir::Layout layout() const = 0;
   virtual ir::DataType data_type() const = 0;
   virtual float data_scale() const = 0;
-  virtual int32_t data_offset() const = 0;
+  virtual int32_t data_zero_point() const = 0;
+  virtual const std::vector<float> &data_scales() const = 0;
+  virtual const std::vector<int32_t> &data_zero_points() const = 0;
   virtual bool has_padding() const = 0;
   virtual void access(const std::function<void(ITensor &tensor)> &fn) = 0;
 
@@ -88,6 +86,12 @@ public:
     throw std::runtime_error("This backend does not support dynamic tensor");
   }
 
+  /// @brief Dealloc the buffer (only for dynamic tensors)
+  virtual void deallocBuffer()
+  {
+    throw std::runtime_error("This backend does not support resetting buffer");
+  }
+
   /**
    * @brief Set the shape of tenser to new_shape
    * @note  Higer dimension will be placed on front.
@@ -102,7 +106,7 @@ public:
    * @brief Get ir::Shape of tensor
    * @note  Higer dimension will be placed on front.
    */
-  virtual ir::Shape getShape() const;
+  virtual ir::Shape getShape() const = 0;
 
   virtual bool is_subtensor() const { return false; }
   virtual bool needMemoryMap() const { return false; }
diff --git a/runtime/onert/core/include/backend/ITensorManager.h b/runtime/onert/core/include/backend/ITensorManager.h
deleted file mode 100644
index 4974b6645..000000000
--- a/runtime/onert/core/include/backend/ITensorManager.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __ONERT_BACKEND_ITENSOR_MANAGER_H__
-#define __ONERT_BACKEND_ITENSOR_MANAGER_H__
-
-namespace onert
-{
-namespace backend
-{
-
-// NOTE This name ITensorManager has been discussed whether or not the name is proper.
-// Anyone can argue with any better name.
-/**
- * @brief Interface as an abstract tensor manager which has MemoryManager
- *        This is used as a base class for IStaticTensorManager and IDynamicTensorManager
- */
-struct ITensorManager
-{
-  virtual ~ITensorManager() = default;
-};
-
-} // namespace backend
-} // namespace onert
-
-#include <unordered_set>
-#include <memory>
-
-namespace onert
-{
-namespace backend
-{
-
-using TensorManagerSet = std::unordered_set<std::unique_ptr<backend::ITensorManager>>;
-
-} // namespace backend
-} // namespace onert
-
-#endif // __ONERT_BACKEND_ITENSOR_MANAGER_H__
diff --git a/runtime/onert/core/include/backend/cpu_common/Allocator.h b/runtime/onert/core/include/backend/basic/Allocator.h
index fa67fc7c4..ff609322a 100644
--- a/runtime/onert/core/include/backend/cpu_common/Allocator.h
+++ b/runtime/onert/core/include/backend/basic/Allocator.h
@@ -19,8 +19,8 @@
  * @brief       This file contains Allocator related classes
  */
 
-#ifndef __ONERT_BACKEND_CPU_COMMON_ALLOCATOR_H__
-#define __ONERT_BACKEND_CPU_COMMON_ALLOCATOR_H__
+#ifndef __ONERT_BACKEND_BASIC_ALLOCATOR_H__
+#define __ONERT_BACKEND_BASIC_ALLOCATOR_H__
 
 #include <memory>
 
@@ -28,7 +28,7 @@ namespace onert
 {
 namespace backend
 {
-namespace cpu_common
+namespace basic
 {
 
 /**
@@ -49,8 +49,8 @@ private:
   std::unique_ptr<uint8_t[]> _base;
 };
 
-} // namespace cpu_common
+} // namespace basic
 } // namespace backend
 } // namespace onert
 
-#endif // __ONERT_BACKEND_CPU_COMMON_ALLOCATOR_H__
+#endif // __ONERT_BACKEND_BASIC_ALLOCATOR_H__
diff --git a/runtime/onert/core/include/backend/basic/BackendContextHelpers.h b/runtime/onert/core/include/backend/basic/BackendContextHelpers.h
new file mode 100644
index 000000000..58bfe3406
--- /dev/null
+++ b/runtime/onert/core/include/backend/basic/BackendContextHelpers.h
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_BASIC_BACKEND_CONTEXT_HELPERS_H__
+#define __ONERT_BACKEND_BASIC_BACKEND_CONTEXT_HELPERS_H__
+
+#include <vector>
+
+#include "ir/Index.h"
+#include "compiler/GraphLowerInfo.h"
+#include "util/logging.h"
+#include "backend/ITensorRegistry.h"
+#include "backend/BackendContext.h"
+#include "Tensor.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace basic
+{
+
+// TODO Remove the template param BackendContext once unification of cpu backend context is done
+template <typename T_BackendContext> void planTensors(const T_BackendContext &ctx)
+{
+  const ir::Graph &graph = *ctx.graph();
+  const auto &order = ctx.data().op_order;
+  auto tensor_builder = ctx.tensor_builder;
+
+  ir::OperandIndexMap<uint32_t> uses_map;
+  ir::OperandIndexMap<uint32_t> def_map;
+  ir::OperandIndexSequence constants;
+
+  auto model_io =
+    (graph.getInputs() + graph.getOutputs()) | ir::Remove::UNDEFINED | ir::Remove::DUPLICATED;
+
+  // Prepare scanning
+  graph.operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &obj) {
+    if (ctx.external_operands().contains(ind))
+      return;
+
+    // TODO Check if we need to handle unused tensors
+
+    uses_map[ind] = obj.getUses().size();
+    def_map[ind] = obj.getDef().valid() ? 1 : 0;
+
+    if (obj.isConstant())
+      constants.append(ind);
+
+    if (!tensor_builder->isRegistered(ind))
+    {
+      // These tensors do not exist in any  (No use and def)
+      const auto info = obj.info();
+      // NOTE Currently we only support NHWC tensors for cpu-common tensors.
+      //      There is no way to get the layout info from the backend context for now.
+      //      When we support NCHW tensors as well, we also need to change tensor info to be
+      //      permuted shape.
+      assert(ctx.operand_layouts().at(ind) == ir::Layout::NHWC);
+      tensor_builder->registerTensorInfo(ind, info, ir::Layout::NHWC);
+    }
+  });
+
+  // Start scanning to do notify{First|Last}Use for each tensor
+
+  // If a tensor is a constant, increase the use of the tensor and allocate it first.
+  // Increasing use count here makes the tensor never be deallocated, i.e it they will be
+  // deallocated last.
+  for (const auto &ind : constants)
+  {
+    uses_map[ind]++;
+    tensor_builder->notifyFirstUse(ind);
+  }
+
+  for (auto &pair : def_map)
+  {
+    if (pair.second == 0)
+      tensor_builder->notifyFirstUse(pair.first);
+  }
+
+  // This is a workaround to keep the operands over the execution
+  // (the operands look like they are unused)
+  std::vector<ir::OperandIndex> operands_last_until_end;
+  for (auto &pair : uses_map)
+  {
+    if (pair.second == 0)
+      operands_last_until_end.push_back(pair.first);
+  }
+
+  // At each operation,
+  // 1. Scan DEF of outputs. If the DEF, allocate it
+  // 2. Scan DEF of inputs. If variable tensor, allocate it
+  // 3. Scan USE of inputs. Decrease the USE and deallocate if the USE is 0
+  for (const auto op_ind : order)
+  {
+    const auto &op = graph.operations().at(op_ind);
+    auto op_inputs = op.getInputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED;
+    auto op_outputs = op.getOutputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED;
+
+    // Define outputs
+    for (const auto &ind : op_outputs)
+    {
+      if (ctx.external_operands().contains(ind))
+        continue;
+      if (!tensor_builder->isRegistered(ind))
+        continue;
+      assert(def_map.find(ind) != def_map.end());
+      if (def_map[ind])
+      {
+        def_map[ind] = 0;
+        tensor_builder->notifyFirstUse(ind);
+      }
+    }
+
+    // Scan variable tensors
+    // This tensor has features like constant. But OperandInfo and LowerInfo treat them as
+    // non-constant because of less memory usage by memory planning in here
+    for (const auto &ind : op_inputs)
+    {
+      if (ctx.external_operands().contains(ind))
+        continue;
+      if (!tensor_builder->isRegistered(ind))
+        continue;
+      const auto &operand = graph.operands().at(ind);
+      if (operand.info().isVariable())
+      {
+        // The variable tensor with buffer is not supported yet
+        assert(operand.data() == nullptr);
+        assert(operand.getUses().size() == 1 && !operand.getDef().valid());
+        assert(uses_map[ind] == 1 && def_map[ind] == 0);
+        tensor_builder->notifyFirstUse(ind);
+      }
+    }
+
+    for (const auto &ind : op_inputs)
+    {
+      if (ctx.external_operands().contains(ind))
+        continue;
+      if (!tensor_builder->isRegistered(ind))
+        continue;
+      assert(uses_map.find(ind) != uses_map.end());
+      assert(uses_map[ind] > 0);
+      uses_map[ind]--;
+      if (uses_map[ind] == 0)
+      {
+        // plan for deallocation of static tensornode
+        tensor_builder->notifyLastUse(ind);
+      }
+    }
+  }
+
+  for (auto ind : operands_last_until_end)
+  {
+    tensor_builder->notifyLastUse(ind);
+  }
+
+  // Dispose and validate
+  for (const auto &ind : constants)
+  {
+    --uses_map[ind];
+    if (uses_map[ind] == 0) // To prevent notifyLastUse from being called twice
+    {
+      tensor_builder->notifyLastUse(ind);
+    }
+  }
+
+  assert(
+    std::all_of(uses_map.begin(), uses_map.end(),
+                [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; }));
+
+  assert(
+    std::all_of(def_map.begin(), def_map.end(),
+                [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; }));
+}
+
+template <typename T_BackendContext> ITensorRegistry *genTensors(T_BackendContext &ctx)
+{
+  const ir::Graph &graph = *ctx.graph();
+  auto tensor_builder = ctx.tensor_builder;
+
+  auto model_io =
+    (graph.getInputs() + graph.getOutputs()) | ir::Remove::UNDEFINED | ir::Remove::DUPLICATED;
+  graph.operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &obj) {
+    if (ctx.external_operands().contains(ind))
+      return;
+    // NOTE Assuming there is no layout changes (Always assume NHWC or UNKNOWN)
+    assert(graph.layout() != ir::Layout::NCHW);
+    ir::OperandInfo backend_info{obj.shape(), obj.typeInfo(), obj.info().memAllocType(),
+                                 obj.isConstant()};
+    tensor_builder->registerTensorInfo(ind, backend_info, ir::Layout::NHWC);
+  });
+
+  // TODO Get compiler options from compiler, and use it rather than getting it from Env
+  if (util::getConfigString(util::config::EXECUTOR) == "Linear")
+  {
+    basic::planTensors(ctx);
+  }
+  else
+  {
+    // For the executors that does not have fixed linear execution order:
+    // To make tensors never be deallocated, this is a workaround to use static memory planner
+    graph.operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &) {
+      if (tensor_builder->isRegistered(ind))
+        tensor_builder->notifyFirstUse(ind);
+    });
+  }
+
+  tensor_builder->allocate();
+
+  return ctx.tensor_registry.get();
+}
+
+inline void initConsts(BackendContext &ctx)
+{
+  ctx.graph()->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &operand) {
+    if (ctx.external_operands().contains(ind) || !operand.isConstant())
+      return;
+
+    auto tensor = ctx.tensor_registry->getNativeITensor(ind);
+    assert(tensor != nullptr);
+
+    VERBOSE(FillOperandData) << "Fill data for " << ind << std::endl;
+
+    auto data = operand.shareData();
+    assert(data && data->base());
+    ExternalTensor *ext_tensor = dynamic_cast<ExternalTensor *>(tensor);
+
+    if (ext_tensor == nullptr)
+      throw std::runtime_error{"This tensor is not external tensor"};
+
+    ext_tensor->setData(data);
+  });
+}
+
+} // namespace basic
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_BASIC_BACKEND_CONTEXT_HELPERS_H__
diff --git a/runtime/onert/core/include/backend/cpu_common/DynamicTensorManager.h b/runtime/onert/core/include/backend/basic/DynamicTensorManager.h
index c4e06aa82..92d8ee3ab 100644
--- a/runtime/onert/core/include/backend/cpu_common/DynamicTensorManager.h
+++ b/runtime/onert/core/include/backend/basic/DynamicTensorManager.h
@@ -14,13 +14,12 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_BACKEND_CPU_COMMON_DYNAMICTENSOR_MANAGER_H__
-#define __ONERT_BACKEND_CPU_COMMON_DYNAMICTENSOR_MANAGER_H__
+#ifndef __ONERT_BACKEND_BASIC_DYNAMICTENSOR_MANAGER_H__
+#define __ONERT_BACKEND_BASIC_DYNAMICTENSOR_MANAGER_H__
 
 #include "MemoryManager.h"
 #include "TensorRegistry.h"
 
-#include <backend/IDynamicTensorManager.h>
 #include <ir/OperandInfo.h>
 #include <ir/Operation.h>
 #include <ir/Index.h>
@@ -29,7 +28,7 @@ namespace onert
 {
 namespace backend
 {
-namespace cpu_common
+namespace basic
 {
 
 // TODO Find optimized algorithm to manage memory.
@@ -37,7 +36,7 @@ namespace cpu_common
 /**
  * @brief Class to manage dynamic tensor and its memory
  */
-class DynamicTensorManager : public backend::IDynamicTensorManager
+class DynamicTensorManager
 {
 public:
   DynamicTensorManager(const std::shared_ptr<TensorRegistry> &reg);
@@ -47,9 +46,6 @@ public:
   void buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &tensor_info,
                    ir::Layout backend_layout);
 
-  void planDealloc(ir::OperationIndex op_ind, backend::ITensor *tensor) override;
-  void deallocInput(ir::OperationIndex op_ind) override;
-
   std::shared_ptr<DynamicMemoryManager> dynamic_mem_mgr() { return _dynamic_mem_mgr; }
 
 private:
@@ -66,11 +62,11 @@ private:
   // contains list of dynamic tensor index, which can be deallocated after running operation
   // note: this map could contain static tensor index too. Careful use is required.
   std::unordered_map<ir::OperationIndex, std::unordered_set<backend::ITensor *>>
-      _dealloc_tensor_map;
+    _dealloc_tensor_map;
 };
 
-} // namespace cpu_common
+} // namespace basic
 } // namespace backend
 } // namespace onert
 
-#endif // __ONERT_BACKEND_CPU_COMMON_DYNAMICTENSOR_MANAGER_H__
+#endif // __ONERT_BACKEND_BASIC_DYNAMICTENSOR_MANAGER_H__
diff --git a/runtime/onert/core/include/backend/cpu_common/IMemoryPlanner.h b/runtime/onert/core/include/backend/basic/IMemoryPlanner.h
index 335f8f5c0..5ca2d953f 100644
--- a/runtime/onert/core/include/backend/cpu_common/IMemoryPlanner.h
+++ b/runtime/onert/core/include/backend/basic/IMemoryPlanner.h
@@ -23,7 +23,7 @@ namespace onert
 {
 namespace backend
 {
-namespace cpu_common
+namespace basic
 {
 
 /**
@@ -67,7 +67,7 @@ struct IMemoryPlanner
   virtual ~IMemoryPlanner() = default;
 };
 
-} // namespace cpu_common
+} // namespace basic
 } // namespace backend
 } // namespace onert
 
diff --git a/runtime/onert/core/include/backend/cpu_common/KernelGeneratorBase.h b/runtime/onert/core/include/backend/basic/KernelGeneratorBase.h
index 49a589768..6e123e81d 100644
--- a/runtime/onert/core/include/backend/cpu_common/KernelGeneratorBase.h
+++ b/runtime/onert/core/include/backend/basic/KernelGeneratorBase.h
@@ -14,16 +14,15 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_BACKEND_CPU_COMMON_KERNEL_GENERATOR_BASE_H__
-#define __ONERT_BACKEND_CPU_COMMON_KERNEL_GENERATOR_BASE_H__
+#ifndef __ONERT_BACKEND_BASIC_KERNEL_GENERATOR_BASE_H__
+#define __ONERT_BACKEND_BASIC_KERNEL_GENERATOR_BASE_H__
 
 #include <assert.h>
 #include <memory>
 #include <functional>
 
+#include "ir/Graph.h"
 #include "ir/OperationVisitor.h"
-#include "ir/OpSequence.h"
-#include <memory>
 #include "exec/FunctionSequence.h"
 #include "backend/ITensorRegistry.h"
 
@@ -31,34 +30,20 @@ namespace onert
 {
 namespace backend
 {
-namespace cpu_common
+namespace basic
 {
 
 class KernelGeneratorBase : public ir::OperationVisitor
 {
 public:
   virtual ~KernelGeneratorBase() = default;
+  KernelGeneratorBase(const ir::Graph &graph) : _graph{graph} {}
 
-  std::unique_ptr<exec::IFunction> releaseFunction()
-  {
-    assert(_return_fn);
-    return std::move(_return_fn);
-  }
-
-  std::unique_ptr<exec::FunctionSequence> generate(const ir::OpSequence &op_seq)
-  {
-    op_seq.accept(*this);
-    return std::move(_return_fn_seq);
-  }
+  virtual std::unique_ptr<exec::FunctionSequence> generate(ir::OperationIndex ind) = 0;
 
 protected:
   using OperationVisitor::visit;
 
-  void visit(const ir::OpSequence &) override
-  {
-    throw std::runtime_error("KernelGenerator: NYI for operation 'OpSequence'");
-  }
-
 #define OP(InternalName)                                                                \
   void visit(const ir::operation::InternalName &) override                              \
   {                                                                                     \
@@ -68,12 +53,19 @@ protected:
 #undef OP
 
 protected:
+  std::unique_ptr<exec::IFunction> releaseFunction()
+  {
+    assert(_return_fn);
+    return std::move(_return_fn);
+  }
+
+protected:
+  const ir::Graph &_graph;
   std::unique_ptr<exec::IFunction> _return_fn;
-  std::unique_ptr<exec::FunctionSequence> _return_fn_seq; // TODO Extract this out
 };
 
-} // namespace cpu_common
+} // namespace basic
 } // namespace backend
 } // namespace onert
 
-#endif // __ONERT_BACKEND_CPU_COMMON_KERNEL_GENERATOR_BASE_H__
+#endif // __ONERT_BACKEND_BASIC_KERNEL_GENERATOR_BASE_H__
diff --git a/runtime/onert/core/include/backend/cpu_common/MemoryManager.h b/runtime/onert/core/include/backend/basic/MemoryManager.h
index 28ec6b803..62618359a 100644
--- a/runtime/onert/core/include/backend/cpu_common/MemoryManager.h
+++ b/runtime/onert/core/include/backend/basic/MemoryManager.h
@@ -18,7 +18,6 @@
 #define __ONERT_BACKEND_CPU_MEMORY_MANAGER_H__
 
 #include "Allocator.h"
-#include "backend/IMemoryManager.h"
 #include "IMemoryPlanner.h"
 
 namespace onert
@@ -28,19 +27,19 @@ namespace backend
 
 class ITensor;
 
-namespace cpu_common
+namespace basic
 {
 
-class MemoryManager : public backend::IMemoryManager
+class MemoryManager
 {
 public:
   MemoryManager();
   MemoryManager(const std::string);
   virtual ~MemoryManager() = default;
 
-  void allocate(void) override;
+  void allocate(void);
   uint8_t *getBuffer(const ir::OperandIndex &ind) const;
-  void deallocate(void) override { _mem_alloc->release(); }
+  void deallocate(void) { _mem_alloc->release(); }
 
   void claimPlan(const ir::OperandIndex &ind, uint32_t size);
   void releasePlan(const ir::OperandIndex &ind);
@@ -69,7 +68,7 @@ private:
   std::unordered_map<const ITensor *, std::shared_ptr<Allocator>> _mem_alloc_map;
 };
 
-} // namespace cpu_common
+} // namespace basic
 } // namespace backend
 } // namespace onert
 
diff --git a/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h b/runtime/onert/core/include/backend/basic/StaticTensorManager.h
index 850bcf2f2..f35dbdfe4 100644
--- a/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h
+++ b/runtime/onert/core/include/backend/basic/StaticTensorManager.h
@@ -14,14 +14,12 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_BACKEND_CPU_COMMON_STATICTENSOR_MANAGER_H__
-#define __ONERT_BACKEND_CPU_COMMON_STATICTENSOR_MANAGER_H__
+#ifndef __ONERT_BACKEND_BASIC_STATICTENSOR_MANAGER_H__
+#define __ONERT_BACKEND_BASIC_STATICTENSOR_MANAGER_H__
 
-#include "backend/IStaticTensorManager.h"
-#include "backend/cpu_common/DynamicTensorManager.h"
-#include "backend/cpu_common/MemoryManager.h"
-#include "backend/cpu_common/TensorRegistry.h"
-#include "backend/ITensorManager.h"
+#include "backend/basic/DynamicTensorManager.h"
+#include "backend/basic/MemoryManager.h"
+#include "backend/basic/TensorRegistry.h"
 #include "ir/OperandIndexMap.h"
 #include "ir/OperandInfo.h"
 #include "TensorRegistry.h"
@@ -30,12 +28,12 @@ namespace onert
 {
 namespace backend
 {
-namespace cpu_common
+namespace basic
 {
 
 class DynamicTensorManager;
 
-class StaticTensorManager : public backend::IStaticTensorManager
+class StaticTensorManager
 {
 public:
   StaticTensorManager(const std::shared_ptr<TensorRegistry> &reg,
@@ -60,8 +58,8 @@ private:
   DynamicTensorManager *_dynamic_tensor_manager;
 };
 
-} // namespace cpu_common
+} // namespace basic
 } // namespace backend
 } // namespace onert
 
-#endif // __ONERT_BACKEND_CPU_COMMON_STATICTENSOR_MANAGER_H__
+#endif // __ONERT_BACKEND_BASIC_STATICTENSOR_MANAGER_H__
diff --git a/runtime/onert/core/include/backend/cpu_common/Tensor.h b/runtime/onert/core/include/backend/basic/Tensor.h
index 5fbf4e729..da5103885 100644
--- a/runtime/onert/core/include/backend/cpu_common/Tensor.h
+++ b/runtime/onert/core/include/backend/basic/Tensor.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_BACKEND_CPU_COMMON_TENSOR_H__
-#define __ONERT_BACKEND_CPU_COMMON_TENSOR_H__
+#ifndef __ONERT_BACKEND_BASIC_TENSOR_H__
+#define __ONERT_BACKEND_BASIC_TENSOR_H__
 
 #include "Allocator.h"
 
@@ -27,7 +27,7 @@ namespace onert
 {
 namespace backend
 {
-namespace cpu_common
+namespace basic
 {
 
 class DynamicMemoryManager;
@@ -41,8 +41,8 @@ public:
 public:
   Tensor(const ir::OperandInfo &info, const ir::Layout layout,
          DynamicMemoryManager *dynamic_mem_mgr)
-      : IPortableTensor(info), _layout(layout), _buffer(nullptr), _num_references(0),
-        _dynamic_mem_mgr(dynamic_mem_mgr), _allocator(nullptr)
+    : IPortableTensor(info), _layout(layout), _buffer(nullptr), _num_references(0),
+      _dynamic_mem_mgr(dynamic_mem_mgr), _allocator(nullptr)
   {
     // DO NOTHING
   }
@@ -53,38 +53,21 @@ public:
   /**
    * @brief Set the Buffer object. This method is called for static and non-const tensor
    */
-  void setBuffer(uint8_t *buffer)
-  {
-    assert(_buffer == nullptr);
-    _buffer = buffer;
-  }
+  void setBuffer(uint8_t *buffer) { _buffer = buffer; }
 
   /**
    * @brief Set the Buffer object. This method is called for dynamic or const tensor
    */
   void setBuffer(const std::shared_ptr<Allocator> &alloc)
   {
-    assert(_buffer == nullptr);
-    _allocator = alloc;
-    _buffer = alloc->base();
-  }
-
-  // This works just as setBuffer but it simply overwrite existing Allocator without nullptr check
-  void overwriteBuffer(const std::shared_ptr<Allocator> &alloc)
-  {
     _allocator = alloc;
     _buffer = alloc->base();
   }
 
   /**
-   * @brief Mark this tensor does not have memory.
-   *        Real memory deallocation should be done by caller.
+   * @brief Reset the buffer and deallocate the allocation if it is managed by itself
    */
-  void resetBuffer()
-  {
-    _allocator.reset();
-    _buffer = nullptr;
-  }
+  void deallocBuffer() override;
 
 public:
   uint8_t *buffer() const override { return _buffer; }
@@ -98,14 +81,10 @@ public:
    *       W : dimension(2)
    *       C : dimension(3)
    */
-  size_t dimension(size_t index) const final override { return _info.shape().dim(index); }
-  size_t num_dimensions() const override { return _info.shape().rank(); }
   size_t total_size() const override { return _info.total_size(); }
   size_t calcOffset(const ir::Coordinates &coords) const override;
   ir::Layout layout() const override { return _layout; }
   ir::DataType data_type() const override { return _info.typeInfo().type(); }
-  float data_scale() const override { return _info.typeInfo().scale(); }
-  int32_t data_offset() const override { return _info.typeInfo().offset(); }
   bool is_constant() const override { return _info.isConstant(); }
   bool is_dynamic() const override { return _info.isDynamic(); }
   void set_dynamic() override { _info.setDynamic(); }
@@ -161,6 +140,7 @@ public:
   virtual int32_t num_references() { return _num_references; }
 
   void setShape(const ir::Shape &new_shape) override;
+  ir::Shape getShape() const override;
 
 protected:
   ir::Layout _layout;
@@ -194,7 +174,7 @@ public:
 
 public:
   ExternalTensor(const ir::OperandInfo &info, const ir::Layout layout)
-      : Tensor(info, layout, nullptr)
+    : Tensor(info, layout, nullptr)
   {
     assert(_layout == ir::Layout::NHWC);
     assert(_info.isConstant());
@@ -263,8 +243,8 @@ public:
 private:
   std::shared_ptr<const ir::Data> _data;
 };
-} // namespace cpu_common
+} // namespace basic
 } // namespace backend
 } // namespace onert
 
-#endif // __ONERT_BACKEND_CPU_COMMON_TENSOR_H__
+#endif // __ONERT_BACKEND_BASIC_TENSOR_H__
diff --git a/runtime/onert/core/include/backend/basic/TensorBuilder.h b/runtime/onert/core/include/backend/basic/TensorBuilder.h
new file mode 100644
index 000000000..a8014e55d
--- /dev/null
+++ b/runtime/onert/core/include/backend/basic/TensorBuilder.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_BASIC_TENSOR_BUILDER_H__
+#define __ONERT_BACKEND_BASIC_TENSOR_BUILDER_H__
+
+#include <backend/basic/DynamicTensorManager.h>
+#include <backend/basic/TensorRegistry.h>
+#include <backend/basic/StaticTensorManager.h>
+
+#include <ir/OperandIndexMap.h>
+
+#include "Tensor.h"
+
+#include <unordered_map>
+
+namespace onert
+{
+namespace backend
+{
+namespace basic
+{
+
+class TensorBuilder
+{
+public:
+  TensorBuilder(const std::shared_ptr<TensorRegistry> &tensor_reg);
+
+  /**
+   * @brief     Register tensor information to allocate on CPU backend
+   * @param[in] ind    Operand index
+   * @param[in] info   Operand information
+   * @param[in] layout Operand data layout
+   */
+  void registerTensorInfo(const ir::OperandIndex &ind, const ir::OperandInfo &info,
+                          ir::Layout backend_layout);
+
+  void notifyFirstUse(const ir::OperandIndex &);
+  void notifyLastUse(const ir::OperandIndex &);
+
+  bool isRegistered(const ir::OperandIndex &) const;
+
+  void allocate(void);
+
+  DynamicTensorManager *dynamicTensorManager(void) { return _dynamic_tensor_mgr.get(); }
+
+private:
+  const std::shared_ptr<TensorRegistry> _tensor_reg;
+  std::unique_ptr<DynamicTensorManager> _dynamic_tensor_mgr;
+  std::unique_ptr<StaticTensorManager> _static_tensor_mgr;
+  ir::OperandIndexMap<ir::OperandInfo> _tensor_info_map;
+};
+
+} // namespace basic
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_BASIC_TENSOR_BUILDER_H__
diff --git a/runtime/onert/core/include/backend/cpu_common/TensorRegistry.h b/runtime/onert/core/include/backend/basic/TensorRegistry.h
index 5896fb7ad..bfff45e37 100644
--- a/runtime/onert/core/include/backend/cpu_common/TensorRegistry.h
+++ b/runtime/onert/core/include/backend/basic/TensorRegistry.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_BACKEND_CPU_COMMON_TENSOR_REGISTRY__
-#define __ONERT_BACKEND_CPU_COMMON_TENSOR_REGISTRY__
+#ifndef __ONERT_BACKEND_BASIC_TENSOR_REGISTRY__
+#define __ONERT_BACKEND_BASIC_TENSOR_REGISTRY__
 
 #include "backend/ITensorRegistry.h"
 #include "Tensor.h"
@@ -24,13 +24,13 @@ namespace onert
 {
 namespace backend
 {
-namespace cpu_common
+namespace basic
 {
 
-using TensorRegistry = PortableTensorRegistryTemplate<cpu_common::Tensor>;
+using TensorRegistry = PortableTensorRegistryTemplate<basic::Tensor>;
 
-} // namespace cpu_common
+} // namespace basic
 } // namespace backend
 } // namespace onert
 
-#endif // __ONERT_BACKEND_CPU_COMMON_TENSOR_REGISTRY__
+#endif // __ONERT_BACKEND_BASIC_TENSOR_REGISTRY__
diff --git a/runtime/onert/core/include/backend/cpu_common/BackendContextHelpers.h b/runtime/onert/core/include/backend/cpu_common/BackendContextHelpers.h
deleted file mode 100644
index 19e7b7c99..000000000
--- a/runtime/onert/core/include/backend/cpu_common/BackendContextHelpers.h
+++ /dev/null
@@ -1,193 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __ONERT_BACKEND_CPU_COMMON_BACKEND_CONTEXT_HELPERS_H__
-#define __ONERT_BACKEND_CPU_COMMON_BACKEND_CONTEXT_HELPERS_H__
-
-#include <vector>
-
-#include "ir/Index.h"
-#include "ir/OpSequences.h"
-#include "ir/LowerInfoMap.h"
-#include "util/logging.h"
-
-namespace onert
-{
-namespace backend
-{
-namespace cpu_common
-{
-
-// TODO Remove the template param BackendContext once unification of cpu backend context is done
-template <typename T_BackendContext>
-void planTensors(const T_BackendContext &ctx, const std::vector<onert::ir::OpSequenceIndex> &order,
-                 const ir::OpSequences &op_seqs, const ir::LowerInfoMap &lower_info)
-{
-  auto graph = ctx.graph();
-  auto tensor_builder = ctx.tensor_builder;
-
-  ir::OperandIndexMap<uint32_t> uses_map;
-  ir::OperandIndexMap<uint32_t> def_map;
-  ir::OperandIndexSequence constants;
-
-  auto model_io =
-      (graph->getInputs() + graph->getOutputs()) | ir::Remove::UNDEFINED | ir::Remove::DUPLICATED;
-
-  // Prepare scanning
-  for (auto ind : ctx.operand_list())
-  {
-    if (model_io.contains(ind))
-      continue;
-    const auto &obj = graph->operands().at(ind);
-    const auto &li = lower_info.operand.at(ind);
-    if (li->def_factors().getOnlyElement().backend() != ctx.backend())
-      continue;
-
-    // Ignore unused tensor
-    if (li->def_factors().size() == 0 && li->use_factors().size() == 0)
-    {
-      VERBOSE_F() << "Operand #" << ind.value() << " will not be used. no more process."
-                  << std::endl;
-      return;
-    }
-
-    uses_map[ind] = obj.getUses().size();
-    def_map[ind] = obj.getDef().valid() ? 1 : 0;
-
-    if (obj.isConstant())
-      constants.append(ind);
-
-    auto factor = li->def_factors().getOnlyElement();
-    if (!tensor_builder->isRegistered(ind))
-    {
-      // These tensors do not exist in any op_seq (No use and def)
-      const auto info = obj.info();
-      const auto backend_layout = factor.layout();
-      // TODO Change tensor info to have permuted shape
-      tensor_builder->registerTensorInfo(ind, info, backend_layout);
-    }
-  }
-
-  // Start scanning to do notify{First|Last}Use for each tensor
-
-  // If a tensor is a constant, increase the use of the tensor and allocate it first.
-  // Increasing use count here makes the tensor never be deallocated, i.e it they will be
-  // deallocated last.
-  for (const auto &ind : constants)
-  {
-    uses_map[ind]++;
-    tensor_builder->notifyFirstUse(ind);
-  }
-
-  // At each operation,
-  // 1. Scan DEF of outputs. If the DEF, allocate it
-  // 2. Scan DEF of inputs. If variable tensor, allocate it
-  // 3. Scan USE of inputs. Decrease the USE and deallocate if the USE is 0
-  for (const auto op_seq_ind : order)
-  {
-    const auto &op_seq = op_seqs.at(op_seq_ind);
-    for (const auto &op_idx : op_seq.operations())
-    {
-      auto op_inputs = graph->operations().at(op_idx).getInputs() | ir::Remove::DUPLICATED |
-                       ir::Remove::UNDEFINED;
-      auto op_outputs = graph->operations().at(op_idx).getOutputs() | ir::Remove::DUPLICATED |
-                        ir::Remove::UNDEFINED;
-
-      // Define outputs
-      for (const auto &ind : op_outputs)
-      {
-        if (model_io.contains(ind))
-          continue;
-        if (!tensor_builder->isRegistered(ind))
-          continue;
-        assert(def_map.find(ind) != def_map.end());
-        if (def_map[ind])
-        {
-          def_map[ind] = 0;
-          tensor_builder->notifyFirstUse(ind);
-        }
-      }
-
-      // Scan variable tensors
-      // This tensor has features like constant. But OperandInfo and LowerInfo treat them as
-      // non-constant because of less memory usage by memory planning in here
-      for (const auto &ind : op_inputs)
-      {
-        if (model_io.contains(ind))
-          continue;
-        if (!tensor_builder->isRegistered(ind))
-          continue;
-        const auto &operand = graph->operands().at(ind);
-        if (operand.info().isVariable())
-        {
-          // The variable tensor with buffer is not supported yet
-          assert(operand.data() == nullptr);
-          assert(operand.getUses().size() == 1 && !operand.getDef().valid());
-          assert(lower_info.operand.at(ind)->def_factors().size() == 1 &&
-                 lower_info.operand.at(ind)->use_factors().size() == 1);
-          assert(uses_map[ind] == 1 && def_map[ind] == 0);
-          tensor_builder->notifyFirstUse(ind);
-        }
-      }
-
-      for (const auto &ind : op_inputs)
-      {
-        if (model_io.contains(ind))
-          continue;
-        if (!tensor_builder->isRegistered(ind))
-          continue;
-        assert(uses_map.find(ind) != uses_map.end());
-        assert(uses_map[ind] > 0);
-        uses_map[ind]--;
-        if (uses_map[ind] == 0)
-        {
-          // plan for deallocation of static tensornode
-          tensor_builder->notifyLastUse(ind);
-
-          // plan for deallocation of dynamic tensor
-          auto dyn_tensor_manager = tensor_builder->dynamicTensorManager();
-          auto *tensor = ctx.tensor_registry->getITensor(ind);
-          assert(tensor);
-          dyn_tensor_manager->planDealloc(op_idx, tensor);
-        }
-      }
-    }
-  }
-
-  // Dispose and validate
-  for (const auto &ind : constants)
-  {
-    --uses_map[ind];
-    if (uses_map[ind] == 0) // To prevent notifyLastUse from being called twice
-    {
-      tensor_builder->notifyLastUse(ind);
-    }
-  }
-
-  assert(
-      std::all_of(uses_map.begin(), uses_map.end(),
-                  [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; }));
-
-  assert(
-      std::all_of(def_map.begin(), def_map.end(),
-                  [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; }));
-}
-
-} // namespace cpu_common
-} // namespace backend
-} // namespace onert
-
-#endif // __ONERT_BACKEND_CPU_COMMON_BACKEND_CONTEXT_HELPERS_H__
diff --git a/runtime/onert/core/include/backend/cpu_common/ConstantInitializer.h b/runtime/onert/core/include/backend/cpu_common/ConstantInitializer.h
deleted file mode 100644
index 679355599..000000000
--- a/runtime/onert/core/include/backend/cpu_common/ConstantInitializer.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __ONERT_BACKEND_CPU_COMMON_CONSTANT_INITIALIZER_H__
-#define __ONERT_BACKEND_CPU_COMMON_CONSTANT_INITIALIZER_H__
-
-#include "TensorRegistry.h"
-
-#include "ConstantInitializerBase.h"
-#include <ir/Operands.h>
-
-namespace onert
-{
-namespace backend
-{
-namespace cpu_common
-{
-
-class ConstantInitializer : public ConstantInitializerBase
-{
-public:
-  ConstantInitializer(const ir::Operands &operands,
-                      const std::shared_ptr<ITensorRegistry> &tensor_reg);
-
-public:
-  void registerDefaultInitializer(const ir::OperandIndex &index, const ir::Operand &obj) override;
-
-  // TODO: For now the only cpu backend supports constant tensor to use data from external
-  // If the other backend supports (to do this,
-  // ExternalTensor should be abstract such as IExternal, maybe),
-  // this can be an interface of cpu_common::ConstantInitializerBase
-  void registerExternalInitializer(const ir::OperandIndex &, const ir::Operand &);
-
-private:
-  std::shared_ptr<ITensorRegistry> tensor_registry() const override { return _tensor_reg; }
-
-private:
-  std::shared_ptr<ITensorRegistry> _tensor_reg;
-};
-
-} // namespace cpu_common
-} // namespace backend
-} // namespace onert
-
-#endif // __ONERT_BACKEND_CPU_COMMON_CONSTANT_INITIALIZER_H__
diff --git a/runtime/onert/core/include/backend/cpu_common/ConstantInitializerBase.h b/runtime/onert/core/include/backend/cpu_common/ConstantInitializerBase.h
deleted file mode 100644
index d4c65de38..000000000
--- a/runtime/onert/core/include/backend/cpu_common/ConstantInitializerBase.h
+++ /dev/null
@@ -1,234 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __ONERT_BACKEND_CPU_COMMON_CONSTANT_INITIALIZER_BASE_H__
-#define __ONERT_BACKEND_CPU_COMMON_CONSTANT_INITIALIZER_BASE_H__
-
-#include <unordered_map>
-#include <functional>
-
-#include "ir/Coordinates.h"
-#include "ir/Layout.h"
-#include "ir/Operand.h"
-#include "ir/Operands.h"
-#include "ir/OperationVisitor.h"
-#include "ir/OpSequence.h"
-#include "backend/ITensorRegistry.h"
-#include "util/logging.h"
-#include "backend/ITensorRegistry.h"
-
-namespace
-{
-template <typename T>
-static void Init(const onert::ir::Operand &model_obj, onert::backend::ITensor &obj, const bool copy,
-                 const onert::ir::Layout frontend_layout = onert::ir::Layout::UNKNOWN)
-{
-  const auto shape = model_obj.shape();
-  assert(model_obj.data());
-  auto base = reinterpret_cast<const T *>(model_obj.data()->base());
-
-  obj.access([&](::onert::backend::ITensor &tensor) {
-    switch (shape.rank())
-    {
-      case 0:
-      {
-        assert(model_obj.data()->size() == sizeof(T));
-        const auto value = *reinterpret_cast<const T *>(base);
-        T *into = reinterpret_cast<T *>(tensor.buffer());
-        *into = value;
-        break;
-      }
-      case 1:
-      {
-        auto vec_size = shape.dim(0);
-        for (int32_t n = 0; n < vec_size; ++n)
-        {
-          const T *from = reinterpret_cast<const T *>(base) + n;
-          const auto value = *from;
-
-          T *into = reinterpret_cast<T *>(tensor.buffer()) + n;
-
-          *into = value;
-        }
-        break;
-      }
-      case 2:
-      {
-        const int32_t copy_len = shape.dim(1);
-
-        for (auto i = 0; i < shape.dim(0); ++i)
-        {
-          ::onert::ir::Coordinates coords{i, 0};
-          memcpy(tensor.buffer() + tensor.calcOffset(coords), base + i * copy_len,
-                 copy_len * sizeof(T));
-        }
-        break;
-      }
-      case 3:
-      {
-        const int32_t width = shape.dim(1);
-        const int32_t copy_len = shape.dim(2);
-
-        for (auto i = 0; i < shape.dim(0); ++i)
-        {
-          for (auto j = 0; j < shape.dim(1); ++j)
-          {
-            ::onert::ir::Coordinates coords{i, j, 0};
-            memcpy(tensor.buffer() + tensor.calcOffset(coords),
-                   base + i * width * copy_len + j * copy_len, copy_len * sizeof(T));
-          }
-        }
-        break;
-      }
-      case 4:
-      {
-        const int32_t height = shape.dim(1);
-        const int32_t width = shape.dim(2);
-        const int32_t copy_len = shape.dim(3);
-        for (auto i = 0; i < shape.dim(0); ++i)
-        {
-          for (auto j = 0; j < shape.dim(1); ++j)
-          {
-            for (auto k = 0; k < shape.dim(2); ++k)
-            {
-              if (copy)
-              {
-                ::onert::ir::Coordinates coords{i, j, k, 0};
-                memcpy(tensor.buffer() + tensor.calcOffset(coords),
-                       base + i * height * width * copy_len + j * width * copy_len + k * copy_len,
-                       copy_len * sizeof(T));
-              }
-              else
-              {
-                for (auto l = 0; l < shape.dim(3); ++l)
-                {
-                  const auto coords = ::onert::ir::convertCoordinates({i, j, k, l}, frontend_layout,
-                                                                      tensor.layout());
-                  T *into = reinterpret_cast<T *>(tensor.buffer() + tensor.calcOffset(coords));
-                  T value = *(base + i * height * width * copy_len + j * width * copy_len +
-                              k * copy_len + l);
-                  *into = value;
-                }
-              }
-            }
-          }
-        }
-        break;
-      }
-      default:
-        throw std::runtime_error{"Not yet supported"};
-    }
-  });
-}
-
-template <typename T>
-void copyInit(const onert::ir::Operand &model_obj, onert::backend::ITensor &obj)
-{
-  Init<T>(model_obj, obj, true);
-}
-
-template <typename T>
-void permuteInit(const onert::ir::Operand &model_obj, onert::backend::ITensor &obj,
-                 const onert::ir::Layout frontend_layout)
-{
-  const bool copy = frontend_layout == obj.layout();
-  Init<T>(model_obj, obj, copy, frontend_layout);
-}
-
-} // namespace
-
-namespace onert
-{
-namespace backend
-{
-namespace cpu_common
-{
-
-class ConstantInitializerBase : public ir::OperationVisitor
-{
-public:
-  virtual ~ConstantInitializerBase() = default;
-
-public:
-  void run()
-  {
-    assert(tensor_registry());
-    for (const auto &it : _init_map)
-    {
-      const auto &ind = it.first;
-      const auto &fn = it.second;
-
-      const auto &model_obj = _operands.at(ind);
-      auto tensor_obj = tensor_registry()->getNativeITensor(ind);
-      assert(tensor_obj != nullptr);
-      fn(model_obj, *tensor_obj);
-      VERBOSE(FillOperandData) << "Fill data for operand " << ind.value() << std::endl;
-    }
-    _init_map.clear();
-  }
-
-public:
-  ConstantInitializerBase(const ir::Operands &operands)
-      : _operands{operands}, _current_layout{ir::Layout::UNKNOWN}
-  {
-  }
-
-public:
-  using Initializer = std::function<void(const ir::Operand &, backend::ITensor &)>;
-
-  void setLayout(ir::Layout layout) { _current_layout = layout; }
-
-protected:
-  virtual std::shared_ptr<ITensorRegistry> tensor_registry() const = 0;
-
-public:
-  virtual void registerDefaultInitializer(const ir::OperandIndex &index, const ir::Operand &obj)
-  {
-    registerPermuteInitializer(index, obj); // as default
-  }
-
-public:
-  void registerCopyInitializer(const ir::OperandIndex &index, const ir::Operand &obj);
-  void registerPermuteInitializer(const ir::OperandIndex &index, const ir::Operand &obj);
-
-public:
-  void registerCustomInitializer(const ir::OperandIndex &index, const ir::Operand &obj,
-                                 void (*customInit)(const onert::ir::Operand &model_obj,
-                                                    onert::backend::ITensor &obj))
-  {
-    // For only CONSTANTS
-    // TODO Add to check if tensor has been allocated
-    if (!obj.isConstant())
-      return;
-
-    using namespace std::placeholders;
-    _init_map[index] = std::bind(customInit, _1, _2);
-  }
-
-public:
-  bool exist(const ir::OperandIndex &ind) { return _init_map.find(ind) != _init_map.end(); }
-
-protected:
-  const ir::Operands &_operands;
-  std::unordered_map<ir::OperandIndex, Initializer> _init_map;
-  ir::Layout _current_layout;
-};
-
-} // namespace cpu_common
-} // namespace backend
-} // namespace onert
-
-#endif // __ONERT_BACKEND_CPU_COMMON_CONSTANT_INITIALIZER_BASE_H__
diff --git a/runtime/onert/core/include/compiler/BackendManager.h b/runtime/onert/core/include/compiler/BackendManager.h
index 7850e21eb..befe40022 100644
--- a/runtime/onert/core/include/compiler/BackendManager.h
+++ b/runtime/onert/core/include/compiler/BackendManager.h
@@ -22,7 +22,7 @@
 
 #include "ir/Operands.h"
 #include "backend/Backend.h"
-#include "backend/controlflow/Backend.h"
+#include "backend/builtin/Backend.h"
 
 namespace onert
 {
@@ -41,7 +41,7 @@ public:
 public:
   backend::Backend *get(const std::string &key);
   const backend::Backend *get(const std::string &key) const;
-  const backend::controlflow::Backend *getControlflow() const;
+  const backend::builtin::Backend *getBuiltin() const;
   const std::vector<const backend::Backend *> getAll() const
   {
     std::vector<const backend::Backend *> v;
@@ -65,15 +65,15 @@ private:
 private:
   std::map<std::string, std::unique_ptr<void, dlhandle_destroy_t>> _handle_map;
   std::map<std::string, std::unique_ptr<backend::Backend, backend_destroy_t>> _gen_map;
-  backend::controlflow::Backend *_controlflow{nullptr};
+  backend::builtin::Backend *_builtin{nullptr};
   /**
-   * @brief load controlflow backend
+   * @brief load builtin backend
    *
    * @param backend backend to be loaded
    *
    * @return
    */
-  void loadControlflowBackend();
+  void loadBuiltinBackend();
 };
 
 } // namespace compiler
diff --git a/runtime/onert/core/include/compiler/CodeMap.h b/runtime/onert/core/include/compiler/CodeMap.h
index e13d3334c..b1d861cf8 100644
--- a/runtime/onert/core/include/compiler/CodeMap.h
+++ b/runtime/onert/core/include/compiler/CodeMap.h
@@ -18,6 +18,10 @@
 #define __ONERT_COMPILER_CODE_MAP_H__
 
 #include <unordered_map>
+#include "ir/Index.h"
+#include "ir/Operation.h"
+#include "exec/FunctionSequence.h"
+#include "OperationLowerInfo.h"
 
 namespace onert
 {
@@ -26,18 +30,20 @@ namespace compiler
 
 struct CodeAndInfo
 {
-  const ir::OpSequence *op_seq;
-  const ir::operation::LowerInfo *lower_info;
+  ir::OperationIndex op_ind;
+  const ir::Operation *op;
+  const OperationLowerInfo *lower_info;
   std::unique_ptr<exec::FunctionSequence> fn_seq;
 
-  CodeAndInfo(const ir::OpSequence *op_seq, const ir::operation::LowerInfo *lower_info,
+  CodeAndInfo(const ir::OperationIndex op_ind, const ir::Operation *op,
+              const OperationLowerInfo *lower_info,
               std::unique_ptr<exec::FunctionSequence> &&fn_seq)
-      : op_seq{op_seq}, lower_info{lower_info}, fn_seq{std::move(fn_seq)}
+    : op_ind{op_ind}, op{op}, lower_info{lower_info}, fn_seq{std::move(fn_seq)}
   {
   }
 };
 
-using CodeMap = std::unordered_map<ir::OpSequenceIndex, CodeAndInfo>;
+using CodeMap = std::unordered_map<ir::OperationIndex, CodeAndInfo>;
 
 } // namespace compiler
 } // namespace onert
diff --git a/runtime/onert/core/include/compiler/Compiler.h b/runtime/onert/core/include/compiler/Compiler.h
index 68b862d58..c2589f6d5 100644
--- a/runtime/onert/core/include/compiler/Compiler.h
+++ b/runtime/onert/core/include/compiler/Compiler.h
@@ -53,7 +53,6 @@ struct CompilerOptions
   // OPTIONS ONLY FOR DEBUGGING/PROFILING
   std::string trace_filepath; //< File path to save trace records
   int graph_dump_level;       //< Graph dump level, values between 0 and 2 are valid
-  int op_seq_max_node;        //< Number of nodes that can be
   std::string executor;       //< Executor name to use
   ManualSchedulerOptions manual_scheduler_options; //< Options for ManualScheduler
   bool he_scheduler;      //< HEScheduler if true, ManualScheduler otherwise
@@ -89,13 +88,6 @@ public:
 
   State state(void) const { return _state; }
 
-  /**
-   * @brief   Check if model can compile
-   * @return  @c true if model can compile, otherwise @c false
-   * @note    This method don't check model correctness,\n
-   *          so model verification should be done before calling this method
-   */
-  bool checkCompilable();
   CompilerOptions &options() { return _options; }
 
   /**
diff --git a/runtime/onert/core/include/compiler/ExecutionBuilder.h b/runtime/onert/core/include/compiler/ExecutionBuilder.h
index d54d9d046..e36ad6d24 100644
--- a/runtime/onert/core/include/compiler/ExecutionBuilder.h
+++ b/runtime/onert/core/include/compiler/ExecutionBuilder.h
@@ -19,8 +19,7 @@
 
 #include <memory>
 
-#include "ir/operation/LowerInfo.h"
-#include "ir/OpSequence.h"
+#include "ir/Index.h"
 #include "exec/FunctionSequence.h"
 #include "CodeMap.h"
 
@@ -32,7 +31,7 @@ namespace compiler
 class ExecutionBuilder
 {
 public:
-  void append(const ir::OpSequenceIndex index, CodeAndInfo &&code_and_info)
+  void append(const ir::OperationIndex index, CodeAndInfo &&code_and_info)
   {
     _code_map.emplace(index, std::move(code_and_info));
   }
diff --git a/runtime/onert/core/include/ir/LowerInfoMap.h b/runtime/onert/core/include/compiler/GraphLowerInfo.h
index fbabaf39d..b679891d6 100644
--- a/runtime/onert/core/include/ir/LowerInfoMap.h
+++ b/runtime/onert/core/include/compiler/GraphLowerInfo.h
@@ -14,29 +14,29 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_IR_LOWER_INFO_MAP_H__
-#define __ONERT_IR_LOWER_INFO_MAP_H__
+#ifndef __ONERT_COMPILER_GRAPH_LOWER_INFO_H__
+#define __ONERT_COMPILER_GRAPH_LOWER_INFO_H__
 
 #include <memory>
 #include <unordered_map>
 
-#include "ir/operand/LowerInfo.h"
-#include "ir/operation/LowerInfo.h"
-#include "ir/OperandIndexMap.h"
+#include "compiler/OperandLowerInfo.h"
+#include "compiler/OperationLowerInfo.h"
+#include "util/ObjectManager.h"
 #include "ir/Index.h"
 
 namespace onert
 {
-namespace ir
+namespace compiler
 {
 
-struct LowerInfoMap
+struct GraphLowerInfo
 {
-  std::unordered_map<OpSequenceIndex, std::unique_ptr<operation::LowerInfo>> op_seq;
-  OperandIndexMap<std::unique_ptr<operand::LowerInfo>> operand;
+  util::ObjectManager<ir::OperationIndex, OperationLowerInfo> operation;
+  util::ObjectManager<ir::OperandIndex, OperandLowerInfo> operand;
 };
 
-} // namespace ir
+} // namespace compiler
 } // namespace onert
 
-#endif // __ONERT_IR_LOWER_INFO_MAP_H__
+#endif // __ONERT_COMPILER_GRAPH_LOWER_INFO_MAP_H__
diff --git a/runtime/onert/core/include/compiler/LoweredGraph.h b/runtime/onert/core/include/compiler/LoweredGraph.h
index f115ab9a8..925de3485 100644
--- a/runtime/onert/core/include/compiler/LoweredGraph.h
+++ b/runtime/onert/core/include/compiler/LoweredGraph.h
@@ -14,12 +14,11 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_IR_LOWERED_GRAPH_H__
-#define __ONERT_IR_LOWERED_GRAPH_H__
+#ifndef __ONERT_COMPILER_LOWERED_GRAPH_H__
+#define __ONERT_COMPILER_LOWERED_GRAPH_H__
 
 #include "ir/Graph.h"
-#include "ir/LowerInfoMap.h"
-#include "ir/OpSequences.h"
+#include "compiler/GraphLowerInfo.h"
 #include "compiler/BackendResolver.h"
 #include "compiler/Compiler.h"
 
@@ -40,50 +39,32 @@ public:
 
   ir::Graph &graph() { return _graph; }
   const ir::Graph &graph() const { return _graph; }
-  const ir::LowerInfoMap *getLowerInfo() const { return &_lower_info_map; }
-  const ir::operation::LowerInfo *getLowerInfo(const ir::OpSequenceIndex &op_seq_index) const;
-  void setLowerInfo(const ir::OpSequenceIndex &op_seq_index,
-                    std::unique_ptr<ir::operation::LowerInfo> &&lower_info);
-  void removeLowerInfo(const ir::OpSequenceIndex &op_seq_index);
-  const ir::operand::LowerInfo *getLowerInfo(const ir::OperandIndex &index) const;
-  ir::operand::LowerInfo *getLowerInfo(const ir::OperandIndex &index);
-  void setLowerInfo(const ir::OperandIndex &index,
-                    std::unique_ptr<ir::operand::LowerInfo> &&lower_info);
-  void removeLowerInfo(const ir::OperandIndex &index);
-  ir::OpSequences &op_seqs() { return _op_seqs; }
-  const ir::OpSequences &op_seqs() const { return _op_seqs; }
-  void iterateTopolOpSeqs(
-      const std::function<void(const ir::OpSequenceIndex &, const ir::OpSequence &)> &fn) const;
-  void
-  iterateTopolOpSeqs(const std::function<void(const ir::OpSequenceIndex &, ir::OpSequence &)> &fn);
-  const backend::BackendContexts &backend_contexts() { return _backend_contexts; }
-  const backend::BackendContexts &backend_contexts() const { return _backend_contexts; }
+  const compiler::GraphLowerInfo &lower_info() const { return _lower_info_map; }
+  compiler::GraphLowerInfo &lower_info() { return _lower_info_map; }
   std::shared_ptr<ir::OperationIndexMap<int64_t>> indexed_ranks() { return _indexed_ranks; }
 
-private:
-  void
-  makeOpSequences(ir::OperandIndexMap<std::unique_ptr<ir::operand::LowerInfo>> &operands_lower_info,
-                  const compiler::CompilerOptions &options,
-                  const compiler::BackendResolver &backend_resolver);
+  void setHasDynamicTensor(ir::OperationIndex ind, bool val)
+  {
+    _has_dynamic_tensor_map.emplace(ind, val);
+  }
+  bool getHasDynamicTensor(ir::OperationIndex ind) const
+  {
+    auto itr = _has_dynamic_tensor_map.find(ind);
+    return (itr == _has_dynamic_tensor_map.end()) ? false : itr->second;
+  }
 
-  void manipulateLowerInfo(
-      ir::OperandIndexMap<std::unique_ptr<ir::operand::LowerInfo>> &operands_lower_info);
+private:
+  void makeLowerInfo(const compiler::BackendResolver &backend_resolver);
   void dumpLowerInfo();
-  bool mergeable(const ir::OpSequenceIndex &op_seq_index, const ir::OperationIndex &node_index,
-                 ir::Layout layout, const compiler::BackendResolver &backend_resolver);
-  ir::OpSequenceIndex appendFreshSingleOpSequence(const ir::OperationIndex &node_index,
-                                                  const ir::Operation &node);
 
 private:
   ir::Graph _graph;
-  backend::BackendContexts _backend_contexts;
   std::shared_ptr<ir::OperationIndexMap<int64_t>> _indexed_ranks;
-  ir::LowerInfoMap _lower_info_map;
-  // Pass(for Perm) can accept only graph so that Graph has OpSequences as a member
-  ir::OpSequences _op_seqs;
+  compiler::GraphLowerInfo _lower_info_map;
+  ir::OperationIndexMap<bool> _has_dynamic_tensor_map;
 };
 
 } // namespace compiler
 } // namespace onert
 
-#endif // __ONERT_IR_LOWERED_GRAPH_H__
+#endif // __ONERT_COMPILER_LOWERED_GRAPH_H__
diff --git a/runtime/onert/core/include/ir/operand/LowerInfo.h b/runtime/onert/core/include/compiler/OperandLowerInfo.h
index b7f032b02..340b9cef1 100644
--- a/runtime/onert/core/include/ir/operand/LowerInfo.h
+++ b/runtime/onert/core/include/compiler/OperandLowerInfo.h
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_IR_OPERAND_LOWER_INFO_H__
-#define __ONERT_IR_OPERAND_LOWER_INFO_H__
+#ifndef __ONERT_COMPILER_OPERAND_LOWER_INFO_H__
+#define __ONERT_COMPILER_OPERAND_LOWER_INFO_H__
 
 #include <functional>
 #include <stdint.h>
 
-#include "ir/operand/PermuteFactor.h"
+#include "compiler/PermuteFactor.h"
 #include "util/Set.h"
 
 namespace onert
@@ -33,16 +33,15 @@ class Backend;
 
 namespace onert
 {
-namespace ir
-{
-namespace operand
+namespace compiler
 {
+
 using PermuteFactorSet = util::Set<PermuteFactor>;
 
-class LowerInfo
+class OperandLowerInfo
 {
 public:
-  LowerInfo()
+  OperandLowerInfo()
   {
     // DO NOTHING
   }
@@ -62,8 +61,7 @@ private:
   PermuteFactorSet _use_factors;
 };
 
-} // namespace operand
-} // namespace ir
+} // namespace compiler
 } // namespace onert
 
-#endif // __ONERT_IR_OPERAND_LOWER_INFO_H__
+#endif // __ONERT_COMPILER_OPERAND_LOWER_INFO_H__
diff --git a/runtime/onert/core/include/ir/operation/LowerInfo.h b/runtime/onert/core/include/compiler/OperationLowerInfo.h
index 7ef53b8c7..20ca12952 100644
--- a/runtime/onert/core/include/ir/operation/LowerInfo.h
+++ b/runtime/onert/core/include/compiler/OperationLowerInfo.h
@@ -14,12 +14,13 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_IR_OPERATION_LOWER_INFO_H__
-#define __ONERT_IR_OPERATION_LOWER_INFO_H__
+#ifndef __ONERT_COMPILER_OP_SEQUENCE_LOWER_INFO_H__
+#define __ONERT_COMPILER_OP_SEQUENCE_LOWER_INFO_H__
 
 #include <string>
 
-#include <ir/operand/PermuteFactor.h>
+#include <compiler/PermuteFactor.h>
+#include <ir/Layout.h>
 
 namespace onert
 {
@@ -31,24 +32,21 @@ class Backend;
 
 namespace onert
 {
-namespace ir
-{
-namespace operation
+namespace compiler
 {
 
-class LowerInfo
+class OperationLowerInfo
 {
 public:
-  LowerInfo(const backend::Backend *backend, Layout layout);
+  OperationLowerInfo(const backend::Backend *backend, ir::Layout layout);
   const backend::Backend *backend() const { return _permute_factor.backend(); }
-  Layout layout() const { return _permute_factor.layout(); }
+  ir::Layout layout() const { return _permute_factor.layout(); }
 
 private:
-  operand::PermuteFactor _permute_factor;
+  PermuteFactor _permute_factor;
 };
 
-} // namespace operation
-} // namespace ir
+} // namespace compiler
 } // namespace onert
 
-#endif // __ONERT_IR_OPERATION_LOWER_INFO_H__
+#endif // __ONERT_COMPILER_OP_SEQUENCE_LOWER_INFO_H__
diff --git a/runtime/onert/core/include/ir/operand/PermuteFactor.h b/runtime/onert/core/include/compiler/PermuteFactor.h
index d0bfed337..67ce957bb 100644
--- a/runtime/onert/core/include/ir/operand/PermuteFactor.h
+++ b/runtime/onert/core/include/compiler/PermuteFactor.h
@@ -16,12 +16,12 @@
 
 /**
  * @file     PermuteFactor.h
- * @brief    This file contains onert::ir::operand::PermuteFactor class
+ * @brief    This file contains PermuteFactor class
  * @ingroup  COM_AI_RUNTIME
  */
 
-#ifndef __ONERT_IR_OPERAND_PERMUTE_FACTOR_H__
-#define __ONERT_IR_OPERAND_PERMUTE_FACTOR_H__
+#ifndef __ONERT_COMPILER_OPERAND_PERMUTE_FACTOR_H__
+#define __ONERT_COMPILER_OPERAND_PERMUTE_FACTOR_H__
 
 #include <functional>
 
@@ -37,9 +37,7 @@ class Backend;
 
 namespace onert
 {
-namespace ir
-{
-namespace operand
+namespace compiler
 {
 
 /**
@@ -53,7 +51,8 @@ public:
    * @param backend  The backend factor
    * @param layout   The layout factor
    */
-  PermuteFactor(const backend::Backend *backend, Layout layout) : _backend{backend}, _layout{layout}
+  PermuteFactor(const backend::Backend *backend, ir::Layout layout)
+    : _backend{backend}, _layout{layout}
   {
     // DO NOTHING
   }
@@ -81,7 +80,7 @@ public:
    *
    * @return Layout factor
    */
-  Layout layout() const { return _layout; }
+  ir::Layout layout() const { return _layout; }
 
 public:
   /**
@@ -102,11 +101,10 @@ public:
 
 private:
   const backend::Backend *_backend{nullptr};
-  Layout _layout{Layout::UNKNOWN};
+  ir::Layout _layout{ir::Layout::UNKNOWN};
 };
 
-} // namespace operand
-} // namespace ir
+} // namespace compiler
 } // namespace onert
 
 namespace std
@@ -115,9 +113,9 @@ namespace std
 /**
  * @brief Structure that provides hash value of PermuteFactor
  */
-template <> struct hash<onert::ir::operand::PermuteFactor>
+template <> struct hash<onert::compiler::PermuteFactor>
 {
-  size_t operator()(const onert::ir::operand::PermuteFactor &factor) const noexcept
+  size_t operator()(const onert::compiler::PermuteFactor &factor) const noexcept
   {
     hash<const onert::backend::Backend *> b_hash{};
     hash<onert::ir::Layout> l_hash{};
@@ -127,4 +125,6 @@ template <> struct hash<onert::ir::operand::PermuteFactor>
 
 } // namespace std
 
-#endif // __ONERT_IR_OPERAND_PERMUTE_FACTOR_H__
+std::ostream &operator<<(std::ostream &os, const onert::compiler::PermuteFactor &obj);
+
+#endif // __ONERT_COMPILER_OPERAND_PERMUTE_FACTOR_H__
diff --git a/runtime/onert/core/include/compiler/StaticShapeInferer.h b/runtime/onert/core/include/compiler/StaticShapeInferer.h
index 33a2f62d9..2e484e649 100644
--- a/runtime/onert/core/include/compiler/StaticShapeInferer.h
+++ b/runtime/onert/core/include/compiler/StaticShapeInferer.h
@@ -18,7 +18,6 @@
 #define __ONERT_COMPILER_STATIC_SHAPE_INFERER_H__
 
 #include "ir/OperationVisitor.h"
-#include "ir/OpSequence.h"
 #include "compiler/LoweredGraph.h"
 #include "ir/Index.h"
 
@@ -40,12 +39,12 @@ class StaticShapeInferer : public ir::OperationVisitor
 {
 public:
   StaticShapeInferer(
-      const ir::SubgraphIndex &subg_idx,
-      const std::unordered_map<ir::SubgraphIndex, std::unique_ptr<compiler::LoweredGraph>>
-          &lowered_subgs)
-      : _lowered_subgs(lowered_subgs), _operands(lowered_subgs.at(subg_idx)->graph().operands()),
-        _operations(lowered_subgs.at(subg_idx)->graph().operations()),
-        _return_has_dynamic_tensor(false)
+    const ir::SubgraphIndex &subg_idx,
+    const std::unordered_map<ir::SubgraphIndex, std::unique_ptr<compiler::LoweredGraph>>
+      &lowered_subgs)
+    : _lowered_subgs(lowered_subgs), _operands(lowered_subgs.at(subg_idx)->graph().operands()),
+      _operations(lowered_subgs.at(subg_idx)->graph().operations()),
+      _return_has_dynamic_tensor(false)
   { /* empty */
   }
   virtual ~StaticShapeInferer() = default;
@@ -55,14 +54,15 @@ public:
    * @brief Infer shape of operands beloning to ops and set the output shape.
    *        If output shape cannot be known without running op, mark it so that it can be allocated
    *        when running kernel.
-   * @param op_seq sequence of operations
-   * @return @c true if op_seq's input or output has any dynamic tensor; @c false otherwise.
+   * @param op Operation
+   * @return @c true if op's input or output has any dynamic tensor; @c false otherwise.
    */
-  bool infer(const ir::OpSequence &op_seq);
+  bool infer(const ir::Operation &op);
 
   void dump();
 
 private:
+  void inferSubgraph(ir::SubgraphIndex subg_ind);
   bool checkDynamicInput(const ir::Operation &op);
   void setDynamicOutput(const ir::Operation &op);
 
@@ -128,7 +128,7 @@ private:
 
 private:
   const std::unordered_map<ir::SubgraphIndex, std::unique_ptr<compiler::LoweredGraph>>
-      &_lowered_subgs;
+    &_lowered_subgs;
   // _operands and _operations can be changed by controlflow operation
   ir::Operands &_operands;     // operands of current subgraph
   ir::Operations &_operations; // operations of current subgraph
diff --git a/runtime/onert/core/include/exec/DynamicShapeInferer.h b/runtime/onert/core/include/exec/DynamicShapeInferer.h
index 1f3a13b06..3d040e2cc 100644
--- a/runtime/onert/core/include/exec/DynamicShapeInferer.h
+++ b/runtime/onert/core/include/exec/DynamicShapeInferer.h
@@ -20,8 +20,6 @@
 #include "ir/Operands.h"
 #include "ir/OperationVisitor.h"
 #include "ir/Index.h"
-#include "backend/IDynamicTensorManager.h"
-#include "backend/ITensorManager.h"
 #include "backend/ITensorRegistry.h"
 
 #include <map>
@@ -40,7 +38,7 @@ class DynamicShapeInferer : public ir::OperationVisitor
 public:
   DynamicShapeInferer(const ir::Operands &operands,
                       const std::shared_ptr<backend::ITensorRegistry> &tensor_registry)
-      : _operands(operands), _tensor_registry(tensor_registry)
+    : _operands(operands), _tensor_registry(tensor_registry)
   {
     UNUSED_RELEASE(_operands);
     UNUSED_RELEASE(_tensor_registry);
@@ -106,6 +104,19 @@ private:
    */
   void handleSimpleUnaryOp(const ir::Operation &op, const ir::OperandIndex input_idx);
 
+  // in case of output tensor of an op, it is possible that
+  // the output became dynamic although it had been static before.
+  // Once a tensor becomes dynamic, it will lost memory allocated for static.
+  // Therefore once output is dynamic, it should be treated as dynamic tensor. (memory should be
+  // allocated at runtime) `previously` means `dynamic` or `static` has been set in previous loop in
+  // WHILE of previous call of `nnfw_run()`
+  bool previously_static(backend::ITensor *op_output) { return !op_output->is_dynamic(); }
+
+  // helper function that check if op's input is static
+  // Note that input of n'th op has been set to static or dynamic by (n-1)th op.
+  // That's why it is called `currently_static`
+  bool currently_static(backend::ITensor *op_input) { return !op_input->is_dynamic(); }
+
 private:
   /**
    * @brief To get operand-level info, e.g., ir::Operand::isConstant()
diff --git a/runtime/onert/core/include/exec/FunctionSequence.h b/runtime/onert/core/include/exec/FunctionSequence.h
index 6ec6e60ad..cf3f2a882 100644
--- a/runtime/onert/core/include/exec/FunctionSequence.h
+++ b/runtime/onert/core/include/exec/FunctionSequence.h
@@ -26,7 +26,6 @@
 #include "exec/DynamicShapeInferer.h"
 #include "ir/Operations.h"
 #include "backend/ITensorRegistry.h"
-#include "backend/IDynamicTensorManager.h"
 
 namespace onert
 {
@@ -76,10 +75,9 @@ public:
 public: // methods related to dynamic tensor
   struct DynamicTensorCtx
   {
-    const ir::OpSequence *op_seq = nullptr;
+    ir::OperationIndex op_ind;
     const ir::Operations *operations = nullptr;
     std::shared_ptr<exec::DynamicShapeInferer> dynamic_shape_inferer = nullptr;
-    backend::IDynamicTensorManager *dynamic_tensor_manager = nullptr;
   };
 
   /**
diff --git a/runtime/onert/core/include/exec/IExecutor.h b/runtime/onert/core/include/exec/IExecutor.h
index 345bec8eb..51fc67af4 100644
--- a/runtime/onert/core/include/exec/IExecutor.h
+++ b/runtime/onert/core/include/exec/IExecutor.h
@@ -36,12 +36,12 @@ namespace onert
 namespace backend
 {
 class IPortableTensor;
-namespace controlflow
+namespace builtin
 {
 class IOTensor;
 }
-}
-}
+} // namespace backend
+} // namespace onert
 namespace onert
 {
 namespace exec
@@ -97,7 +97,7 @@ struct IExecutor
    *
    * @return Vector of @c IOTensor
    */
-  virtual const std::vector<backend::controlflow::IOTensor *> &getOutputTensors() const = 0;
+  virtual const std::vector<backend::builtin::IOTensor *> &getOutputTensors() const = 0;
 };
 
 using ExecutorMap = std::unordered_map<ir::SubgraphIndex, std::unique_ptr<IExecutor>>;
diff --git a/runtime/onert/core/include/exec/IODescription.h b/runtime/onert/core/include/exec/IODescription.h
index d1810ec3b..43d4015d5 100644
--- a/runtime/onert/core/include/exec/IODescription.h
+++ b/runtime/onert/core/include/exec/IODescription.h
@@ -37,7 +37,7 @@ struct InputDesc
 
   InputDesc(void) = delete;
   InputDesc(const ir::OperandInfo &info, const void *buffer, const size_t size, ir::Layout layout)
-      : info(info), buffer(buffer), size(size), layout(layout)
+    : info(info), buffer(buffer), size(size), layout(layout)
   {
   }
 };
@@ -53,7 +53,7 @@ struct OutputDesc
 
   OutputDesc(void) = delete;
   OutputDesc(const ir::OperandInfo &info, void *buffer, const size_t size, ir::Layout layout)
-      : info(info), buffer(buffer), size(size), layout(layout)
+    : info(info), buffer(buffer), size(size), layout(layout)
   {
   }
 };
diff --git a/runtime/onert/core/include/ir/Data.h b/runtime/onert/core/include/ir/Data.h
index d31191b4f..bd0d87cae 100644
--- a/runtime/onert/core/include/ir/Data.h
+++ b/runtime/onert/core/include/ir/Data.h
@@ -75,10 +75,10 @@ class MMapedData final : public ExternalData
 public:
   MMapedData(int fd, const std::ptrdiff_t mmap_offset, const size_t mmap_size,
              const std::ptrdiff_t data_offset, const size_t data_size)
-      : ExternalData(nullptr, data_size),
-        _mmap_base(
-            static_cast<uint8_t *>(mmap(NULL, mmap_size, PROT_READ, MAP_PRIVATE, fd, mmap_offset))),
-        _mmap_size(mmap_size), _offset(data_offset - mmap_offset)
+    : ExternalData(nullptr, data_size),
+      _mmap_base(
+        static_cast<uint8_t *>(mmap(NULL, mmap_size, PROT_READ, MAP_PRIVATE, fd, mmap_offset))),
+      _mmap_size(mmap_size), _offset(data_offset - mmap_offset)
   {
     // DO NOTHING
   }
diff --git a/runtime/onert/core/include/ir/Graph.h b/runtime/onert/core/include/ir/Graph.h
index 2103e6e64..5543d9559 100644
--- a/runtime/onert/core/include/ir/Graph.h
+++ b/runtime/onert/core/include/ir/Graph.h
@@ -22,8 +22,6 @@
 
 #include "ir/Operands.h"
 #include "ir/Operations.h"
-#include "ir/OpSequence.h"
-#include "ir/OpSequences.h"
 #include "ir/Subgraphs.h"
 
 namespace onert
@@ -58,18 +56,45 @@ public:
   // Graph Building
 public:
   OperandIndex addOperand(const Shape &shape, const TypeInfo &type);
+  /**
+   * @brief Add an operand to the graph with the given index and object
+   *
+   * If the given index is available, it succeeds. And @c operand is moved which invalidates the
+   * caller's pointer. If the given index is already taken, it fails. And @c operand will not be
+   * moved so the caller's pointer will be still valid.
+   *
+   * @param[in] index Index to be added
+   * @param[in] operand Operand to be added
+   * @return OperandIndex @c index if successful, Undefined otherwise
+   */
+  OperandIndex addOperand(OperandIndex index, std::unique_ptr<Operand> &&operand);
   OperationIndex addOperation(std::unique_ptr<Operation> &&node);
+  /**
+   * @brief Add an operation to the graph with the given index and object
+   *
+   * If the given index is available, it succeeds. And @c operation is moved which invalidates the
+   * caller's pointer. If the given index is already taken, it fails. And @c operation will not be
+   * moved so the caller's pointer will be still valid.
+   *
+   * @param index Index to be added
+   * @param operation Operation to be added
+   * @return OperandIndex @c index if successful, Undefined otherwise
+   */
+  OperationIndex addOperation(OperationIndex index, std::unique_ptr<Operation> &&operation);
   void setOperandValue(const OperandIndex &ind, std::shared_ptr<Data> data);
   void addInput(const OperandIndex &ind, const std::string &name = "");
   void addOutput(const OperandIndex &ind, const std::string &name = "");
-  void finishBuilding(void);
+  void verify(void);
   void removeOperand(const OperandIndex &ind) { _operands.remove(ind); }
-  bool isBuildingPhase(void) const { return _phase == Phase::BUILDING; }
   void setLayout(Layout layout) { _layout = layout; }
   void setSubgraphs(const std::shared_ptr<Subgraphs> &subgs) { _subgraphs = subgs; }
 
 private:
+  bool checkOperandsForOperation(const Operation &operation);
+  void linkOperandToOperation(OperationIndex index, const Operation &operation);
   void initializeUseDef();
+  // TODO Rename to `sweepUnusedOperands`
+  // TODO Make this public
   void sweepGarbageOperands();
 
   // Custom operations support
@@ -104,8 +129,11 @@ public:
   std::shared_ptr<Subgraphs> &subgraphs() { return _subgraphs; }
   Layout layout() const { return _layout; }
 
+  // Topological sort
+public:
+  std::vector<ir::OperationIndex> topolSortOperations() const;
+
 private:
-  Phase _phase{Phase::BUILDING};
   Operations _operations;
   Operands _operands;
   OperandIndexSequence _inputs;
diff --git a/runtime/onert/core/include/ir/Index.h b/runtime/onert/core/include/ir/Index.h
index 2538301a4..e01b090f3 100644
--- a/runtime/onert/core/include/ir/Index.h
+++ b/runtime/onert/core/include/ir/Index.h
@@ -19,6 +19,8 @@
 
 #include "util/Index.h"
 
+#include <ostream>
+
 namespace onert
 {
 namespace ir
@@ -33,12 +35,38 @@ using OperandIndex = ::onert::util::Index<uint32_t, OperandIndexTag>;
 struct IOIndexTag;
 using IOIndex = ::onert::util::Index<uint32_t, IOIndexTag>;
 
-struct OpSequenceIndexTag;
-using OpSequenceIndex = ::onert::util::Index<uint32_t, OpSequenceIndexTag>;
-
 struct SubgraphIndexTag;
 using SubgraphIndex = ::onert::util::Index<uint32_t, SubgraphIndexTag>;
 
+template <typename IndexType>
+std::ostream &_index_print_impl(std::ostream &o, const std::string &prefix, IndexType index)
+{
+  if (index.undefined())
+    return o << prefix << std::string("?");
+  else
+    return o << prefix << index.value();
+}
+
+inline std::ostream &operator<<(std::ostream &o, const OperationIndex &i)
+{
+  return _index_print_impl(o, "@", i);
+}
+
+inline std::ostream &operator<<(std::ostream &o, const OperandIndex &i)
+{
+  return _index_print_impl(o, "%", i);
+}
+
+inline std::ostream &operator<<(std::ostream &o, const IOIndex &i)
+{
+  return _index_print_impl(o, "IO", i);
+}
+
+inline std::ostream &operator<<(std::ostream &o, const SubgraphIndex &i)
+{
+  return _index_print_impl(o, "SUBGRAPH", i); // $ubgraph
+}
+
 } // namespace ir
 } // namespace onert
 
diff --git a/runtime/onert/core/include/ir/OpSequence.h b/runtime/onert/core/include/ir/OpSequence.h
deleted file mode 100644
index 754cf3b34..000000000
--- a/runtime/onert/core/include/ir/OpSequence.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __ONERT_IR_OP_SEQUENCE_H__
-#define __ONERT_IR_OP_SEQUENCE_H__
-
-#include <vector>
-#include <string>
-#include <memory>
-
-#include "ir/Layout.h"
-#include "ir/Index.h"
-#include "ir/Operation.h"
-
-namespace onert
-{
-namespace ir
-{
-
-class Operations;
-
-class OpSequence
-{
-public:
-  explicit OpSequence(Layout layout);
-  OpSequence(const OpSequence &) = delete;
-
-public:
-  void accept(OperationVisitor &v) const;
-
-public:
-  const OperandIndexSequence &getInputs() const { return _inputs; }
-  const OperandIndexSequence &getOutputs() const { return _outputs; }
-  void setInputs(const OperandIndexSequence &indexes) { _inputs = indexes; }
-  void setOutputs(const OperandIndexSequence &indexes) { _outputs = indexes; }
-  void replaceInputs(const OperandIndex &from, const OperandIndex &to)
-  {
-    _inputs.replace(from, to);
-  }
-  void replaceOutputs(const OperandIndex &from, const OperandIndex &to)
-  {
-    _outputs.replace(from, to);
-  }
-
-  void appendOperation(const OperationIndex &index) { _operations.emplace_back(index); }
-
-  std::vector<OperationIndex> &operations(void) { return _operations; }
-
-  const std::vector<OperationIndex> &operations(void) const { return _operations; }
-
-  uint32_t size(void) const { return _operations.size(); }
-
-public:
-  void remove(const OperationIndex &index);
-
-  bool exist(const OperationIndex &index) const;
-
-public:
-  Layout getLayout() const { return _layout; }
-
-public:
-  std::vector<OperationIndex>::const_iterator begin() const { return _operations.begin(); }
-  std::vector<OperationIndex>::const_iterator end() const { return _operations.end(); }
-
-public:
-  /**
-   * @brief Set @c true if any operation in this opSequence has dynamic input
-   *        or dynamic output;
-   *        @c false if all operations' inputs and outputs are static tensors
-   */
-  void has_dynamic_tensor(bool has_dynamic_tensor) { _has_dynamic_tensor = has_dynamic_tensor; }
-  bool has_dynamic_tensor() const { return _has_dynamic_tensor; }
-
-private:
-  OperandIndexSequence _inputs;
-  OperandIndexSequence _outputs;
-  std::vector<OperationIndex> _operations;
-
-private:
-  Layout _layout;
-  bool _has_dynamic_tensor;
-};
-
-std::string getStrFromOpSeq(const OpSequence &op_seq, const Operations &operations);
-
-} // namespace ir
-} // namespace onert
-
-#endif // __ONERT_IR_OP_SEQUENCE_H__
diff --git a/runtime/onert/core/include/ir/OpSequences.h b/runtime/onert/core/include/ir/OpSequences.h
deleted file mode 100644
index ab258f395..000000000
--- a/runtime/onert/core/include/ir/OpSequences.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __ONERT_IR_OP_SEQUENCES_H__
-#define __ONERT_IR_OP_SEQUENCES_H__
-
-#include "ir/Index.h"
-#include "ir/OpSequence.h"
-#include "util/ObjectManager.h"
-
-namespace onert
-{
-namespace ir
-{
-
-/**
- * @brief Class that manages OpSequence objects
- */
-class OpSequences : public util::ObjectManager<OpSequenceIndex, OpSequence>
-{
-public:
-  /**
-   * @brief Create an instance of OpSequence with given op and push it to objects
-   *
-   * @param[in] op_idx Operation index that is emplaced
-   * @param[in] layout OpSequence's layout
-   * @return OpSequenceIndex
-   */
-  OpSequenceIndex emplace(const OperationIndex &op_index, Layout layout);
-
-  /**
-   * @brief Push an instance of OpSequence to objects
-   *
-   * @param[in] op_seq An instance of OpSequence
-   * @return OpSequenceIndex
-   */
-  OpSequenceIndex emplace(std::unique_ptr<OpSequence> &&op_seq);
-  /**
-   * @brief Check if an operation does exist in any OpSequences
-   *
-   * @param operation_index Operation index to find
-   * @return true If such operation exists in any OpSequences otherwise false
-   */
-  bool containsOperation(const OperationIndex &operation_index) const;
-  /**
-   * @brief Find an operation from all OpSequences
-   *
-   * @param operation_index Operation index to find
-   * @return OpSequenceIndex Index of OpSequence that contains given operation index
-   */
-  OpSequenceIndex getOperation(const OperationIndex &operation_index) const;
-  /**
-   * @brief Remove an operation from OpSequence
-   *
-   * @param operation_index Operation index to be removed
-   */
-  void removeFromOpSequence(const OperationIndex &operation_index);
-
-private:
-  void cacheSequenceIndex(const OpSequenceIndex &seq_index, const OperationIndex &op_index) const;
-  OpSequenceIndex *findSequenceIndex(const OperationIndex &operation_index) const;
-
-  OpSequenceIndex findOperation(const OperationIndex &operation_index) const;
-  mutable std::unordered_map<OperationIndex, OpSequenceIndex> _seq_indexes;
-};
-
-/**
- * @brief Dump OpSequences
- *
- * @param op_seqs Operation Sequences
- * @param operations Operation context
- */
-void dumpOpSequences(const OpSequences &op_seqs, const Operations &operations);
-
-} // namespace ir
-} // namespace onert
-
-#endif // __ONERT_IR_OP_SEQUENCES_H__
diff --git a/runtime/onert/core/include/ir/Operand.h b/runtime/onert/core/include/ir/Operand.h
index f149a744b..e4a91579a 100644
--- a/runtime/onert/core/include/ir/Operand.h
+++ b/runtime/onert/core/include/ir/Operand.h
@@ -36,7 +36,7 @@ class Operand
 {
 public:
   explicit Operand(const Shape &shape, const TypeInfo &type)
-      : _info{shape, type, MemAllocType::STATIC}
+    : _info{shape, type, MemAllocType::STATIC}
   {
     // DO NOTHING
   }
@@ -55,6 +55,7 @@ public:
   void removeUse(const OperationIndex &idx);
   void setDef(const OperationIndex &idx);
   void unsetDef();
+  void clearDefUse();
 
 public:
   void type(const DataType type) { _info.type(type); };
diff --git a/runtime/onert/core/include/ir/OperandIndexSequence.h b/runtime/onert/core/include/ir/OperandIndexSequence.h
index 2f78cc832..846c3f950 100644
--- a/runtime/onert/core/include/ir/OperandIndexSequence.h
+++ b/runtime/onert/core/include/ir/OperandIndexSequence.h
@@ -77,7 +77,7 @@ public:
 
 public:
   OperandIndexSequence operator+(const OperandIndexSequence &other) const;
-  friend std::ostream &operator<<(std::ostream &o, const OperandIndexSequence &op_seq);
+  friend std::ostream &operator<<(std::ostream &o, const OperandIndexSequence &operand_seq);
 
 public:
   std::vector<OperandIndex>::const_iterator begin(void) const { return _vec.begin(); }
diff --git a/runtime/onert/core/include/ir/OperandInfo.h b/runtime/onert/core/include/ir/OperandInfo.h
index dc89f8726..11aeb4920 100644
--- a/runtime/onert/core/include/ir/OperandInfo.h
+++ b/runtime/onert/core/include/ir/OperandInfo.h
@@ -67,8 +67,8 @@ public:
    */
   OperandInfo(const Shape &shape, const TypeInfo &typeInfo, MemAllocType alloc_type,
               bool is_const = false, bool is_variable = false)
-      : _shape(shape), _typeInfo(typeInfo), _alloc_type(alloc_type), _const(is_const),
-        _variable(is_variable)
+    : _shape(shape), _typeInfo(typeInfo), _alloc_type(alloc_type), _const(is_const),
+      _variable(is_variable)
   {
     // DO NOTHING
   }
diff --git a/runtime/onert/core/include/ir/OperationVisitor.h b/runtime/onert/core/include/ir/OperationVisitor.h
index a27770744..4d08a5c71 100644
--- a/runtime/onert/core/include/ir/OperationVisitor.h
+++ b/runtime/onert/core/include/ir/OperationVisitor.h
@@ -18,7 +18,6 @@
 #define __ONERT_IR_OPERATION_VISITOR_H__
 
 #include "ir/Operations.Include.h"
-#include "ir/OpSequence.h"
 
 namespace onert
 {
@@ -33,15 +32,6 @@ struct OperationVisitor
   virtual void visit(const operation::InternalName &) {}
 #include "ir/Operations.lst"
 #undef OP
-
-  // This OpSequence node should be handled specially so that
-  // Op.lst doesn't have OpSequence
-  // TODO Remove by pushing it down to derived classes.
-  virtual void visit(const OpSequence &)
-  {
-    throw std::runtime_error{
-        "OperationVisitor: This does not privide visit function in OpSequence"};
-  }
 };
 
 } // namespace ir
diff --git a/runtime/onert/core/include/ir/Shape.h b/runtime/onert/core/include/ir/Shape.h
index a0b4bb196..ec6dd07af 100644
--- a/runtime/onert/core/include/ir/Shape.h
+++ b/runtime/onert/core/include/ir/Shape.h
@@ -12,7 +12,7 @@
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
-*/
+ */
 
 #ifndef __ONERT_IR_SHAPE_H__
 #define __ONERT_IR_SHAPE_H__
@@ -61,7 +61,7 @@ struct FeatureShape
    * @param[in]  width  The width value
    */
   FeatureShape(int32_t batch, int32_t depth, int32_t height, int32_t width)
-      : N{batch}, C{depth}, H{height}, W{width}
+    : N{batch}, C{depth}, H{height}, W{width}
   {
     // DO NOTHING
   }
@@ -89,6 +89,7 @@ public:
     return rank() == 0 ? 1 : _dimensions.at(i);
   }
 
+  // TODO Fix different behavior with const version
   int32_t &dim(int i) { return _dimensions.at(i); }
 
   /**
@@ -139,10 +140,10 @@ inline bool operator!=(const Shape &lhs, const Shape &rhs) { return lhs.dims() !
 Shape permuteShape(const Shape &shape, Layout frontend_layout, Layout backend_layout);
 
 /**
-* @brief Find out if tha rank in this shape is "maybe" unspecified.
-*        Note that when rank == 0, shape could represent scalar or unspecified rank
-* \see https://developer.android.com/ndk/reference/struct/a-neural-networks-operand-type
-*/
+ * @brief Find out if tha rank in this shape is "maybe" unspecified.
+ *        Note that when rank == 0, shape could represent scalar or unspecified rank
+ * \see https://developer.android.com/ndk/reference/struct/a-neural-networks-operand-type
+ */
 inline bool rankMaybeUnspecified(const ir::Shape &shape) { return (shape.rank() == 0); }
 
 } // namespace ir
diff --git a/runtime/onert/core/include/ir/Sparsity.h b/runtime/onert/core/include/ir/Sparsity.h
index ad4d8259b..690304ad2 100644
--- a/runtime/onert/core/include/ir/Sparsity.h
+++ b/runtime/onert/core/include/ir/Sparsity.h
@@ -12,7 +12,7 @@
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
-*/
+ */
 
 #ifndef __ONERT_IR_SPARSITY_H__
 #define __ONERT_IR_SPARSITY_H__
@@ -35,7 +35,7 @@ public:
   Sparsity() = default;
   Sparsity(std::vector<uint16_t> &&w1_segments, std::vector<uint16_t> &&w1_indices,
            std::vector<int32_t> &&block_size)
-      : _w1_segments(w1_segments), _w1_indices(w1_indices), _block_size(block_size)
+    : _w1_segments(w1_segments), _w1_indices(w1_indices), _block_size(block_size)
   {
   }
 
diff --git a/runtime/onert/core/include/ir/TypeInfo.h b/runtime/onert/core/include/ir/TypeInfo.h
index a1ae4d2e4..0a00da5fd 100644
--- a/runtime/onert/core/include/ir/TypeInfo.h
+++ b/runtime/onert/core/include/ir/TypeInfo.h
@@ -17,6 +17,7 @@
 #ifndef __ONERT_IR_TYPEINFO_H__
 #define __ONERT_IR_TYPEINFO_H__
 
+#include <cassert>
 #include <cstdint>
 #include <memory>
 #include <vector>
@@ -29,21 +30,51 @@ namespace onert
 namespace ir
 {
 
+struct Quantization
+{
+  std::vector<float> scales;
+  std::vector<int32_t> zero_points;
+};
+
 class TypeInfo
 {
 public:
   TypeInfo() = delete;
 
-  explicit TypeInfo(DataType type, float scale = 0, int32_t offset = 0)
-      : _type(type), _scale(scale), _offset(offset), _sparsity(nullptr)
+  explicit TypeInfo(DataType type) : _type{type}, _sparsity{nullptr} {}
+
+  TypeInfo(DataType type, float scale, int32_t zero_point) : _type{type}, _sparsity{nullptr}
   {
+    quantization(scale, zero_point);
   }
 
 public:
   DataType type() const { return _type; }
-  float scale() const { return _scale; }
-  int32_t offset() const { return _offset; }
+  float scale() const
+  {
+    assert(_quant.scales.size() == 1);
+    return _quant.scales[0];
+  }
+  const std::vector<float> &scales() const { return _quant.scales; }
+  int32_t zero_point() const
+  {
+    assert(_quant.zero_points.size() == 1);
+    return _quant.zero_points[0];
+  }
+  const std::vector<int32_t> &zero_points() const { return _quant.zero_points; }
   const ir::Sparsity *sparsity() const { return _sparsity.get(); }
+  void quantization(float scale, int32_t zero_point)
+  {
+    _quant.scales.resize(1);
+    _quant.scales[0] = scale;
+    _quant.zero_points.resize(1);
+    _quant.zero_points[0] = zero_point;
+  }
+  void quantization(std::vector<float> &&scales, std::vector<int32_t> &&zero_points)
+  {
+    _quant.scales = scales;
+    _quant.zero_points = zero_points;
+  }
   void sparsity(std::shared_ptr<ir::Sparsity> sparsity) { _sparsity = sparsity; }
 
 public:
@@ -51,10 +82,7 @@ public:
 
 private:
   DataType _type;
-  // for quantization
-  float _scale;
-  int32_t _offset;
-  // for sparsity
+  ir::Quantization _quant;
   std::shared_ptr<ir::Sparsity> _sparsity;
 };
 
diff --git a/runtime/onert/core/include/util/Config.lst b/runtime/onert/core/include/util/Config.lst
index 5944f8344..d501345c1 100644
--- a/runtime/onert/core/include/util/Config.lst
+++ b/runtime/onert/core/include/util/Config.lst
@@ -31,7 +31,6 @@ CONFIG(ACL_LAYOUT              , std::string  , "none")
 CONFIG(NCNN_LAYOUT             , std::string  , "NCHW")
 CONFIG(PROFILING_MODE          , bool         , "0")
 CONFIG(USE_SCHEDULER           , bool         , "0")
-CONFIG(OP_SEQ_MAX_NODE         , int          , "0")
 CONFIG(TRACE_FILEPATH          , std::string  , "")
 CONFIG(FP16_ENABLE             , bool         , "0")
 CONFIG(RUY_THREADS             , int          , "-1")
@@ -44,4 +43,3 @@ CONFIG(USE_MMAPED_DATA         , bool         , "0")
     CONFIG(OP_BACKEND_ ## InternalName, std::string, "")
 #include "ir/Operations.lst"
 #undef OP
-
diff --git a/runtime/onert/core/include/util/Exceptions.h b/runtime/onert/core/include/util/Exceptions.h
index fc3fa0f64..e77686593 100644
--- a/runtime/onert/core/include/util/Exceptions.h
+++ b/runtime/onert/core/include/util/Exceptions.h
@@ -38,7 +38,7 @@ class InsufficientBufferSizeException : public OnertException
 {
 public:
   InsufficientBufferSizeException(const std::string &msg)
-      : OnertException{"InsufficientBufferSize", msg}
+    : OnertException{"InsufficientBufferSize", msg}
   {
   }
 };
diff --git a/runtime/onert/core/include/util/ITimer.h b/runtime/onert/core/include/util/ITimer.h
index d5a4e1eb0..f63a3f220 100644
--- a/runtime/onert/core/include/util/ITimer.h
+++ b/runtime/onert/core/include/util/ITimer.h
@@ -46,7 +46,7 @@ public:
   {
     const auto end_time = std::chrono::steady_clock::now();
     _timer_res =
-        std::chrono::duration_cast<std::chrono::microseconds>(end_time - _start_time).count();
+      std::chrono::duration_cast<std::chrono::microseconds>(end_time - _start_time).count();
   };
 
 private:
diff --git a/runtime/onert/core/include/util/Index.h b/runtime/onert/core/include/util/Index.h
index e8f59282d..d3f3dcb46 100644
--- a/runtime/onert/core/include/util/Index.h
+++ b/runtime/onert/core/include/util/Index.h
@@ -138,14 +138,6 @@ public:
    */
   T value() const { return _index; }
 
-  friend std::ostream &operator<<(std::ostream &o, const Index &t)
-  {
-    if (t.undefined())
-      return o << std::string("undefined");
-    else
-      return o << t.value();
-  }
-
 private:
   T _index;
 };
diff --git a/runtime/onert/core/include/util/ObjectManager.h b/runtime/onert/core/include/util/ObjectManager.h
index d2dd881a8..a493789fa 100644
--- a/runtime/onert/core/include/util/ObjectManager.h
+++ b/runtime/onert/core/include/util/ObjectManager.h
@@ -24,6 +24,8 @@
 
 #include <memory>
 
+#include "util/logging.h"
+
 namespace onert
 {
 namespace util
@@ -36,35 +38,71 @@ namespace util
 template <typename Index, typename Object> class ObjectManager
 {
 public:
-  ObjectManager() : _index_count{0u} {}
+  ObjectManager() : _next_index{0u} {}
 
 public:
   /**
-   * @brief Create an object with args and put it in the container with a new Index for that
+   * @brief Create an object with args and put it in the container with a newly assigned @c Index
    *
    * @param[in] args Arguments for creating Operand object
-   * @return Created index that is associated to the object
+   * @return Created index that is associated to the object if successful, Undefined index otherwise
    */
   template <class... Args> Index emplace(Args &&... args)
   {
     auto index = generateIndex();
+    if (!index.valid())
+      return index;
     _objects.emplace(index, std::make_unique<Object>(std::forward<Args>(args)...));
     return index;
   }
 
   /**
-   * @brief Put object in the container with a new Index for that
+   * @brief Put the object in the container with given index.
+   *
+   * It fails when the given index is already taken or @c index is Undefined.
    *
    * @param[in] object Object to be pushed
-   * @return Created index that is associated to the object
+   * @param[in] index Index associated with the object
+   * @return @c index if successful, an Undefined index otherwise
+   */
+  Index push(std::unique_ptr<Object> &&object, Index index)
+  {
+    auto gen_index = tryIndex(index);
+    if (gen_index.valid())
+      _objects.emplace(gen_index, std::move(object));
+    return gen_index;
+  }
+  /**
+   * @brief Put the object in the container with a newly assigned index.
+   *
+   * It fails when it cannot generate a valid index.
+   *
+   * @param[in] object Object to be pushed
+   * @return The newly assigned index if successful, an Undefined index otherwise
    */
   Index push(std::unique_ptr<Object> &&object)
   {
-    auto index = generateIndex();
-    _objects.emplace(index, std::move(object));
+    auto gen_index = generateIndex();
+    if (gen_index.valid())
+      _objects.emplace(gen_index, std::move(object));
+    return gen_index;
+  }
+  /**
+   * @brief Set the object in the container with given index.
+   *
+   * If the index is Undefined, it will fail.
+   * If the index is already taken, it will overwrite the content.
+   *
+   * @param[in] object Object to be pushed
+   * @param[in] index Index associated with the object
+   * @return @c index if successful, an Undefined index otherwise
+   */
+  Index set(Index index, std::unique_ptr<Object> &&object)
+  {
+    if (index.valid())
+      _objects[index] = std::move(object);
     return index;
   }
-
   /**
    * @brief Remove the object that is associated with the given index
    *
@@ -76,6 +114,8 @@ public:
   /**
    * @brief Get the object that is associated with the given index
    *
+   * If such object does not exist, it will throw @c std::out_of_range
+   *
    * @param[in] index Index of the object to be returned
    * @return Object
    */
@@ -83,6 +123,8 @@ public:
   /**
    * @brief Get the object that is associated with the given index
    *
+   * If such object does not exist, it will throw @c std::out_of_range
+   *
    * @param[in] index Index of the object to be returned
    * @return Object
    */
@@ -90,6 +132,38 @@ public:
   /**
    * @brief Get the object that is associated with the given index
    *
+   * If such object does not exist, it will return `nullptr`
+   *
+   * @param[in] index Index of the object to be returned
+   * @return Object
+   */
+  const Object *getRawPtr(const Index &index) const
+  {
+    auto itr = _objects.find(index);
+    if (itr == _objects.end())
+      return nullptr;
+    else
+    {
+      assert(itr->second != nullptr);
+      return itr->second.get();
+    }
+  }
+  /**
+   * @brief Get the object that is associated with the given index
+   *
+   * If such object does not exist, it will return `nullptr`
+   *
+   * @param[in] index Index of the object to be returned
+   * @return Object The found object
+   */
+  Object *getRawPtr(const Index &index)
+  {
+    return const_cast<Object *>(
+      const_cast<const ObjectManager<Index, Object> *>(this)->getRawPtr(index));
+  }
+  /**
+   * @brief Get the object that is associated with the given index
+   *
    * @param[in] index Index of the object to be returned
    * @return true if such entry exists otherwise false
    */
@@ -99,6 +173,12 @@ public:
     return it != _objects.end();
   }
   /**
+   * @brief Return the number of objects that the manager contains
+   *
+   * @return size_t Number of objects
+   */
+  size_t size() const { return _objects.size(); }
+  /**
    * @brief Iterate over the container with given function
    *
    * @param[in] fn Function to be run for every container entry
@@ -135,11 +215,39 @@ public:
   }
 
 private:
-  Index generateIndex() { return Index{_index_count++}; }
+  // Try assigning the given index
+  Index tryIndex(Index index)
+  {
+    if (!index.valid())
+      return index;
+    if (_objects.find(index) == _objects.end())
+    {
+      // If the given index does not exist, update the next index and return the index
+      if (index.value() >= _next_index)
+        _next_index = index.value() + 1;
+      return index;
+    }
+    else
+    {
+      // If the given index exists already, return a non-valid index
+      return Index{};
+    }
+  }
+
+  // Generate a new index with `_next_index`
+  Index generateIndex()
+  {
+    // No need to check if there is an entry with _next_index since
+    // _next_index is always ("the highest index in the object map" + 1)
+    if (Index{_next_index}.valid())
+      return Index{_next_index++};
+    else
+      return Index{};
+  }
 
 protected:
   std::unordered_map<Index, std::unique_ptr<Object>> _objects;
-  uint32_t _index_count;
+  uint32_t _next_index;
 };
 
 } // namespace util
diff --git a/runtime/onert/core/include/util/ShapeInference.h b/runtime/onert/core/include/util/ShapeInference.h
index b11da90ce..d859378c6 100644
--- a/runtime/onert/core/include/util/ShapeInference.h
+++ b/runtime/onert/core/include/util/ShapeInference.h
@@ -29,7 +29,6 @@
 #include "ir/Index.h"
 #include "ir/Layout.h"
 #include "ir/OperationVisitor.h"
-#include "backend/IDynamicTensorManager.h"
 #include "backend/ITensor.h"
 #include "backend/ITensorRegistry.h"
 
@@ -104,8 +103,8 @@ ir::Shape inferResizeBilinearShape(const ir::Shape &in_shape, const int32_t outp
 ir::Shape inferSelectShape(const ir::Shape &input_cond_shape, const ir::Shape &input_true_shape,
                            const ir::Shape &input_false_shape);
 
-ir::Shape inferSliceShape(const ir::Shape &input_shape, const int32_t *begins_buf,
-                          const int32_t *sizes_buf);
+template <typename T>
+ir::Shape inferSliceShape(const ir::Shape &input_shape, const T *begins_buf, const T *sizes_buf);
 
 ir::Shape inferSpaceToBatchNDShape(const ir::Shape &input_shape, const ir::Shape &block_shape_shape,
                                    const ir::Shape &padding_shape, const int32_t *block_shape_buf,
diff --git a/runtime/onert/core/include/util/TracingCtx.h b/runtime/onert/core/include/util/TracingCtx.h
index a82704cf0..334257d87 100644
--- a/runtime/onert/core/include/util/TracingCtx.h
+++ b/runtime/onert/core/include/util/TracingCtx.h
@@ -64,6 +64,13 @@ public:
   uint32_t getSessionId() const { return _session_id; }
 
   /**
+   * @brief Return true if more than 1 session exist
+   *
+   * @note  This method is NOT thread-safe. Call this in thread-safe situation.
+   */
+  bool hasMultipleSessions() const { return _next_session_id > 1; }
+
+  /**
    * @brief Set subgraph index of a graph
    */
   void setSubgraphIndex(const ir::Graph *g, uint32_t index) { _subgraph_indices.emplace(g, index); }
@@ -78,14 +85,14 @@ private:
   {
     std::unique_lock<std::mutex> lock{_session_id_mutex};
 
-    static uint32_t next_session_id = 0;
-    _session_id = next_session_id++;
+    _session_id = _next_session_id++;
   }
 
 private:
   std::unordered_map<const ir::Graph *, ir::SubgraphIndex> _subgraph_indices;
   uint32_t _session_id;
   static std::mutex _session_id_mutex;
+  static uint32_t _next_session_id;
 };
 
 } // namespace util
diff --git a/runtime/onert/core/include/util/logging.h b/runtime/onert/core/include/util/logging.h
index 65c375077..fe255f8ff 100644
--- a/runtime/onert/core/include/util/logging.h
+++ b/runtime/onert/core/include/util/logging.h
@@ -18,6 +18,7 @@
 #define __ONERT_UTIL_LOGGING_H__
 
 #include <iostream>
+#include <cstring>
 
 #include "util/ConfigSource.h"
 
@@ -52,17 +53,28 @@ private:
 
 static Context &ctx = Context::get();
 
+inline std::string decorated_name(const char *input)
+{
+  const int min_prefix = 16;
+  std::string prefix(input);
+  auto len_prefix = prefix.size();
+  if (len_prefix > min_prefix)
+    return "[" + prefix + "] ";
+  std::string spaces((min_prefix - len_prefix) / 2, ' ');
+  return (len_prefix % 2 ? "[ " : "[") + spaces + prefix + spaces + "] ";
+}
+
 } // namespace logging
 } // namespace util
 } // namespace onert
 
 #define VERBOSE(name)                        \
   if (::onert::util::logging::ctx.enabled()) \
-  std::cout << "[" << #name << "] "
+  std::cout << ::onert::util::logging::decorated_name(#name)
 
 #define VERBOSE_F()                          \
   if (::onert::util::logging::ctx.enabled()) \
-  std::cout << "[" << __func__ << "] "
+  std::cout << ::onert::util::logging::decorated_name(__func__)
 
 #define WHEN_LOG_ENABLED(METHOD)             \
   if (::onert::util::logging::ctx.enabled()) \
diff --git a/runtime/onert/core/src/backend/BackendContext.cc b/runtime/onert/core/src/backend/BackendContext.cc
index 404c3b155..b9aab7994 100644
--- a/runtime/onert/core/src/backend/BackendContext.cc
+++ b/runtime/onert/core/src/backend/BackendContext.cc
@@ -23,12 +23,5 @@ namespace onert
 namespace backend
 {
 
-void BackendContext::initialize(const std::vector<OperationInfo> &operation_list,
-                                const std::vector<ir::OperandIndex> &operand_list)
-{
-  _operation_list = operation_list;
-  _operand_list = operand_list;
-}
-
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/core/src/backend/ITensor.cc b/runtime/onert/core/src/backend/ITensor.cc
index 7127ed93d..1339cb409 100644
--- a/runtime/onert/core/src/backend/ITensor.cc
+++ b/runtime/onert/core/src/backend/ITensor.cc
@@ -21,14 +21,9 @@ namespace onert
 namespace backend
 {
 
-ir::Shape ITensor::getShape() const
-{
-  onert::ir::Shape shape(num_dimensions());
-  for (uint32_t d = 0; d < num_dimensions(); d++)
-    shape.dim(d) = dimension(d);
-
-  return shape;
-}
+// `dynamic_cast` not working across library boundaries on NDK
+// With this as a key function, `dynamic_cast` works across dl
+ITensor::~ITensor() {}
 
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/core/src/backend/cpu_common/Allocator.cc b/runtime/onert/core/src/backend/basic/Allocator.cc
index 0ba444ee6..61214dfad 100644
--- a/runtime/onert/core/src/backend/cpu_common/Allocator.cc
+++ b/runtime/onert/core/src/backend/basic/Allocator.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "backend/cpu_common/Allocator.h"
+#include "backend/basic/Allocator.h"
 
 #include "util/logging.h"
 
@@ -22,7 +22,7 @@ namespace onert
 {
 namespace backend
 {
-namespace cpu_common
+namespace basic
 {
 
 Allocator::Allocator(uint32_t capacity)
@@ -33,6 +33,6 @@ Allocator::Allocator(uint32_t capacity)
   VERBOSE(ALLOC) << "base pointer: " << static_cast<void *>(_base.get()) << std::endl;
 }
 
-} // namespace cpu_common
+} // namespace basic
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/core/src/backend/cpu_common/BackendContextHelpers.cc b/runtime/onert/core/src/backend/basic/BackendContextHelpers.cc
index 732b03ce8..c02cc0cf2 100644
--- a/runtime/onert/core/src/backend/cpu_common/BackendContextHelpers.cc
+++ b/runtime/onert/core/src/backend/basic/BackendContextHelpers.cc
@@ -14,4 +14,4 @@
  * limitations under the License.
  */
 
-#include "backend/cpu_common/BackendContextHelpers.h"
+#include "backend/basic/BackendContextHelpers.h"
diff --git a/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc b/runtime/onert/core/src/backend/basic/DynamicTensorManager.cc
index 740248ccd..07bcb09ee 100644
--- a/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc
+++ b/runtime/onert/core/src/backend/basic/DynamicTensorManager.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "backend/cpu_common/DynamicTensorManager.h"
+#include "backend/basic/DynamicTensorManager.h"
 
 #include "util/logging.h"
 #include "misc/polymorphic_downcast.h"
@@ -23,11 +23,11 @@ namespace onert
 {
 namespace backend
 {
-namespace cpu_common
+namespace basic
 {
 
 DynamicTensorManager::DynamicTensorManager(const std::shared_ptr<TensorRegistry> &reg)
-    : _dynamic_mem_mgr{new DynamicMemoryManager()}, _tensors{reg}
+  : _dynamic_mem_mgr{new DynamicMemoryManager()}, _tensors{reg}
 {
   // DO NOTHING
 }
@@ -41,33 +41,6 @@ void DynamicTensorManager::buildTensor(const ir::OperandIndex &ind,
   _tensors->setNativeTensor(ind, std::move(tensor));
 }
 
-void DynamicTensorManager::planDealloc(ir::OperationIndex op_ind, backend::ITensor *tensor)
-{
-  _dealloc_tensor_map[op_ind].emplace(tensor);
-}
-
-void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind)
-{
-  auto find = _dealloc_tensor_map.find(op_ind);
-  if (find == _dealloc_tensor_map.end())
-    return;
-
-  auto &input_set = find->second;
-  for (auto *tensor : input_set)
-  {
-    if (!tensor->is_dynamic())
-      continue;
-
-    _dynamic_mem_mgr->deallocate(tensor);
-
-    auto *cpu_tensor = nnfw::misc::polymorphic_downcast<cpu_common::Tensor *>(tensor);
-    cpu_tensor->resetBuffer();
-
-    VERBOSE(DynamicTensorManager) << "Deallocating tensor " << (void *)cpu_tensor
-                                  << " (input of op_ind: " << op_ind.value() << ")" << std::endl;
-  }
-}
-
 const ITensor *DynamicTensorManager::getRawITensor(ir::OperandIndex ind)
 {
   auto ptr = _tensors->getITensor(ind);
@@ -75,6 +48,6 @@ const ITensor *DynamicTensorManager::getRawITensor(ir::OperandIndex ind)
   return ptr;
 }
 
-} // namespace cpu_common
+} // namespace basic
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/core/src/backend/cpu_common/MemoryManager.cc b/runtime/onert/core/src/backend/basic/MemoryManager.cc
index 9f179d9ee..c468ee458 100644
--- a/runtime/onert/core/src/backend/cpu_common/MemoryManager.cc
+++ b/runtime/onert/core/src/backend/basic/MemoryManager.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <backend/cpu_common/MemoryManager.h>
+#include <backend/basic/MemoryManager.h>
 
 #include <cassert>
 
@@ -26,7 +26,7 @@ namespace onert
 {
 namespace backend
 {
-namespace cpu_common
+namespace basic
 {
 
 MemoryManager::MemoryManager() : _mem_planner{createMemoryPlanner()}
@@ -35,20 +35,20 @@ MemoryManager::MemoryManager() : _mem_planner{createMemoryPlanner()}
 }
 
 MemoryManager::MemoryManager(const std::string planner_id)
-    : _mem_planner{createMemoryPlanner(planner_id)}
+  : _mem_planner{createMemoryPlanner(planner_id)}
 {
   // DO NOTHING
 }
 
-cpu_common::IMemoryPlanner *MemoryManager::createMemoryPlanner()
+basic::IMemoryPlanner *MemoryManager::createMemoryPlanner()
 {
   auto planner_id = util::getConfigString(util::config::CPU_MEMORY_PLANNER);
-  return cpu_common::MemoryPlannerFactory::get().create(planner_id);
+  return basic::MemoryPlannerFactory::get().create(planner_id);
 }
 
-cpu_common::IMemoryPlanner *MemoryManager::createMemoryPlanner(const std::string planner_id)
+basic::IMemoryPlanner *MemoryManager::createMemoryPlanner(const std::string planner_id)
 {
-  return cpu_common::MemoryPlannerFactory::get().create(planner_id);
+  return basic::MemoryPlannerFactory::get().create(planner_id);
 }
 
 void MemoryManager::claimPlan(const ir::OperandIndex &ind, uint32_t size)
@@ -60,7 +60,7 @@ void MemoryManager::releasePlan(const ir::OperandIndex &ind) { _mem_planner->rel
 
 void MemoryManager::allocate(void)
 {
-  _mem_alloc = std::make_shared<cpu_common::Allocator>(_mem_planner->capacity());
+  _mem_alloc = std::make_shared<basic::Allocator>(_mem_planner->capacity());
   assert(_mem_alloc->base());
 }
 
@@ -71,14 +71,14 @@ uint8_t *MemoryManager::getBuffer(const ir::OperandIndex &ind) const
   return _mem_alloc->base() + mem_blk.offset;
 }
 
-std::shared_ptr<cpu_common::Allocator> DynamicMemoryManager::allocate(const ITensor *tensor,
-                                                                      uint32_t capacity)
+std::shared_ptr<basic::Allocator> DynamicMemoryManager::allocate(const ITensor *tensor,
+                                                                 uint32_t capacity)
 {
   auto find = _mem_alloc_map.find(tensor);
   if (find != _mem_alloc_map.end())
     throw std::runtime_error("Cannot allocate memory for a tensor. It was already allocated.");
 
-  _mem_alloc_map[tensor] = std::make_shared<cpu_common::Allocator>(capacity);
+  _mem_alloc_map[tensor] = std::make_shared<basic::Allocator>(capacity);
   return _mem_alloc_map[tensor];
 }
 
@@ -103,6 +103,6 @@ void DynamicMemoryManager::deallocate(void)
   _mem_alloc_map.clear();
 }
 
-} // namespace cpu_common
+} // namespace basic
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/core/src/backend/cpu_common/MemoryPlanner.cc b/runtime/onert/core/src/backend/basic/MemoryPlanner.cc
index 01cd1a0fe..1fda57b3d 100644
--- a/runtime/onert/core/src/backend/cpu_common/MemoryPlanner.cc
+++ b/runtime/onert/core/src/backend/basic/MemoryPlanner.cc
@@ -22,7 +22,7 @@ namespace onert
 {
 namespace backend
 {
-namespace cpu_common
+namespace basic
 {
 
 void BumpPlanner::claim(const ir::OperandIndex &ind, size_t size)
@@ -31,13 +31,12 @@ void BumpPlanner::claim(const ir::OperandIndex &ind, size_t size)
   _mem_plans[ind] = blk;
   _capacity += size;
 
-  VERBOSE(BP_PLANNER) << "CLAIM(#" << ind.value() << "): " << blk.offset << ", " << blk.size
-                      << std::endl;
+  VERBOSE(BP_PLANNER) << "CLAIM(" << ind << "): " << blk.offset << ", " << blk.size << std::endl;
 }
 
 void BumpPlanner::release(const ir::OperandIndex &ind)
 {
-  VERBOSE(BP_PLANNER) << "RELEASE(#" << ind.value() << "): "
+  VERBOSE(BP_PLANNER) << "RELEASE(" << ind << "): "
                       << "NOTHING does" << std::endl;
 }
 
@@ -77,7 +76,7 @@ void FirstFitPlanner::claim(const ir::OperandIndex &ind, size_t size)
   _claim_table[next_offset] = ind;
   _mem_plans[ind] = {next_offset, size};
 
-  VERBOSE(FF_PLANNER) << "claim(#" << ind.value() << "): [+" << next_offset << ", " << size << "sz]"
+  VERBOSE(FF_PLANNER) << "claim(" << ind << "): [+" << next_offset << ", " << size << "sz]"
                       << std::endl;
 
   if (_capacity < next_offset + size)
@@ -98,7 +97,7 @@ void FirstFitPlanner::release(const ir::OperandIndex &ind)
 
       _claim_table.erase(it);
 
-      VERBOSE(FF_PLANNER) << "release(#" << index << "): [+" << offset << ", " << size << "sz]"
+      VERBOSE(FF_PLANNER) << "release(" << index << "): [+" << offset << ", " << size << "sz]"
                           << std::endl;
       return;
     }
@@ -107,8 +106,8 @@ void FirstFitPlanner::release(const ir::OperandIndex &ind)
 }
 
 WICPlanner::WICPlanner()
-    : _initialized(false), _capacity(0), _mem_plans(), _live_operands(), _interference_graph(),
-      _operands()
+  : _initialized(false), _capacity(0), _mem_plans(), _live_operands(), _interference_graph(),
+    _operands()
 {
   // DO NOTHING
 }
@@ -124,13 +123,13 @@ void WICPlanner::claim(const ir::OperandIndex &ind, size_t size)
   }
   _live_operands.emplace(ind);
 
-  VERBOSE(WIC_PLANNER) << "claim(#" << ind.value() << "): [" << size << "sz]" << std::endl;
+  VERBOSE(WIC_PLANNER) << "claim(" << ind << "): [" << size << "sz]" << std::endl;
 }
 
 void WICPlanner::release(const ir::OperandIndex &ind)
 {
   _live_operands.erase(ind);
-  VERBOSE(WIC_PLANNER) << "release(#" << ind.value() << ")" << std::endl;
+  VERBOSE(WIC_PLANNER) << "release(" << ind << ")" << std::endl;
 }
 
 /*
@@ -148,7 +147,7 @@ void WICPlanner::buildMemoryPlans()
   {
     uint32_t size = operand.first;
     const ir::OperandIndex &ind = operand.second;
-    VERBOSE(WIC_PLANNER) << "build_plan(#" << ind.value() << "): [" << size << "sz]" << std::endl;
+    VERBOSE(WIC_PLANNER) << "build_plan(" << ind << "): [" << size << "sz]" << std::endl;
 
     uint32_t next_offset = 0;
     if (_interference_graph.count(ind))
@@ -184,8 +183,8 @@ void WICPlanner::buildMemoryPlans()
     }
 
     _mem_plans[ind] = {next_offset, size};
-    VERBOSE(WIC_PLANNER) << "alloc(#" << ind.value() << "): [+" << next_offset << ", " << size
-                         << "sz]" << std::endl;
+    VERBOSE(WIC_PLANNER) << "alloc(" << ind << "): [+" << next_offset << ", " << size << "sz]"
+                         << std::endl;
 
     if (_capacity < next_offset + size)
     {
@@ -204,6 +203,6 @@ WICPlanner::MemoryPlans &WICPlanner::memory_plans()
   return _mem_plans;
 }
 
-} // namespace cpu_common
+} // namespace basic
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/core/src/backend/cpu_common/MemoryPlanner.h b/runtime/onert/core/src/backend/basic/MemoryPlanner.h
index 7c387e542..661d0b5d9 100644
--- a/runtime/onert/core/src/backend/cpu_common/MemoryPlanner.h
+++ b/runtime/onert/core/src/backend/basic/MemoryPlanner.h
@@ -19,23 +19,23 @@
  * @brief       This file contains Memory Planning related classes
  */
 
-#ifndef __ONERT_BACKEND_CPU_COMMON_MEMORY_PLANNER_H__
-#define __ONERT_BACKEND_CPU_COMMON_MEMORY_PLANNER_H__
+#ifndef __ONERT_BACKEND_BASIC_MEMORY_PLANNER_H__
+#define __ONERT_BACKEND_BASIC_MEMORY_PLANNER_H__
 
 #include <map>
 #include <vector>
 #include <unordered_set>
 #include <memory>
 
-#include "backend/cpu_common/Allocator.h"
-#include "backend/cpu_common/IMemoryPlanner.h"
+#include "backend/basic/Allocator.h"
+#include "backend/basic/IMemoryPlanner.h"
 #include "ir/OperandIndexMap.h"
 
 namespace onert
 {
 namespace backend
 {
-namespace cpu_common
+namespace basic
 {
 
 /**
@@ -153,8 +153,8 @@ private:
   std::multimap<uint32_t, ir::OperandIndex, std::greater<uint32_t>> _operands;
 };
 
-} // namespace cpu_common
+} // namespace basic
 } // namespace backend
 } // namespace onert
 
-#endif // __ONERT_BACKEND_CPU_COMMON_MEMORY_PLANNER_H__
+#endif // __ONERT_BACKEND_BASIC_MEMORY_PLANNER_H__
diff --git a/runtime/onert/core/src/backend/cpu_common/MemoryPlanner.test.cc b/runtime/onert/core/src/backend/basic/MemoryPlanner.test.cc
index 5208a94d4..a32228cbe 100644
--- a/runtime/onert/core/src/backend/cpu_common/MemoryPlanner.test.cc
+++ b/runtime/onert/core/src/backend/basic/MemoryPlanner.test.cc
@@ -21,13 +21,13 @@
 
 TEST(Allocator, allocate_test)
 {
-  ::onert::backend::cpu_common::Allocator allocator(1024);
+  ::onert::backend::basic::Allocator allocator(1024);
   ASSERT_NE(allocator.base(), nullptr);
 }
 
 TEST(BumpPlanner, claim_test)
 {
-  ::onert::backend::cpu_common::BumpPlanner planner;
+  ::onert::backend::basic::BumpPlanner planner;
 
   auto claim = [&planner](uint32_t index, size_t size, uint32_t expected_offset) {
     onert::ir::OperandIndex mem_idx(index);
@@ -44,7 +44,7 @@ TEST(BumpPlanner, claim_test)
 
 TEST(FirstFitPlanner, claim_release_test)
 {
-  ::onert::backend::cpu_common::FirstFitPlanner planner;
+  ::onert::backend::basic::FirstFitPlanner planner;
 
   auto claim = [&planner](uint32_t index, size_t size, uint32_t expected_offset) {
     onert::ir::OperandIndex mem_idx(index);
@@ -128,7 +128,7 @@ TEST(FirstFitPlanner, claim_release_test)
 
 TEST(WICPlanner, claim_release_test)
 {
-  ::onert::backend::cpu_common::WICPlanner planner;
+  ::onert::backend::basic::WICPlanner planner;
 
   auto claim = [&planner](uint32_t index, size_t size) {
     onert::ir::OperandIndex mem_idx(index);
diff --git a/runtime/onert/core/src/backend/cpu_common/MemoryPlannerFactory.cc b/runtime/onert/core/src/backend/basic/MemoryPlannerFactory.cc
index ead4f3294..e12635359 100644
--- a/runtime/onert/core/src/backend/cpu_common/MemoryPlannerFactory.cc
+++ b/runtime/onert/core/src/backend/basic/MemoryPlannerFactory.cc
@@ -22,7 +22,7 @@ namespace onert
 {
 namespace backend
 {
-namespace cpu_common
+namespace basic
 {
 
 MemoryPlannerFactory &MemoryPlannerFactory::get()
@@ -48,6 +48,6 @@ IMemoryPlanner *MemoryPlannerFactory::create(const std::string &key)
   return new FirstFitPlanner; // Default Planner
 }
 
-} // namespace cpu_common
+} // namespace basic
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/core/src/backend/cpu_common/MemoryPlannerFactory.h b/runtime/onert/core/src/backend/basic/MemoryPlannerFactory.h
index d14ec13ca..fe32f4c99 100644
--- a/runtime/onert/core/src/backend/cpu_common/MemoryPlannerFactory.h
+++ b/runtime/onert/core/src/backend/basic/MemoryPlannerFactory.h
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_BACKEND_CPU_COMMON_MEMORY_PLANNER_FACTORY_H__
-#define __ONERT_BACKEND_CPU_COMMON_MEMORY_PLANNER_FACTORY_H__
+#ifndef __ONERT_BACKEND_BASIC_MEMORY_PLANNER_FACTORY_H__
+#define __ONERT_BACKEND_BASIC_MEMORY_PLANNER_FACTORY_H__
 
-#include "backend/cpu_common/IMemoryPlanner.h"
+#include "backend/basic/IMemoryPlanner.h"
 
 #include <string>
 
@@ -25,7 +25,7 @@ namespace onert
 {
 namespace backend
 {
-namespace cpu_common
+namespace basic
 {
 
 class MemoryPlannerFactory
@@ -40,8 +40,8 @@ public:
   IMemoryPlanner *create(const std::string &key);
 };
 
-} // namespace cpu_common
+} // namespace basic
 } // namespace backend
 } // namespace onert
 
-#endif // __ONERT_BACKEND_CPU_COMMON_MEMORY_PLANNER_FACTORY_H__
+#endif // __ONERT_BACKEND_BASIC_MEMORY_PLANNER_FACTORY_H__
diff --git a/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc b/runtime/onert/core/src/backend/basic/StaticTensorManager.cc
index 8c5c46a08..d891814fa 100644
--- a/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc
+++ b/runtime/onert/core/src/backend/basic/StaticTensorManager.cc
@@ -14,23 +14,23 @@
  * limitations under the License.
  */
 
-#include "backend/cpu_common/StaticTensorManager.h"
+#include "backend/basic/StaticTensorManager.h"
 
-#include "backend/cpu_common/DynamicTensorManager.h"
-#include "backend/cpu_common/Tensor.h"
+#include "backend/basic/DynamicTensorManager.h"
+#include "backend/basic/Tensor.h"
 #include <util/logging.h>
 
 namespace onert
 {
 namespace backend
 {
-namespace cpu_common
+namespace basic
 {
 
 StaticTensorManager::StaticTensorManager(const std::shared_ptr<TensorRegistry> &reg,
                                          DynamicTensorManager *dynamic_tensor_manager)
-    : _nonconst_mgr{new MemoryManager()}, _tensors{reg},
-      _dynamic_tensor_manager{dynamic_tensor_manager}
+  : _nonconst_mgr{new MemoryManager()}, _tensors{reg}, _dynamic_tensor_manager{
+                                                         dynamic_tensor_manager}
 {
   // DO NOTHING
 }
@@ -48,8 +48,8 @@ void StaticTensorManager::allocateNonconsts(void)
       auto *buffer = _nonconst_mgr->getBuffer(ind);
       tensor->setBuffer(buffer);
 
-      VERBOSE(CPU_StaticTensorManager) << "TENSOR(#" << ind.value()
-                                       << "): " << static_cast<void *>(buffer) << std::endl;
+      VERBOSE(CPU_StaticTensorManager)
+        << "TENSOR " << ind << " : " << static_cast<void *>(buffer) << std::endl;
     }
   }
 }
@@ -103,6 +103,6 @@ void StaticTensorManager::iterate(const std::function<void(const ir::OperandInde
     fn(it.first);
 }
 
-} // namespace cpu_common
+} // namespace basic
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/core/src/backend/cpu_common/Tensor.cc b/runtime/onert/core/src/backend/basic/Tensor.cc
index e412cb775..c2bbc5a66 100644
--- a/runtime/onert/core/src/backend/cpu_common/Tensor.cc
+++ b/runtime/onert/core/src/backend/basic/Tensor.cc
@@ -14,28 +14,30 @@
  * limitations under the License.
  */
 
-#include "backend/cpu_common/Tensor.h"
+#include "backend/basic/Tensor.h"
 
 #include "ir/DataType.h"
-#include "backend/cpu_common/MemoryManager.h"
+#include "backend/basic/MemoryManager.h"
 
 namespace onert
 {
 namespace backend
 {
-namespace cpu_common
+namespace basic
 {
 
 Tensor::~Tensor() {}
 
 size_t Tensor::calcOffset(const ir::Coordinates &coords) const
 {
-  size_t rank = num_dimensions();
+  auto shape = getShape();
+  size_t rank = shape.rank();
   rank = rank == 0 ? 1 : rank;
   size_t offset = 0;
   for (size_t i = 0; i < rank; ++i)
   {
-    offset = offset * dimension(i) + coords[i];
+    auto dim = shape.rank() == 0 ? 1 : shape.dim(i);
+    offset = offset * dim + coords[i];
   }
   offset *= sizeOfDataType(data_type());
   return offset;
@@ -47,31 +49,19 @@ bool Tensor::applyShape(const ir::Shape &new_shape)
 {
   bool previously_dynamic = is_dynamic();
 
-  auto allocTensorMem = [&](bool overwrite = false) {
+  auto allocTensorMem = [&]() {
     auto capacity = total_size();
     auto alloc = _dynamic_mem_mgr->allocate(this, capacity);
-
-    if (overwrite)
-      overwriteBuffer(alloc);
-    else
-      setBuffer(alloc);
+    setBuffer(alloc);
   };
 
-  if (!previously_dynamic)
-  {
-    // TODO deallocate tensor->buffer()
-    // issue is that staticTensorManager might have allocate this memory
-    setShape(new_shape);
-    set_dynamic();
-    allocTensorMem(true);
-  }
-  else if (buffer() == nullptr)
+  if (!previously_dynamic || buffer() == nullptr)
   {
+    // Always set shape - when buffer with same size was already allocated, shape could differ
     setShape(new_shape);
     set_dynamic();
     allocTensorMem();
   }
-  // when buffer was already allocated and new_shape requires different size
   else
   {
     auto previous_size = total_size();
@@ -82,7 +72,7 @@ bool Tensor::applyShape(const ir::Shape &new_shape)
 
       setShape(new_shape);
       set_dynamic();
-      allocTensorMem(true);
+      allocTensorMem();
     }
     else
     { // when buffer with same size was already allocated, shape could differ
@@ -92,7 +82,22 @@ bool Tensor::applyShape(const ir::Shape &new_shape)
   return true;
 }
 
-} // namespace cpu_common
+ir::Shape Tensor::getShape() const { return _info.shape(); }
+
+void Tensor::deallocBuffer()
+{
+  if (_allocator)
+  {
+    _buffer = nullptr;
+    _allocator.reset();
+    if (_dynamic_mem_mgr)
+    {
+      _dynamic_mem_mgr->deallocate(this);
+    }
+  }
+}
+
+} // namespace basic
 } // namespace backend
 } // namespace onert
 
@@ -102,13 +107,13 @@ namespace onert
 {
 namespace backend
 {
-namespace cpu_common
+namespace basic
 {
 
 // `dynamic_cast` not working across library boundaries on NDK
 // With this as a key function, `dynamic_cast` works across dl
 ExternalTensor::~ExternalTensor() {}
 
-} // namespace cpu
+} // namespace basic
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/backend/ruy/TensorBuilder.cc b/runtime/onert/core/src/backend/basic/TensorBuilder.cc
index c77defc30..a10cc2bf9 100644
--- a/runtime/onert/backend/ruy/TensorBuilder.cc
+++ b/runtime/onert/core/src/backend/basic/TensorBuilder.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "TensorBuilder.h"
+#include <backend/basic/TensorBuilder.h>
 
 #include <util/logging.h>
 
@@ -24,13 +24,12 @@ namespace onert
 {
 namespace backend
 {
-namespace ruy
+namespace basic
 {
 
-TensorBuilder::TensorBuilder(const std::shared_ptr<cpu_common::TensorRegistry> &tensor_reg)
-    : _tensor_reg{tensor_reg},
-      _dynamic_tensor_mgr{new cpu_common::DynamicTensorManager(_tensor_reg)},
-      _static_tensor_mgr{new StaticTensorManager(_tensor_reg, _dynamic_tensor_mgr.get())}
+TensorBuilder::TensorBuilder(const std::shared_ptr<TensorRegistry> &tensor_reg)
+  : _tensor_reg{tensor_reg}, _dynamic_tensor_mgr{new DynamicTensorManager(_tensor_reg)},
+    _static_tensor_mgr{new StaticTensorManager(_tensor_reg, _dynamic_tensor_mgr.get())}
 {
   /* empty */
 }
@@ -77,14 +76,8 @@ bool TensorBuilder::isRegistered(const ir::OperandIndex &ind) const
   return _tensor_info_map.find(ind) != _tensor_info_map.end();
 }
 
-void TensorBuilder::prepare(void) { _static_tensor_mgr->allocateNonconsts(); }
+void TensorBuilder::allocate(void) { _static_tensor_mgr->allocateNonconsts(); }
 
-void TensorBuilder::allocate()
-{
-  // NOTE For now nothing to do. Allocation is done in prepare stage, which is not appropriate
-  //      This is because CPU kernels require `ITensor`s to be allocated before Kernel Generation.
-}
-
-} // namespace ruy
+} // namespace basic
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/core/src/backend/controlflow/Backend.h b/runtime/onert/core/src/backend/builtin/Backend.h
index 3323cf5cb..3791f3ffa 100644
--- a/runtime/onert/core/src/backend/controlflow/Backend.h
+++ b/runtime/onert/core/src/backend/builtin/Backend.h
@@ -14,12 +14,11 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_BACKEND_CONTROLFLOW_BACKEND_H__
-#define __ONERT_BACKEND_CONTROLFLOW_BACKEND_H__
+#ifndef __ONERT_BACKEND_BUILTIN_BACKEND_H__
+#define __ONERT_BACKEND_BUILTIN_BACKEND_H__
 
 #include "BackendContext.h"
 #include "Config.h"
-#include "ConstantInitializer.h"
 #include "KernelGenerator.h"
 #include "TensorBuilder.h"
 #include "Tensor.h"
@@ -32,7 +31,7 @@ namespace onert
 {
 namespace backend
 {
-namespace controlflow
+namespace builtin
 {
 
 class Backend : public ::onert::backend::Backend
@@ -42,12 +41,9 @@ public:
 
   std::shared_ptr<IConfig> config() const override { return _config; }
 
-  std::unique_ptr<onert::backend::BackendContext>
-  newContext(const ir::Graph &graph, const std::shared_ptr<custom::IKernelBuilder> &,
-             bool) const override
+  std::unique_ptr<onert::backend::BackendContext> newContext(ContextData &&data) const override
   {
-    const auto &operands = graph.operands();
-    auto context = std::make_unique<BackendContext>(this, &graph);
+    auto context = std::make_unique<BackendContext>(this, std::move(data));
     // ControlFlow backend may not build tensors for itself because the backend's operation uses
     // tensors of other baceknd instead
     // But the backend builds tensors in case of that the controlflow operation may have constant
@@ -69,9 +65,8 @@ public:
     auto tb = std::make_shared<TensorBuilder>(tr);
     context->tensor_registry = tr;
     context->tensor_builder = tb;
-    context->constant_initializer = std::make_shared<ConstantInitializer>(operands, tr);
-    context->kernel_gen = std::make_shared<KernelGenerator>(graph, tb->dynamicTensorManager(), tr,
-                                                            context->external_context());
+    context->kernel_gen = std::make_shared<KernelGenerator>(
+      *context->graph(), tb->dynamicTensorManager(), tr, context->external_context());
     return context;
   }
 
@@ -79,8 +74,8 @@ private:
   std::shared_ptr<IConfig> _config;
 };
 
-} // namespace controlflow
+} // namespace builtin
 } // namespace backend
 } // namespace onert
 
-#endif // __ONERT_BACKEND_CONTROLFLOW_BACKEND_H__
+#endif // __ONERT_BACKEND_BUILTIN_BACKEND_H__
diff --git a/runtime/onert/core/src/backend/builtin/BackendContext.cc b/runtime/onert/core/src/backend/builtin/BackendContext.cc
new file mode 100644
index 000000000..8a6cddcfb
--- /dev/null
+++ b/runtime/onert/core/src/backend/builtin/BackendContext.cc
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BackendContext.h"
+
+#include "KernelGenerator.h"
+#include "backend/basic/BackendContextHelpers.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace builtin
+{
+
+ITensorRegistry *BackendContext::genTensors() { return basic::genTensors(*this); }
+
+FunctionMap BackendContext::genKernels()
+{
+  FunctionMap ret;
+
+  for (auto op_ind : _data.op_order)
+  {
+    auto fn_seq = kernel_gen->generate(op_ind);
+    ret.emplace_back(op_ind, std::move(fn_seq));
+  }
+
+  basic::initConsts(*this);
+
+  // NOTE For memory optimization, we want to free some operand data
+  const_cast<ir::Graph *>(graph())->operands().iterate(
+    [&](const ir::OperandIndex &, ir::Operand &obj) { obj.releaseData(); });
+
+  for (auto &it : ret)
+  {
+    auto &fn_seq = it.second;
+    fn_seq->iterate([&](exec::IFunction &ifunc) { ifunc.prepare(); });
+  }
+
+  return ret;
+}
+
+} // namespace builtin
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/core/src/backend/controlflow/BackendContext.h b/runtime/onert/core/src/backend/builtin/BackendContext.h
index a768d5d61..93e825239 100644
--- a/runtime/onert/core/src/backend/controlflow/BackendContext.h
+++ b/runtime/onert/core/src/backend/builtin/BackendContext.h
@@ -14,12 +14,11 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_BACKEND_CONTROLFLOW_BACKEND_CONTEXT_H__
-#define __ONERT_BACKEND_CONTROLFLOW_BACKEND_CONTEXT_H__
+#ifndef __ONERT_BACKEND_BUILTIN_BACKEND_CONTEXT_H__
+#define __ONERT_BACKEND_BUILTIN_BACKEND_CONTEXT_H__
 
 #include <backend/BackendContext.h>
 #include "TensorBuilder.h"
-#include "ConstantInitializer.h"
 #include "KernelGenerator.h"
 #include "ExternalContext.h"
 
@@ -27,41 +26,35 @@ namespace onert
 {
 namespace backend
 {
-namespace controlflow
+namespace builtin
 {
 
 class BackendContext : public onert::backend::BackendContext
 {
 public:
-  BackendContext(const Backend *backend, const ir::Graph *graph,
+  BackendContext(const Backend *backend, ContextData &&data,
                  std::shared_ptr<ITensorRegistry> tensor_registry = nullptr,
                  std::shared_ptr<TensorBuilder> tensor_builder = nullptr,
-                 std::shared_ptr<ConstantInitializer> constant_initializer = nullptr,
                  std::shared_ptr<KernelGenerator> kernel_gen = nullptr)
-      : onert::backend::BackendContext(backend, graph, tensor_registry),
-        tensor_builder{tensor_builder}, constant_initializer{constant_initializer},
-        kernel_gen{kernel_gen}, _external_context(std::make_shared<ExternalContext>())
+    : onert::backend::BackendContext(backend, std::move(data), tensor_registry),
+      tensor_builder{tensor_builder}, kernel_gen{kernel_gen},
+      _external_context(std::make_shared<ExternalContext>())
   {
   }
 
-  ITensorRegistry *genTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
-                              const ir::OpSequences &op_seqs,
-                              const ir::LowerInfoMap &lower_info) override;
+  ITensorRegistry *genTensors() override;
 
-  FunctionMap genKernels(const std::vector<ir::OpSequenceIndex> &order,
-                         const ir::OpSequences &op_seqs) override;
+  FunctionMap genKernels() override;
 
   std::shared_ptr<ExternalContext> external_context() { return _external_context; }
 
 private:
-  void initConsts();
-  void planTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
-                   const ir::OpSequences &op_seqs, const ir::LowerInfoMap &lower_info);
+  void planTensors(const std::vector<onert::ir::OperationIndex> &order,
+                   const compiler::GraphLowerInfo &lower_info);
 
 public:
   // TODO Make it private
   std::shared_ptr<TensorBuilder> tensor_builder;
-  std::shared_ptr<ConstantInitializer> constant_initializer;
   std::shared_ptr<KernelGenerator> kernel_gen;
 
 private:
@@ -71,8 +64,8 @@ private:
   std::shared_ptr<ExternalContext> _external_context;
 };
 
-} // namespace controlflow
+} // namespace builtin
 } // namespace backend
 } // namespace onert
 
-#endif // __ONERT_BACKEND_CONTROLFLOW_BACKEND_CONTEXT_H__
+#endif // __ONERT_BACKEND_BUILTIN_BACKEND_CONTEXT_H__
diff --git a/runtime/onert/core/src/backend/controlflow/Config.cc b/runtime/onert/core/src/backend/builtin/Config.cc
index 5ec01fe11..f792c0c36 100644
--- a/runtime/onert/core/src/backend/controlflow/Config.cc
+++ b/runtime/onert/core/src/backend/builtin/Config.cc
@@ -20,10 +20,10 @@ namespace onert
 {
 namespace backend
 {
-namespace controlflow
+namespace builtin
 {
 
-std::string Config::ID = "controlflow";
+std::string Config::ID = "builtin";
 
 bool Config::initialize() { return true; }
 
@@ -32,6 +32,6 @@ ir::Layout Config::supportLayout(const ir::Operation &, ir::Layout frontend_layo
   return frontend_layout;
 }
 
-} // namespace controlflow
+} // namespace builtin
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/core/src/backend/controlflow/Config.h b/runtime/onert/core/src/backend/builtin/Config.h
index 6645ed59d..5226eba69 100644
--- a/runtime/onert/core/src/backend/controlflow/Config.h
+++ b/runtime/onert/core/src/backend/builtin/Config.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_BACKEND_CONTROLFLOW_CONFIG_H__
-#define __ONERT_BACKEND_CONTROLFLOW_CONFIG_H__
+#ifndef __ONERT_BACKEND_BUILTIN_CONFIG_H__
+#define __ONERT_BACKEND_BUILTIN_CONFIG_H__
 
 #include <backend/IConfig.h>
 #include <memory>
@@ -25,7 +25,7 @@ namespace onert
 {
 namespace backend
 {
-namespace controlflow
+namespace builtin
 {
 
 class Config : public IConfig
@@ -46,8 +46,8 @@ public:
   std::unique_ptr<util::ITimer> timer() override { return std::make_unique<util::CPUTimer>(); }
 };
 
-} // namespace controlflow
+} // namespace builtin
 } // namespace backend
 } // namespace onert
 
-#endif // __ONERT_BACKEND_CONTROLFLOW_CONFIG_H__
+#endif // __ONERT_BACKEND_BUILTIN_CONFIG_H__
diff --git a/runtime/onert/backend/ruy/ConstantInitializer.h b/runtime/onert/core/src/backend/builtin/ConstantInitializer.h
index 24b4d924d..6b8eb3e9d 100644
--- a/runtime/onert/backend/ruy/ConstantInitializer.h
+++ b/runtime/onert/core/src/backend/builtin/ConstantInitializer.h
@@ -14,22 +14,22 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_BACKEND_RUY_CONSTANT_INITIALIZER_H__
-#define __ONERT_BACKEND_RUY_CONSTANT_INITIALIZER_H__
+#ifndef __ONERT_COMPILER_BUILTIN_CONSTANT_INITIALIZER_H__
+#define __ONERT_COMPILER_BUILTIN_CONSTANT_INITIALIZER_H__
 
-#include <backend/cpu_common/ConstantInitializer.h>
+#include <backend/basic/ConstantInitializer.h>
 
 namespace onert
 {
 namespace backend
 {
-namespace ruy
+namespace builtin
 {
 
-using ConstantInitializer = cpu_common::ConstantInitializer;
+using ConstantInitializer = basic::ConstantInitializer;
 
-} // namespace ruy
+} // namespace builtin
 } // namespace backend
 } // namespace onert
 
-#endif // __ONERT_BACKEND_RUY_CONSTANT_INITIALIZER_H__
+#endif // __ONERT_COMPILER_BUILTIN_CONSTANT_INITIALIZER_H__
diff --git a/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.h b/runtime/onert/core/src/backend/builtin/DynamicTensorManager.h
index c962d6ef1..148948a9c 100644
--- a/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.h
+++ b/runtime/onert/core/src/backend/builtin/DynamicTensorManager.h
@@ -14,25 +14,25 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_BACKEND_CONTROLFLOW_DYNAMICTENSOR_MANAGER_H__
-#define __ONERT_BACKEND_CONTROLFLOW_DYNAMICTENSOR_MANAGER_H__
+#ifndef __ONERT_BACKEND_BUILTIN_DYNAMICTENSOR_MANAGER_H__
+#define __ONERT_BACKEND_BUILTIN_DYNAMICTENSOR_MANAGER_H__
 
 #include "TensorRegistry.h"
 #include "Tensor.h"
 
-#include <backend/cpu_common/DynamicTensorManager.h>
+#include <backend/basic/DynamicTensorManager.h>
 
 namespace onert
 {
 namespace backend
 {
-namespace controlflow
+namespace builtin
 {
 
-using DynamicTensorManager = cpu_common::DynamicTensorManager;
+using DynamicTensorManager = basic::DynamicTensorManager;
 
-} // namespace controlflow
+} // namespace builtin
 } // namespace backend
 } // namespace onert
 
-#endif // __ONERT_BACKEND_CONTROLFLOW_DYNAMICTENSOR_MANAGER_H__
+#endif // __ONERT_BACKEND_BUILTIN_DYNAMICTENSOR_MANAGER_H__
diff --git a/runtime/onert/core/src/backend/controlflow/ExternalContext.h b/runtime/onert/core/src/backend/builtin/ExternalContext.h
index cfb983136..e67be988d 100644
--- a/runtime/onert/core/src/backend/controlflow/ExternalContext.h
+++ b/runtime/onert/core/src/backend/builtin/ExternalContext.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_BACKEND_CONTROLFLOW_EXTERNAL_CONTEXT_H__
-#define __ONERT_BACKEND_CONTROLFLOW_EXTERNAL_CONTEXT_H__
+#ifndef __ONERT_BACKEND_BUILTIN_EXTERNAL_CONTEXT_H__
+#define __ONERT_BACKEND_BUILTIN_EXTERNAL_CONTEXT_H__
 
 #include <util/ConfigSource.h>
 
@@ -24,21 +24,19 @@
 #include <ruy/ctx.h>
 #include <ruy/tune.h>
 
-namespace
-{
-const int kDefaultNumThreadpoolThreads = 1;
-}
-
 namespace onert
 {
 namespace backend
 {
-namespace controlflow
+namespace builtin
 {
 
 // TODO Unify this with cpu::ExternalContext
 class ExternalContext
 {
+private:
+  static const int kDefaultNumThreadpoolThreads = 1;
+
 public:
   ExternalContext() : _ruy_context(std::make_unique<ruy::Context>())
   {
@@ -49,7 +47,7 @@ public:
   void setMaxNumThreads(int max_num_threads)
   {
     const int target_num_threads =
-        max_num_threads > -1 ? max_num_threads : kDefaultNumThreadpoolThreads;
+      max_num_threads > -1 ? max_num_threads : kDefaultNumThreadpoolThreads;
     _ruy_context->set_max_num_threads(target_num_threads);
   }
 
@@ -72,8 +70,8 @@ private:
   const std::unique_ptr<ruy::Context> _ruy_context;
 };
 
-} // namespace controlflow
+} // namespace builtin
 } // namespace backend
 } // namespace onert
 
-#endif // __ONERT_BACKEND_CONTROLFLOW_EXTERNAL_CONTEXT_H__
+#endif // __ONERT_BACKEND_BUILTIN_EXTERNAL_CONTEXT_H__
diff --git a/runtime/onert/core/src/backend/controlflow/IOTensor.cc b/runtime/onert/core/src/backend/builtin/IOTensor.cc
index 47405ac9e..f7f4a6977 100644
--- a/runtime/onert/core/src/backend/controlflow/IOTensor.cc
+++ b/runtime/onert/core/src/backend/builtin/IOTensor.cc
@@ -22,11 +22,15 @@ namespace onert
 {
 namespace backend
 {
-namespace controlflow
+namespace builtin
 {
 
+// `dynamic_cast` not working across library boundaries on NDK
+// With this as a key function, `dynamic_cast` works across dl
+IOTensor::~IOTensor() {}
+
 IOTensor::IOTensor(const ir::OperandInfo &info, ir::Layout layout)
-    : IPortableTensor{info}, _orig_info{info}, _orig_layout{layout}
+  : IPortableTensor{info}, _orig_info{info}, _orig_layout{layout}
 {
   setUserTensor(nullptr, 0);
 }
@@ -47,6 +51,6 @@ void IOTensor::setUserTensor(uint8_t *buffer, size_t size)
   _tensor = _user_tensor.get();
 }
 
-} // namespace controlflow
+} // namespace builtin
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/core/src/backend/controlflow/IOTensor.h b/runtime/onert/core/src/backend/builtin/IOTensor.h
index a7ed84b6d..a1b2064a1 100644
--- a/runtime/onert/core/src/backend/controlflow/IOTensor.h
+++ b/runtime/onert/core/src/backend/builtin/IOTensor.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_BACKEND_CONTROLFLOW_IO_TENSOR_H__
-#define __ONERT_BACKEND_CONTROLFLOW_IO_TENSOR_H__
+#ifndef __ONERT_BACKEND_BUILTIN_IO_TENSOR_H__
+#define __ONERT_BACKEND_BUILTIN_IO_TENSOR_H__
 
 #include "backend/IPortableTensor.h"
 #include "UserTensor.h"
@@ -24,7 +24,7 @@ namespace onert
 {
 namespace backend
 {
-namespace controlflow
+namespace builtin
 {
 
 /**
@@ -42,6 +42,7 @@ class IOTensor : public IPortableTensor
 {
 public:
   IOTensor(const ir::OperandInfo &info, ir::Layout layout);
+  ~IOTensor();
 
 public:
   void setTensor(IPortableTensor *tensor);
@@ -52,17 +53,16 @@ public:
 public:
   uint8_t *buffer() const override { return _tensor->buffer(); }
   size_t total_size() const override { return _tensor->total_size(); }
-  size_t dimension(size_t index) const override { return _tensor->dimension(index); }
-  size_t num_dimensions() const override { return _tensor->num_dimensions(); }
   size_t calcOffset(const ir::Coordinates &coords) const override
   {
     return _tensor->calcOffset(coords);
   }
   ir::Layout layout() const override { return _tensor->layout(); }
   ir::DataType data_type() const override { return _tensor->data_type(); }
-  float data_scale() const override { return _tensor->data_scale(); }
-  int32_t data_offset() const override { return _tensor->data_offset(); }
-  bool is_dynamic() const override { return _is_dynamic || (_tensor && _tensor->is_dynamic()); }
+  bool is_dynamic() const override
+  {
+    return _is_dynamic || _orig_info.isDynamic() || (_tensor && _tensor->is_dynamic());
+  }
   void set_dynamic() override { _is_dynamic = true; }
   ir::Shape getShape() const override { return _tensor->getShape(); }
   void setShape(const ir::Shape &shape) override
@@ -79,6 +79,9 @@ public:
     return _tensor->applyShape(shape);
   }
 
+public:
+  void setShapeOfIPortableTensor(const ir::Shape &shape) { _info.shape(shape); }
+
 private:
   const ir::OperandInfo _orig_info;
   const ir::Layout _orig_layout;
@@ -87,8 +90,8 @@ private:
   std::unique_ptr<UserTensor> _user_tensor; //< If it is a user tensor, it is managed by this object
 };
 
-} // namespace controlflow
+} // namespace builtin
 } // namespace backend
 } // namespace onert
 
-#endif // __ONERT_BACKEND_CONTROLFLOW_IO_TENSOR_H__
+#endif // __ONERT_BACKEND_BUILTIN_IO_TENSOR_H__
diff --git a/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc b/runtime/onert/core/src/backend/builtin/KernelGenerator.cc
index 2606f044e..3d6358d9d 100644
--- a/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc
+++ b/runtime/onert/core/src/backend/builtin/KernelGenerator.cc
@@ -28,48 +28,47 @@ namespace onert
 {
 namespace backend
 {
-namespace controlflow
+namespace builtin
 {
 
 KernelGenerator::KernelGenerator(const ir::Graph &graph, DynamicTensorManager *dyn_tensor_manager,
                                  const std::shared_ptr<TensorRegistry> &tensor_reg,
                                  const std::shared_ptr<ExternalContext> &external_context)
-    : _graph{graph}, _dyn_tensor_manager{dyn_tensor_manager}, _tensor_reg{tensor_reg},
-      _tensor_registries{}, _executor_map{nullptr}, _external_context{external_context}
+  : basic::KernelGeneratorBase{graph}, _dyn_tensor_manager{dyn_tensor_manager},
+    _tensor_reg{tensor_reg}, _tensor_registries{}, _executor_map{nullptr}, _external_context{
+                                                                             external_context}
 {
   UNUSED_RELEASE(_graph);
   UNUSED_RELEASE(_tensor_registries);
   UNUSED_RELEASE(_executor_map);
 }
 
-void KernelGenerator::visit(const ir::OpSequence &op_seq)
+std::unique_ptr<exec::FunctionSequence> KernelGenerator::generate(ir::OperationIndex ind)
 {
-  assert(!_return_fn_seq);
   assert(_dyn_tensor_manager);
   assert(_tensor_reg);
 
   auto dyn_shape_inferer =
-      std::make_unique<exec::DynamicShapeInferer>(_graph.operands(), _tensor_reg);
+    std::make_unique<exec::DynamicShapeInferer>(_graph.operands(), _tensor_reg);
 
-  _return_fn_seq = std::make_unique<exec::FunctionSequence>();
+  auto ret = std::make_unique<exec::FunctionSequence>();
 
   // Prepare to handle dynamic tensors later
   auto dyn_ctx = std::make_shared<exec::FunctionSequence::DynamicTensorCtx>();
   {
-    dyn_ctx->op_seq = &op_seq;
+    dyn_ctx->op_ind = ind;
     dyn_ctx->operations = &_graph.operations();
     dyn_ctx->dynamic_shape_inferer = std::move(dyn_shape_inferer);
-    dyn_ctx->dynamic_tensor_manager = _dyn_tensor_manager;
 
-    _return_fn_seq->dynamic_tensor_ctx(dyn_ctx);
+    ret->dynamic_tensor_ctx(dyn_ctx);
   }
 
-  for (const auto &op_idx : op_seq.operations())
-  {
-    const auto &node = _graph.operations().at(op_idx);
-    node.accept(*this);
-    _return_fn_seq->append(releaseFunction());
-  }
+  auto &op = _graph.operations().at(ind);
+  op.accept(*this);
+  assert(_return_fn); // _return_fn must have been generated
+  ret->append(std::move(_return_fn));
+
+  return ret;
 }
 
 void KernelGenerator::visit(const ir::operation::If &node)
@@ -95,9 +94,9 @@ void KernelGenerator::visit(const ir::operation::If &node)
   // creating executor recusively
   const auto cond_tensor = input_tensors.front();
   input_tensors.erase(input_tensors.begin());
-  auto fn = std::make_unique<::onert::backend::controlflow::kernel::IfLayer>(
-      cond_tensor, input_tensors, output_tensors, then_subg_index, else_subg_index, _executor_map,
-      _external_context);
+  auto fn = std::make_unique<::onert::backend::builtin::kernel::IfLayer>(
+    cond_tensor, input_tensors, output_tensors, then_subg_index, else_subg_index, _executor_map,
+    _external_context);
 
   _return_fn = std::move(fn);
 }
@@ -112,7 +111,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
   std::vector<ITensor *> input_tensors{getTensor(input_index)};
 
   auto fn =
-      std::make_unique<kernel::PermuteLayer>(input_tensors, output_tensors, _external_context);
+    std::make_unique<kernel::PermuteLayer>(input_tensors, output_tensors, _external_context);
   _return_fn = std::move(fn);
 }
 
@@ -121,7 +120,7 @@ void KernelGenerator::visit(const ir::operation::While &node)
   const auto cond_subg_index = node.param().cond_subg_index;
   const auto body_subg_index = node.param().body_subg_index;
 
-  // This op does not support input as a constant, because controlflow backend does not have
+  // This op does not support input as a constant, because builtin backend does not have
   // TensorBuilder
   std::vector<backend::IPortableTensor *> input_tensors;
   for (const auto input_index : node.getInputs())
@@ -139,9 +138,9 @@ void KernelGenerator::visit(const ir::operation::While &node)
 
   // WhileLayer just set ExecutorMap instead of cond and body executor to avoid complexity of
   // creating executor recusively
-  auto fn = std::make_unique<::onert::backend::controlflow::kernel::WhileLayer>(
-      input_tensors, output_tensors, cond_subg_index, body_subg_index, _executor_map,
-      _dyn_tensor_manager->dynamic_mem_mgr().get(), _external_context);
+  auto fn = std::make_unique<::onert::backend::builtin::kernel::WhileLayer>(
+    input_tensors, output_tensors, cond_subg_index, body_subg_index, _executor_map,
+    _dyn_tensor_manager->dynamic_mem_mgr().get(), _external_context);
 
   _return_fn = std::move(fn);
 }
@@ -161,6 +160,6 @@ backend::IPortableTensor *KernelGenerator::getPortableTensor(const ir::OperandIn
   return ret;
 }
 
-} // namespace controlflow
+} // namespace builtin
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/core/src/backend/controlflow/KernelGenerator.h b/runtime/onert/core/src/backend/builtin/KernelGenerator.h
index 7b395d186..00ad962b9 100644
--- a/runtime/onert/core/src/backend/controlflow/KernelGenerator.h
+++ b/runtime/onert/core/src/backend/builtin/KernelGenerator.h
@@ -14,25 +14,25 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_BACKEND_CONTROLFLOW_KERNEL_GENERATOR_H__
-#define __ONERT_BACKEND_CONTROLFLOW_KERNEL_GENERATOR_H__
+#ifndef __ONERT_BACKEND_BUILTIN_KERNEL_GENERATOR_H__
+#define __ONERT_BACKEND_BUILTIN_KERNEL_GENERATOR_H__
 
-#include <exec/IExecutor.h>
+#include "exec/IExecutor.h"
 #include "ExternalContext.h"
-#include <ir/Graph.h>
+#include "ir/Graph.h"
 #include "TensorBuilder.h"
 #include "compiler/TensorRegistries.h"
-#include "backend/cpu_common/KernelGeneratorBase.h"
+#include "backend/basic/KernelGeneratorBase.h"
 #include "TensorRegistry.h"
 
 namespace onert
 {
 namespace backend
 {
-namespace controlflow
+namespace builtin
 {
 
-class KernelGenerator : public cpu_common::KernelGeneratorBase
+class KernelGenerator : public basic::KernelGeneratorBase
 {
 public:
   KernelGenerator(const ir::Graph &graph, DynamicTensorManager *dyn_tensor_manager,
@@ -49,7 +49,9 @@ public:
     _executor_map = executor_map.get();
   }
 
-  void visit(const ir::OpSequence &) override;
+  std::unique_ptr<exec::FunctionSequence> generate(ir::OperationIndex ind) override;
+
+private:
   void visit(const ir::operation::If &) override;
   void visit(const ir::operation::Permute &) override;
   void visit(const ir::operation::While &) override;
@@ -59,7 +61,6 @@ private:
   backend::IPortableTensor *getPortableTensor(const ir::OperandIndex &index);
 
 private:
-  const ir::Graph &_graph;
   DynamicTensorManager *_dyn_tensor_manager;
   std::shared_ptr<TensorRegistry> _tensor_reg;
   compiler::TensorRegistries _tensor_registries;
@@ -67,8 +68,8 @@ private:
   const std::shared_ptr<ExternalContext> _external_context;
 };
 
-} // namespace controlflow
+} // namespace builtin
 } // namespace backend
 } // namespace onert
 
-#endif // __ONERT_BACKEND_CONTROLFLOW_KERNEL_GENERATOR_H__
+#endif // __ONERT_BACKEND_BUILTIN_KERNEL_GENERATOR_H__
diff --git a/runtime/onert/core/include/backend/IStaticTensorManager.h b/runtime/onert/core/src/backend/builtin/Tensor.h
index cef1f8a0a..d55e64161 100644
--- a/runtime/onert/core/include/backend/IStaticTensorManager.h
+++ b/runtime/onert/core/src/backend/builtin/Tensor.h
@@ -14,22 +14,23 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_BACKEND_ISTATICTENSOR_MANAGER_H__
-#define __ONERT_BACKEND_ISTATICTENSOR_MANAGER_H__
+#ifndef __ONERT_BACKEND_BUILTIN_TENSOR_H__
+#define __ONERT_BACKEND_BUILTIN_TENSOR_H__
 
-#include "ITensorManager.h"
+#include <backend/basic/Tensor.h>
 
 namespace onert
 {
 namespace backend
 {
-
-struct IStaticTensorManager : public ITensorManager
+namespace builtin
 {
-  virtual ~IStaticTensorManager() = default;
-};
 
+using Tensor = basic::Tensor;
+using ExternalTensor = basic::ExternalTensor;
+
+} // namespace builtin
 } // namespace backend
 } // namespace onert
 
-#endif // __ONERT_BACKEND_ISTATICTENSOR_MANAGER_H__
+#endif // __ONERT_BACKEND_BUILTIN_TENSOR_H__
diff --git a/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc b/runtime/onert/core/src/backend/builtin/TensorBuilder.cc
index a767f0eca..fefae40d8 100644
--- a/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc
+++ b/runtime/onert/core/src/backend/builtin/TensorBuilder.cc
@@ -24,14 +24,13 @@ namespace onert
 {
 namespace backend
 {
-namespace controlflow
+namespace builtin
 {
 
 TensorBuilder::TensorBuilder(const std::shared_ptr<TensorRegistry> &tensor_reg)
-    : _tensor_reg{tensor_reg},
-      _dynamic_tensor_mgr{new DynamicTensorManager(_tensor_reg->base_reg())},
-      _static_tensor_mgr{
-          new cpu_common::StaticTensorManager(_tensor_reg->base_reg(), _dynamic_tensor_mgr.get())}
+  : _tensor_reg{tensor_reg}, _dynamic_tensor_mgr{new DynamicTensorManager(_tensor_reg->base_reg())},
+    _static_tensor_mgr{
+      new basic::StaticTensorManager(_tensor_reg->base_reg(), _dynamic_tensor_mgr.get())}
 {
   /* empty */
 }
@@ -41,15 +40,14 @@ void TensorBuilder::registerTensorInfo(const ir::OperandIndex &ind, const ir::Op
 {
   _tensor_info_map.emplace(ind, info);
 
-  _tensor_layout_map.insert({ind, backend_layout});
-
+  VERBOSE_F() << "cpucommon REGISTER!! " << ind << std::endl;
   if (info.isDynamic())
   {
-    _dynamic_tensor_mgr->buildTensor(ind, info, _tensor_layout_map[ind]);
+    _dynamic_tensor_mgr->buildTensor(ind, info, backend_layout);
   }
   else
   {
-    _static_tensor_mgr->buildTensor(ind, info, _tensor_layout_map[ind], info.isConstant());
+    _static_tensor_mgr->buildTensor(ind, info, backend_layout, info.isConstant());
   }
 }
 
@@ -90,24 +88,18 @@ bool TensorBuilder::isRegistered(const ir::OperandIndex &ind) const
   return _tensor_info_map.find(ind) != _tensor_info_map.end();
 }
 
-void TensorBuilder::prepare(void) { _static_tensor_mgr->allocateNonconsts(); }
-
-void TensorBuilder::allocate()
-{
-  // NOTE For now nothing to do. Allocation is done in prepare stage, which is not appropriate
-  //      This is because CPU kernels require `ITensor`s to be allocated before Kernel Generation.
-}
+void TensorBuilder::allocate(void) { _static_tensor_mgr->allocateNonconsts(); }
 
 DynamicTensorManager *TensorBuilder::dynamicTensorManager(void)
 {
   return _dynamic_tensor_mgr.get();
 }
 
-cpu_common::Tensor *TensorBuilder::nativeOwnTensorAt(const ir::OperandIndex &ind)
+basic::Tensor *TensorBuilder::nativeOwnTensorAt(const ir::OperandIndex &ind)
 {
   return _tensor_reg->getNativeOwnTensor(ind);
 }
 
-} // namespace controlflow
+} // namespace builtin
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/core/src/backend/controlflow/TensorBuilder.h b/runtime/onert/core/src/backend/builtin/TensorBuilder.h
index d2e3076fd..1e364c927 100644
--- a/runtime/onert/core/src/backend/controlflow/TensorBuilder.h
+++ b/runtime/onert/core/src/backend/builtin/TensorBuilder.h
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_BACKEND_CONTROLFLOW_TENSOR_BUILDER_H__
-#define __ONERT_BACKEND_CONTROLFLOW_TENSOR_BUILDER_H__
+#ifndef __ONERT_BACKEND_BUILTIN_TENSOR_BUILDER_H__
+#define __ONERT_BACKEND_BUILTIN_TENSOR_BUILDER_H__
 
-#include <backend/cpu_common/StaticTensorManager.h>
-#include <backend/cpu_common/TensorRegistry.h>
-#include <backend/cpu_common/Tensor.h>
+#include <backend/basic/StaticTensorManager.h>
+#include <backend/basic/TensorRegistry.h>
+#include <backend/basic/Tensor.h>
 
 #include <ir/OperandIndexMap.h>
 
@@ -31,7 +31,7 @@ namespace onert
 {
 namespace backend
 {
-namespace controlflow
+namespace builtin
 {
 
 class TensorBuilder
@@ -53,9 +53,7 @@ public:
 
   bool isRegistered(const ir::OperandIndex &) const;
 
-  void prepare(void);
-  void allocate();
-  void postFunctionPrepare() { /* DO NOTHING */}
+  void allocate(void);
 
   DynamicTensorManager *dynamicTensorManager(void);
 
@@ -65,18 +63,17 @@ public:
    *        If not, program will crash with assert or exception.
    * @return operand::Tensor *
    */
-  cpu_common::Tensor *nativeOwnTensorAt(const ir::OperandIndex &ind);
+  basic::Tensor *nativeOwnTensorAt(const ir::OperandIndex &ind);
 
 private:
   const std::shared_ptr<TensorRegistry> _tensor_reg;
   std::unique_ptr<DynamicTensorManager> _dynamic_tensor_mgr;
-  std::unique_ptr<cpu_common::StaticTensorManager> _static_tensor_mgr;
+  std::unique_ptr<basic::StaticTensorManager> _static_tensor_mgr;
   ir::OperandIndexMap<ir::OperandInfo> _tensor_info_map;
-  ir::OperandIndexMap<ir::Layout> _tensor_layout_map;
 };
 
-} // namespace controlflow
+} // namespace builtin
 } // namespace backend
 } // namespace onert
 
-#endif // __ONERT_BACKEND_CONTROLFLOW_TENSOR_BUILDER_H__
+#endif // __ONERT_BACKEND_BUILTIN_TENSOR_BUILDER_H__
diff --git a/runtime/onert/core/src/backend/controlflow/TensorRegistry.h b/runtime/onert/core/src/backend/builtin/TensorRegistry.h
index 901f0aebb..ae68b1318 100644
--- a/runtime/onert/core/src/backend/controlflow/TensorRegistry.h
+++ b/runtime/onert/core/src/backend/builtin/TensorRegistry.h
@@ -14,10 +14,10 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_BACKEND_CONTROLFLOW_TENSOR_REGISTRY_H__
-#define __ONERT_BACKEND_CONTROLFLOW_TENSOR_REGISTRY_H__
+#ifndef __ONERT_BACKEND_BUILTIN_TENSOR_REGISTRY_H__
+#define __ONERT_BACKEND_BUILTIN_TENSOR_REGISTRY_H__
 
-#include "backend/cpu_common/TensorRegistry.h"
+#include "backend/basic/TensorRegistry.h"
 #include "backend/ITensorRegistry.h"
 #include "Tensor.h"
 #include "IOTensor.h"
@@ -27,27 +27,27 @@ namespace onert
 {
 namespace backend
 {
-namespace controlflow
+namespace builtin
 {
 
 /**
- * @brief Tensor registry class for controlflow backend
+ * @brief Tensor registry class for builtin backend
  *
  * This class contains three types of tensors. Two native tensors(tensors that are managed by this
  * backend) and the other is migrant tensor.
  *
  * - NativeIOTensor  - @c IOTensor managed by this backend ( in @c _base_reg )
  *     - NOTE The tensor it actually points to can be from another backend
- * - NativeOwnTensor - @c cpu_common::Tensor managed by this backend ( in @c _base_reg )
+ * - NativeOwnTensor - @c basic::Tensor managed by this backend ( in @c _base_reg )
  * - MigrantTensor   - @c IPortableTensor managed by other backends
  *
- * @note @c _base_reg is used in implementation to reuse @c cpu_common::StaticTensorManager
+ * @note @c _base_reg is used in implementation to reuse @c basic::StaticTensorManager
  *
  */
 class TensorRegistry : public ITensorRegistry
 {
 public:
-  TensorRegistry() : _base_reg{new cpu_common::TensorRegistry} {}
+  TensorRegistry() : _base_reg{new basic::TensorRegistry} {}
 
   ITensor *getITensor(const ir::OperandIndex &ind) override
   {
@@ -120,15 +120,15 @@ public:
   {
     return _native_io_tensors;
   }
-  std::shared_ptr<cpu_common::TensorRegistry> base_reg() { return _base_reg; }
+  std::shared_ptr<basic::TensorRegistry> base_reg() { return _base_reg; }
 
 private:
-  std::shared_ptr<cpu_common::TensorRegistry> _base_reg;
+  std::shared_ptr<basic::TensorRegistry> _base_reg;
   ir::OperandIndexMap<std::unique_ptr<IOTensor>> _native_io_tensors;
 };
 
-} // namespace controlflow
+} // namespace builtin
 } // namespace backend
 } // namespace onert
 
-#endif // ifndef __ONERT_BACKEND_CONTROLFLOW_TENSOR_REGISTRY_H__
+#endif // ifndef __ONERT_BACKEND_BUILTIN_TENSOR_REGISTRY_H__
diff --git a/runtime/onert/core/src/backend/controlflow/UserTensor.cc b/runtime/onert/core/src/backend/builtin/UserTensor.cc
index 5081a90ea..f0b00b928 100644
--- a/runtime/onert/core/src/backend/controlflow/UserTensor.cc
+++ b/runtime/onert/core/src/backend/builtin/UserTensor.cc
@@ -23,16 +23,16 @@ namespace onert
 {
 namespace backend
 {
-namespace controlflow
+namespace builtin
 {
 
 size_t UserTensor::calcOffset(const ir::Coordinates &coords) const
 {
-  size_t rank = num_dimensions();
+  size_t rank = getShape().rank();
   size_t offset = 0;
   for (size_t i = 0; i < rank; ++i)
   {
-    offset = offset * dimension(i) + coords[i];
+    offset = offset * getShape().dim(i) + coords[i];
   }
   offset *= sizeOfDataType(data_type());
   return offset;
@@ -48,6 +48,6 @@ bool UserTensor::applyShape(const ir::Shape &new_shape)
   return true;
 }
 
-} // namespace controlflow
+} // namespace builtin
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/core/src/backend/controlflow/UserTensor.h b/runtime/onert/core/src/backend/builtin/UserTensor.h
index 7aa62a8a9..0d0ed73c5 100644
--- a/runtime/onert/core/src/backend/controlflow/UserTensor.h
+++ b/runtime/onert/core/src/backend/builtin/UserTensor.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_BACKEND_CONTROLFLOW_USER_TENSOR_H__
-#define __ONERT_BACKEND_CONTROLFLOW_USER_TENSOR_H__
+#ifndef __ONERT_BACKEND_BUILTIN_USER_TENSOR_H__
+#define __ONERT_BACKEND_BUILTIN_USER_TENSOR_H__
 
 #include "ir/OperandInfo.h"
 #include "backend/IPortableTensor.h"
@@ -24,7 +24,7 @@ namespace onert
 {
 namespace backend
 {
-namespace controlflow
+namespace builtin
 {
 
 /**
@@ -39,7 +39,7 @@ class UserTensor : public IPortableTensor
 {
 public:
   UserTensor(const ir::OperandInfo &info, ir::Layout layout, uint8_t *buffer, size_t size)
-      : IPortableTensor{info}, _layout{layout}, _buffer{buffer}, _size{size}, _dynamic{false}
+    : IPortableTensor{info}, _layout{layout}, _buffer{buffer}, _size{size}, _dynamic{false}
   {
   }
 
@@ -57,13 +57,9 @@ public:
 public:
   uint8_t *buffer() const override { return _buffer; }
   size_t total_size() const override { return _size; }
-  size_t dimension(size_t index) const override { return _info.shape().dim(index); }
-  size_t num_dimensions() const override { return _info.shape().rank(); }
   size_t calcOffset(const ir::Coordinates &coords) const override;
   ir::Layout layout() const override { return _layout; }
   ir::DataType data_type() const override { return _info.typeInfo().type(); }
-  float data_scale() const override { return _info.typeInfo().scale(); }
-  int32_t data_offset() const override { return _info.typeInfo().offset(); }
   bool is_dynamic() const override { return _dynamic; }
   void set_dynamic() override { _dynamic = true; }
   ir::Shape getShape() const override { return _info.shape(); }
@@ -78,8 +74,8 @@ private:
   bool _dynamic;
 };
 
-} // namespace controlflow
+} // namespace builtin
 } // namespace backend
 } // namespace onert
 
-#endif // __ONERT_BACKEND_CONTROLFLOW_USER_TENSOR_H__
+#endif // __ONERT_BACKEND_BUILTIN_USER_TENSOR_H__
diff --git a/runtime/onert/core/src/backend/controlflow/kernel/IfLayer.cc b/runtime/onert/core/src/backend/builtin/kernel/IfLayer.cc
index 1d786c4dd..fdd9d9d14 100644
--- a/runtime/onert/core/src/backend/controlflow/kernel/IfLayer.cc
+++ b/runtime/onert/core/src/backend/builtin/kernel/IfLayer.cc
@@ -24,7 +24,7 @@ namespace onert
 {
 namespace backend
 {
-namespace controlflow
+namespace builtin
 {
 namespace kernel
 {
@@ -35,9 +35,9 @@ IfLayer::IfLayer(backend::IPortableTensor *cond_tensor,
                  const ir::SubgraphIndex &then_subg_index, const ir::SubgraphIndex &else_subg_index,
                  exec::ExecutorMap *executor_map,
                  const std::shared_ptr<ExternalContext> &external_context)
-    : _cond_tensor{cond_tensor}, _input_tensors{input_tensors}, _output_tensors{output_tensors},
-      _then_subg_index{then_subg_index}, _else_subg_index{else_subg_index},
-      _executor_map{executor_map}, _external_context{external_context}
+  : _cond_tensor{cond_tensor}, _input_tensors{input_tensors}, _output_tensors{output_tensors},
+    _then_subg_index{then_subg_index}, _else_subg_index{else_subg_index},
+    _executor_map{executor_map}, _external_context{external_context}
 {
   // At this point, executor_map may not have executors of then subg and else subg
 }
@@ -79,6 +79,6 @@ void IfLayer::run()
 }
 
 } // namespace kernel
-} // namespace controlflow
+} // namespace builtin
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/core/src/backend/controlflow/kernel/IfLayer.h b/runtime/onert/core/src/backend/builtin/kernel/IfLayer.h
index 967552fc3..f12ef3605 100644
--- a/runtime/onert/core/src/backend/controlflow/kernel/IfLayer.h
+++ b/runtime/onert/core/src/backend/builtin/kernel/IfLayer.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_BACKEND_CONTROLFLOW_KERNEL_IF_LAYER_H__
-#define __ONERT_BACKEND_CONTROLFLOW_KERNEL_IF_LAYER_H__
+#ifndef __ONERT_BACKEND_BUILTIN_KERNEL_IF_LAYER_H__
+#define __ONERT_BACKEND_BUILTIN_KERNEL_IF_LAYER_H__
 
 #include <backend/IPortableTensor.h>
 #include <exec/IExecutor.h>
@@ -25,7 +25,7 @@ namespace onert
 {
 namespace backend
 {
-namespace controlflow
+namespace builtin
 {
 namespace kernel
 {
@@ -54,8 +54,8 @@ private:
 };
 
 } // namespace kernel
-} // namespace controlflow
+} // namespace builtin
 } // namespace backend
 } // namespace onert
 
-#endif // __ONERT_BACKEND_CONTROLFLOW_KERNEL_IF_LAYER_H__
+#endif // __ONERT_BACKEND_BUILTIN_KERNEL_IF_LAYER_H__
diff --git a/runtime/onert/core/src/backend/controlflow/kernel/PermuteLayer.cc b/runtime/onert/core/src/backend/builtin/kernel/PermuteLayer.cc
index 8b79ea070..20cd87ad1 100644
--- a/runtime/onert/core/src/backend/controlflow/kernel/PermuteLayer.cc
+++ b/runtime/onert/core/src/backend/builtin/kernel/PermuteLayer.cc
@@ -24,7 +24,7 @@ namespace onert
 {
 namespace backend
 {
-namespace controlflow
+namespace builtin
 {
 namespace kernel
 {
@@ -32,7 +32,7 @@ namespace kernel
 PermuteLayer::PermuteLayer(const std::vector<ITensor *> &src_tensors,
                            const std::vector<ITensor *> &dst_tensors,
                            const std::shared_ptr<ExternalContext> &external_context)
-    : _external_context{external_context}, _tasks_map{}
+  : _external_context{external_context}, _tasks_map{}
 {
   assert(src_tensors.size() == dst_tensors.size());
   _src_tensors = src_tensors;
@@ -66,12 +66,12 @@ void PermuteLayer::optimize()
       if (underlying_type(src->data_type()) != underlying_type(dst->data_type()))
         throw std::runtime_error("data type does not match");
       const auto permute_type = [&]() -> PermuteType {
-        if (src->num_dimensions() == 4 && src->layout() == ir::Layout::NHWC &&
+        if (src->getShape().rank() == 4 && src->layout() == ir::Layout::NHWC &&
             dst->layout() == ir::Layout::NCHW)
         {
           return PermuteType::NHWC_TO_NCHW;
         }
-        else if (src->num_dimensions() == 4 && src->layout() == ir::Layout::NCHW &&
+        else if (src->getShape().rank() == 4 && src->layout() == ir::Layout::NCHW &&
                  dst->layout() == ir::Layout::NHWC)
         {
           return PermuteType::NCHW_TO_NHWC;
@@ -91,10 +91,10 @@ void PermuteLayer::optimize()
             if ((!src_tensor.has_padding() && !dst_tensor.has_padding()))
             {
               const auto num_elements = src_tensor.getShape().num_elements();
-              const int thread_count = _external_context->ruy_context()->max_num_threads() <
-                                               static_cast<int>(num_elements)
-                                           ? _external_context->ruy_context()->max_num_threads()
-                                           : num_elements;
+              const int thread_count =
+                _external_context->ruy_context()->max_num_threads() < static_cast<int>(num_elements)
+                  ? _external_context->ruy_context()->max_num_threads()
+                  : num_elements;
 
               std::vector<PermuteWorkerTask> tasks;
               auto start = 0;
@@ -122,8 +122,9 @@ void PermuteLayer::optimize()
           }
           else
           {
-            assert(src_tensor.num_dimensions() == 4 && (permute_type == PermuteType::NHWC_TO_NCHW ||
-                                                        permute_type == PermuteType::NCHW_TO_NHWC));
+            assert(src_tensor.getShape().rank() == 4 &&
+                   (permute_type == PermuteType::NHWC_TO_NCHW ||
+                    permute_type == PermuteType::NCHW_TO_NHWC));
             const auto loop_shape = src_tensor.getShape();
             const auto copy_len = data_size;
 
@@ -144,19 +145,19 @@ void PermuteLayer::appendPermuteTasks(const ITensor *src_tensor, ITensor *dst_te
                                       const ir::Shape &loop_shape, size_t size)
 {
   size_t distributed_dim = 0;
+  auto src_shape = src_tensor->getShape();
   if (src_tensor->layout() == dst_tensor->layout())
   {
-    for (size_t i = 1; i < src_tensor->num_dimensions() - 1; ++i)
+    for (int i = 1; i < src_shape.rank() - 1; ++i)
     {
-      distributed_dim =
-          src_tensor->dimension(distributed_dim) < src_tensor->dimension(i) ? i : distributed_dim;
+      distributed_dim = src_shape.dim(distributed_dim) < src_shape.dim(i) ? i : distributed_dim;
     }
   }
-  const auto distributed_dim_val = src_tensor->dimension(distributed_dim);
+  const auto distributed_dim_val = src_shape.dim(distributed_dim);
   const int thread_count =
-      _external_context->ruy_context()->max_num_threads() < static_cast<int>(distributed_dim_val)
-          ? _external_context->ruy_context()->max_num_threads()
-          : distributed_dim_val;
+    _external_context->ruy_context()->max_num_threads() < static_cast<int>(distributed_dim_val)
+      ? _external_context->ruy_context()->max_num_threads()
+      : distributed_dim_val;
   // NOTE Do not remove this assertion. It would cause performance degradation by new threads to be
   // created in the context's thread pool
   assert(thread_count <= _external_context->ruy_context()->max_num_threads());
@@ -213,13 +214,13 @@ void PermuteLayer::run()
 
       // set output shape and output buffer
       ir::Shape new_shape =
-          exec::convertShape(src_shape, src_tensor->layout(), dst_tensor->layout());
+        exec::convertShape(src_shape, src_tensor->layout(), dst_tensor->layout());
 
       try
       {
         if (!dst_tensor->applyShape(new_shape))
           throw std::runtime_error{
-              "Error: PermuteLayer: output's TensorManager does not support dynamic tensor"};
+            "Error: PermuteLayer: output's TensorManager does not support dynamic tensor"};
         assert(dst_tensor->buffer() != nullptr);
       }
       catch (const std::out_of_range &e)
@@ -262,7 +263,7 @@ void PermuteLayer::run()
         if (_tasks_map.find(src) == _tasks_map.end() || _tasks_map.at(src).size() == 1 ||
             src->is_dynamic() || dst->is_dynamic())
         {
-          permute(src, dst, src->num_dimensions(), src_offsets, dst_offsets);
+          permute(src, dst, src->getShape().rank(), src_offsets, dst_offsets);
         }
         // If dst is subtensor, we have to use clEnqueueMapBuffer instead of clEnqueueWirteBuffer
         else if (dst->needMemoryMap() && !dst->is_subtensor())
@@ -306,6 +307,6 @@ void PermuteLayer::run()
 }
 
 } // namespace kernel
-} // namespace controlflow
+} // namespace builtin
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/core/src/backend/controlflow/kernel/PermuteLayer.h b/runtime/onert/core/src/backend/builtin/kernel/PermuteLayer.h
index 6fb69b65c..ac5470e85 100644
--- a/runtime/onert/core/src/backend/controlflow/kernel/PermuteLayer.h
+++ b/runtime/onert/core/src/backend/builtin/kernel/PermuteLayer.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_BACKEND_CONTROLFLOW_KERNEL_PERMUTELAYER_H__
-#define __ONERT_BACKEND_CONTROLFLOW_KERNEL_PERMUTELAYER_H__
+#ifndef __ONERT_BACKEND_BUILTIN_KERNEL_PERMUTELAYER_H__
+#define __ONERT_BACKEND_BUILTIN_KERNEL_PERMUTELAYER_H__
 
 #include "exec/IPermuteFunction.h"
 #include "exec/IExecutor.h"
@@ -26,7 +26,7 @@ namespace onert
 {
 namespace backend
 {
-namespace controlflow
+namespace builtin
 {
 namespace kernel
 {
@@ -56,11 +56,11 @@ private:
 
     PermuteWorkerTask(const ITensor &src_tensor, ITensor &dst_tensor,
                       const ir::Coordinates &start_coords, const ir::Shape &loop_shape, size_t size)
-        : _src_buffer{src_tensor.buffer()}, _dst_buffer{dst_tensor.buffer()},
-          _src_start_offset{src_tensor.calcOffset(start_coords)},
-          _dst_start_offset{dst_tensor.calcOffset(start_coords)}, _src_strides{}, _dst_strides{},
-          _loop_shape{loop_shape}, _size{size}, _src_layout{src_tensor.layout()},
-          _dst_layout{dst_tensor.layout()}, _is_permutation{true}
+      : _src_buffer{src_tensor.buffer()}, _dst_buffer{dst_tensor.buffer()},
+        _src_start_offset{src_tensor.calcOffset(start_coords)},
+        _dst_start_offset{dst_tensor.calcOffset(start_coords)}, _src_strides{}, _dst_strides{},
+        _loop_shape{loop_shape}, _size{size}, _src_layout{src_tensor.layout()},
+        _dst_layout{dst_tensor.layout()}, _is_permutation{true}
     {
       // Set strides
       setStrides(src_tensor, &_src_strides);
@@ -71,9 +71,9 @@ private:
     // Constructor for a copy
     PermuteWorkerTask(const uint8_t *src_buffer, uint8_t *dst_buffer, uint32_t src_start_offset,
                       uint32_t dst_start_offset, size_t size)
-        : _src_buffer{src_buffer}, _dst_buffer{dst_buffer}, _src_start_offset{src_start_offset},
-          _dst_start_offset{dst_start_offset}, _src_strides{0}, _dst_strides{0}, _loop_shape{1},
-          _size{size}, _src_layout{}, _dst_layout{}, _is_permutation{false}
+      : _src_buffer{src_buffer}, _dst_buffer{dst_buffer}, _src_start_offset{src_start_offset},
+        _dst_start_offset{dst_start_offset}, _src_strides{0}, _dst_strides{0},
+        _loop_shape{1}, _size{size}, _src_layout{}, _dst_layout{}, _is_permutation{false}
     {
       // DO NOTHING
     }
@@ -106,12 +106,13 @@ private:
   private:
     void setStrides(const ITensor &tensor, Strides *strides)
     {
-      const size_t rank = tensor.num_dimensions();
+      auto shape = tensor.getShape();
+      const size_t rank = shape.rank();
       for (size_t i = 0; i < rank; ++i)
       {
         ir::Coordinates no_step(rank), one_step(rank);
         one_step.set(i, 1);
-        if (tensor.dimension(i) > 1)
+        if (shape.dim(i) > 1)
         {
           strides->set(i, tensor.calcOffset(one_step) - tensor.calcOffset(no_step));
         }
@@ -142,8 +143,8 @@ private:
 };
 
 } // namespace kernel
-} // namespace controlflow
+} // namespace builtin
 } // namespace backend
 } // namespace onert
 
-#endif // __ONERT_BACKEND_CONTROLFLOW_KERNEL_PERMUTELAYER_H__
+#endif // __ONERT_BACKEND_BUILTIN_KERNEL_PERMUTELAYER_H__
diff --git a/runtime/onert/core/src/backend/controlflow/kernel/WhileLayer.cc b/runtime/onert/core/src/backend/builtin/kernel/WhileLayer.cc
index a4b5aa5ca..81b4a6378 100644
--- a/runtime/onert/core/src/backend/controlflow/kernel/WhileLayer.cc
+++ b/runtime/onert/core/src/backend/builtin/kernel/WhileLayer.cc
@@ -26,7 +26,7 @@ namespace onert
 {
 namespace backend
 {
-namespace controlflow
+namespace builtin
 {
 namespace kernel
 {
@@ -35,11 +35,11 @@ WhileLayer::WhileLayer(const std::vector<backend::IPortableTensor *> input_tenso
                        const std::vector<backend::IPortableTensor *> output_tensors,
                        const ir::SubgraphIndex &cond_subg_index,
                        const ir::SubgraphIndex &body_subg_index, exec::ExecutorMap *executor_map,
-                       cpu_common::DynamicMemoryManager *dyn_memory_manager,
+                       basic::DynamicMemoryManager *dyn_memory_manager,
                        const std::shared_ptr<ExternalContext> &external_context)
-    : _cond_subg_index{cond_subg_index}, _body_subg_index{body_subg_index},
-      _input_tensors{input_tensors}, _output_tensors{output_tensors}, _executor_map{executor_map},
-      _dyn_memory_manager{dyn_memory_manager}, _external_context{external_context}
+  : _cond_subg_index{cond_subg_index}, _body_subg_index{body_subg_index},
+    _input_tensors{input_tensors}, _output_tensors{output_tensors}, _executor_map{executor_map},
+    _dyn_memory_manager{dyn_memory_manager}, _external_context{external_context}
 {
   // At this point, executor_map may not have executors of cond subg and body subg
 }
@@ -143,6 +143,6 @@ void WhileLayer::run()
 }
 
 } // namespace kernel
-} // namespace controlflow
+} // namespace builtin
 } // namespace backend
 } // namespace onert
diff --git a/runtime/onert/core/src/backend/controlflow/kernel/WhileLayer.h b/runtime/onert/core/src/backend/builtin/kernel/WhileLayer.h
index d3924c843..912102781 100644
--- a/runtime/onert/core/src/backend/controlflow/kernel/WhileLayer.h
+++ b/runtime/onert/core/src/backend/builtin/kernel/WhileLayer.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef __ONERT_BACKEND_CONTROLFLOW_KERNEL_WHILE_LAYER_H__
-#define __ONERT_BACKEND_CONTROLFLOW_KERNEL_WHILE_LAYER_H__
+#ifndef __ONERT_BACKEND_BUILTIN_KERNEL_WHILE_LAYER_H__
+#define __ONERT_BACKEND_BUILTIN_KERNEL_WHILE_LAYER_H__
 
 #include <backend/IPortableTensor.h>
 #include <exec/IExecutor.h>
@@ -24,13 +24,13 @@
 #include <ir/Graph.h>
 #include "../ExternalContext.h"
 
-#include "backend/cpu_common/MemoryManager.h"
+#include "backend/basic/MemoryManager.h"
 
 namespace onert
 {
 namespace backend
 {
-namespace controlflow
+namespace builtin
 {
 namespace kernel
 {
@@ -41,7 +41,7 @@ public:
   WhileLayer(const std::vector<backend::IPortableTensor *> input_tensors,
              const std::vector<backend::IPortableTensor *> output_tensors,
              const ir::SubgraphIndex &cond_subg_index, const ir::SubgraphIndex &body_subg_index,
-             exec::ExecutorMap *executor_map, cpu_common::DynamicMemoryManager *dyn_memory_manager,
+             exec::ExecutorMap *executor_map, basic::DynamicMemoryManager *dyn_memory_manager,
              const std::shared_ptr<ExternalContext> &external_context);
 
 public:
@@ -53,13 +53,13 @@ private:
   const std::vector<backend::IPortableTensor *> _input_tensors;
   const std::vector<backend::IPortableTensor *> _output_tensors;
   exec::ExecutorMap *_executor_map;
-  cpu_common::DynamicMemoryManager *_dyn_memory_manager; // For generating temp tensors
+  basic::DynamicMemoryManager *_dyn_memory_manager; // For generating temp tensors
   const std::shared_ptr<ExternalContext> _external_context;
 };
 
 } // namespace kernel
-} // namespace controlflow
+} // namespace builtin
 } // namespace backend
 } // namespace onert
 
-#endif // __ONERT_BACKEND_CONTROLFLOW_KERNEL_WHILE_LAYER_H__
+#endif // __ONERT_BACKEND_BUILTIN_KERNEL_WHILE_LAYER_H__
diff --git a/runtime/onert/core/src/backend/controlflow/BackendContext.cc b/runtime/onert/core/src/backend/controlflow/BackendContext.cc
deleted file mode 100644
index 366377edf..000000000
--- a/runtime/onert/core/src/backend/controlflow/BackendContext.cc
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "BackendContext.h"
-
-#include "KernelGenerator.h"
-#include "backend/cpu_common/BackendContextHelpers.h"
-
-namespace onert
-{
-namespace backend
-{
-namespace controlflow
-{
-
-void BackendContext::initConsts()
-{
-  for (auto &op : operation_list())
-  {
-    constant_initializer->setLayout(op.layout);
-    graph()->operations().at(op.index).accept(*constant_initializer);
-  }
-
-  for (auto ind : operand_list())
-  {
-    const auto &obj = graph()->operands().at(ind);
-    if (obj.isConstant() && !constant_initializer->exist(ind))
-    {
-      constant_initializer->registerDefaultInitializer(ind, obj);
-    }
-  }
-
-  constant_initializer->run();
-}
-
-ITensorRegistry *BackendContext::genTensors(const std::vector<onert::ir::OpSequenceIndex> &order,
-                                            const ir::OpSequences &op_seqs,
-                                            const ir::LowerInfoMap &lower_info)
-{
-  auto model_io = (graph()->getInputs() + graph()->getOutputs()) | ir::Remove::UNDEFINED |
-                  ir::Remove::DUPLICATED;
-  for (auto index : operand_list())
-  {
-    if (model_io.contains(index))
-      continue;
-    const auto &obj = graph()->operands().at(index);
-    const auto frontend_layout = [&]() {
-      if (obj.getUses().size() == 0)
-        return ir::Layout::UNKNOWN;
-      auto use_op_ind = *obj.getUses().begin(); // FIXME What if it has two or more uses?
-      for (auto &operation_info : operation_list())
-      {
-        if (operation_info.index == use_op_ind)
-          return operation_info.layout;
-      }
-      return ir::Layout::UNKNOWN;
-    }();
-    const auto &permute_factor = lower_info.operand.at(index)->def_factors().getOnlyElement();
-    if (permute_factor.backend() != backend())
-      continue;
-    const auto backend_layout = permute_factor.layout();
-    ir::OperandInfo backend_info{permuteShape(obj.shape(), frontend_layout, backend_layout),
-                                 obj.typeInfo(), obj.info().memAllocType(), obj.isConstant()};
-    tensor_builder->registerTensorInfo(index, backend_info, backend_layout);
-  }
-
-  // TODO Get compiler options from compiler, and use it rather than getting it from Env
-  if (util::getConfigString(util::config::EXECUTOR) == "Linear")
-  {
-    cpu_common::planTensors(*this, order, op_seqs, lower_info);
-  }
-  else
-  {
-    // For the executors that does not have fixed linear execution order:
-    // To make tensors never be deallocated, this is a workaround to use static memory planner
-    for (auto ind : operand_list())
-    {
-      if (tensor_builder->isRegistered(ind))
-        tensor_builder->notifyFirstUse(ind);
-    }
-  }
-
-  tensor_builder->prepare();
-
-  return tensor_registry.get();
-}
-
-FunctionMap BackendContext::genKernels(const std::vector<ir::OpSequenceIndex> &order,
-                                       const ir::OpSequences &op_seqs)
-{
-  FunctionMap ret;
-
-  for (auto op_seq_ind : order)
-  {
-    const auto &op_seq = op_seqs.at(op_seq_ind);
-    bool assigned = [&]() {
-      for (auto op_info : operation_list())
-        if (op_seq.exist(op_info.index))
-          return true;
-      return false;
-    }();
-    if (!assigned)
-      continue;
-    auto fn_seq = kernel_gen->generate(op_seqs.at(op_seq_ind));
-    ret.emplace_back(op_seq_ind, std::move(fn_seq));
-  }
-
-  initConsts();
-
-  // NOTE For memory optimization, we want to free some operand data
-  for (auto ind : operand_list())
-  {
-    // TODO Remove const_cast
-    auto &obj = const_cast<ir::Graph *>(graph())->operands().at(ind);
-    obj.releaseData();
-  }
-
-  for (auto &it : ret)
-  {
-    auto &fn_seq = it.second;
-    fn_seq->iterate([&](exec::IFunction &ifunc) { ifunc.prepare(); });
-  }
-
-  return ret;
-}
-
-} // namespace controlflow
-} // namespace backend
-} // namespace onert
diff --git a/runtime/onert/core/src/backend/controlflow/ConstantInitializer.h b/runtime/onert/core/src/backend/controlflow/ConstantInitializer.h
deleted file mode 100644
index ac97ef91c..000000000
--- a/runtime/onert/core/src/backend/controlflow/ConstantInitializer.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __ONERT_COMPILER_CONTROLFLOW_CONSTANT_INITIALIZER_H__
-#define __ONERT_COMPILER_CONTROLFLOW_CONSTANT_INITIALIZER_H__
-
-#include <backend/cpu_common/ConstantInitializer.h>
-
-namespace onert
-{
-namespace backend
-{
-namespace controlflow
-{
-
-using ConstantInitializer = cpu_common::ConstantInitializer;
-
-} // namespace controlflow
-} // namespace backend
-} // namespace onert
-
-#endif // __ONERT_COMPILER_CONTROLFLOW_CONSTANT_INITIALIZER_H__
diff --git a/runtime/onert/core/src/backend/controlflow/Tensor.h b/runtime/onert/core/src/backend/controlflow/Tensor.h
deleted file mode 100644
index 87951a9b3..000000000
--- a/runtime/onert/core/src/backend/controlflow/Tensor.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __ONERT_BACKEND_CONTROLFLOW_TENSOR_H__
-#define __ONERT_BACKEND_CONTROLFLOW_TENSOR_H__
-
-#include <backend/cpu_common/Tensor.h>
-
-namespace onert
-{
-namespace backend
-{
-namespace controlflow
-{
-
-using Tensor = cpu_common::Tensor;
-using ExternalTensor = cpu_common::ExternalTensor;
-
-} // namespace controlflow
-} // namespace backend
-} // namespace onert
-
-#endif // __ONERT_BACKEND_CONTROLFLOW_TENSOR_H__
diff --git a/runtime/onert/core/src/backend/cpu_common/ConstantInitializer.cc b/runtime/onert/core/src/backend/cpu_common/ConstantInitializer.cc
deleted file mode 100644
index 610ba5ffc..000000000
--- a/runtime/onert/core/src/backend/cpu_common/ConstantInitializer.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "backend/cpu_common/ConstantInitializer.h"
-#include "backend/cpu_common/Tensor.h"
-
-namespace onert
-{
-namespace backend
-{
-namespace cpu_common
-{
-
-ConstantInitializer::ConstantInitializer(const ir::Operands &operands,
-                                         const std::shared_ptr<ITensorRegistry> &tensor_reg)
-    : ConstantInitializerBase{operands}, _tensor_reg{tensor_reg}
-{
-  // DO NOTHING
-}
-
-void ConstantInitializer::registerDefaultInitializer(const ir::OperandIndex &index,
-                                                     const ir::Operand &obj)
-{
-  registerExternalInitializer(index, obj);
-}
-
-void ConstantInitializer::registerExternalInitializer(const ir::OperandIndex &index,
-                                                      const ir::Operand &obj)
-{
-  // For only CONSTANTS
-  // TODO Add to check if tensor has been allocated
-  if (!obj.isConstant())
-    return;
-
-  _init_map[index] = [](const onert::ir::Operand &model_obj, onert::backend::ITensor &itensor) {
-    auto data = model_obj.shareData();
-    assert(data && data->base());
-    ExternalTensor &tensor = dynamic_cast<ExternalTensor &>(itensor);
-    tensor.setData(data);
-  };
-}
-
-} // namespace cpu_common
-} // namespace backend
-} // namespace onert
diff --git a/runtime/onert/core/src/backend/cpu_common/ConstantInitializerBase.cc b/runtime/onert/core/src/backend/cpu_common/ConstantInitializerBase.cc
deleted file mode 100644
index 15c2dfeb1..000000000
--- a/runtime/onert/core/src/backend/cpu_common/ConstantInitializerBase.cc
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "backend/cpu_common/ConstantInitializerBase.h"
-
-#include <Half.h>
-
-using float16 = Half;
-
-namespace onert
-{
-namespace backend
-{
-namespace cpu_common
-{
-
-void ConstantInitializerBase::registerCopyInitializer(const ir::OperandIndex &index,
-                                                      const ir::Operand &obj)
-{
-  // For only CONSTANTS
-  // TODO Add to check if tensor has been allocated
-  if (!obj.isConstant())
-    return;
-
-  const auto type = obj.typeInfo().type();
-  using ir::DataType;
-
-  switch (type)
-  {
-    case DataType::FLOAT32:
-      _init_map[index] = copyInit<float>;
-      break;
-    case DataType::INT32:
-      _init_map[index] = copyInit<int32_t>;
-      break;
-    case DataType::UINT32:
-      _init_map[index] = copyInit<uint32_t>;
-      break;
-    case DataType::BOOL8:
-    case DataType::QUANT_UINT8_ASYMM:
-      _init_map[index] = copyInit<uint8_t>;
-      break;
-    case DataType::QUANT_INT8_SYMM:
-    case DataType::QUANT_INT8_ASYMM:
-      _init_map[index] = copyInit<int8_t>;
-      break;
-    case DataType::FLOAT16:
-      _init_map[index] = copyInit<float16>;
-      break;
-    case DataType::INT64:
-      _init_map[index] = copyInit<int64_t>;
-      break;
-    default:
-      throw std::runtime_error("Not supported, yet");
-      break;
-  }
-}
-
-void ConstantInitializerBase::registerPermuteInitializer(const ir::OperandIndex &index,
-                                                         const ir::Operand &obj)
-{
-  // For only CONSTANTS
-  // TODO Add to check if tensor has been allocated
-  if (!obj.isConstant())
-    return;
-
-  const auto type = obj.typeInfo().type();
-  using ir::DataType;
-  using namespace std::placeholders;
-
-  switch (type)
-  {
-    case DataType::FLOAT32:
-      _init_map[index] = std::bind(permuteInit<float>, _1, _2, _current_layout);
-      break;
-    case DataType::INT32:
-      _init_map[index] = std::bind(permuteInit<int32_t>, _1, _2, _current_layout);
-      break;
-    case DataType::UINT32:
-      _init_map[index] = std::bind(permuteInit<uint32_t>, _1, _2, _current_layout);
-      break;
-    case DataType::BOOL8:
-    case DataType::QUANT_UINT8_ASYMM:
-      _init_map[index] = std::bind(permuteInit<uint8_t>, _1, _2, _current_layout);
-      break;
-    case DataType::QUANT_INT8_SYMM:
-    case DataType::QUANT_INT8_ASYMM:
-      _init_map[index] = std::bind(permuteInit<int8_t>, _1, _2, _current_layout);
-      break;
-    case DataType::FLOAT16:
-      _init_map[index] = std::bind(permuteInit<float16>, _1, _2, _current_layout);
-      break;
-    case DataType::INT64:
-      _init_map[index] = std::bind(permuteInit<int64_t>, _1, _2, _current_layout);
-      break;
-    default:
-      throw std::runtime_error("Not supported, yet");
-      break;
-  }
-}
-
-} // namespace cpu_common
-} // namespace backend
-} // namespace onert
diff --git a/runtime/onert/core/src/compiler/BackendManager.cc b/runtime/onert/core/src/compiler/BackendManager.cc
index ea45cbeb7..0d6051b21 100644
--- a/runtime/onert/core/src/compiler/BackendManager.cc
+++ b/runtime/onert/core/src/compiler/BackendManager.cc
@@ -20,8 +20,8 @@
 #include <dlfcn.h>
 
 #include "backend/Backend.h"
-#include "backend/controlflow/Backend.h"
-#include "backend/controlflow/Config.h"
+#include "backend/builtin/Backend.h"
+#include "backend/builtin/Config.h"
 #include "backend/IConfig.h"
 #include "util/logging.h"
 #include "util/ConfigSource.h"
@@ -29,9 +29,9 @@
 
 static const char *SHARED_LIB_EXT =
 #if defined(__APPLE__) && defined(__MACH__)
-    ".dylib";
+  ".dylib";
 #else
-    ".so";
+  ".so";
 #endif
 
 namespace onert
@@ -45,20 +45,20 @@ BackendManager &BackendManager::get()
   return object;
 }
 
-BackendManager::BackendManager() { loadControlflowBackend(); }
+BackendManager::BackendManager() { loadBuiltinBackend(); }
 
-void BackendManager::loadControlflowBackend()
+void BackendManager::loadBuiltinBackend()
 {
-  auto backend_object = std::unique_ptr<backend::controlflow::Backend, backend_destroy_t>(
-      new backend::controlflow::Backend, [](backend::Backend *backend) { delete backend; });
+  auto backend_object = std::unique_ptr<backend::builtin::Backend, backend_destroy_t>(
+    new backend::builtin::Backend, [](backend::Backend *backend) { delete backend; });
 
   bool initialized = backend_object->config()->initialize(); // Call initialize here?
   if (!initialized)
   {
-    throw std::runtime_error(backend::controlflow::Config::ID + " backend initialization failed");
+    throw std::runtime_error(backend::builtin::Config::ID + " backend initialization failed");
   }
-  _controlflow = backend_object.get(); // Save the controlflow backend implementation pointer
-  assert(_controlflow);
+  _builtin = backend_object.get(); // Save the builtin backend implementation pointer
+  assert(_builtin);
   _gen_map.emplace(backend_object->config()->id(), std::move(backend_object));
 }
 
@@ -104,7 +104,7 @@ void BackendManager::loadBackend(const std::string &backend)
     }
 
     auto backend_object =
-        std::unique_ptr<backend::Backend, backend_destroy_t>(backend_create(), backend_destroy);
+      std::unique_ptr<backend::Backend, backend_destroy_t>(backend_create(), backend_destroy);
     bool initialized = backend_object->config()->initialize(); // Call initialize here?
     if (!initialized)
     {
@@ -118,24 +118,18 @@ void BackendManager::loadBackend(const std::string &backend)
   }
 
   // Save backend handle (avoid warning by handle lost without dlclose())
-
-  // NOTE This is a workaround for clang-format3.9 (seems like it does not understand
-  //      "by-copy capture with an initializer"
-  // clang-format off
   auto u_handle = std::unique_ptr<void, dlhandle_destroy_t>{
-      handle, [id = backend, filename = backend_so](void *h) {
-        if (dlclose(h) == 0)
-        {
-          VERBOSE(BackendManager) << "Successfully unloaded '" << id << "'(" << filename << ")\n";
-        }
-        else
-        {
-          VERBOSE(BackendManager)
-              << "Failed to unload backend '" << id << "'- " << dlerror() << "\n";
-        }
-      }};
-// clang-format on
-_handle_map.emplace(backend, std::move(u_handle));
+    handle, [id = backend, filename = backend_so](void *h) {
+      if (dlclose(h) == 0)
+      {
+        VERBOSE(BackendManager) << "Successfully unloaded '" << id << "'(" << filename << ")\n";
+      }
+      else
+      {
+        VERBOSE(BackendManager) << "Failed to unload backend '" << id << "'- " << dlerror() << "\n";
+      }
+    }};
+  _handle_map.emplace(backend, std::move(u_handle));
 }
 
 backend::Backend *BackendManager::get(const std::string &key)
@@ -158,7 +152,7 @@ const backend::Backend *BackendManager::get(const std::string &key) const
   return nullptr;
 }
 
-const backend::controlflow::Backend *BackendManager::getControlflow() const { return _controlflow; }
+const backend::builtin::Backend *BackendManager::getBuiltin() const { return _builtin; }
 
 } // namespace compiler
 } // namespace onert
diff --git a/runtime/onert/core/src/compiler/Compiler.cc b/runtime/onert/core/src/compiler/Compiler.cc
index 7eeb14ad3..082bdc9d0 100644
--- a/runtime/onert/core/src/compiler/Compiler.cc
+++ b/runtime/onert/core/src/compiler/Compiler.cc
@@ -16,22 +16,21 @@
 
 #include "compiler/Compiler.h"
 
-#include "ParamChecker.h"
 #include "ExecutorFactory.h"
 #include "ShapeValidator.h"
-#include "Fp32ToFp16Converter.h"
 
-#include <backend/controlflow/Config.h>
+#include <backend/builtin/Config.h>
 #include "compiler/BackendManager.h"
 #include "compiler/IScheduler.h"
 #include "compiler/ManualScheduler.h"
 #include "compiler/HEScheduler.h"
 #include "compiler/StaticShapeInferer.h"
+#include "compiler/OperationLowerInfo.h"
 #include "compiler/pass/ConstantOutputPass.h"
 #include "compiler/pass/OddOutputPass.h"
 #include "compiler/pass/PassRunner.h"
+#include "compiler/pass/UnusedOperandEliminationPass.h"
 #include "exec/ExecTime.h"
-#include "ir/operation/LowerInfo.h"
 #include "ir/verifier/Verifier.h"
 #include "dumper/dot/DotDumper.h"
 #include "compiler/Linear.h"
@@ -77,15 +76,11 @@ CompilerOptions fetchCompilerOptionsFromGlobalConfig(const ir::Subgraphs &subgs)
   options.backend_list = nnfw::misc::split(util::getConfigString(util::config::BACKENDS), ';');
   options.trace_filepath = util::getConfigString(util::config::TRACE_FILEPATH);
   options.graph_dump_level = util::getConfigInt(util::config::GRAPH_DOT_DUMP);
-  options.op_seq_max_node = util::getConfigInt(util::config::OP_SEQ_MAX_NODE);
   options.executor = util::getConfigString(util::config::EXECUTOR);
   options.he_scheduler = util::getConfigBool(util::config::USE_SCHEDULER);
   options.he_profiling_mode = util::getConfigBool(util::config::PROFILING_MODE);
   options.disable_compile = util::getConfigBool(util::config::DISABLE_COMPILE);
   options.fp16_enable = util::getConfigBool(util::config::FP16_ENABLE);
-#ifdef RUY_PROFILER
-  options.op_seq_max_node = 1;
-#endif
 
   {
     // Backend for all
@@ -123,8 +118,8 @@ CompilerOptions fetchCompilerOptionsFromGlobalConfig(const ir::Subgraphs &subgs)
       auto key = static_cast<uint32_t>(std::stoi(key_str));
 
       subgs.at(ir::SubgraphIndex{0})
-          ->operations()
-          .at(ir::OperationIndex{key}); // Check if exist, or this wil throw
+        ->operations()
+        .at(ir::OperationIndex{key}); // Check if exist, or this wil throw
       ms_options.index_to_backend.emplace(ir::OperationIndex{key}, val);
     }
   }
@@ -132,7 +127,7 @@ CompilerOptions fetchCompilerOptionsFromGlobalConfig(const ir::Subgraphs &subgs)
 }
 
 Compiler::Compiler(const std::shared_ptr<ir::Subgraphs> &subgs, util::TracingCtx *tracing_ctx)
-    : _subgraphs{subgs}, _state{State::CREATED}
+  : _subgraphs{subgs}, _state{State::CREATED}
 {
   // Set default values for CompilerOptions
   // All these default values should not be fetched from Env, when we stop supporting Android NN
@@ -157,10 +152,10 @@ std::shared_ptr<exec::ExecutorMap> Compiler::compile(void)
 {
   // Set control flow backend for control flow operators
   {
-    auto &cfid = backend::controlflow::Config::ID;
-    _options.manual_scheduler_options.opcode_to_backend[ir::OpCode::If] = cfid;
-    _options.manual_scheduler_options.opcode_to_backend[ir::OpCode::While] = cfid;
-    _options.manual_scheduler_options.opcode_to_backend[ir::OpCode::Permute] = cfid;
+    auto &builtin_id = backend::builtin::Config::ID;
+    _options.manual_scheduler_options.opcode_to_backend[ir::OpCode::If] = builtin_id;
+    _options.manual_scheduler_options.opcode_to_backend[ir::OpCode::While] = builtin_id;
+    _options.manual_scheduler_options.opcode_to_backend[ir::OpCode::Permute] = builtin_id;
   }
 
   // FIXME This is a workaround for bcq operations, should remove it
@@ -170,15 +165,13 @@ std::shared_ptr<exec::ExecutorMap> Compiler::compile(void)
   }
 
   {
-    VERBOSE(Compiler) << std::boolalpha;
-    VERBOSE(Compiler) << "==== Compiler Options ====" << std::endl;
+    VERBOSE(Compiler) << std::boolalpha << "==== Compiler Options ====" << std::endl;
     VERBOSE(Compiler) << "backend_list             : "
                       << nnfw::misc::join(_options.backend_list.begin(),
                                           _options.backend_list.end(), "/")
                       << std::endl;
     VERBOSE(Compiler) << "trace_filepath           : " << _options.trace_filepath << std::endl;
     VERBOSE(Compiler) << "graph_dump_level         : " << _options.graph_dump_level << std::endl;
-    VERBOSE(Compiler) << "op_seq_max_node          : " << _options.op_seq_max_node << std::endl;
     VERBOSE(Compiler) << "executor                 : " << _options.executor << std::endl;
     VERBOSE(Compiler) << "manual backend_for_all   : "
                       << _options.manual_scheduler_options.backend_for_all << std::endl;
@@ -188,16 +181,19 @@ std::shared_ptr<exec::ExecutorMap> Compiler::compile(void)
     VERBOSE(Compiler) << "he_scheduler             : " << _options.he_scheduler << std::endl;
     VERBOSE(Compiler) << "he_profiling_mode        : " << _options.he_profiling_mode << std::endl;
     VERBOSE(Compiler) << "disable_compile          : " << _options.disable_compile << std::endl;
-    VERBOSE(Compiler) << "fp16_enable              : " << _options.fp16_enable << std::endl;
-    VERBOSE(Compiler) << std::noboolalpha;
+    VERBOSE(Compiler) << "fp16_enable              : " << _options.fp16_enable << std::endl
+                      << std::noboolalpha;
   }
 
   _subgraphs->iterate([&](const ir::SubgraphIndex &, ir::Graph &subg) {
     // Mandatory passes
     pass::PassRunner{}
-        .append(std::make_unique<pass::ConstantOutputPass>(subg))
-        .append(std::make_unique<pass::OddOutputPass>(subg))
-        .run();
+      .append(std::make_unique<pass::ConstantOutputPass>(subg))
+      .append(std::make_unique<pass::OddOutputPass>(subg))
+      .run();
+
+    // Optimizations
+    pass::PassRunner{}.append(std::make_unique<pass::UnusedOperandEliminationPass>(subg)).run();
   });
 
   /***************************************************
@@ -208,7 +204,7 @@ std::shared_ptr<exec::ExecutorMap> Compiler::compile(void)
   // Compilable check
   // TODO: Support hybrid execution -
   //       execution between interpreter and compiled executor (including control flow)
-  if (!checkCompilable())
+  if (_options.disable_compile)
   {
     _subgraphs->iterate([&](const ir::SubgraphIndex &index, ir::Graph &subg) {
       executors->emplace(index, std::make_unique<interp::InterpExecutor>(subg));
@@ -235,22 +231,6 @@ std::shared_ptr<exec::ExecutorMap> Compiler::compile(void)
     // Lower: Assign backend
     lowered_subgs[index] = std::make_unique<compiler::LoweredGraph>(subg, _options);
 
-    // Check backend(s) for subgraph support FP16
-    bool backends_support_fp16 = true;
-    auto &contexts = (*lowered_subgs[index]).backend_contexts();
-    for (auto it = contexts.begin(); it != contexts.end(); it++)
-    {
-      // Controlflow backend is not for actual computaion of operations so it is an exception
-      if (it->first->config()->id() != backend::controlflow::Config::ID)
-        backends_support_fp16 &= it->first->config()->supportFP16();
-    }
-
-    if (_options.fp16_enable && backends_support_fp16)
-    {
-      // NOTE: the only acl_cl backend enables fp16 mode
-      Fp32ToFp16Converter(*lowered_subgs[index]).run();
-    }
-
     subg.setSubgraphs(nullptr);
   });
 
@@ -268,11 +248,14 @@ std::shared_ptr<exec::ExecutorMap> Compiler::compile(void)
   {
     const auto primary_subg_idx = ir::SubgraphIndex{0};
     StaticShapeInferer inferer(primary_subg_idx, lowered_subgs);
-    lowered_subgs.at(primary_subg_idx)
-        ->iterateTopolOpSeqs([&](const ir::OpSequenceIndex &, ir::OpSequence &op_seq) {
-          auto has_dynamic_tensor = inferer.infer(op_seq);
-          op_seq.has_dynamic_tensor(has_dynamic_tensor);
-        });
+    auto &lowered_subg = lowered_subgs.at(primary_subg_idx);
+    auto ordered_ops = lowered_subg->graph().topolSortOperations();
+    for (auto op_ind : ordered_ops)
+    {
+      const auto &op = lowered_subg->graph().operations().at(op_ind);
+      bool has_dynamic_tensor = inferer.infer(op);
+      lowered_subg->setHasDynamicTensor(op_ind, has_dynamic_tensor);
+    }
     inferer.dump();
   }
 
@@ -303,9 +286,9 @@ std::shared_ptr<exec::ExecutorMap> Compiler::compile(void)
     ir::OperationDumper dumper("Executor generation of Subgraph " +
                                std::to_string(subg_index.value()));
     lowered_subg->graph().operations().iterate(
-        [&](const ir::OperationIndex &, const ir::Operation &op) { op.accept(dumper); });
+      [&](const ir::OperationIndex &, const ir::Operation &op) { op.accept(dumper); });
     auto executor = std::unique_ptr<exec::IExecutor>{
-        ExecutorFactory::get().create(std::move(lowered_subg), _options, executors)};
+      ExecutorFactory::get().create(std::move(lowered_subg), _options, executors)};
     executor->setIndexedRanks(indexed_ranks);
     executors->insert(std::make_pair(subg_index, std::move(executor)));
   }
@@ -317,32 +300,6 @@ std::shared_ptr<exec::ExecutorMap> Compiler::compile(void)
   return executors;
 }
 
-bool Compiler::checkCompilable()
-{
-  // Disable compile phase
-  // When ready to use interpreter backend, remove this config and use backend setting
-  if (_options.disable_compile)
-  {
-    return false;
-  }
-
-  // TODO check unspecified operand shape
-
-  // Check compilable parameter
-  for (uint32_t i = 0; i < _subgraphs->count(); ++i)
-  {
-    auto graph = _subgraphs->at(ir::SubgraphIndex{i});
-    ParamChecker paramChecker{graph};
-    paramChecker();
-    if (paramChecker.haveNoneConstParam())
-    {
-      return false;
-    }
-  }
-
-  return true;
-}
-
 } // namespace compiler
 
 } // namespace onert
diff --git a/runtime/onert/core/src/compiler/ExecutorFactory.cc b/runtime/onert/core/src/compiler/ExecutorFactory.cc
index 356feed7c..ba038e935 100644
--- a/runtime/onert/core/src/compiler/ExecutorFactory.cc
+++ b/runtime/onert/core/src/compiler/ExecutorFactory.cc
@@ -18,6 +18,7 @@
 
 #include <deque>
 #include <functional>
+#include "ir/OperationCloner.h"
 #include "exec/ExecutionObservers.h"
 #include "exec/LinearExecutor.h"
 #include "exec/DataflowExecutor.h"
@@ -26,12 +27,14 @@
 #include "compiler/ExecutionBuilder.h"
 #include "exec/ExecTime.h"
 #include "compiler/Linear.h"
+#include "compiler/BackendManager.h"
 #include "backend/IPortableTensor.h"
-#include "backend/controlflow/Config.h"
-#include "backend/controlflow/KernelGenerator.h"
-#include "backend/controlflow/UserTensor.h"
-#include "backend/controlflow/TensorBuilder.h"
+#include "backend/builtin/Config.h"
+#include "backend/builtin/KernelGenerator.h"
+#include "backend/builtin/UserTensor.h"
+#include "backend/builtin/TensorBuilder.h"
 #include "util/TracingCtx.h"
+#include "dumper/text/GraphDumper.h"
 
 #include <memory>
 
@@ -45,7 +48,7 @@ class SyncFunction final : public exec::IFunction
 public:
   virtual ~SyncFunction() = default;
   SyncFunction(std::unique_ptr<exec::IFunction> fn, const std::shared_ptr<backend::IConfig> config)
-      : _fn{std::move(fn)}, _config{config}
+    : _fn{std::move(fn)}, _config{config}
   {
     assert(_fn);
     assert(_config);
@@ -64,36 +67,164 @@ private:
   std::shared_ptr<backend::IConfig> _config;
 };
 
+using DeallocList = std::vector<backend::ITensor *>;
+// Deallocation after execution of an operation used by Linear Executor
+class DeallocFunction final : public exec::IFunction
+{
+public:
+  DeallocFunction(const DeallocList &tensors) : _dealloc_list{tensors} {}
+
+  void run() override
+  {
+    for (auto tensor : _dealloc_list)
+    {
+      if (!tensor->is_dynamic())
+        continue;
+      tensor->deallocBuffer();
+    }
+  }
+
+private:
+  DeallocList _dealloc_list;
+};
+
 void initializeSubgraphIOTensors(compiler::LoweredGraph &lowered_graph,
+                                 const backend::BackendContexts &backend_contexts,
                                  const ir::OperandIndexSequence &indices)
 {
-  // TODO Store controlflow backend in BackendContext
-  std::shared_ptr<backend::controlflow::TensorRegistry> cf_tensor_reg;
-  for (const auto &e : lowered_graph.backend_contexts())
+  // TODO Store builtin backend in BackendContext
+  std::shared_ptr<backend::builtin::TensorRegistry> builtin_tensor_reg;
+  for (const auto &e : backend_contexts)
   {
     auto backend = e.first;
     auto &context = e.second;
-    if (backend->config()->id() == backend::controlflow::Config::ID)
+    if (backend->config()->id() == backend::builtin::Config::ID)
     {
-      cf_tensor_reg =
-          std::dynamic_pointer_cast<backend::controlflow::TensorRegistry>(context->tensor_registry);
+      builtin_tensor_reg =
+        std::dynamic_pointer_cast<backend::builtin::TensorRegistry>(context->tensor_registry);
     }
   }
-  assert(cf_tensor_reg);
+  assert(builtin_tensor_reg);
 
   for (auto ind : indices)
   {
     const auto &operand = lowered_graph.graph().operands().at(ind);
-    auto tensor = std::make_unique<backend::controlflow::IOTensor>(
-        operand.info(),
-        ir::Layout::NHWC /* FIXME find op_seq for this operand and use frontend_layout */
-        );
+    auto tensor = std::make_unique<backend::builtin::IOTensor>(
+      operand.info(),
+      ir::Layout::NHWC /* FIXME find operation for this operand and use frontend_layout */
+    );
 
-    // Add tensor to controlflow TensorRegistry.
-    cf_tensor_reg->setNativeIOTensor(ind, std::move(tensor));
+    // Add tensor to builtin TensorRegistry.
+    builtin_tensor_reg->setNativeIOTensor(ind, std::move(tensor));
   }
 }
 
+backend::BackendContexts createBackendContexts(compiler::LoweredGraph &lgraph, bool linear_executor)
+{
+  backend::BackendContexts contexts;
+  auto &backend_manager = compiler::BackendManager::get();
+
+  std::unordered_map<const backend::Backend *, backend::ContextData> context_data_map;
+
+  // Generate partial graphs for each backend
+  for (auto backend : backend_manager.getAll())
+  {
+    auto &data = context_data_map[backend];
+    auto graph = std::make_unique<ir::Graph>();
+    graph->setLayout(lgraph.graph().layout());
+    data.graph = std::move(graph);
+  }
+
+  auto &whole_graph = lgraph.graph();
+  // Separate operands into partial graphs
+  whole_graph.operands().iterate([&](const ir::OperandIndex &operand_ind, ir::Operand &operand) {
+    auto &operand_li = lgraph.lower_info().operand;
+    const auto &def_factors = operand_li.at(operand_ind).def_factors();
+    if (def_factors.size() == 0) // Ignore unused tensor
+      return;
+    const auto &def_factor = def_factors.getOnlyElement();
+    const auto backend = def_factor.backend();
+    auto &partial_graph = *context_data_map[backend].graph;
+    auto &operand_layouts = context_data_map[backend].operand_layouts;
+    assert(operand_layouts.find(operand_ind) == operand_layouts.end());
+    operand_layouts[operand_ind] = def_factor.layout();
+
+    // Copy the operand and insert it to the partial graph
+    auto new_operand = std::make_unique<ir::Operand>(operand);
+    new_operand->clearDefUse();
+    operand.releaseData(); // Deref data of LoweredGraph
+    auto new_operand_ind = partial_graph.addOperand(operand_ind, std::move(new_operand));
+    UNUSED_RELEASE(new_operand_ind);
+    assert(new_operand_ind == operand_ind);
+  });
+  // Separate operations into partial graphs
+  whole_graph.operations().iterate(
+    [&](const ir::OperationIndex &op_ind, const ir::Operation &operation) {
+      auto &op_li = lgraph.lower_info().operation;
+      auto backend = op_li.at(op_ind).backend();
+      auto &partial_graph = *context_data_map[backend].graph;
+      auto &external_operands = context_data_map[backend].external_operands;
+      auto &operand_layouts = context_data_map[backend].operand_layouts;
+
+      {
+        // Add missing operands (externals)
+        auto io_list = (operation.getInputs() + operation.getOutputs()) | ir::Remove::DUPLICATED |
+                       ir::Remove::UNDEFINED;
+        for (auto operand_ind : io_list)
+        {
+          if (partial_graph.operands().exist(operand_ind))
+            continue;
+
+          // Copy the operand and insert it to the partial graph
+          const auto &operand = whole_graph.operands().at(operand_ind);
+          auto new_operand = std::make_unique<ir::Operand>(operand);
+          new_operand->clearDefUse();
+          auto new_operand_ind = partial_graph.addOperand(operand_ind, std::move(new_operand));
+          UNUSED_RELEASE(new_operand_ind);
+          assert(new_operand_ind == operand_ind);
+
+          auto layout =
+            lgraph.lower_info().operand.at(operand_ind).def_factors().getOnlyElement().layout();
+          assert(operand_layouts.find(operand_ind) == operand_layouts.end());
+          operand_layouts[operand_ind] = layout;
+          external_operands.add(operand_ind);
+        }
+
+        auto new_op_ind = partial_graph.addOperation(op_ind, clone(operation));
+        UNUSED_RELEASE(new_op_ind);
+        assert(new_op_ind == op_ind);
+      }
+    });
+
+  // Create contexts
+  auto whole_op_order = lgraph.graph().topolSortOperations();
+  for (auto &pair : context_data_map)
+  {
+    auto backend = pair.first;
+    auto &data = pair.second;
+    // Handle graph input/outputs or external tensors
+    data.graph->operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &operand) {
+      if (whole_graph.getInputs().contains(ind) || whole_graph.getOutputs().contains(ind))
+        data.external_operands.add(ind);
+      // Inputs are either "graph input" or "no def op and non-constant"
+      if (whole_graph.getInputs().contains(ind) ||
+          (!operand.getDef().valid() && !operand.isConstant()))
+        // Outputs are either "graph output" or "no uses"
+        data.graph->addInput(ind);
+      if (whole_graph.getOutputs().contains(ind) || operand.getUses().size() == 0)
+        data.graph->addOutput(ind);
+    });
+    dumper::text::dumpGraph(*data.graph);
+
+    std::copy_if(whole_op_order.begin(), whole_op_order.end(), std::back_inserter(data.op_order),
+                 [&](const auto &ind) { return data.graph->operations().exist(ind); });
+    data.is_linear_executor = linear_executor;
+    data.custom_kernel_builder = lgraph.graph().getKernelBuilder();
+    contexts.emplace(backend, backend->newContext(std::move(data)));
+  }
+  return contexts;
+}
+
 } // namespace
 } // namespace onert
 
@@ -124,68 +255,31 @@ exec::IExecutor *ExecutorFactory::create(std::unique_ptr<compiler::LoweredGraph>
   return _map.at(options.executor)(std::move(lowered_graph), options, executor_map);
 }
 
-void ExecutorFactory::initializeBackendContext(compiler::LoweredGraph *lowered_graph)
-{
-  struct Entry
-  {
-    std::vector<backend::BackendContext::OperationInfo> operation_list;
-    std::vector<ir::OperandIndex> operand_list;
-  };
-  std::unordered_map<const backend::Backend *, Entry> backend_assets;
-
-  // Build lists for operations
-  lowered_graph->op_seqs().iterate(
-      [&](const ir::OpSequenceIndex &op_seq_index, const ir::OpSequence &op_seq) {
-        auto &op_seq_li = lowered_graph->getLowerInfo()->op_seq;
-        auto backend = op_seq_li.at(op_seq_index)->backend();
-        for (auto &operation_idx : op_seq.operations())
-        {
-          backend_assets[backend].operation_list.emplace_back(operation_idx, op_seq.getLayout());
-        }
-      });
-
-  // Build lists for operands
-  lowered_graph->graph().operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &) {
-    const auto lower_info = lowered_graph->getLowerInfo(ind);
-    for (auto factor : lower_info->def_factors())
-    {
-      auto backend = factor.backend();
-      backend_assets[backend].operand_list.emplace_back(ind);
-    }
-  });
-
-  for (auto &pair : backend_assets)
-  {
-    auto backend = pair.first;
-    auto &arg = pair.second;
-    lowered_graph->backend_contexts().at(backend)->initialize(arg.operation_list, arg.operand_list);
-  }
-}
-
-void ExecutorFactory::prepareMigrantTensors(compiler::LoweredGraph &lowered_graph)
+void ExecutorFactory::prepareMigrantTensors(compiler::LoweredGraph &lowered_graph,
+                                            const backend::BackendContexts &backend_contexts)
 {
-  TensorRegistries tensor_regs{lowered_graph.backend_contexts(), true};
-
-  lowered_graph.op_seqs().iterate(
-      [&](const ir::OpSequenceIndex &op_seq_index, const ir::OpSequence &op_seq) {
-        auto lower_info = lowered_graph.getLowerInfo(op_seq_index);
-        auto &backend_ctx = lowered_graph.backend_contexts().at(lower_info->backend());
-        for (auto ind : (op_seq.getInputs() + op_seq.getOutputs()) | ir::Remove::DUPLICATED |
-                            ir::Remove::UNDEFINED)
+  TensorRegistries tensor_regs{backend_contexts, true};
+
+  lowered_graph.graph().operations().iterate(
+    [&](const ir::OperationIndex &op_ind, const ir::Operation &op) {
+      auto lower_info = lowered_graph.lower_info().operation.getRawPtr(op_ind);
+      auto &backend_ctx = backend_contexts.at(lower_info->backend());
+      for (auto ind :
+           (op.getInputs() + op.getOutputs()) | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED)
+      {
+        // If an Operation's input/output tensor does not have an own tensor object,
+        // it must be using migrant tensors, so find the tensor from other tensor registries and
+        // register it to the current tensor registry if it is portable
+        if (!backend_ctx->tensor_registry->getITensor(ind))
         {
-          // If an OpSequence input/output tensor does not have a own tensor object,
-          // it must be using migrant tensors, so find the tensor from other tensor builders and
-          // set the tensor to this tensor builder if portable
-          if (!backend_ctx->tensor_registry->getITensor(ind))
-          {
-            auto tensor = tensor_regs.getITensor(ind);
-            assert(tensor); // The tensor must have been registered
-            auto ptensor = dynamic_cast<backend::IPortableTensor *>(tensor);
-            if (ptensor)
-              backend_ctx->tensor_registry->setMigrantTensor(ind, ptensor);
-          }
+          auto tensor = tensor_regs.getITensor(ind);
+          assert(tensor); // The tensor must have been registered
+          auto ptensor = dynamic_cast<backend::IPortableTensor *>(tensor);
+          if (ptensor)
+            backend_ctx->tensor_registry->setMigrantTensor(ind, ptensor);
         }
-      });
+      }
+    });
 }
 
 exec::IExecutor *
@@ -193,17 +287,17 @@ ExecutorFactory::createLinearExecutor(std::unique_ptr<compiler::LoweredGraph> lo
                                       const compiler::CompilerOptions &options,
                                       const std::shared_ptr<exec::ExecutorMap> &executor_map)
 {
-  const auto &backend_contexts = lowered_graph->backend_contexts();
-
-  initializeBackendContext(lowered_graph.get());
+  auto graph = lowered_graph->graph();
 
-  TensorRegistries tensor_regs{lowered_graph->backend_contexts(), true};
+  backend::BackendContexts backend_contexts =
+    createBackendContexts(*lowered_graph, options.executor == "Linear");
 
-  assert(!lowered_graph->graph().isBuildingPhase());
+  TensorRegistries tensor_regs{backend_contexts, true};
 
   initializeSubgraphIOTensors(
-      *lowered_graph, (lowered_graph->graph().getInputs() + lowered_graph->graph().getOutputs()) |
-                          ir::Remove::DUPLICATED | ir::Remove::UNDEFINED);
+    *lowered_graph, backend_contexts,
+    (lowered_graph->graph().getInputs() + lowered_graph->graph().getOutputs()) |
+      ir::Remove::DUPLICATED | ir::Remove::UNDEFINED);
 
   // linearize
   auto order = Linear::linearize(*lowered_graph);
@@ -211,20 +305,20 @@ ExecutorFactory::createLinearExecutor(std::unique_ptr<compiler::LoweredGraph> lo
 
   for (auto &pair : backend_contexts)
   {
-    pair.second->genTensors(order, lowered_graph->op_seqs(), *lowered_graph->getLowerInfo());
+    pair.second->genTensors();
   }
 
-  prepareMigrantTensors(*lowered_graph);
+  prepareMigrantTensors(*lowered_graph, backend_contexts);
 
-  // Give some runtime objects to controlflow KernelGenerator
+  // Give some runtime objects to builtin KernelGenerator
   for (auto &pair : backend_contexts)
   {
-    auto cf_context = dynamic_cast<backend::controlflow::BackendContext *>(pair.second.get());
-    if (cf_context != nullptr)
+    auto builtin_context = dynamic_cast<backend::builtin::BackendContext *>(pair.second.get());
+    if (builtin_context != nullptr)
     {
-      auto cf_kernel_gen = cf_context->kernel_gen;
-      cf_kernel_gen->setTensorRegistries(tensor_regs);
-      cf_kernel_gen->setExecutorMap(executor_map);
+      auto builtin_kernel_gen = builtin_context->kernel_gen;
+      builtin_kernel_gen->setTensorRegistries(tensor_regs);
+      builtin_kernel_gen->setExecutorMap(executor_map);
     }
   }
 
@@ -234,41 +328,97 @@ ExecutorFactory::createLinearExecutor(std::unique_ptr<compiler::LoweredGraph> lo
   std::deque<std::pair<const backend::Backend *, backend::BackendContext *>> ordered_contexts;
   for (auto &pair : backend_contexts)
   {
-    // NOTE controlflow backend must be processed lastly.
+    // NOTE builtin backend must be processed lastly.
     // This is because of Permute layer's specialty which is the only operation that could have
     // different ITensor objects for the input and the output. And it requires all other backends'
     // tensors are ready to use.
-    if (pair.first->config()->id() == "controlflow")
+    if (pair.first->config()->id() == "builtin")
       ordered_contexts.emplace_back(pair.first, pair.second.get());
     else
       ordered_contexts.emplace_front(pair.first, pair.second.get());
   }
 
+  // Simulate the execution for deallocation of tensors
+  std::unordered_map<ir::OperationIndex, DeallocList> dealloc_list_map;
+  {
+    ir::OperandIndexMap<uint32_t> uses_map;
+    ir::OperandIndexSequence constants;
+
+    auto model_io =
+      (graph.getInputs() + graph.getOutputs()) | ir::Remove::UNDEFINED | ir::Remove::DUPLICATED;
+
+    // Prepare scanning
+    graph.operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &obj) {
+      uses_map[ind] = obj.getUses().size();
+
+      if (obj.isConstant())
+        constants.append(ind);
+    });
+
+    // A trick to consider constants as an execption
+    for (const auto &ind : constants)
+    {
+      uses_map[ind]++;
+    }
+
+    for (const auto op_ind : order)
+    {
+      const auto &op = graph.operations().at(op_ind);
+      auto op_inputs = op.getInputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED;
+      auto op_outputs = op.getOutputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED;
+
+      for (const auto &ind : op_inputs)
+      {
+        const auto &operand = graph.operands().at(ind);
+        assert(uses_map.find(ind) != uses_map.end());
+        assert(uses_map[ind] > 0);
+        uses_map[ind]--;
+        if (uses_map[ind] == 0 && !operand.info().isVariable() && !model_io.contains(ind))
+        {
+          dealloc_list_map[op_ind].emplace_back(tensor_regs.getITensor(ind));
+        }
+      }
+    }
+
+    // Dispose and validate
+    for (const auto &ind : constants)
+    {
+      --uses_map[ind];
+    }
+
+    assert(
+      std::all_of(uses_map.begin(), uses_map.end(),
+                  [](std::pair<const ir::OperandIndex, uint32_t> it) { return it.second == 0; }));
+  }
+
   // Generate kernels
   for (auto &pair : ordered_contexts)
   {
-    auto codes = pair.second->genKernels(order, lowered_graph->op_seqs());
+    auto codes = pair.second->genKernels();
     for (auto &pair : codes)
     {
-      auto &op_seq_ind = pair.first;
+      auto &op_ind = pair.first;
       auto &fn_seq = pair.second;
-      auto &op_seq = lowered_graph->op_seqs().at(op_seq_ind);
-      auto lower_info = lowered_graph->getLowerInfo(op_seq_ind);
+      auto &op = lowered_graph->graph().operations().at(op_ind);
+      auto lower_info = lowered_graph->lower_info().operation.getRawPtr(op_ind);
       if (options.he_profiling_mode)
         fn_seq->wrap<SyncFunction>(lower_info->backend()->config());
-      builder.append(op_seq_ind, {&op_seq, lower_info, std::move(fn_seq)});
+      if (!dealloc_list_map[op_ind].empty())
+        fn_seq->append(std::make_unique<DeallocFunction>(dealloc_list_map[op_ind]));
+      builder.append(op_ind, {op_ind, &op, lower_info, std::move(fn_seq)});
     }
   }
 
   auto code_map = builder.releaseCodeMap();
 
-  auto exec = new exec::LinearExecutor{std::move(lowered_graph), tensor_regs, std::move(code_map),
-                                       order, options.tracing_ctx};
+  auto exec = new exec::LinearExecutor{
+    std::move(lowered_graph), std::move(backend_contexts), tensor_regs, std::move(code_map), order,
+    options.tracing_ctx};
 
   if (!options.trace_filepath.empty())
   {
     std::unique_ptr<exec::IExecutionObserver> ctp = std::make_unique<exec::TracingObserver>(
-        options.trace_filepath, exec->graph(), options.tracing_ctx);
+      options.trace_filepath, exec->graph(), options.tracing_ctx);
     exec->addObserver(std::move(ctp));
   }
 
@@ -276,41 +426,35 @@ ExecutorFactory::createLinearExecutor(std::unique_ptr<compiler::LoweredGraph> lo
 }
 
 exec::IExecutor *ExecutorFactory::createDataflowExecutor(
-    std::unique_ptr<compiler::LoweredGraph> lowered_graph, const compiler::CompilerOptions &options,
-    const std::shared_ptr<exec::ExecutorMap> &executor_map, bool parallel)
+  std::unique_ptr<compiler::LoweredGraph> lowered_graph, const compiler::CompilerOptions &options,
+  const std::shared_ptr<exec::ExecutorMap> &executor_map, bool parallel)
 {
-  const auto &backend_contexts = lowered_graph->backend_contexts();
+  backend::BackendContexts backend_contexts =
+    createBackendContexts(*lowered_graph, options.executor == "Linear");
 
-  initializeBackendContext(lowered_graph.get());
-
-  TensorRegistries tensor_regs{lowered_graph->backend_contexts(), true};
-
-  assert(!lowered_graph->graph().isBuildingPhase());
+  TensorRegistries tensor_regs{backend_contexts, true};
 
   initializeSubgraphIOTensors(
-      *lowered_graph, (lowered_graph->graph().getInputs() + lowered_graph->graph().getOutputs()) |
-                          ir::Remove::DUPLICATED | ir::Remove::UNDEFINED);
+    *lowered_graph, backend_contexts,
+    (lowered_graph->graph().getInputs() + lowered_graph->graph().getOutputs()) |
+      ir::Remove::DUPLICATED | ir::Remove::UNDEFINED);
 
-  // linearize
-  // This order is just for giving topological order info to the backens
-  // TODO When we pass a partial graph to a backend, we can remove this
-  auto order = Linear::linearize(*lowered_graph);
   for (auto &pair : backend_contexts)
   {
-    pair.second->genTensors(order, lowered_graph->op_seqs(), *lowered_graph->getLowerInfo());
+    pair.second->genTensors();
   }
 
-  prepareMigrantTensors(*lowered_graph);
+  prepareMigrantTensors(*lowered_graph, backend_contexts);
 
-  // Give some runtime objects to controlflow KernelGenerator
+  // Give some runtime objects to builtin KernelGenerator
   for (auto &pair : backend_contexts)
   {
-    auto cf_context = dynamic_cast<backend::controlflow::BackendContext *>(pair.second.get());
-    if (cf_context != nullptr)
+    auto builtin_context = dynamic_cast<backend::builtin::BackendContext *>(pair.second.get());
+    if (builtin_context != nullptr)
     {
-      auto cf_kernel_gen = cf_context->kernel_gen;
-      cf_kernel_gen->setTensorRegistries(tensor_regs);
-      cf_kernel_gen->setExecutorMap(executor_map);
+      auto builtin_kernel_gen = builtin_context->kernel_gen;
+      builtin_kernel_gen->setTensorRegistries(tensor_regs);
+      builtin_kernel_gen->setExecutorMap(executor_map);
     }
   }
 
@@ -320,11 +464,11 @@ exec::IExecutor *ExecutorFactory::createDataflowExecutor(
   std::deque<std::pair<const backend::Backend *, backend::BackendContext *>> ordered_contexts;
   for (auto &pair : backend_contexts)
   {
-    // NOTE controlflow backend must be processed lastly.
+    // NOTE builtin backend must be processed lastly.
     // This is because of Permute layer's specialty which is the only operation that could have
     // different ITensor objects for the input and the output. And it requires all other backends'
     // tensors are ready to use.
-    if (pair.first->config()->id() == "controlflow")
+    if (pair.first->config()->id() == "builtin")
       ordered_contexts.emplace_back(pair.first, pair.second.get());
     else
       ordered_contexts.emplace_front(pair.first, pair.second.get());
@@ -333,16 +477,16 @@ exec::IExecutor *ExecutorFactory::createDataflowExecutor(
   // Generate kernels
   for (auto &pair : ordered_contexts)
   {
-    auto codes = pair.second->genKernels(order, lowered_graph->op_seqs());
+    auto codes = pair.second->genKernels();
     for (auto &pair : codes)
     {
-      auto &op_seq_ind = pair.first;
+      auto &op_ind = pair.first;
       auto &fn_seq = pair.second;
-      auto &op_seq = lowered_graph->op_seqs().at(op_seq_ind);
-      auto lower_info = lowered_graph->getLowerInfo(op_seq_ind);
+      auto &op = lowered_graph->graph().operations().at(op_ind);
+      auto lower_info = lowered_graph->lower_info().operation.getRawPtr(op_ind);
       if (options.he_profiling_mode)
         fn_seq->wrap<SyncFunction>(lower_info->backend()->config());
-      builder.append(op_seq_ind, {&op_seq, lower_info, std::move(fn_seq)});
+      builder.append(op_ind, {op_ind, &op, lower_info, std::move(fn_seq)});
     }
   }
 
@@ -351,13 +495,14 @@ exec::IExecutor *ExecutorFactory::createDataflowExecutor(
   exec::ExecutorBase *exec = nullptr;
   if (parallel)
   {
-    exec = new exec::ParallelExecutor{std::move(lowered_graph), tensor_regs, std::move(code_map),
-                                      options.tracing_ctx};
+    exec = new exec::ParallelExecutor{std::move(lowered_graph), std::move(backend_contexts),
+                                      tensor_regs, std::move(code_map), options.tracing_ctx};
   }
   else
   {
-    auto dataflow_exec = new exec::DataflowExecutor{std::move(lowered_graph), tensor_regs,
-                                                    std::move(code_map), options.tracing_ctx};
+    auto dataflow_exec =
+      new exec::DataflowExecutor{std::move(lowered_graph), std::move(backend_contexts), tensor_regs,
+                                 std::move(code_map), options.tracing_ctx};
     if (options.he_profiling_mode)
     {
       std::vector<const backend::Backend *> backends;
@@ -367,7 +512,7 @@ exec::IExecutor *ExecutorFactory::createDataflowExecutor(
       }
       auto et = std::make_shared<exec::ExecTime>(backends);
       std::unique_ptr<exec::IExecutionObserver> obs =
-          std::make_unique<exec::ProfileObserver>(et, dataflow_exec->graph());
+        std::make_unique<exec::ProfileObserver>(et, dataflow_exec->graph());
       dataflow_exec->addObserver(std::move(obs));
     }
     exec = dataflow_exec;
@@ -376,7 +521,7 @@ exec::IExecutor *ExecutorFactory::createDataflowExecutor(
   if (!options.trace_filepath.empty())
   {
     std::unique_ptr<exec::IExecutionObserver> ctp = std::make_unique<exec::TracingObserver>(
-        options.trace_filepath, exec->graph(), options.tracing_ctx);
+      options.trace_filepath, exec->graph(), options.tracing_ctx);
     exec->addObserver(std::move(ctp));
   }
 
diff --git a/runtime/onert/core/src/compiler/ExecutorFactory.h b/runtime/onert/core/src/compiler/ExecutorFactory.h
index 06dc691db..5fe1617a6 100644
--- a/runtime/onert/core/src/compiler/ExecutorFactory.h
+++ b/runtime/onert/core/src/compiler/ExecutorFactory.h
@@ -43,10 +43,8 @@ private:
   ExecutorFactory();
 
 private:
-  static void initializeBackendContext(compiler::LoweredGraph *lowered_graph);
-  static void runTensorRegistration(compiler::LoweredGraph *lowered_graph,
-                                    const std::vector<ir::OpSequenceIndex> &order);
-  static void prepareMigrantTensors(compiler::LoweredGraph &lowered_graph);
+  static void prepareMigrantTensors(compiler::LoweredGraph &lowered_graph,
+                                    const backend::BackendContexts &backend_contexts);
   static exec::IExecutor *
   createLinearExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
                        const compiler::CompilerOptions &options,
@@ -58,10 +56,10 @@ private:
 
 private:
   std::unordered_map<std::string, std::function<exec::IExecutor *(
-                                      std::unique_ptr<compiler::LoweredGraph>,
-                                      const compiler::CompilerOptions &options,
-                                      const std::shared_ptr<exec::ExecutorMap> &executor_map)>>
-      _map;
+                                    std::unique_ptr<compiler::LoweredGraph>,
+                                    const compiler::CompilerOptions &options,
+                                    const std::shared_ptr<exec::ExecutorMap> &executor_map)>>
+    _map;
 };
 
 } // namespace compiler
diff --git a/runtime/onert/core/src/compiler/Fp32ToFp16Converter.cc b/runtime/onert/core/src/compiler/Fp32ToFp16Converter.cc
index 23a6a253d..5c1cef1ab 100644
--- a/runtime/onert/core/src/compiler/Fp32ToFp16Converter.cc
+++ b/runtime/onert/core/src/compiler/Fp32ToFp16Converter.cc
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#if 0 // This file is temporarily unused
+
 #include "Fp32ToFp16Converter.h"
 #include "ir/operation/ConvertFp32ToFp16.h"
 #include "ir/operation/ConvertFp16ToFp32.h"
@@ -45,7 +47,7 @@ namespace compiler
 {
 
 Fp32ToFp16Converter::Fp32ToFp16Converter(compiler::LoweredGraph &lowered_graph)
-    : _lowered_graph{lowered_graph}
+  : _lowered_graph{lowered_graph}
 {
   VERBOSE(Fp32ToFp16Converter) << "Fp16 Enable on" << std::endl;
 }
@@ -177,26 +179,26 @@ void Fp32ToFp16Converter::run()
 void Fp32ToFp16Converter::appendOpSequences()
 {
   _lowered_graph.op_seqs().iterate(
-      [&](const ir::OpSequenceIndex &op_seq_ind, ir::OpSequence &op_seq) {
-        const auto lower_info = _lowered_graph.getLowerInfo(op_seq_ind);
-        assert(lower_info != nullptr);
-
-        // For now, the only acl_cl supports fully fp16 type
-        // TODO Support fp16 on acl_neon. Current acl_neon supports the only reshape and concat
-        // operations.
-        //      To do this, we could check the support by `operation by operation`. After that, we
-        //      would partition an op_seq if it contains unsupported operations.
-        if (lower_info->backend()->config()->id() != kAclClBackendConfigId)
-          return;
-
-        // OpSeq's input set should be included in the first operation's input set or
-        // OpSeq's output set should be included in the last operation's output set
-        assert(checkOperandsOfOpSequence(op_seq));
-
-        // Append converting OpSequence for fp16 but all operands' types are not fp16 still.
-        appendNewOpSeqForConvertFp32ToFp16(op_seq_ind, op_seq);
-        appendNewOpSeqForConvertFp16ToFp32(op_seq_ind, op_seq);
-      });
+    [&](const ir::OpSequenceIndex &op_seq_ind, ir::OpSequence &op_seq) {
+      const auto lower_info = _lowered_graph.getLowerInfo(op_seq_ind);
+      assert(lower_info != nullptr);
+
+      // For now, the only acl_cl supports fully fp16 type
+      // TODO Support fp16 on acl_neon. Current acl_neon supports the only reshape and concat
+      // operations.
+      //      To do this, we could check the support by `operation by operation`. After that, we
+      //      would partition an op_seq if it contains unsupported operations.
+      if (lower_info->backend()->config()->id() != kAclClBackendConfigId)
+        return;
+
+      // OpSeq's input set should be included in the first operation's input set or
+      // OpSeq's output set should be included in the last operation's output set
+      assert(checkOperandsOfOpSequence(op_seq));
+
+      // Append converting OpSequence for fp16 but all operands' types are not fp16 still.
+      appendNewOpSeqForConvertFp32ToFp16(op_seq_ind, op_seq);
+      appendNewOpSeqForConvertFp16ToFp32(op_seq_ind, op_seq);
+    });
 }
 
 //
@@ -253,7 +255,7 @@ void Fp32ToFp16Converter::appendNewOpSeqForConvertFp32ToFp16(const ir::OpSequenc
     const auto new_op_seq_ind = newOpSequence(op_seq_ind, new_node_ind);
 
     // set new lower_info for op_seq
-    setNewOpSequenceLowerInfo(op_seq_ind, new_op_seq_ind);
+    setNewOperationLowerInfo(op_seq_ind, new_op_seq_ind);
 
     _list_fp32_to_fp16.insert(new_op_seq_ind);
 
@@ -326,7 +328,7 @@ void Fp32ToFp16Converter::appendNewOpSeqForConvertFp16ToFp32(const ir::OpSequenc
     auto new_op_seq_ind = newOpSequence(op_seq_ind, new_node_ind);
 
     // set new lower_info for op_seq
-    setNewOpSequenceLowerInfo(op_seq_ind, new_op_seq_ind);
+    setNewOperationLowerInfo(op_seq_ind, new_op_seq_ind);
 
     _list_fp16_to_fp32.insert(new_op_seq_ind);
 
@@ -372,16 +374,16 @@ void Fp32ToFp16Converter::optimize()
 void Fp32ToFp16Converter::convertOperands()
 {
   _lowered_graph.op_seqs().iterate(
-      [&](const ir::OpSequenceIndex &op_seq_ind, ir::OpSequence &op_seq) {
-        const auto lower_info = _lowered_graph.getLowerInfo(op_seq_ind);
-        assert(lower_info != nullptr);
-        // For now, the only acl_cl supports fully fp16
-        if (lower_info->backend()->config()->id() != kAclClBackendConfigId)
-          return;
-
-        // Convert input,output operands' type to fp16
-        convertOperandsOfOpSequence(op_seq);
-      });
+    [&](const ir::OpSequenceIndex &op_seq_ind, ir::OpSequence &op_seq) {
+      const auto lower_info = _lowered_graph.getLowerInfo(op_seq_ind);
+      assert(lower_info != nullptr);
+      // For now, the only acl_cl supports fully fp16
+      if (lower_info->backend()->config()->id() != kAclClBackendConfigId)
+        return;
+
+      // Convert input,output operands' type to fp16
+      convertOperandsOfOpSequence(op_seq);
+    });
 }
 
 void Fp32ToFp16Converter::convertOperandsOfOpSequence(ir::OpSequence &op_seq)
@@ -405,7 +407,7 @@ void Fp32ToFp16Converter::convertOperandsOfOpSequence(ir::OpSequence &op_seq)
 
       obj.type(ir::DataType::FLOAT16);
 
-      VERBOSE(Fp32ToFp16Converter) << "Input Operand #" << ind.value() << ": fp16" << std::endl;
+      VERBOSE(Fp32ToFp16Converter) << "Input Operand " << ind << ": fp16" << std::endl;
     }
 
     for (auto &ind : node.getOutputs())
@@ -419,7 +421,7 @@ void Fp32ToFp16Converter::convertOperandsOfOpSequence(ir::OpSequence &op_seq)
 
       obj.type(ir::DataType::FLOAT16);
 
-      VERBOSE(Fp32ToFp16Converter) << "Output Operand #" << ind.value() << ": fp16" << std::endl;
+      VERBOSE(Fp32ToFp16Converter) << "Output Operand " << ind << ": fp16" << std::endl;
     }
   }
 }
@@ -444,7 +446,7 @@ void Fp32ToFp16Converter::convertDatas()
 
       obj.data(std::move(new_data));
       obj.type(ir::DataType::FLOAT16);
-      VERBOSE(Fp32ToFp16Converter) << "Constant Operand #" << ind.value() << ": fp16" << std::endl;
+      VERBOSE(Fp32ToFp16Converter) << "Constant Operand " << ind << ": fp16" << std::endl;
     }
   });
 }
@@ -515,21 +517,21 @@ void Fp32ToFp16Converter::setNewOperandLowerInfo(const ir::OpSequenceIndex &op_s
 {
   const auto lower_info = _lowered_graph.getLowerInfo(op_seq_ind);
   assert(lower_info != nullptr);
-  auto new_lower_info = std::make_unique<ir::operand::LowerInfo>();
-  auto permute_factor = ir::operand::PermuteFactor(lower_info->backend(), lower_info->layout());
+  auto new_lower_info = std::make_unique<compiler::OperandLowerInfo>();
+  auto permute_factor = compiler::PermuteFactor(lower_info->backend(), lower_info->layout());
   new_lower_info->addDefPermuteFactor(permute_factor);
   new_lower_info->addUsePermuteFactor(permute_factor);
   _lowered_graph.setLowerInfo(new_op_ind, std::move(new_lower_info));
 }
 
-void Fp32ToFp16Converter::setNewOpSequenceLowerInfo(const ir::OpSequenceIndex &op_seq_ind,
-                                                    const ir::OpSequenceIndex &new_op_seq_ind)
+void Fp32ToFp16Converter::setNewOperationLowerInfo(const ir::OpSequenceIndex &op_seq_ind,
+                                                   const ir::OpSequenceIndex &new_op_seq_ind)
 {
   const auto lower_info = _lowered_graph.getLowerInfo(op_seq_ind);
   assert(lower_info != nullptr);
 
   auto new_lower_info =
-      std::make_unique<ir::operation::LowerInfo>(lower_info->backend(), lower_info->layout());
+    std::make_unique<compiler::OperationLowerInfo>(lower_info->backend(), lower_info->layout());
   _lowered_graph.setLowerInfo(new_op_seq_ind, std::move(new_lower_info));
 }
 
@@ -600,7 +602,7 @@ Fp32ToFp16Converter::newOperationConvertFp32ToFp16(const ir::OperandIndex &op_se
   auto &new_op_obj = operands.at(new_op_ind);
 
   std::unique_ptr<ir::Operation> new_node(
-      new ir::operation::ConvertFp32ToFp16({op_seq_input_ind}, {new_op_ind}));
+    new ir::operation::ConvertFp32ToFp16({op_seq_input_ind}, {new_op_ind}));
   const auto new_node_ind = operations.push(std::move(new_node));
 
   input_obj.insertUse(new_node_ind);
@@ -620,7 +622,7 @@ Fp32ToFp16Converter::newOperationConvertFp16ToFp32(const ir::OperandIndex &op_se
   auto &new_op_obj = operands.at(new_op_ind);
 
   std::unique_ptr<ir::Operation> new_node(
-      new ir::operation::ConvertFp16ToFp32({new_op_ind}, {op_seq_output_ind}));
+    new ir::operation::ConvertFp16ToFp32({new_op_ind}, {op_seq_output_ind}));
   const auto new_node_ind = operations.push(std::move(new_node));
 
   new_op_obj.insertUse(new_node_ind);
@@ -759,9 +761,8 @@ Fp32ToFp16Converter::findOpSequencesContiguous(const InputToOpSeqs &input_to_op_
           opseq_map_to_delete[op_seq_ind_fp16_to_fp32].insert(op_seq_ind);
         }
 
-        VERBOSE(Fp32ToFp16Converter)
-            << "Contiguous from OpSeq#" << op_seq_ind_fp16_to_fp32.value() << "(ToFp32)"
-            << " to OpSeq#" << op_seq_ind.value() << "(ToFp16)" << std::endl;
+        VERBOSE(Fp32ToFp16Converter) << "Contiguous from " << op_seq_ind_fp16_to_fp32 << "(ToFp32)"
+                                     << " to " << op_seq_ind << "(ToFp16)" << std::endl;
       }
     }
   }
@@ -842,7 +843,7 @@ Fp32ToFp16Converter::findOperationsToDelete(const OpSeqIndexList &list_to_delete
 }
 
 void Fp32ToFp16Converter::manipulateContiguousOpSequences(
-    const InputToOpSeqs &input_to_op_seqs, const OpSeqIndexToOpSeqIndexList &opseq_map_to_delete)
+  const InputToOpSeqs &input_to_op_seqs, const OpSeqIndexToOpSeqIndexList &opseq_map_to_delete)
 {
   auto &op_seqs = _lowered_graph.op_seqs();
 
@@ -894,8 +895,7 @@ void Fp32ToFp16Converter::manipulateContiguousOpSequences(
 }
 
 void Fp32ToFp16Converter::deleteContiguousOpSequences(
-    const OpSeqIndexList &list_to_delete_op_seqs,
-    const ir::OperandIndexSequence &list_to_delete_ops)
+  const OpSeqIndexList &list_to_delete_op_seqs, const ir::OperandIndexSequence &list_to_delete_ops)
 {
   auto &operands = _lowered_graph.graph().operands();
   auto &operations = _lowered_graph.graph().operations();
@@ -905,21 +905,21 @@ void Fp32ToFp16Converter::deleteContiguousOpSequences(
   {
     auto &op_seq = op_seqs.at(op_seq_ind);
     assert(op_seq.size() == 1);
-    VERBOSE(Fp32ToFp16Converter) << "Delete OpSeq #" << op_seq_ind.value() << std::endl;
+    VERBOSE(Fp32ToFp16Converter) << "Delete OpSeq " << op_seq_ind << std::endl;
 
     auto &first_node_ind = op_seq.operations().at(0);
     auto &first_node = operations.at(first_node_ind);
     assert(first_node.opcode() == ir::OpCode::ConvertFp32ToFp16 ||
            first_node.opcode() == ir::OpCode::ConvertFp16ToFp32);
-    VERBOSE(Fp32ToFp16Converter) << "Delete Node #" << first_node_ind.value() << std::endl;
+    VERBOSE(Fp32ToFp16Converter) << "Delete Node " << first_node_ind << std::endl;
 
     // Uses
     for (auto &ind : first_node.getInputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED)
     {
       auto &obj = operands.at(ind);
       obj.removeUse(first_node_ind);
-      VERBOSE(Fp32ToFp16Converter) << "Operand #" << ind.value() << "'s Use(Node#"
-                                   << first_node_ind.value() << ") is removed" << std::endl;
+      VERBOSE(Fp32ToFp16Converter)
+        << "Operand " << ind << "'s Use(Node" << first_node_ind << ") is removed" << std::endl;
     }
 
     // Def
@@ -928,27 +928,29 @@ void Fp32ToFp16Converter::deleteContiguousOpSequences(
       auto &obj = operands.at(ind);
       assert(obj.getDef() == first_node_ind);
       obj.unsetDef();
-      VERBOSE(Fp32ToFp16Converter) << "Operand #" << ind.value() << "'s Def(Node#"
-                                   << first_node_ind.value() << ") is removed" << std::endl;
+      VERBOSE(Fp32ToFp16Converter)
+        << "Operand " << ind << "'s Def(Node" << first_node_ind << ") is removed" << std::endl;
     }
 
     // Operation
     operations.remove(first_node_ind);
-    VERBOSE(Fp32ToFp16Converter) << "Node#" << first_node_ind.value() << " is removed" << std::endl;
+    VERBOSE(Fp32ToFp16Converter) << "Node" << first_node_ind << " is removed" << std::endl;
 
     // OpSequence
     op_seqs.remove(op_seq_ind);
-    VERBOSE(Fp32ToFp16Converter) << "OpSeq#" << op_seq_ind.value() << " is removed" << std::endl;
+    VERBOSE(Fp32ToFp16Converter) << "OpSeq" << op_seq_ind << " is removed" << std::endl;
   }
 
   // Operand
   for (auto &ind : list_to_delete_ops)
   {
     operands.remove(ind);
-    VERBOSE(Fp32ToFp16Converter) << "Operand #" << ind.value() << " is removed" << std::endl;
+    VERBOSE(Fp32ToFp16Converter) << "Operand " << ind << " is removed" << std::endl;
   }
 }
 
 } // namespace compiler
 
 } // namespace onert
+
+#endif
diff --git a/runtime/onert/core/src/compiler/Fp32ToFp16Converter.h b/runtime/onert/core/src/compiler/Fp32ToFp16Converter.h
index eeecb9846..87751ceb4 100644
--- a/runtime/onert/core/src/compiler/Fp32ToFp16Converter.h
+++ b/runtime/onert/core/src/compiler/Fp32ToFp16Converter.h
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#if 0 // This file is temporarily unused
+
 #ifndef __ONERT_COMPILER_FP32_TO_FP16_CONVERTER_H__
 #define __ONERT_COMPILER_FP32_TO_FP16_CONVERTER_H__
 
@@ -64,8 +66,8 @@ private:
 
   void setNewOperandLowerInfo(const ir::OpSequenceIndex &op_seq_ind,
                               const ir::OperandIndex &new_op_ind);
-  void setNewOpSequenceLowerInfo(const ir::OpSequenceIndex &op_seq_ind,
-                                 const ir::OpSequenceIndex &new_op_seq_ind);
+  void setNewOperationLowerInfo(const ir::OpSequenceIndex &op_seq_ind,
+                                const ir::OpSequenceIndex &new_op_seq_ind);
 
   void manipulateInput(const ir::OpSequenceIndex &op_seq_ind,
                        const ir::OperandIndex &op_seq_input_ind,
@@ -99,3 +101,5 @@ private:
 } // namespace onert
 
 #endif // __ONERT_COMPILER_FP32_TO_FP16_CONVERTER_H__
+
+#endif
diff --git a/runtime/onert/core/src/compiler/HEScheduler.cc b/runtime/onert/core/src/compiler/HEScheduler.cc
index 349b1c221..2f996c8e8 100644
--- a/runtime/onert/core/src/compiler/HEScheduler.cc
+++ b/runtime/onert/core/src/compiler/HEScheduler.cc
@@ -96,12 +96,12 @@ namespace compiler
 void HEScheduler::scheduleShufflingBackends()
 {
   VERBOSE(HEScheduler::schedule)
-      << "Started task scheduling: uses all backends to get more metrics for data transfer"
-      << std::endl;
+    << "Started task scheduling: uses all backends to get more metrics for data transfer"
+    << std::endl;
   size_t backend_ind = 0;
   for (const auto &rank : _rank_to_op)
   {
-    VERBOSE(HEScheduler::schedule) << "scheduling (" << rank.second.value() << ")" << std::endl;
+    VERBOSE(HEScheduler::schedule) << "scheduling (" << rank.second << ")" << std::endl;
     const auto &node = _graph->operations().at(rank.second);
     const bool quant = isQuant(*_graph, node);
     const auto size = getOperationsFlattenedIOSize(*_graph, node);
@@ -123,7 +123,7 @@ void HEScheduler::scheduleShufflingBackends()
         continue;
       }
       const auto exec_time =
-          _exec_time->getOperationExecTime(_all_backends[backend_ind], node.name(), quant, size);
+        _exec_time->getOperationExecTime(_all_backends[backend_ind], node.name(), quant, size);
       // Scheduling to measure data transfer must be done after measuring all backends separately
       assert(exec_time != _exec_time->NOT_FOUND);
       if (exec_time == _exec_time->getMax())
@@ -227,7 +227,7 @@ std::unique_ptr<compiler::BackendResolver> HEScheduler::schedule(const ir::Graph
 
   ir::OperationIndexMap<bool> visited;
   graph.operations().iterate(
-      [&](const ir::OperationIndex &index, const ir::Operation &) { visited[index] = false; });
+    [&](const ir::OperationIndex &index, const ir::Operation &) { visited[index] = false; });
   // for each task select the backend with the smallest earliest finishing time(eft)
   for (const auto &rank : _rank_to_op)
   {
@@ -267,9 +267,9 @@ int64_t HEScheduler::tryBackend(const ir::Operation &node, const backend::Backen
   if (!_is_profiling_mode)
   {
     VERBOSE(HEScheduler::tryBackend)
-        << "Trying to HE schedule while there is no profiling info for " << node.name()
-        << " on backend " << backend->config()->id() << ". So this backend won't be used. "
-        << std::endl;
+      << "Trying to HE schedule while there is no profiling info for " << node.name()
+      << " on backend " << backend->config()->id() << ". So this backend won't be used. "
+      << std::endl;
     _is_supported[backend][node.name()] = false;
     return _exec_time->getMax();
   }
@@ -300,7 +300,7 @@ void HEScheduler::makeRank()
   VERBOSE(HEScheduler::makeRank) << "task prioritizing" << std::endl;
 
   _graph->operations().iterate(
-      [&](const ir::OperationIndex &index, const ir::Operation &) { DFSMaxRank(index); });
+    [&](const ir::OperationIndex &index, const ir::Operation &) { DFSMaxRank(index); });
 
   // Check that ranks are calculated for all operations(nodes)
   _graph->operations().iterate([&](const ir::OperationIndex &index, const ir::Operation &) {
@@ -369,8 +369,8 @@ int64_t HEScheduler::DFSMaxRank(const ir::OperationIndex &index)
   assert(rank >= 0);
   _rank_to_op.emplace(rank, index);
   _op_to_rank->emplace(index, rank);
-  VERBOSE(HEScheduler::DFSMaxRank) << "rank of operation (" << index.value() << ")" << node.name()
-                                   << " is " << rank << std::endl;
+  VERBOSE(HEScheduler::DFSMaxRank)
+    << "rank of operation (" << index << ")" << node.name() << " is " << rank << std::endl;
 
   return rank;
 }
@@ -393,9 +393,9 @@ int64_t HEScheduler::DFSChildrenMaxRank(const ir::OperationIndex &index)
         {
           continue;
         }
-        // TODO Change it to controlflow backend
+        // TODO Change it to builtin backend
         auto transfer_cost =
-            getPermuteTime(backend, other_backend, quant, operand.info().total_size());
+          getPermuteTime(backend, other_backend, quant, operand.info().total_size());
         avg_transfer_cost += transfer_cost;
       }
     }
@@ -428,7 +428,7 @@ int64_t HEScheduler::backendAvailableTime(const backend::Backend *backend,
 
 bool HEScheduler::schedule(const ir::OperationIndex &index, const backend::Backend *parent_backend)
 {
-  VERBOSE(HEScheduler::schedule) << "scheduling (" << index.value() << ")" << std::endl;
+  VERBOSE(HEScheduler::schedule) << "scheduling (" << index << ")" << std::endl;
   int64_t eft = std::numeric_limits<int64_t>::max(), selected_exec_time = 0;
   const auto &node = _graph->operations().at(index);
 
@@ -551,18 +551,18 @@ HEScheduler::ESTAndExecTime(const backend::Backend *backend, const ir::Operation
   if (!_is_parallel_exec)
   {
     VERBOSE(HEScheduler::ESTAndExecTime)
-        << "exec_time of (" << index.value() << ") " << node.name() << " quant==" << quant << " on "
-        << backend->config()->id() << " is " << exec_time
-        << " microseconds. Data transfer cost: " << total_transfer_cost << std::endl;
+      << "exec_time of (" << index << ") " << node.name() << " quant==" << quant << " on "
+      << backend->config()->id() << " is " << exec_time
+      << " microseconds. Data transfer cost: " << total_transfer_cost << std::endl;
 
     return {total_transfer_cost, exec_time};
   }
   VERBOSE(HEScheduler::ESTAndExecTime)
-      << "exec_time of (" << index.value() << ") " << node.name() << " quant==" << quant << " on "
-      << backend->config()->id() << ": " << exec_time
-      << " microseconds. Backend available time: " << prev_op_ft
-      << " Parent's max eft: " << max_pred_eft - total_transfer_cost
-      << " data transfer cost: " << total_transfer_cost << std::endl;
+    << "exec_time of (" << index << ") " << node.name() << " quant==" << quant << " on "
+    << backend->config()->id() << ": " << exec_time
+    << " microseconds. Backend available time: " << prev_op_ft
+    << " Parent's max eft: " << max_pred_eft - total_transfer_cost
+    << " data transfer cost: " << total_transfer_cost << std::endl;
 
   return {prev_op_ft, exec_time};
 }
@@ -587,7 +587,7 @@ int64_t HEScheduler::predMaxEFT(const backend::Backend *backend, const ir::Opera
       {
         // Multiply operand size by 2 because size must describe input+output size
         int64_t transfer_cost =
-            getPermuteTime(parent_backend, backend, quant, input_operand.info().total_size() * 2);
+          getPermuteTime(parent_backend, backend, quant, input_operand.info().total_size() * 2);
         transfer_st_exec_time.emplace(_ops_eft.at(input_node_idx), transfer_cost);
       }
     }
diff --git a/runtime/onert/core/src/compiler/HEScheduler.h b/runtime/onert/core/src/compiler/HEScheduler.h
index b9cee5881..1a95b9881 100644
--- a/runtime/onert/core/src/compiler/HEScheduler.h
+++ b/runtime/onert/core/src/compiler/HEScheduler.h
@@ -50,26 +50,26 @@ public:
    * @param[in] model Graph model
    * @param[in] backend_resolver backend resolver
    */
-  HEScheduler(const backend::BackendContexts &backend_contexts, const CompilerOptions &options)
-      : _is_supported{}, _backends_avail_time{}, _ops_eft{},
-        _op_to_rank{std::make_shared<ir::OperationIndexMap<int64_t>>()},
-        _is_profiling_mode{options.he_profiling_mode},
-        _is_linear_exec{options.executor == "Linear"},
-        _is_parallel_exec{options.executor == "Parallel"}
+  HEScheduler(const std::vector<const backend::Backend *> &backends, const CompilerOptions &options)
+    : _is_supported{}, _backends_avail_time{}, _ops_eft{},
+      _op_to_rank{std::make_shared<ir::OperationIndexMap<int64_t>>()},
+      _is_profiling_mode{options.he_profiling_mode}, _is_linear_exec{options.executor == "Linear"},
+      _is_parallel_exec{options.executor == "Parallel"}
   {
-    for (auto &entry : backend_contexts)
+    for (auto entry : backends)
     {
-      if (entry.first->config()->id() == backend::controlflow::Config::ID)
+      if (entry->config()->id() == backend::builtin::Config::ID)
         continue;
-      _all_backends.push_back(entry.first);
+      _all_backends.push_back(entry);
     }
     _backend_resolver = std::make_unique<compiler::BackendResolver>();
     _exec_time = std::make_unique<exec::ExecTime>(_all_backends);
 
     // Find cpu backend
-    auto cpu_backend_it = std::find_if(
-        _all_backends.begin(), _all_backends.end(),
-        [](const backend::Backend *backend) { return backend->config()->id() == "cpu"; });
+    auto cpu_backend_it =
+      std::find_if(_all_backends.begin(), _all_backends.end(), [](const backend::Backend *backend) {
+        return backend->config()->id() == "cpu";
+      });
     if (cpu_backend_it == _all_backends.end())
       throw std::runtime_error("HEScheduler could be used only if 'cpu' backend is available");
     _cpu_backend = *cpu_backend_it;
@@ -173,7 +173,7 @@ private:
   std::unique_ptr<exec::ExecTime> _exec_time;
   const ir::Graph *_graph{nullptr};
   std::vector<const backend::Backend *> _all_backends;
-  const backend::Backend *_cpu_backend{nullptr}; // TODO Change this to controlflow_backend
+  const backend::Backend *_cpu_backend{nullptr}; // TODO Change this to _builtin_backend
   bool _is_profiling_mode;
   bool _is_linear_exec;
   bool _is_parallel_exec;
diff --git a/runtime/onert/core/src/compiler/Linear.cc b/runtime/onert/core/src/compiler/Linear.cc
index fdd2a7653..73ba96238 100644
--- a/runtime/onert/core/src/compiler/Linear.cc
+++ b/runtime/onert/core/src/compiler/Linear.cc
@@ -15,48 +15,37 @@
  */
 
 #include <algorithm>
+#include <sstream>
 
 #include "Linear.h"
 
 #include "backend/IConfig.h"
 #include "backend/Backend.h"
 #include "util/logging.h"
+#include "dumper/text/GraphDumper.h"
 
 namespace onert
 {
 namespace compiler
 {
 
-std::vector<ir::OpSequenceIndex> Linear::linearize(const compiler::LoweredGraph &lowered_graph)
+// TODO(easy) Change the LoweredGraph param to Graph
+std::vector<ir::OperationIndex> Linear::linearize(const compiler::LoweredGraph &lowered_graph)
 {
-  std::vector<ir::OpSequenceIndex> order;
-  lowered_graph.iterateTopolOpSeqs(
-      [&](const ir::OpSequenceIndex &index, const ir::OpSequence &) -> void {
-        order.emplace_back(index);
-      });
-  return order;
+  return lowered_graph.graph().topolSortOperations();
 }
 
+// TODO(easy) Change the LoweredGraph param to Graph
 void Linear::dump(const compiler::LoweredGraph &lowered_graph,
-                  const std::vector<ir::OpSequenceIndex> &order)
+                  const std::vector<ir::OperationIndex> &order)
 {
+  for (const auto ind : order)
   {
-    const auto &toString = [](const onert::backend::Backend *backend) {
-      assert(backend);
-      std::string str;
-      str += backend->config()->id();
-      return "{" + str + "}";
-    };
-
-    VERBOSE(Linear) << "Final OpSequence" << std::endl;
-    for (const auto index : order)
-    {
-      const auto &op_seq = lowered_graph.op_seqs().at(index);
-      const auto lower_info = lowered_graph.getLowerInfo(index);
-      const auto &operations = lowered_graph.graph().operations();
-      VERBOSE(Linear) << "* OP_SEQ " << toString(lower_info->backend()) << " "
-                      << ir::getStrFromOpSeq(op_seq, operations) << std::endl;
-    }
+    // TODO Could logging system can handle this? (Inserting prefix for each line)
+    std::istringstream iss{dumper::text::formatOperation(lowered_graph.graph(), ind)};
+    std::string line;
+    while (std::getline(iss, line))
+      VERBOSE(GraphDumper) << line << std::endl;
   }
 }
 
diff --git a/runtime/onert/core/src/compiler/Linear.h b/runtime/onert/core/src/compiler/Linear.h
index 56b42ccb0..9ac9a0139 100644
--- a/runtime/onert/core/src/compiler/Linear.h
+++ b/runtime/onert/core/src/compiler/Linear.h
@@ -20,29 +20,20 @@
 #include <vector>
 #include <memory>
 
-#include "ir/OpSequences.h"
 #include "ir/Index.h"
 #include "compiler/LoweredGraph.h"
 
 namespace onert
 {
-namespace ir
-{
-struct OperationVisitor;
-} // namespace ir
-} // namespace onert
-
-namespace onert
-{
 namespace compiler
 {
 
 class Linear
 {
 public:
-  static std::vector<ir::OpSequenceIndex> linearize(const compiler::LoweredGraph &lowered_graph);
+  static std::vector<ir::OperationIndex> linearize(const compiler::LoweredGraph &lowered_graph);
   static void dump(const compiler::LoweredGraph &lowered_graph,
-                   const std::vector<ir::OpSequenceIndex> &order);
+                   const std::vector<ir::OperationIndex> &order);
 };
 
 } // namespace compiler
diff --git a/runtime/onert/core/src/compiler/LoweredGraph.cc b/runtime/onert/core/src/compiler/LoweredGraph.cc
index 6d5210dc5..b469b991b 100644
--- a/runtime/onert/core/src/compiler/LoweredGraph.cc
+++ b/runtime/onert/core/src/compiler/LoweredGraph.cc
@@ -17,6 +17,7 @@
 #include "compiler/LoweredGraph.h"
 
 #include <assert.h>
+#include <algorithm>
 #include <sstream>
 #include "util/logging.h"
 #include "compiler/pass/ConstantInsertionPass.h"
@@ -25,7 +26,7 @@
 #include "compiler/pass/PermutationOperationPass.h"
 #include "compiler/pass/PermutationInsertionPass.h"
 #include "compiler/pass/PermutationEliminationPass.h"
-#include "ir/GraphIterator.h"
+#include "dumper/text/GraphDumper.h"
 #include "ir/verifier/Verifier.h"
 #include "backend/Backend.h"
 #include "backend/IConfig.h"
@@ -48,16 +49,8 @@ LoweredGraph::LoweredGraph(const ir::Graph &graph, const CompilerOptions &option
     options.tracing_ctx->setSubgraphIndex(&_graph, subgraph_index.value());
   }
 
-  bool linear_executor = (options.executor == "Linear");
-
   // Build backend contexts
   auto &backend_manager = BackendManager::get();
-
-  // Always create Controlflow backend context
-  auto cf_backend = backend_manager.getControlflow();
-  _backend_contexts.emplace(
-      cf_backend, cf_backend->newContext(_graph, _graph.getKernelBuilder(), linear_executor));
-
   // Create contexts for other backends
   for (auto backend_str : options.backend_list)
   {
@@ -72,9 +65,6 @@ LoweredGraph::LoweredGraph(const ir::Graph &graph, const CompilerOptions &option
       VERBOSE(LoweredGraph) << "Cannot load backend - " << backend_str << std::endl;
       continue;
     }
-
-    _backend_contexts.emplace(
-        backend, backend->newContext(_graph, _graph.getKernelBuilder(), linear_executor));
   }
   if (backend_manager.num_backends() == 0)
     throw std::runtime_error{"No available backends loaded."};
@@ -82,305 +72,115 @@ LoweredGraph::LoweredGraph(const ir::Graph &graph, const CompilerOptions &option
   // TODO Move "schedule" phase out of here
   // Schedule
   std::unique_ptr<BackendResolver> backend_resolver;
+  auto all_backends = backend_manager.getAll();
   if (options.he_scheduler)
   {
-    auto scheduler = HEScheduler(_backend_contexts, options);
+    auto scheduler = HEScheduler(all_backends, options);
     backend_resolver = scheduler.schedule(_graph);
     _indexed_ranks = scheduler.getIndexedRanks();
   }
   else
   {
-    auto scheduler = ManualScheduler(_backend_contexts, options);
+    auto scheduler = ManualScheduler(all_backends, options);
     backend_resolver = scheduler.schedule(_graph);
   }
 
-  {
-    // operand::LowerInfo holder
-    ir::OperandIndexMap<std::unique_ptr<ir::operand::LowerInfo>> operands_lower_info;
-
-    _graph.operands().iterate([&](const ir::OperandIndex &index, const ir::Operand &) {
-      operands_lower_info[index] = std::make_unique<ir::operand::LowerInfo>();
-    });
-
-    // Make op_seqs while checking whether a node can be merged into a op_seq.
-    makeOpSequences(operands_lower_info, options, *backend_resolver);
-
-    _op_seqs.iterate([&](const ir::OpSequenceIndex &, ir::OpSequence &op_seq) {
-      assert(op_seq.operations().size() > 0);
-      std::reverse(std::begin(op_seq.operations()), std::end(op_seq.operations()));
-    });
+  makeLowerInfo(*backend_resolver);
+  VERBOSE(LoweredGraph) << "dump before mandatory passes" << std::endl;
+  dumper::text::dumpLoweredGraph(*this);
 
-    VERBOSE(OpSequences) << "dump before permutation insertion" << std::endl;
-    dumpOpSequences(_op_seqs, _graph.operations());
-
-    // Mandatory passes
-    pass::PassRunner{}
-        .append(std::make_unique<pass::ConstantInsertionPass>(*this))
-        .append(std::make_unique<pass::ConstantLoweringPass>(*this))
-        .run();
-
-    // Set LowerInfo for each operand from the operand::LowerInfo holder
-    manipulateLowerInfo(operands_lower_info);
-
-    dumpLowerInfo();
-  }
-
-  // Mandatory passes
+  // Mandatory passes - kind of legalization(?)
   pass::PassRunner{}
-      .append(std::make_unique<pass::PermutationOperationPass>(*this))
-      .append(std::make_unique<pass::PermutationInsertionPass>(*this))
-      .run();
+    .append(std::make_unique<pass::ConstantInsertionPass>(*this))
+    .append(std::make_unique<pass::ConstantLoweringPass>(*this))
+    .append(std::make_unique<pass::PermutationOperationPass>(*this))
+    .append(std::make_unique<pass::PermutationInsertionPass>(*this))
+    .run();
+
+  dumpLowerInfo();
 
-  // Optimization passes
+  // Optimization passes (optional)
   pass::PassRunner{}.append(std::make_unique<pass::PermutationEliminationPass>(*this)).run();
 
-  VERBOSE(LoweredGraph) << "Dump after permutation insertion" << std::endl;
+  VERBOSE(LoweredGraph) << "Dump after all the passes" << std::endl;
   for (auto operand : _graph.getInputs())
     VERBOSE(LoweredGraph) << "Graph Input : " << operand << std::endl;
   for (auto operand : _graph.getOutputs())
     VERBOSE(LoweredGraph) << "Graph Output : " << operand << std::endl;
-  dumpOpSequences(_op_seqs, _graph.operations());
+  dumper::text::dumpLoweredGraph(*this);
 
   // Graph verifications
   {
     assert(ir::verifier::InputOutputChecker().verify(_graph));
     assert(ir::verifier::DAGChecker().verify(_graph));
-    assert(ir::verifier::EdgeConsistencyChecker().verify(_graph));
+    assert(ir::verifier::EdgeChecker().verify(_graph));
   }
 }
 
-const ir::operation::LowerInfo *
-LoweredGraph::getLowerInfo(const ir::OpSequenceIndex &op_seq_index) const
-{
-  auto itr = _lower_info_map.op_seq.find(op_seq_index);
-  if (itr == _lower_info_map.op_seq.end())
-    return nullptr;
-  return itr->second.get();
-}
-
-void LoweredGraph::setLowerInfo(const ir::OpSequenceIndex &op_seq_index,
-                                std::unique_ptr<ir::operation::LowerInfo> &&lower_info)
+void LoweredGraph::makeLowerInfo(const compiler::BackendResolver &backend_resolver)
 {
-  _lower_info_map.op_seq.insert(std::make_pair(op_seq_index, std::move(lower_info)));
-}
+  _graph.operands().iterate([&](const ir::OperandIndex &index, const ir::Operand &) {
+    lower_info().operand.set(index, std::make_unique<OperandLowerInfo>());
+  });
 
-void LoweredGraph::removeLowerInfo(const ir::OpSequenceIndex &op_seq_index)
-{
-  auto &op_seq_lower_info = _lower_info_map.op_seq;
-  assert(op_seq_lower_info.find(op_seq_index) != op_seq_lower_info.end());
-  for (auto it = op_seq_lower_info.begin(); it != op_seq_lower_info.end(); ++it)
-  {
-    if (it->first == op_seq_index)
+  // Set operand lower info using assigned backends to operations
+  _graph.operations().iterate([&](const ir::OperationIndex &op_ind, const ir::Operation &) {
+    const ir::Operation &op = _graph.operations().at(op_ind);
+    auto backend = backend_resolver.getBackend(op_ind);
+    if (!backend)
     {
-      op_seq_lower_info.erase(it);
-      break;
+      throw std::runtime_error{"Fail to find backend for " + op.name() + " operation"};
     }
-  }
-}
-
-const ir::operand::LowerInfo *LoweredGraph::getLowerInfo(const ir::OperandIndex &index) const
-{
-  auto itr = _lower_info_map.operand.find(index);
-  if (itr == _lower_info_map.operand.end())
-    return nullptr;
-  return itr->second.get();
-}
-
-ir::operand::LowerInfo *LoweredGraph::getLowerInfo(const ir::OperandIndex &index)
-{
-  auto itr = _lower_info_map.operand.find(index);
-  if (itr == _lower_info_map.operand.end())
-    return nullptr;
-  return itr->second.get();
-}
-
-void LoweredGraph::setLowerInfo(const ir::OperandIndex &index,
-                                std::unique_ptr<ir::operand::LowerInfo> &&lower_info)
-{
-  _lower_info_map.operand.insert(std::make_pair(index, std::move(lower_info)));
-}
-
-void LoweredGraph::removeLowerInfo(const ir::OperandIndex &index)
-{
-  _lower_info_map.operand.erase(index);
-}
-
-void LoweredGraph::iterateTopolOpSeqs(
-    const std::function<void(const ir::OpSequenceIndex &, const ir::OpSequence &)> &fn) const
-{
-  // Topological Sorting for ir::OpSequences
-  std::vector<ir::OpSequenceIndex> topol_sorted;
-  ir::PostDfsIterator<true>{}.iterateOpSeqs(
-      *this, [&](const ir::OpSequenceIndex &index, const ir::OpSequence &) {
-        topol_sorted.emplace_back(index);
-      });
-  std::reverse(topol_sorted.begin(), topol_sorted.end());
-  for (const auto op_seq_idx : topol_sorted)
-  {
-    const auto &op_seq = _op_seqs.at(op_seq_idx);
-    fn(op_seq_idx, op_seq);
-  }
-}
-
-void LoweredGraph::iterateTopolOpSeqs(
-    const std::function<void(const ir::OpSequenceIndex &, ir::OpSequence &)> &fn)
-{
-  // Topological Sorting for ir::OpSequences
-  std::vector<ir::OpSequenceIndex> topol_sorted;
-  ir::PostDfsIterator<false>{}.iterateOpSeqs(
-      *this, [&](const ir::OpSequenceIndex &index, ir::OpSequence &) {
-        topol_sorted.emplace_back(index);
-      });
-  std::reverse(topol_sorted.begin(), topol_sorted.end());
-  for (const auto op_seq_idx : topol_sorted)
-  {
-    auto &op_seq = _op_seqs.at(op_seq_idx);
-    fn(op_seq_idx, op_seq);
-  }
-}
-
-ir::OpSequenceIndex LoweredGraph::appendFreshSingleOpSequence(const ir::OperationIndex &node_index,
-                                                              const ir::Operation &node)
-{
-  // Create a fresh op_seq with one operation, and append it to op_seqs
-  // Create a fresh op_seq
-  auto op_seq = std::make_unique<ir::OpSequence>(_graph.layout());
-
-  // Add an operation
-  op_seq->appendOperation(node_index);
-
-  // Update input/output
-  op_seq->setOutputs(node.getOutputs());
-  op_seq->setInputs(node.getInputs());
-
-  return _op_seqs.emplace(std::move(op_seq));
-}
-
-void LoweredGraph::makeOpSequences(
-    ir::OperandIndexMap<std::unique_ptr<ir::operand::LowerInfo>> &operands_lower_info,
-    const CompilerOptions &options, const BackendResolver &backend_resolver)
-{
-  // if SUBG_MAX_NODE == 0, no limit on nodes of a op_seq
-  const int op_seq_max_node = options.op_seq_max_node;
-  assert(op_seq_max_node >= 0);
-
-  bool is_profiling = options.he_profiling_mode;
-  ir::OpSequence *op_seq = nullptr;
-  ir::OpSequenceIndex op_seq_index;
-
-  // NOTE: The below method appends nodes while making one op_seq if needed. If something better
-  // ways, happy to update this code.
-  ir::PostDfsConstIterator{}.iterate(
-      _graph, [&](const ir::OperationIndex &node_index, const ir::Operation &node) {
-        // LowerInfo for in/output operands
-        auto backend = backend_resolver.getBackend(node_index);
-
-        // Get frontend's layout
-        auto frontend_layout = _graph.layout();
-
-        // The layout of each backend should be set at another place
-        // TODO Change setting layout of each backend at another place
-        auto backend_layout = backend->config()->supportLayout(node, frontend_layout);
-
-        for (auto operand : node.getInputs() | ir::Remove::UNDEFINED)
-        {
-          auto &&lower_info = operands_lower_info.at(operand);
-          lower_info->addUsePermuteFactor(ir::operand::PermuteFactor{backend, backend_layout});
-        }
-        for (auto operand : node.getOutputs() | ir::Remove::UNDEFINED)
-        {
-          auto &&lower_info = operands_lower_info.at(operand);
-          lower_info->addDefPermuteFactor(ir::operand::PermuteFactor{backend, backend_layout});
-        }
-
-        bool new_op_seq = (op_seq == nullptr ||
-                           (op_seq_max_node != 0 &&
-                            op_seq->operations().size() >= static_cast<size_t>(op_seq_max_node)));
 
-        // for profiling each op_seq must contain just one node,
-        // so that we can measure a node separately
-        if (new_op_seq || is_profiling ||
-            !mergeable(op_seq_index, node_index, backend_layout, backend_resolver))
-        {
-          auto new_op_seq_index = appendFreshSingleOpSequence(node_index, node);
-
-          // ir::OpSequence LowerInfo
-          setLowerInfo(new_op_seq_index,
-                       std::make_unique<ir::operation::LowerInfo>(backend, backend_layout));
-
-          op_seq_index = new_op_seq_index;
-          op_seq = &(_op_seqs.at(new_op_seq_index));
-
-          VERBOSE(Lower) << "OpSequence#" << op_seq_index.value() << " is created for "
-                         << "NODE#" << node_index.value() << "(" << node.name() << ")" << std::endl;
-        }
-        else
-        {
-          op_seq->appendOperation(node_index);
-          // Set inputs
-          auto new_inputs = node.getInputs();
-          // Add inputs except outputs of the previous node
-          for (auto ind : op_seq->getInputs())
-          {
-            if (!node.getOutputs().contains(ind))
-              new_inputs.append(ind);
-          }
-          op_seq->setInputs(new_inputs);
+    auto frontend_layout = _graph.layout();
 
-          VERBOSE(Lower) << "OpSequence#" << op_seq_index.value() << " merges "
-                         << "NODE#" << node_index.value() << "(" << node.name() << ")" << std::endl;
-        }
-      });
-}
+    // The layout of each backend should be set at another place
+    // TODO Change setting layout of each backend at another place
+    auto backend_layout = backend->config()->supportLayout(op, frontend_layout);
 
-void LoweredGraph::manipulateLowerInfo(
-    ir::OperandIndexMap<std::unique_ptr<ir::operand::LowerInfo>> &operands_lower_info)
-{
-  const auto controlflow_backend = BackendManager::get().getControlflow();
+    for (auto ind : op.getInputs() | ir::Remove::UNDEFINED)
+    {
+      auto &operand_li = lower_info().operand.at(ind);
+      operand_li.addUsePermuteFactor(PermuteFactor{backend, backend_layout});
+    }
+    for (auto ind : op.getOutputs() | ir::Remove::UNDEFINED)
+    {
+      auto &operand_li = lower_info().operand.at(ind);
+      operand_li.addDefPermuteFactor(PermuteFactor{backend, backend_layout});
+    }
+    lower_info().operation.set(
+      op_ind, std::make_unique<compiler::OperationLowerInfo>(backend, backend_layout));
+  });
 
-  // TODO Rather than using NHWC Get frontend layout of this node from IR
-  auto factor = ir::operand::PermuteFactor{controlflow_backend, ir::Layout::NHWC};
+  // Handle graph inputs and outputs
+  const auto builtin_backend = BackendManager::get().getBuiltin();
+  auto factor = PermuteFactor{builtin_backend, _graph.layout()};
   for (auto index : _graph.getInputs() | ir::Remove::UNDEFINED)
   {
-    auto &&lower_info = operands_lower_info.at(index);
-    assert(lower_info->def_factors().empty());
-    lower_info->addDefPermuteFactor(factor);
-  }
-  for (auto index : _graph.getOutputs() | ir::Remove::UNDEFINED)
-  {
-    auto &&lower_info = operands_lower_info.at(index);
-    lower_info->addUsePermuteFactor(factor);
+    auto &operand_li = lower_info().operand.at(index);
+    assert(operand_li.def_factors().empty());
+    operand_li.addDefPermuteFactor(factor);
   }
   for (auto index : _graph.getOutputs() | ir::Remove::UNDEFINED)
   {
-    auto &&lower_info = operands_lower_info.at(index);
-    if (lower_info->def_factors().size() == 0)
-    {
-      // In case of that an operand is Graph's output and not input or output of any operation
-      lower_info->addDefPermuteFactor(ir::operand::PermuteFactor{
-          controlflow_backend,
-          ir::Layout::NHWC // TODO Get frontend layout of this node from IR
-      });
-    }
+    auto &operand_li = lower_info().operand.at(index);
+    operand_li.addUsePermuteFactor(factor);
   }
 
-  // 1. Add def of variable operand
-  // 2. Set LowerInfo for each operand from the operand::LowerInfo holder
+  // Handle variable tensors
   _graph.operands().iterate([&](const ir::OperandIndex &index, ir::Operand &operand) {
     // Some inputs of an operation could be non-constant, but not existed in graph inputs/outputs
-    // and not undefined operand. Those inputs must have exist as a Tensor. For example,
-    // UnidirectionalSequenceLSTM operation could have state inputs such as it.
+    // and not undefined operand - these are variable tensors. For example,
+    // UnidirectionalSequenceLSTM has such inputs.
     if (operand.info().isVariable())
     {
       // The variable operand with buffer is not supported yet
       assert(operand.data() == nullptr);
       assert(operand.getUses().size() == 1 && !operand.getDef().valid());
-      auto &lowered_info = operands_lower_info[index];
-      assert(lowered_info->def_factors().empty());
-      lowered_info->addDefPermuteFactor(lowered_info->use_factors().getOnlyElement());
+      auto operand_li = lower_info().operand.at(index);
+      assert(operand_li.def_factors().empty());
+      operand_li.addDefPermuteFactor(operand_li.use_factors().getOnlyElement());
     }
-
-    setLowerInfo(index, std::move(operands_lower_info[index]));
   });
 }
 
@@ -392,10 +192,20 @@ void LoweredGraph::dumpLowerInfo()
   std::map<uint32_t, std::string> dumps;
 
   _graph.operands().iterate([&](const ir::OperandIndex &index, ir::Operand &object) {
-    std::stringstream sstream;
-    if (!getLowerInfo(index)->def_factors().empty() || !getLowerInfo(index)->use_factors().empty())
+    const auto operand_lower_info = lower_info().operand.getRawPtr(index);
+    assert(operand_lower_info);
+    if (!operand_lower_info->def_factors().empty() || !operand_lower_info->use_factors().empty())
     {
-      auto factors_to_string = [](const ir::operand::PermuteFactorSet &factors) {
+      auto shape_to_string = [](const ir::Shape &shape) {
+        std::stringstream sstream;
+        sstream << "{ ";
+        for (auto i = 0; i < shape.rank(); ++i)
+          sstream << (shape.dim(i)) << " ";
+        sstream << "}";
+        return sstream.str();
+      };
+
+      auto factors_to_string = [](const PermuteFactorSet &factors) {
         std::string str;
         for (auto factor : factors)
         {
@@ -406,161 +216,44 @@ void LoweredGraph::dumpLowerInfo()
         return "{ " + str + "}";
       };
 
-      auto operation_index_to_string = [](const ir::OperationIndexSet &operations) {
-        std::string str;
+      auto operation_index_set_to_string = [](const ir::OperationIndexSet &operations) {
+        std::stringstream sstream;
+        sstream << "{ ";
         for (auto op : operations)
-        {
-          str += std::to_string(op.value());
-          str += " ";
-        }
-        return "{ " + str + "}";
+          sstream << op << " ";
+        sstream << "}";
+        return sstream.str();
+      };
+
+      auto data_to_str = [](const ir::Data *data) {
+        return (data ? (std::to_string(data->size()) + " bytes") : "N/A");
       };
 
-      const auto lower_info = getLowerInfo(index);
-      const auto &shape = object.shape();
-      std::string def_ops =
-          object.getDef().valid() ? std::to_string(object.getDef().value()) : "N/A";
-      std::string use_ops = operation_index_to_string(object.getUses());
-      std::string def_layouts = factors_to_string(lower_info->def_factors());
-      std::string use_layouts = factors_to_string(lower_info->use_factors());
-      sstream << "Operand #" << index.value() << " LowerInfo" << std::endl;
-      sstream << "  - Shape           : { ";
-      for (auto i = 0; i < shape.rank(); ++i)
-      {
-        sstream << (shape.dim(i)) << " ";
-      }
-      sstream << "}" << std::endl;
-      sstream << "  - Def Operations  : " << def_ops << std::endl;
-      sstream << "  - Use Operations  : " << use_ops << std::endl;
-      sstream << "  - Data            : "
-              << (object.data() ? (std::to_string(object.data()->size()) + " bytes") : "N/A")
-              << std::endl;
-      sstream << "  - Lower Info" << std::endl;
-      sstream << "    - Def Backends    : " << def_layouts << std::endl;
-      sstream << "    - Use Backends    : " << use_layouts << std::endl;
+      std::string shape_str = shape_to_string(object.shape());
+      std::string def_op = operation_index_set_to_string({object.getDef()});
+      std::string use_ops = operation_index_set_to_string(object.getUses());
+      std::string def_factors = factors_to_string(operand_lower_info->def_factors());
+      std::string use_factors = factors_to_string(operand_lower_info->use_factors());
+      std::stringstream sstream;
+      sstream << "Operand " << index << " Info" << std::endl;
+      sstream << "  - Shape     : " << shape_str << std::endl;
+      sstream << "  - Def/Uses  : Def " << def_op << " Uses " << use_ops << std::endl;
+      sstream << "  - Data      : " << data_to_str(object.data()) << std::endl;
+      sstream << "  - LowerInfo : Def " << def_factors << " Uses " << use_factors << std::endl;
+      dumps.emplace(index.value(), sstream.str());
     }
-    dumps.emplace(index.value(), sstream.str());
   });
 
   for (const auto &e : dumps)
   {
     if (!e.second.empty())
     {
-      VERBOSE(Lower) << e.second;
-    }
-  }
-}
-
-bool LoweredGraph::mergeable(const ir::OpSequenceIndex &op_seq_index,
-                             const ir::OperationIndex &node_index, ir::Layout layout,
-                             const BackendResolver &backend_resolver)
-{
-  // Are they mergeable?
-  // 1. the same backend id and layout?
-  // 2. Is op_seq or node branched?
-  // 3. if 1 is true, the op_seq and a node are connected?
-  const auto &op_seq = _op_seqs.at(op_seq_index);
-  const auto &node = _graph.operations().at(node_index);
-
-  // The same backend id and layout?
-  {
-    const auto op_seq_backend_layout = getLowerInfo(op_seq_index)->layout();
-    const auto &op_seq_backend_id = getLowerInfo(op_seq_index)->backend()->config()->id();
-    const auto &node_backend_id = backend_resolver.getBackend(node_index)->config()->id();
-    VERBOSE(Lower) << "OpSequence#" << op_seq_index.value() << " { " << op_seq_backend_id << "("
-                   << to_string(op_seq_backend_layout) << ") } "
-                   << " NODE#" << node_index.value() << " (" << node.name() << ") { "
-                   << node_backend_id << "(" << to_string(layout) << ") } " << std::endl;
-    if (op_seq_backend_id != node_backend_id || op_seq_backend_layout != layout)
-      return false;
-  }
-
-  // Branched?
-  {
-    std::unordered_set<ir::OperationIndex> branched_set;
-
-    // Check for branching up
-    for (const auto &input : op_seq.getInputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED)
-    {
-      const auto &input_obj = _graph.operands().at(input);
-      auto def = input_obj.getDef();
-      if (def.valid())
-      {
-        branched_set.insert(def);
-        if (branched_set.size() > 1)
-        {
-          return false;
-        }
-      }
-    }
-    branched_set.clear();
-
-    // Check for branching down
-    for (const auto &output : node.getOutputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED)
-    {
-      // TODO Fix this workaround for the case of model outputs that are used by another operation
-      //      This is needed since the branching is decided by operation, but for model outputs,
-      //      there is controlflow backen(use backend) but no actual use operation exists
-      if (_graph.getOutputs().contains(output))
-        return false;
-
-      const auto &output_obj = _graph.operands().at(output);
-      for (const auto &use : output_obj.getUses())
-      {
-        branched_set.insert(use);
-        if (branched_set.size() > 1)
-        {
-          return false;
-        }
-      }
-    }
-  }
-
-  // Connected?
-  // an input of one node is an output of the other node? or vice-versa?
-  {
-    const auto &node_inputs = node.getInputs();
-    const auto &node_outputs = node.getOutputs();
-
-    // op_seq's operations are in order so that we just check the first and the last
-    std::vector<ir::OperationIndex> op_seq_ops{op_seq.operations()[0]};
-    if (op_seq.operations().size() > 1)
-      op_seq_ops.emplace_back(op_seq.operations()[op_seq.operations().size() - 1]);
-
-    for (const auto &n_index : op_seq_ops)
-    {
-      const auto &n = _graph.operations().at(n_index);
-
-      // node's output == op_seq's input?
-      for (const auto input : n.getInputs() | ir::Remove::UNDEFINED)
-      {
-        if (node_outputs.contains(input))
-        {
-          VERBOSE(Lower) << "OpSequence#" << op_seq_index.value() << " 's NODE#" << n_index.value()
-                         << "(" << n.name() << ") is connected to NODE#" << node_index.value()
-                         << "(" << node.name() << ")" << std::endl;
-          return true;
-        }
-      }
-
-      // node's input == op_seq's output?
-      for (const auto output : n.getOutputs() | ir::Remove::UNDEFINED)
-      {
-        if (node_inputs.contains(output))
-        {
-          VERBOSE(Lower) << "OpSequence#" << op_seq_index.value() << " 's NODE#" << n_index.value()
-                         << " (" << n.name() << ") is connected to NODE#" << node_index.value()
-                         << std::endl;
-          return true;
-        }
-      }
+      std::istringstream iss(e.second);
+      std::string line;
+      while (std::getline(iss, line))
+        VERBOSE(Lower) << line << std::endl;
     }
-
-    VERBOSE(Lower) << "OpSequence#" << op_seq_index.value() << " is not connected to NODE#"
-                   << node_index.value() << "(" << node.name() << ")" << std::endl;
   }
-
-  return false;
 }
 
 } // namespace compiler
diff --git a/runtime/onert/core/src/compiler/ManualScheduler.cc b/runtime/onert/core/src/compiler/ManualScheduler.cc
index 1f4a47864..af2d84cd9 100644
--- a/runtime/onert/core/src/compiler/ManualScheduler.cc
+++ b/runtime/onert/core/src/compiler/ManualScheduler.cc
@@ -29,9 +29,9 @@ namespace onert
 namespace compiler
 {
 
-ManualScheduler::ManualScheduler(const backend::BackendContexts &backend_contexts,
+ManualScheduler::ManualScheduler(const std::vector<const backend::Backend *> &backends,
                                  const compiler::CompilerOptions &options)
-    : _backend_contexts{backend_contexts}, _options{options}
+  : _backends{backends}, _options{options}
 {
 }
 
@@ -88,23 +88,21 @@ std::unique_ptr<BackendResolver> ManualScheduler::schedule(const ir::Graph &grap
     try
     {
       graph.operations().at(key); // Check if exist, or this will throw
-      backend_resolver->setBackend(
-          key, BackendManager::get().get(
-                   val)); // TODO Ensure this backend is available in backend contexts
+      backend_resolver->setBackend(key, BackendManager::get().get(val));
     }
     catch (...)
     {
-      VERBOSE(ManualScheduler) << "Invalid value while OperationIndex to Backend mapping : @"
-                               << key.value() << " -> \"" << val << "\"" << std::endl;
+      VERBOSE(ManualScheduler) << "Invalid value while OperationIndex to Backend mapping : @" << key
+                               << " -> \"" << val << "\"" << std::endl;
     }
   }
 
   // Dump final assignment
   WHEN_LOG_ENABLED(backend_resolver->iterate(
-      [&](const ir::OperationIndex &index, const backend::Backend &backend) {
-        VERBOSE(ManualScheduler) << "backend for operation #" << index.value() << ": "
-                                 << backend.config()->id() << std::endl;
-      }));
+    [&](const ir::OperationIndex &index, const backend::Backend &backend) {
+      VERBOSE(ManualScheduler) << "backend for " << index << ": " << backend.config()->id()
+                               << std::endl;
+    }));
 
   return backend_resolver;
 }
@@ -114,7 +112,7 @@ const backend::Backend *ManualScheduler::resolveBackend(const std::string &id,
 {
   // Ensure if the backend is available in the current backend context
   const backend::Backend *backend = BackendManager::get().get(id);
-  if (!backend || _backend_contexts.find(backend) == _backend_contexts.end())
+  if (!backend || std::find(_backends.begin(), _backends.end(), backend) == _backends.end())
   {
     backend = fallback;
   }
diff --git a/runtime/onert/core/src/compiler/ManualScheduler.h b/runtime/onert/core/src/compiler/ManualScheduler.h
index 41503f7ff..18732d744 100644
--- a/runtime/onert/core/src/compiler/ManualScheduler.h
+++ b/runtime/onert/core/src/compiler/ManualScheduler.h
@@ -28,7 +28,7 @@ namespace compiler
 class ManualScheduler : public IScheduler
 {
 public:
-  ManualScheduler(const backend::BackendContexts &backend_contexts,
+  ManualScheduler(const std::vector<const backend::Backend *> &backends,
                   const compiler::CompilerOptions &options);
   std::unique_ptr<BackendResolver> schedule(const ir::Graph &graph) override;
 
@@ -37,7 +37,7 @@ private:
                                          const backend::Backend *fallback = nullptr);
 
 private:
-  const backend::BackendContexts &_backend_contexts;
+  std::vector<const backend::Backend *> _backends;
   compiler::CompilerOptions _options;
 };
 
diff --git a/runtime/onert/core/src/ir/operation/LowerInfo.cc b/runtime/onert/core/src/compiler/OperationLowerInfo.cc
index 249918bd6..e8a438130 100644
--- a/runtime/onert/core/src/ir/operation/LowerInfo.cc
+++ b/runtime/onert/core/src/compiler/OperationLowerInfo.cc
@@ -14,21 +14,18 @@
  * limitations under the License.
  */
 
-#include "ir/operation/LowerInfo.h"
+#include "compiler/OperationLowerInfo.h"
 
 namespace onert
 {
-namespace ir
-{
-namespace operation
+namespace compiler
 {
 
-LowerInfo::LowerInfo(const backend::Backend *backend, Layout layout)
-    : _permute_factor{backend, layout}
+OperationLowerInfo::OperationLowerInfo(const backend::Backend *backend, ir::Layout layout)
+  : _permute_factor{backend, layout}
 {
   // DO NOTHING
 }
 
-} // namespace operation
-} // namespace ir
+} // namespace compiler
 } // namespace onert
diff --git a/runtime/onert/core/src/compiler/ParamChecker.h b/runtime/onert/core/src/compiler/ParamChecker.h
deleted file mode 100644
index 61429d521..000000000
--- a/runtime/onert/core/src/compiler/ParamChecker.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file ParamChecker.h
- * @brief This file contains ParamChecker to check\n
- *        operations' parameters are compilable at machine independent phase\n
- *        ex) Check param is constant
- */
-#ifndef __ONERT_COMPILER_PARAM_CHECKER_H__
-#define __ONERT_COMPILER_PARAM_CHECKER_H__
-
-#include "ir/OperationVisitor.h"
-
-namespace onert
-{
-namespace ir
-{
-class Graph;
-} // namespace ir
-} // namespace onert
-
-namespace onert
-{
-namespace compiler
-{
-
-class ParamChecker : public ir::OperationVisitor
-{
-public:
-  /**
-   * @brief Construct a new Param Checker object (deleted)
-   */
-  ParamChecker(void) = delete;
-  /**
-   * @brief Construct a new Param Checker object
-   * @param[in] model Graph model to check
-   */
-  ParamChecker(std::shared_ptr<ir::Graph> model) : _model{model} {}
-
-public:
-  /**
-   * @brief Run parameter analysis
-   */
-  void operator()();
-  /**
-   * @brief   Return analysis result if model have non-const parameter
-   * @return  @c true if there is non-const parameter, otherwise @c false
-   */
-  bool haveNoneConstParam(void) { return _nonConstParam; }
-
-private:
-  const std::shared_ptr<ir::Graph> _model;
-  bool _nonConstParam{false};
-};
-
-} // namespace compiler
-} // namespace onert
-
-#endif // __ONERT_COMPILER_OPERATION_VALIDATOR_H__
diff --git a/runtime/onert/core/src/compiler/PermuteFactor.cc b/runtime/onert/core/src/compiler/PermuteFactor.cc
new file mode 100644
index 000000000..f0081a2a4
--- /dev/null
+++ b/runtime/onert/core/src/compiler/PermuteFactor.cc
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compiler/PermuteFactor.h"
+
+#include <assert.h>
+#include <ostream>
+
+#include "backend/Backend.h"
+
+std::ostream &operator<<(std::ostream &os, const onert::compiler::PermuteFactor &obj)
+{
+  assert(obj.backend() && obj.backend()->config());
+  return os << "(" << obj.backend()->config()->id() << "/" << to_string(obj.layout()) << ")";
+}
diff --git a/runtime/onert/core/src/compiler/ShapeValidator.cc b/runtime/onert/core/src/compiler/ShapeValidator.cc
index e0c9f5283..1c7000986 100644
--- a/runtime/onert/core/src/compiler/ShapeValidator.cc
+++ b/runtime/onert/core/src/compiler/ShapeValidator.cc
@@ -19,8 +19,6 @@
 #include <typeinfo>
 
 #include "ir/Graph.h"
-#include "ir/operation/LowerInfo.h"
-
 #include "util/logging.h"
 #include "util/Utils.h"
 
@@ -37,7 +35,7 @@ namespace compiler
 {
 
 ShapeValidator::ShapeValidator(const ir::Graph &graph)
-    : _graph{graph}, _ctx{graph.operands()}, _current_layout{ir::Layout::UNKNOWN}
+  : _graph{graph}, _ctx{graph.operands()}, _current_layout{ir::Layout::UNKNOWN}
 {
 }
 
@@ -62,7 +60,7 @@ void ShapeValidator::operator()()
   _current_layout = _graph.layout();
 
   _graph.operations().iterate(
-      [&](const ir::OperationIndex &, const ir::Operation &node) { node.accept(*this); });
+    [&](const ir::OperationIndex &, const ir::Operation &node) { node.accept(*this); });
 }
 
 void ShapeValidator::visit(const ir::operation::BatchMatMul &node)
@@ -88,7 +86,7 @@ void ShapeValidator::visit(const ir::operation::BatchToSpaceND &node)
 
   const auto ifm_index{node.getInputs().at(ir::operation::BatchToSpaceND::Input::INPUT)};
   const auto block_size_index{
-      node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
+    node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
 
   const auto frontend_layout = _current_layout;
   const auto input_shape = _ctx.at(ifm_index).shape().asFeature(frontend_layout);
@@ -120,11 +118,11 @@ void ShapeValidator::visit(const ir::operation::BCQFullyConnected &node)
 
   const auto ifm_index{node.getInputs().at(ir::operation::BCQFullyConnected::Input::INPUT)};
   const auto weight_scales_index{
-      node.getInputs().at(ir::operation::BCQFullyConnected::Input::WEIGHTS_SCALES)};
+    node.getInputs().at(ir::operation::BCQFullyConnected::Input::WEIGHTS_SCALES)};
   const auto weight_binary_index{
-      node.getInputs().at(ir::operation::BCQFullyConnected::Input::WEIGHTS_BINARY)};
+    node.getInputs().at(ir::operation::BCQFullyConnected::Input::WEIGHTS_BINARY)};
   const auto weight_cluster_index{
-      node.getInputs().at(ir::operation::BCQFullyConnected::Input::WEIGHTS_CLUSTERS)};
+    node.getInputs().at(ir::operation::BCQFullyConnected::Input::WEIGHTS_CLUSTERS)};
   // const auto bias_index{node.getInputs().at(ir::operation::BCQFullyConnected::Input::BIAS)};
 
   OP_REQUIRES(_ctx.at(ifm_index).shape().rank() == 2);
@@ -153,7 +151,7 @@ void ShapeValidator::visit(const ir::operation::BCQGather &node)
   const auto input_binary_index{node.getInputs().at(ir::operation::BCQGather::Input::INPUT_BINARY)};
   const auto input_scales_index{node.getInputs().at(ir::operation::BCQGather::Input::INPUT_SCALES)};
   const auto input_clusters_index{
-      node.getInputs().at(ir::operation::BCQGather::Input::INPUT_CLUSTERS)};
+    node.getInputs().at(ir::operation::BCQGather::Input::INPUT_CLUSTERS)};
 
   OP_REQUIRES(_ctx.at(indices_index).shape().rank() <= 2); // TODO : support rank up to 4 or more
   OP_REQUIRES(_ctx.at(input_binary_index).shape().rank() == 2);
@@ -256,13 +254,12 @@ void ShapeValidator::visit(const ir::operation::Reduce &node)
     {
       // Reducing C or
       // (Reducing H and C(input and output) == 1) or (Reducing W and C(input and output) == 1)
-      OP_REQUIRES((input_shape.dim(0) == output_shape.dim(0) &&
-                   input_shape.dim(1) == output_shape.dim(1) &&
-                   input_shape.dim(2) == output_shape.dim(2)) ||
-                  (input_shape.dim(0) == output_shape.dim(0) &&
-                   (input_shape.dim(1) == output_shape.dim(1) ||
-                    input_shape.dim(2) == output_shape.dim(1)) &&
-                   input_shape.dim(3) == 1 && output_shape.dim(2) == 1));
+      OP_REQUIRES(
+        (input_shape.dim(0) == output_shape.dim(0) && input_shape.dim(1) == output_shape.dim(1) &&
+         input_shape.dim(2) == output_shape.dim(2)) ||
+        (input_shape.dim(0) == output_shape.dim(0) &&
+         (input_shape.dim(1) == output_shape.dim(1) || input_shape.dim(2) == output_shape.dim(1)) &&
+         input_shape.dim(3) == 1 && output_shape.dim(2) == 1));
     }
   }
 }
@@ -293,12 +290,12 @@ void ShapeValidator::visit(const ir::operation::RNN &node)
     return;
 
   const auto hidden_state_out_index{
-      node.getOutputs().at(ir::operation::RNN::Output::HIDDEN_STATE_OUT)};
+    node.getOutputs().at(ir::operation::RNN::Output::HIDDEN_STATE_OUT)};
 
   const auto input_index{node.getInputs().at(ir::operation::RNN::Input::INPUT)};
   const auto weights_index{node.getInputs().at(ir::operation::RNN::Input::WEIGHTS)};
   const auto recurrent_weights_index{
-      node.getInputs().at(ir::operation::RNN::Input::RECURRENT_WEIGHTS)};
+    node.getInputs().at(ir::operation::RNN::Input::RECURRENT_WEIGHTS)};
   const auto bias_index{node.getInputs().at(ir::operation::RNN::Input::BIAS)};
   const auto hidden_state_in_index{node.getInputs().at(ir::operation::RNN::Input::HIDDEN_STATE_IN)};
 
@@ -335,7 +332,7 @@ void ShapeValidator::visit(const ir::operation::SpaceToBatchND &node)
 
   const auto ifm_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::INPUT)};
   const auto block_size_index{
-      node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
+    node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
   const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
 
   const auto frontend_layout = _current_layout;
@@ -570,48 +567,48 @@ void ShapeValidator::visit(const ir::operation::LSTM &node)
     return;
 
   const auto scratch_buffer_index{
-      node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)}; // Optional
+    node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)}; // Optional
   const auto output_state_out_index{
-      node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)}; // Optional
+    node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)}; // Optional
   const auto cell_state_out_index{
-      node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)}; // Optional
+    node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)}; // Optional
 
   const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)};
   const auto input_to_input_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // Optional
+    node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // Optional
   const auto input_to_forget_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)};
+    node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)};
   const auto input_to_cell_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)};
+    node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)};
   const auto input_to_output_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
+    node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
   const auto recurrent_to_input_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // Optional
+    node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // Optional
   const auto recurrent_to_forget_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)};
+    node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)};
   const auto recurrent_to_cell_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)};
+    node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)};
   const auto recurrent_to_output_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
+    node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
   const auto cell_to_input_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // Optional
+    node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // Optional
   const auto cell_to_forget_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // Optional
+    node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // Optional
   const auto cell_to_output_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // Optional
+    node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // Optional
   const auto input_gate_bias_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)}; // Optional
+    node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)}; // Optional
   const auto forget_gate_bias_index{
-      node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)};
+    node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)};
   const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)};
   const auto output_gate_bias_index{
-      node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)};
+    node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)};
   const auto projection_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // Optional
+    node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // Optional
   const auto projection_bias_index{
-      node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; // Optional
+    node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; // Optional
   const auto output_state_in_index{
-      node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)};
+    node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)};
   const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)};
 
   OP_REQUIRES(_ctx.at(input_index).shape().rank() == _ctx.at(output_index).shape().rank());
@@ -620,54 +617,54 @@ void ShapeValidator::visit(const ir::operation::LSTM &node)
     OP_REQUIRES(_ctx.at(input_index).shape().dim(i) == _ctx.at(output_index).shape().dim(i));
   }
   OP_REQUIRES(
-      (_ctx.at(output_index).shape().rank() == 2 || _ctx.at(output_index).shape().rank() == 3) &&
-      (_ctx.at(input_index).shape().rank() == 2 || _ctx.at(input_index).shape().rank() == 3) &&
-      (!_ctx.exist(input_to_input_weights_index) ||
-       _ctx.at(input_to_input_weights_index).shape().rank() == 2) &&
-      _ctx.at(input_to_forget_weights_index).shape().rank() == 2 &&
-      _ctx.at(input_to_cell_weights_index).shape().rank() == 2 &&
-      _ctx.at(input_to_output_weights_index).shape().rank() == 2 &&
-      (!_ctx.exist(recurrent_to_input_weights_index) ||
-       _ctx.at(recurrent_to_input_weights_index).shape().rank() == 2) &&
-      _ctx.at(recurrent_to_forget_weights_index).shape().rank() == 2 &&
-      _ctx.at(recurrent_to_cell_weights_index).shape().rank() == 2 &&
-      _ctx.at(recurrent_to_output_weights_index).shape().rank() == 2 &&
-      (!_ctx.exist(projection_weights_index) ||
-       _ctx.at(projection_weights_index).shape().rank() == 2) &&
-      _ctx.at(output_state_in_index).shape().rank() == 2 &&
-      _ctx.at(cell_state_in_index).shape().rank() == 2);
+    (_ctx.at(output_index).shape().rank() == 2 || _ctx.at(output_index).shape().rank() == 3) &&
+    (_ctx.at(input_index).shape().rank() == 2 || _ctx.at(input_index).shape().rank() == 3) &&
+    (!_ctx.exist(input_to_input_weights_index) ||
+     _ctx.at(input_to_input_weights_index).shape().rank() == 2) &&
+    _ctx.at(input_to_forget_weights_index).shape().rank() == 2 &&
+    _ctx.at(input_to_cell_weights_index).shape().rank() == 2 &&
+    _ctx.at(input_to_output_weights_index).shape().rank() == 2 &&
+    (!_ctx.exist(recurrent_to_input_weights_index) ||
+     _ctx.at(recurrent_to_input_weights_index).shape().rank() == 2) &&
+    _ctx.at(recurrent_to_forget_weights_index).shape().rank() == 2 &&
+    _ctx.at(recurrent_to_cell_weights_index).shape().rank() == 2 &&
+    _ctx.at(recurrent_to_output_weights_index).shape().rank() == 2 &&
+    (!_ctx.exist(projection_weights_index) ||
+     _ctx.at(projection_weights_index).shape().rank() == 2) &&
+    _ctx.at(output_state_in_index).shape().rank() == 2 &&
+    _ctx.at(cell_state_in_index).shape().rank() == 2);
 
   OP_REQUIRES(
-      (!_ctx.exist(cell_to_input_weights_index) ||
-       _ctx.at(cell_to_input_weights_index).shape().rank() == 1) &&
-      (!_ctx.exist(cell_to_forget_weights_index) ||
-       _ctx.at(cell_to_forget_weights_index).shape().rank() == 1) &&
-      (!_ctx.exist(cell_to_output_weights_index) ||
-       _ctx.at(cell_to_output_weights_index).shape().rank() == 1) &&
-      (!_ctx.exist(input_gate_bias_index) || _ctx.at(input_gate_bias_index).shape().rank() == 1) &&
-      _ctx.at(forget_gate_bias_index).shape().rank() == 1 &&
-      _ctx.at(cell_bias_index).shape().rank() == 1 &&
-      _ctx.at(output_gate_bias_index).shape().rank() == 1 &&
-      (!_ctx.exist(projection_bias_index) || _ctx.at(projection_bias_index).shape().rank() == 1));
+    (!_ctx.exist(cell_to_input_weights_index) ||
+     _ctx.at(cell_to_input_weights_index).shape().rank() == 1) &&
+    (!_ctx.exist(cell_to_forget_weights_index) ||
+     _ctx.at(cell_to_forget_weights_index).shape().rank() == 1) &&
+    (!_ctx.exist(cell_to_output_weights_index) ||
+     _ctx.at(cell_to_output_weights_index).shape().rank() == 1) &&
+    (!_ctx.exist(input_gate_bias_index) || _ctx.at(input_gate_bias_index).shape().rank() == 1) &&
+    _ctx.at(forget_gate_bias_index).shape().rank() == 1 &&
+    _ctx.at(cell_bias_index).shape().rank() == 1 &&
+    _ctx.at(output_gate_bias_index).shape().rank() == 1 &&
+    (!_ctx.exist(projection_bias_index) || _ctx.at(projection_bias_index).shape().rank() == 1));
 
   // CIFG assertion
   OP_REQUIRES(
-      ((!_ctx.exist(input_to_input_weights_index) ||
-        (_ctx.at(input_to_input_weights_index).shape().dim(0) == 0 &&
-         _ctx.at(input_to_input_weights_index).shape().dim(1) == 0)) &&
-       (!_ctx.exist(recurrent_to_input_weights_index) ||
-        (_ctx.at(recurrent_to_input_weights_index).shape().dim(0) == 0 &&
-         _ctx.at(recurrent_to_input_weights_index).shape().dim(1) == 0)) &&
-       (!_ctx.exist(input_gate_bias_index) || _ctx.at(input_gate_bias_index).shape().dim(0) == 0) &&
-       (!_ctx.exist(cell_to_input_weights_index) ||
-        _ctx.at(cell_to_input_weights_index).shape().dim(0) == 0)) ||
-      ((_ctx.exist(input_to_input_weights_index) &&
-        (_ctx.at(input_to_input_weights_index).shape().dim(0) != 0 &&
-         _ctx.at(input_to_input_weights_index).shape().dim(1) != 0)) &&
-       (_ctx.exist(recurrent_to_input_weights_index) &&
-        (_ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
-         _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0)) &&
-       (_ctx.exist(input_gate_bias_index) && _ctx.at(input_gate_bias_index).shape().dim(0) != 0)));
+    ((!_ctx.exist(input_to_input_weights_index) ||
+      (_ctx.at(input_to_input_weights_index).shape().dim(0) == 0 &&
+       _ctx.at(input_to_input_weights_index).shape().dim(1) == 0)) &&
+     (!_ctx.exist(recurrent_to_input_weights_index) ||
+      (_ctx.at(recurrent_to_input_weights_index).shape().dim(0) == 0 &&
+       _ctx.at(recurrent_to_input_weights_index).shape().dim(1) == 0)) &&
+     (!_ctx.exist(input_gate_bias_index) || _ctx.at(input_gate_bias_index).shape().dim(0) == 0) &&
+     (!_ctx.exist(cell_to_input_weights_index) ||
+      _ctx.at(cell_to_input_weights_index).shape().dim(0) == 0)) ||
+    ((_ctx.exist(input_to_input_weights_index) &&
+      (_ctx.at(input_to_input_weights_index).shape().dim(0) != 0 &&
+       _ctx.at(input_to_input_weights_index).shape().dim(1) != 0)) &&
+     (_ctx.exist(recurrent_to_input_weights_index) &&
+      (_ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
+       _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0)) &&
+     (_ctx.exist(input_gate_bias_index) && _ctx.at(input_gate_bias_index).shape().dim(0) != 0)));
 
   // Peephole assertion
   OP_REQUIRES(((!_ctx.exist(cell_to_forget_weights_index) ||
@@ -683,11 +680,11 @@ void ShapeValidator::visit(const ir::operation::LSTM &node)
                                     (_ctx.at(input_to_input_weights_index).shape().dim(0) != 0 &&
                                      _ctx.at(input_to_input_weights_index).shape().dim(1) != 0);
   bool has_recurrent_to_input_weights =
-      _ctx.exist(recurrent_to_input_weights_index) &&
-      (_ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
-       _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0);
+    _ctx.exist(recurrent_to_input_weights_index) &&
+    (_ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
+     _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0);
   bool has_input_gate_bias =
-      _ctx.exist(input_gate_bias_index) && _ctx.at(input_gate_bias_index).shape().dim(0) != 0;
+    _ctx.exist(input_gate_bias_index) && _ctx.at(input_gate_bias_index).shape().dim(0) != 0;
   bool has_cell_to_input_weights = _ctx.exist(cell_to_input_weights_index) &&
                                    _ctx.at(cell_to_input_weights_index).shape().dim(0) != 0;
   bool has_cell_to_forget_weights = _ctx.exist(cell_to_forget_weights_index) &&
@@ -698,7 +695,7 @@ void ShapeValidator::visit(const ir::operation::LSTM &node)
                                 (_ctx.at(projection_weights_index).shape().dim(0) != 0 &&
                                  _ctx.at(projection_weights_index).shape().dim(1) != 0);
   bool has_projection_bias =
-      _ctx.exist(projection_bias_index) && _ctx.at(projection_bias_index).shape().dim(0) != 0;
+    _ctx.exist(projection_bias_index) && _ctx.at(projection_bias_index).shape().dim(0) != 0;
 
   // NOTE The cell_to_input_weights do not exist in non-peephole although regular LSTM(non-CIFG).
   // true: no CIFG
@@ -714,8 +711,8 @@ void ShapeValidator::visit(const ir::operation::LSTM &node)
   bool has_projection_param = has_projection_weights;
 
   const auto batch_size = (_ctx.at(input_index).shape().rank() == 3 && node.param().time_major)
-                              ? _ctx.at(input_index).shape().dim(1)
-                              : _ctx.at(input_index).shape().dim(0);
+                            ? _ctx.at(input_index).shape().dim(1)
+                            : _ctx.at(input_index).shape().dim(0);
   OP_REQUIRES(batch_size == _ctx.at(output_state_in_index).shape().dim(0) &&
               batch_size == _ctx.at(cell_state_in_index).shape().dim(0));
 
@@ -736,7 +733,7 @@ void ShapeValidator::visit(const ir::operation::LSTM &node)
               num_units == _ctx.at(cell_state_in_index).shape().dim(1));
 
   const auto output_size =
-      _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 1);
+    _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 1);
   OP_REQUIRES(output_size == _ctx.at(recurrent_to_forget_weights_index).shape().dim(1) &&
               output_size == _ctx.at(recurrent_to_cell_weights_index).shape().dim(1) &&
               output_size == _ctx.at(recurrent_to_output_weights_index).shape().dim(1) &&
@@ -1018,9 +1015,9 @@ void ShapeValidator::visit(const ir::operation::MatrixBandPart &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::MatrixBandPart::Input::INPUT)};
   const auto num_lower_index{
-      node.getInputs().at(ir::operation::MatrixBandPart::Input::NUM_LOWER_DIAG)};
+    node.getInputs().at(ir::operation::MatrixBandPart::Input::NUM_LOWER_DIAG)};
   const auto num_upper_index{
-      node.getInputs().at(ir::operation::MatrixBandPart::Input::NUM_UPPER_DIAG)};
+    node.getInputs().at(ir::operation::MatrixBandPart::Input::NUM_UPPER_DIAG)};
 
   // Check for dimension constraints
   if (_ctx.at(output_index).info().isDynamic())
diff --git a/runtime/onert/core/src/compiler/StaticShapeInferer.cc b/runtime/onert/core/src/compiler/StaticShapeInferer.cc
index 1f2c6f3b9..5849a9801 100644
--- a/runtime/onert/core/src/compiler/StaticShapeInferer.cc
+++ b/runtime/onert/core/src/compiler/StaticShapeInferer.cc
@@ -25,40 +25,48 @@ namespace onert
 namespace compiler
 {
 
-bool StaticShapeInferer::infer(const ir::OpSequence &op_seq)
+void StaticShapeInferer::inferSubgraph(ir::SubgraphIndex subg_ind)
+{
+  StaticShapeInferer inferer(subg_ind, _lowered_subgs);
+  auto &lgraph = _lowered_subgs.at(subg_ind);
+  for (auto op_ind : lgraph->graph().topolSortOperations())
+  {
+    auto &op = lgraph->graph().operations().at(op_ind);
+    bool has_dynamic_tensor = inferer.infer(op);
+    lgraph->setHasDynamicTensor(op_ind, has_dynamic_tensor);
+  }
+}
+
+bool StaticShapeInferer::infer(const ir::Operation &op)
 {
   bool has_dynamic_tensor = false;
 
-  for (const auto &operation_idx : op_seq.operations())
-  {
-    auto &op = _operations.at(operation_idx);
-    auto opcode = op.opcode();
+  auto opcode = op.opcode();
+
+  _return_has_dynamic_tensor = false; // this is used as a return value inside operation's visit()
 
-    _return_has_dynamic_tensor = false; // this is used as a return value inside operation's visit()
+  // IF: need shape inference for then, else
+  // While: need shape inference for condition, body
+  if (opcode == ir::OpCode::If || opcode == ir::OpCode::While)
+  {
+    op.accept(*this);
+  }
+  else
+  {
+    _return_has_dynamic_tensor = checkDynamicInput(op);
 
-    // IF: need shape inference for then, else
-    // While: need shape inference for condition, body
-    if (opcode == ir::OpCode::If || opcode == ir::OpCode::While)
+    if (_return_has_dynamic_tensor)
     {
-      op.accept(*this);
+      setDynamicOutput(op);
     }
     else
     {
-      _return_has_dynamic_tensor = checkDynamicInput(op);
-
-      if (_return_has_dynamic_tensor)
-      {
-        setDynamicOutput(op);
-      }
-      else
-      {
-        op.accept(*this);
-      }
+      op.accept(*this);
     }
-
-    has_dynamic_tensor = has_dynamic_tensor || _return_has_dynamic_tensor;
   }
 
+  has_dynamic_tensor = has_dynamic_tensor || _return_has_dynamic_tensor;
+
   return has_dynamic_tensor;
 }
 
@@ -132,13 +140,13 @@ void StaticShapeInferer::dump()
   {
     const auto index = pair.first;
     const auto &lowered_subg = pair.second;
-    VERBOSE(StaticShapeInferer) << "SubGraph #" << index.value() << std::endl;
+    VERBOSE(StaticShapeInferer) << index << std::endl;
     lowered_subg->graph().operands().iterate(
-        [&](const ir::OperandIndex &ind, const ir::Operand &operand) {
-          VERBOSE(StaticShapeInferer) << "Operand #" << ind.value() << ", "
-                                      << (operand.info().isDynamic() ? "Dynamic" : "Static") << ", "
-                                      << get_shape_str(operand.info().shape()) << std::endl;
-        });
+      [&](const ir::OperandIndex &ind, const ir::Operand &operand) {
+        VERBOSE(StaticShapeInferer)
+          << "  " << ind << ", " << (operand.info().isDynamic() ? "Dynamic" : "Static") << ", "
+          << get_shape_str(operand.info().shape()) << std::endl;
+      });
   }
 }
 
@@ -167,7 +175,7 @@ void StaticShapeInferer::visit(const ir::operation::ArgMinMax &op)
 
   // re-sizing output shape
   ir::Shape new_shape =
-      shape_inference::inferArgMinMaxShape(input.info().shape(), axis_value, rank);
+    shape_inference::inferArgMinMaxShape(input.info().shape(), axis_value, rank);
   output.info().shape(new_shape);
 }
 
@@ -189,7 +197,7 @@ void StaticShapeInferer::visit(const ir::operation::BCQFullyConnected &op)
   const auto &input = _operands.at(input_idx);
 
   const auto cluster_idx{
-      op.getInputs().at(ir::operation::BCQFullyConnected::Input::WEIGHTS_CLUSTERS)};
+    op.getInputs().at(ir::operation::BCQFullyConnected::Input::WEIGHTS_CLUSTERS)};
   const auto &cluster = _operands.at(cluster_idx);
 
   const auto output_idx = op.getOutputs().at(0);
@@ -200,7 +208,7 @@ void StaticShapeInferer::visit(const ir::operation::BCQFullyConnected &op)
 
   // re-sizing output shape
   ir::Shape new_shape = shape_inference::inferBCQFullyConnectedShape(
-      input.info().shape(), cluster.info().shape(), cluster_buf);
+    input.info().shape(), cluster.info().shape(), cluster_buf);
   output.info().shape(new_shape);
 }
 
@@ -225,7 +233,7 @@ void StaticShapeInferer::visit(const ir::operation::BCQGather &op)
 
   // re-sizing output shape
   ir::Shape new_shape = shape_inference::inferBCQGatherShape(
-      indices.info().shape(), cluster.info().shape(), cluster_buf, rank, op.param());
+    indices.info().shape(), cluster.info().shape(), cluster_buf, rank, op.param());
 
   output.info().shape(new_shape);
 }
@@ -298,7 +306,7 @@ void StaticShapeInferer::visit(const ir::operation::Conv2D &op)
 
   // re-sizing output shape
   ir::Shape new_shape =
-      shape_inference::inferConv2DShape(input.info().shape(), ker.info().shape(), op.param());
+    shape_inference::inferConv2DShape(input.info().shape(), ker.info().shape(), op.param());
   output.info().shape(new_shape);
 }
 
@@ -341,9 +349,9 @@ void StaticShapeInferer::visit(const ir::operation::ExpandDims &op)
 
   assert(axis.data()->base());
   int32_t axis_value =
-      (axis_type == ir::DataType::INT32)
-          ? reinterpret_cast<const int32_t *>(axis.data()->base())[0]
-          : static_cast<int32_t>(reinterpret_cast<const int64_t *>(axis.data()->base())[0]);
+    (axis_type == ir::DataType::INT32)
+      ? reinterpret_cast<const int32_t *>(axis.data()->base())[0]
+      : static_cast<int32_t>(reinterpret_cast<const int64_t *>(axis.data()->base())[0]);
 
   // re-sizing output shape
   ir::Shape new_shape = shape_inference::inferExpandDimsShape(input.info().shape(), axis_value);
@@ -372,10 +380,10 @@ void StaticShapeInferer::visit(const ir::operation::Fill &op)
 
   const auto &dims_shape = shape.info().shape();
   auto new_shape = ((dims_type == ir::DataType::INT32)
-                        ? shape_inference::inferFillShape<int32_t>(
-                              dims_shape, reinterpret_cast<const int32_t *>(dims_buf))
-                        : shape_inference::inferFillShape<int64_t>(
-                              dims_shape, reinterpret_cast<const int64_t *>(dims_buf)));
+                      ? shape_inference::inferFillShape<int32_t>(
+                          dims_shape, reinterpret_cast<const int32_t *>(dims_buf))
+                      : shape_inference::inferFillShape<int64_t>(
+                          dims_shape, reinterpret_cast<const int64_t *>(dims_buf)));
 
   output.info().shape(new_shape);
 }
@@ -393,7 +401,7 @@ void StaticShapeInferer::visit(const ir::operation::FullyConnected &op)
   ir::Operand &output = _operands.at(output_idx);
   // re-sizing output shape
   ir::Shape new_shape =
-      shape_inference::inferFullyConnectedShape(input.info().shape(), ker.info().shape());
+    shape_inference::inferFullyConnectedShape(input.info().shape(), ker.info().shape());
   output.info().shape(new_shape);
 }
 
@@ -420,7 +428,7 @@ void StaticShapeInferer::visit(const ir::operation::Gather &op)
 
   // re-sizing output shape
   ir::Shape new_shape =
-      shape_inference::inferGatherShape(input.info().shape(), indices.info().shape(), axis, rank);
+    shape_inference::inferGatherShape(input.info().shape(), indices.info().shape(), axis, rank);
   output.info().shape(new_shape);
 }
 
@@ -465,23 +473,11 @@ void StaticShapeInferer::visit(const ir::operation::If &op)
     }
   }
 
-  // re-sizing operands of then subgraph
-  StaticShapeInferer then_inferer(op.param().then_subg_index, _lowered_subgs);
-  _lowered_subgs.at(op.param().then_subg_index)
-      ->iterateTopolOpSeqs([&](const ir::OpSequenceIndex &, ir::OpSequence &op_seq) {
-        bool has_dynamic_tensor = then_inferer.infer(op_seq);
-        op_seq.has_dynamic_tensor(has_dynamic_tensor);
-      });
-
-  // re-sizing operands of else subgraph
-  StaticShapeInferer else_inferer(op.param().else_subg_index, _lowered_subgs);
-  _lowered_subgs.at(op.param().else_subg_index)
-      ->iterateTopolOpSeqs([&](const ir::OpSequenceIndex &, ir::OpSequence &op_seq) {
-        bool has_dynamic_tensor = else_inferer.infer(op_seq);
-        op_seq.has_dynamic_tensor(has_dynamic_tensor);
-      });
+  inferSubgraph(op.param().then_subg_index);
+  inferSubgraph(op.param().else_subg_index);
 
   // re-sizing output shapes
+  // TODO use then_graph / else_graph instead
   const auto &then_outputs = _lowered_subgs.at(op.param().then_subg_index)->graph().getOutputs();
   const auto &else_outputs = _lowered_subgs.at(op.param().else_subg_index)->graph().getOutputs();
   assert(outputs.size() == then_outputs.size());
@@ -515,14 +511,15 @@ void StaticShapeInferer::visit(const ir::operation::LSTM &op)
   auto &output = _operands.at(output_index);
 
   const auto output_state_out_index{
-      op.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)};
+    op.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)};
 
   const auto cell_state_out_index{op.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)};
 
   const auto scratch_buffer_index{op.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)};
 
-  if (output.info().isDynamic() || (_operands.exist(output_state_out_index) &&
-                                    _operands.at(output_state_out_index).info().isDynamic()) ||
+  if (output.info().isDynamic() ||
+      (_operands.exist(output_state_out_index) &&
+       _operands.at(output_state_out_index).info().isDynamic()) ||
       (_operands.exist(cell_state_out_index) &&
        _operands.at(cell_state_out_index).info().isDynamic()) ||
       (_operands.exist(scratch_buffer_index) &&
@@ -533,11 +530,11 @@ void StaticShapeInferer::visit(const ir::operation::LSTM &op)
   const auto &input = _operands.at(input_index);
 
   const auto input_to_output_weights_index{
-      op.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
+    op.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
   const auto &input_to_output_weights = _operands.at(input_to_output_weights_index);
 
   const auto recurrent_to_output_weights_index{
-      op.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
+    op.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
   const auto &recurrent_to_output_weights = _operands.at(recurrent_to_output_weights_index);
 
   // re-sizing outputs
@@ -575,16 +572,16 @@ void StaticShapeInferer::visit(const ir::operation::LSTM &op)
     auto &scratch_buffer = _operands.at(scratch_buffer_index);
 
     const auto input_to_input_weights_index{
-        op.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)};
+      op.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)};
     const auto recurrent_to_input_weights_index{
-        op.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)};
+      op.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)};
 
     bool has_input_to_input_weights =
-        _operands.at(input_to_input_weights_index).shape().dim(0) != 0 &&
-        _operands.at(input_to_input_weights_index).shape().dim(1) != 0;
+      _operands.at(input_to_input_weights_index).shape().dim(0) != 0 &&
+      _operands.at(input_to_input_weights_index).shape().dim(1) != 0;
     bool has_recurrent_to_input_weights =
-        _operands.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
-        _operands.at(recurrent_to_input_weights_index).shape().dim(1) != 0;
+      _operands.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
+      _operands.at(recurrent_to_input_weights_index).shape().dim(1) != 0;
 
     // NOTE The cell_to_input_weights do not exist in non-peephole although regular LSTM(non-CIFG).
     // true: no CIFG
@@ -674,8 +671,8 @@ void StaticShapeInferer::visit(const ir::operation::Pad &op)
 
   // re-sizing output shape
   const auto new_shape = shape_inference::inferPadShape(
-      input.shape(), reinterpret_cast<const int32_t *>(pad.data()->base()),
-      pad.shape().num_elements());
+    input.shape(), reinterpret_cast<const int32_t *>(pad.data()->base()),
+    pad.shape().num_elements());
   output.info().shape(new_shape);
 }
 
@@ -722,12 +719,12 @@ void StaticShapeInferer::visit(const ir::operation::Range &op)
     if (output.typeInfo().type() == ir::DataType::FLOAT32)
     {
       new_shape = shape_inference::inferRangeShape<float>(
-          start_op.asScalar<float>(), limit_op.asScalar<float>(), delta_op.asScalar<float>());
+        start_op.asScalar<float>(), limit_op.asScalar<float>(), delta_op.asScalar<float>());
     }
     else if (output.typeInfo().type() == ir::DataType::INT32)
     {
       new_shape = shape_inference::inferRangeShape<int32_t>(
-          start_op.asScalar<int32_t>(), limit_op.asScalar<int32_t>(), delta_op.asScalar<int32_t>());
+        start_op.asScalar<int32_t>(), limit_op.asScalar<int32_t>(), delta_op.asScalar<int32_t>());
     }
     assert(output.shape() == new_shape);
   }
@@ -774,7 +771,7 @@ void StaticShapeInferer::visit(const ir::operation::Reduce &op)
 
   // re-sizing output shape
   ir::Shape new_shape =
-      shape_inference::inferReduceShape(input.info().shape(), axes_vec, keep_dims);
+    shape_inference::inferReduceShape(input.info().shape(), axes_vec, keep_dims);
   output.info().shape(new_shape);
 }
 
@@ -800,7 +797,7 @@ void StaticShapeInferer::visit(const ir::operation::Reshape &op)
       assert(shape_buf);
 
       ir::Shape new_shape = shape_inference::inferReshapeShape(
-          shape_buf, shape.shape().num_elements(), input.shape().num_elements());
+        shape_buf, shape.shape().num_elements(), input.shape().num_elements());
 
       // if shape is from Const, TFLC put the shape of output into tensor
       if (new_shape != output.shape())
@@ -821,8 +818,8 @@ void StaticShapeInferer::visit(const ir::operation::Reshape &op)
   {
     // Let's check the new_shape option
     auto shape = op.param().new_shape;
-    ir::Shape new_shape = shape_inference::inferReshapeShape(shape.data(), shape.size(),
-                                                             input.shape().num_elements());
+    ir::Shape new_shape =
+      shape_inference::inferReshapeShape(shape.data(), shape.size(), input.shape().num_elements());
 
     if (new_shape != output.shape())
     {
@@ -867,7 +864,7 @@ void StaticShapeInferer::visit(const ir::operation::ResizeBilinear &op)
 
   // Shape inferencing logic based on Params
   ir::Shape new_shape =
-      shape_inference::inferResizeBilinearShape(input.shape(), height_out, width_out);
+    shape_inference::inferResizeBilinearShape(input.shape(), height_out, width_out);
 
   // if size_op is from Const, TFLC put the shape of output into tensor
   if (new_shape != output.shape())
@@ -898,7 +895,7 @@ void StaticShapeInferer::visit(const ir::operation::Select &op)
 
   // Select output shpae
   ir::Shape new_shape = shape_inference::inferSelectShape(
-      input_cond.info().shape(), input_true.info().shape(), input_false.info().shape());
+    input_cond.info().shape(), input_true.info().shape(), input_false.info().shape());
   output.info().shape(new_shape);
 }
 
@@ -937,11 +934,21 @@ void StaticShapeInferer::visit(const ir::operation::Slice &op)
     return;
   }
 
-  auto begins_buf = reinterpret_cast<const int32_t *>(begins.data()->base());
-  auto sizes_buf = reinterpret_cast<const int32_t *>(sizes.data()->base());
+  auto begins_buf = begins.data()->base();
+  auto sizes_buf = sizes.data()->base();
+
+  const auto begins_type = begins.typeInfo().type();
+  assert(begins_type == ir::DataType::INT32 || begins_type == ir::DataType::INT64);
+  assert(begins_type == sizes.typeInfo().type());
 
   ir::Shape new_shape =
-      shape_inference::inferSliceShape(input.info().shape(), begins_buf, sizes_buf);
+    (begins_type == ir::DataType::INT32)
+      ? shape_inference::inferSliceShape<int32_t>(input.info().shape(),
+                                                  reinterpret_cast<const int32_t *>(begins_buf),
+                                                  reinterpret_cast<const int32_t *>(sizes_buf))
+      : shape_inference::inferSliceShape<int64_t>(input.info().shape(),
+                                                  reinterpret_cast<const int64_t *>(begins_buf),
+                                                  reinterpret_cast<const int64_t *>(sizes_buf));
   output.info().shape(new_shape);
 }
 
@@ -978,7 +985,7 @@ void StaticShapeInferer::visit(const ir::operation::SpaceToBatchND &op)
   auto padding_data = reinterpret_cast<const int32_t *>(padding.data()->base());
 
   ir::Shape new_shape = shape_inference::inferSpaceToBatchNDShape(
-      input_shape, block_shape_shape, padding_shape, block_shape_data, padding_data);
+    input_shape, block_shape_shape, padding_shape, block_shape_data, padding_data);
 
   output.info().shape(new_shape);
 }
@@ -1012,7 +1019,7 @@ void StaticShapeInferer::visit(const ir::operation::Split &op)
   assert(0 <= axis_value && axis_value < rank);
 
   ir::Shape new_shape =
-      shape_inference::inferSplitShape(input.info().shape(), axis_value, num_splits);
+    shape_inference::inferSplitShape(input.info().shape(), axis_value, num_splits);
   for (auto output_idx : outputs)
   {
     ir::Operand &output = _operands.at(output_idx);
@@ -1069,10 +1076,10 @@ void StaticShapeInferer::visit(const ir::operation::StridedSlice &op)
   auto strides_buf = reinterpret_cast<const uint32_t *>(strides.data()->base());
 
   auto op_params = shape_inference::buildStridedSliceParams(
-      starts_buf, ends_buf, strides_buf, begin_mask, end_mask, shrink_axis_mask, rank);
+    starts_buf, ends_buf, strides_buf, begin_mask, end_mask, shrink_axis_mask, rank);
 
   ir::Shape new_shape =
-      shape_inference::inferStridedSliceShape(input.info().shape(), op_params, rank);
+    shape_inference::inferStridedSliceShape(input.info().shape(), op_params, rank);
   output.info().shape(new_shape);
 }
 
@@ -1224,12 +1231,7 @@ void StaticShapeInferer::visit(const ir::operation::While &op)
   }
 
   // re-sizing operands of body subgraph
-  StaticShapeInferer body_inferer(op.param().body_subg_index, _lowered_subgs);
-  _lowered_subgs.at(op.param().body_subg_index)
-      ->iterateTopolOpSeqs([&](const ir::OpSequenceIndex &, ir::OpSequence &op_seq) {
-        bool has_dynamic_tensor = body_inferer.infer(op_seq);
-        op_seq.has_dynamic_tensor(has_dynamic_tensor);
-      });
+  inferSubgraph(op.param().body_subg_index);
 
   // Check whether while operation's shapes are predictable
   // If any of shape of body outputs and cond inputs are different, non-constant operands would be
@@ -1272,23 +1274,13 @@ void StaticShapeInferer::visit(const ir::operation::While &op)
     }
 
     // Set non-constant operands of body subgraph to dynamic
-    StaticShapeInferer body_inferer(op.param().body_subg_index, _lowered_subgs);
-    _lowered_subgs.at(op.param().body_subg_index)
-        ->iterateTopolOpSeqs([&](const ir::OpSequenceIndex &, ir::OpSequence &op_seq) {
-          bool has_dynamic_tensor = body_inferer.infer(op_seq);
-          op_seq.has_dynamic_tensor(has_dynamic_tensor);
-        });
+    inferSubgraph(op.param().body_subg_index);
   }
 
   // re-sizing operands of cond subgraph
   // If check_unpredictable_dynamic is true, non-constant operands of cond subgraph would be set to
   // dynamic
-  StaticShapeInferer cond_inferer(op.param().cond_subg_index, _lowered_subgs);
-  _lowered_subgs.at(op.param().cond_subg_index)
-      ->iterateTopolOpSeqs([&](const ir::OpSequenceIndex &, ir::OpSequence &op_seq) {
-        bool has_dynamic_tensor = cond_inferer.infer(op_seq);
-        op_seq.has_dynamic_tensor(has_dynamic_tensor);
-      });
+  inferSubgraph(op.param().cond_subg_index);
 
   // re-sizing outputs of while operation
   // If check_unpredictable_dynamic is true, outputs of while operation would be set to dynamic
diff --git a/runtime/onert/core/src/compiler/TensorRegistries.h b/runtime/onert/core/src/compiler/TensorRegistries.h
index e42225cbf..2a99db781 100644
--- a/runtime/onert/core/src/compiler/TensorRegistries.h
+++ b/runtime/onert/core/src/compiler/TensorRegistries.h
@@ -21,9 +21,9 @@
 #include <memory>
 #include "backend/BackendContext.h"
 #include "backend/Backend.h"
-#include "backend/controlflow/Config.h"
-#include "backend/controlflow/TensorBuilder.h"
-#include "backend/controlflow/TensorRegistry.h"
+#include "backend/builtin/Config.h"
+#include "backend/builtin/TensorBuilder.h"
+#include "backend/builtin/TensorRegistry.h"
 
 namespace onert
 {
@@ -35,17 +35,16 @@ class TensorRegistries
 public:
   TensorRegistries() = default;
 
-  TensorRegistries(const onert::backend::BackendContexts &backend_contexts,
-                   bool include_controlflow)
+  TensorRegistries(const onert::backend::BackendContexts &backend_contexts, bool include_builtin)
   {
     for (const auto &e : backend_contexts)
     {
       auto tensor_reg = e.second->tensor_registry;
-      if (e.first->config()->id() == backend::controlflow::Config::ID)
+      if (e.first->config()->id() == backend::builtin::Config::ID)
       {
-        _cf_tensor_reg =
-            std::dynamic_pointer_cast<backend::controlflow::TensorRegistry>(tensor_reg);
-        if (include_controlflow)
+        _builtin_tensor_reg =
+          std::dynamic_pointer_cast<backend::builtin::TensorRegistry>(tensor_reg);
+        if (include_builtin)
           _tensor_regs.insert(tensor_reg);
       }
       else
@@ -64,9 +63,9 @@ public:
     return _tensor_regs.cend();
   }
 
-  std::shared_ptr<backend::controlflow::TensorRegistry> getControlflowTensorRegistry() const
+  std::shared_ptr<backend::builtin::TensorRegistry> getBuiltinTensorRegistry() const
   {
-    return _cf_tensor_reg;
+    return _builtin_tensor_reg;
   }
 
   backend::ITensor *getITensor(ir::OperandIndex ind) const
@@ -82,7 +81,7 @@ public:
 
 private:
   std::unordered_set<std::shared_ptr<backend::ITensorRegistry>> _tensor_regs;
-  std::shared_ptr<backend::controlflow::TensorRegistry> _cf_tensor_reg;
+  std::shared_ptr<backend::builtin::TensorRegistry> _builtin_tensor_reg;
 };
 
 } // namespace compiler
diff --git a/runtime/onert/core/src/compiler/pass/ConstantInsertionPass.cc b/runtime/onert/core/src/compiler/pass/ConstantInsertionPass.cc
index ef6240894..89dd303d4 100644
--- a/runtime/onert/core/src/compiler/pass/ConstantInsertionPass.cc
+++ b/runtime/onert/core/src/compiler/pass/ConstantInsertionPass.cc
@@ -17,8 +17,9 @@
 #include "ConstantInsertionPass.h"
 
 #include "backend/Backend.h"
-#include <ir/Graph.h>
-#include <util/Utils.h>
+#include "ir/Graph.h"
+#include "util/Utils.h"
+#include "util/logging.h"
 
 namespace onert
 {
@@ -29,11 +30,10 @@ namespace pass
 
 void ConstantInsertionPass::callback(const ir::OperationIndex &node_index, ir::Operation &node)
 {
-  const auto &op_sequence_index = _lowered_graph.op_seqs().getOperation(node_index);
-  const auto op_seq_lower_info = _lowered_graph.getLowerInfo(op_sequence_index);
-  const auto backend = op_seq_lower_info->backend();
-  const auto layout = op_seq_lower_info->layout();
-  const auto factor = ir::operand::PermuteFactor{backend, layout};
+  const auto op_lower_info = _lowered_graph.lower_info().operation.getRawPtr(node_index);
+  const auto backend = op_lower_info->backend();
+  const auto layout = op_lower_info->layout();
+  const auto factor = PermuteFactor{backend, layout};
 
   for (const auto input : node.getInputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED)
   {
@@ -45,21 +45,12 @@ void ConstantInsertionPass::callback(const ir::OperationIndex &node_index, ir::O
       if (_replace_operands_map.count(key) == 0)
       {
         ir::Operand new_object(object);
-        new_object.unsetDef();
-        // TODO Remove const_case
-        const_cast<ir::OperationIndexSet &>(new_object.getUses()).clear();
+        new_object.clearDefUse();
         const auto new_index = _graph.operands().emplace(new_object);
         _replace_operands_map[key] = new_index;
       }
 
       const auto replaced_input = _replace_operands_map[key];
-      // Update op_seq
-      if (_lowered_graph.op_seqs().at(op_sequence_index).getInputs().contains(input))
-      {
-        // All inputs of op_seq have the same PermuteFactor because those inputs are inputs of first
-        // operation
-        _lowered_graph.op_seqs().at(op_sequence_index).replaceInputs(input, replaced_input);
-      }
 
       // Update the same inputs of a node at once because inputs of an operation have the same
       // PermuteFactor
@@ -69,6 +60,8 @@ void ConstantInsertionPass::callback(const ir::OperationIndex &node_index, ir::O
       auto &replaced_object = _graph.operands().at(replaced_input);
       replaced_object.insertUse(node_index);
 
+      VERBOSE(ConstInsertPass) << "New operand " << replaced_input << " added(copy of " << input
+                               << ") for " << factor << std::endl;
       // Remove this node from uses of origin operand
       // Constant operand has no def.
       assert(!object.getDef().valid());
@@ -76,7 +69,11 @@ void ConstantInsertionPass::callback(const ir::OperationIndex &node_index, ir::O
 
       // Remove origin operand
       if (object.getUses().size() == 0)
+      {
         _graph.removeOperand(input);
+        VERBOSE(ConstInsertPass) << "Original operand " << input << " removed - no uses"
+                                 << std::endl;
+      }
     }
   }
 
diff --git a/runtime/onert/core/src/compiler/pass/ConstantInsertionPass.h b/runtime/onert/core/src/compiler/pass/ConstantInsertionPass.h
index 052883c92..4911ace2f 100644
--- a/runtime/onert/core/src/compiler/pass/ConstantInsertionPass.h
+++ b/runtime/onert/core/src/compiler/pass/ConstantInsertionPass.h
@@ -17,7 +17,7 @@
 #ifndef __ONERT_COMPILER_PASS_CONSTANT_INSERTION_PASS_H__
 #define __ONERT_COMPILER_PASS_CONSTANT_INSERTION_PASS_H__
 
-#include <ir/operand/PermuteFactor.h>
+#include <compiler/PermuteFactor.h>
 #include <ir/Index.h>
 #include "LoweredOperationPass.h"
 #include <unordered_map>
@@ -45,7 +45,7 @@ private:
   struct ReplaceKey
   {
     ir::OperandIndex index;
-    ir::operand::PermuteFactor factor;
+    PermuteFactor factor;
 
     bool operator==(const ReplaceKey &other) const
     {
@@ -61,8 +61,7 @@ private:
     std::size_t operator()(const ReplaceKey &key) const noexcept
     {
       using std::hash;
-      return hash<ir::OperandIndex>()(key.index) ^
-             (hash<ir::operand::PermuteFactor>()(key.factor) << 1);
+      return hash<ir::OperandIndex>()(key.index) ^ (hash<PermuteFactor>()(key.factor) << 1);
     }
   };
 
diff --git a/runtime/onert/core/src/compiler/pass/ConstantLoweringPass.cc b/runtime/onert/core/src/compiler/pass/ConstantLoweringPass.cc
index 1c1dbe0ee..6ed154548 100644
--- a/runtime/onert/core/src/compiler/pass/ConstantLoweringPass.cc
+++ b/runtime/onert/core/src/compiler/pass/ConstantLoweringPass.cc
@@ -18,8 +18,9 @@
 
 #include "backend/Backend.h"
 #include <ir/Graph.h>
-#include <ir/operand/PermuteFactor.h>
+#include <compiler/PermuteFactor.h>
 #include <util/Utils.h>
+#include "util/logging.h"
 
 namespace onert
 {
@@ -30,11 +31,10 @@ namespace pass
 
 void ConstantLoweringPass::callback(const ir::OperationIndex &node_index, ir::Operation &node)
 {
-  const auto &op_sequence_index = _lowered_graph.op_seqs().getOperation(node_index);
-  const auto op_seq_lower_info = _lowered_graph.getLowerInfo(op_sequence_index);
-  const auto backend = op_seq_lower_info->backend();
-  const auto layout = op_seq_lower_info->layout();
-  const auto factor = ir::operand::PermuteFactor{backend, layout};
+  const auto op_lower_info = _lowered_graph.lower_info().operation.getRawPtr(node_index);
+  const auto backend = op_lower_info->backend();
+  const auto layout = op_lower_info->layout();
+  const auto factor = PermuteFactor{backend, layout};
 
   // Now this runtime does not support the node making output of operation as constant
   for (const auto input : node.getInputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED)
@@ -44,9 +44,10 @@ void ConstantLoweringPass::callback(const ir::OperationIndex &node_index, ir::Op
     {
       // All constant operand are already assinged at each backend by ContantInsertionPass. So a
       // constant has `def` and `use` as the same PermuteFactor
-      _lowered_graph.setLowerInfo(input, std::make_unique<ir::operand::LowerInfo>());
-      _lowered_graph.getLowerInfo(input)->addDefPermuteFactor(factor);
-      _lowered_graph.getLowerInfo(input)->addUsePermuteFactor(factor);
+      auto operand_li = std::make_unique<compiler::OperandLowerInfo>();
+      operand_li->addDefPermuteFactor(factor);
+      operand_li->addUsePermuteFactor(factor);
+      _lowered_graph.lower_info().operand.set(input, std::move(operand_li));
     }
   }
 }
diff --git a/runtime/onert/core/src/compiler/pass/LoweredOperandPass.h b/runtime/onert/core/src/compiler/pass/LoweredOperandPass.h
index 0c5f7d745..1f1f32f6d 100644
--- a/runtime/onert/core/src/compiler/pass/LoweredOperandPass.h
+++ b/runtime/onert/core/src/compiler/pass/LoweredOperandPass.h
@@ -31,7 +31,7 @@ class LoweredOperandPass : public OperandPass
 {
 public:
   LoweredOperandPass(compiler::LoweredGraph &lowered_graph)
-      : OperandPass{lowered_graph.graph()}, _lowered_graph{lowered_graph}
+    : OperandPass{lowered_graph.graph()}, _lowered_graph{lowered_graph}
   {
     // DO NOTHING
   }
diff --git a/runtime/onert/core/src/compiler/pass/LoweredOperationPass.h b/runtime/onert/core/src/compiler/pass/LoweredOperationPass.h
index 5c8569be2..76ee3d7ff 100644
--- a/runtime/onert/core/src/compiler/pass/LoweredOperationPass.h
+++ b/runtime/onert/core/src/compiler/pass/LoweredOperationPass.h
@@ -31,7 +31,7 @@ class LoweredOperationPass : public OperationPass
 {
 public:
   LoweredOperationPass(LoweredGraph &lowered_graph)
-      : OperationPass{lowered_graph.graph()}, _lowered_graph{lowered_graph}
+    : OperationPass{lowered_graph.graph()}, _lowered_graph{lowered_graph}
   {
     // DO NOTHING
   }
diff --git a/runtime/onert/core/src/compiler/pass/OperandPass.cc b/runtime/onert/core/src/compiler/pass/OperandPass.cc
index 50c001c30..db8ebedcd 100644
--- a/runtime/onert/core/src/compiler/pass/OperandPass.cc
+++ b/runtime/onert/core/src/compiler/pass/OperandPass.cc
@@ -28,7 +28,7 @@ namespace pass
 void OperandPass::run()
 {
   _graph.operands().iterate(
-      [&](const ir::OperandIndex &index, ir::Operand &object) { callback(index, object); });
+    [&](const ir::OperandIndex &index, ir::Operand &object) { callback(index, object); });
 }
 
 } // namespace pass
diff --git a/runtime/onert/core/src/compiler/pass/OperationPass.cc b/runtime/onert/core/src/compiler/pass/OperationPass.cc
index d7a55cb22..357a8798a 100644
--- a/runtime/onert/core/src/compiler/pass/OperationPass.cc
+++ b/runtime/onert/core/src/compiler/pass/OperationPass.cc
@@ -30,7 +30,7 @@ namespace pass
 void OperationPass::run()
 {
   _graph.operations().iterate(
-      [&](const ir::OperationIndex &index, ir::Operation &node) { callback(index, node); });
+    [&](const ir::OperationIndex &index, ir::Operation &node) { callback(index, node); });
 }
 
 } // namespace pass
diff --git a/runtime/onert/core/src/compiler/pass/Pass.h b/runtime/onert/core/src/compiler/pass/Pass.h
index 3f356c337..3016df490 100644
--- a/runtime/onert/core/src/compiler/pass/Pass.h
+++ b/runtime/onert/core/src/compiler/pass/Pass.h
@@ -24,7 +24,7 @@ namespace onert
 namespace ir
 {
 class Graph;
-} // namespace compiler
+} // namespace ir
 } // namespace onert
 
 namespace onert
diff --git a/runtime/onert/core/src/compiler/pass/PassRunner.cc b/runtime/onert/core/src/compiler/pass/PassRunner.cc
index 2a058c8ac..1be6d7794 100644
--- a/runtime/onert/core/src/compiler/pass/PassRunner.cc
+++ b/runtime/onert/core/src/compiler/pass/PassRunner.cc
@@ -36,7 +36,7 @@ void PassRunner::run()
     VERBOSE(PassRunner) << "Start running '" << pass->id() << "'" << std::endl;
     pass->run();
     VERBOSE(PassRunner) << "Finished running '" << pass->id() << "'" << std::endl;
-    // TODO Dump graph(LowerInfo, OpSequence, ...)?
+    // TODO Dump graph?
   }
 }
 
diff --git a/runtime/onert/core/src/compiler/pass/PermutationEliminationPass.cc b/runtime/onert/core/src/compiler/pass/PermutationEliminationPass.cc
index 504f1b995..181f388de 100644
--- a/runtime/onert/core/src/compiler/pass/PermutationEliminationPass.cc
+++ b/runtime/onert/core/src/compiler/pass/PermutationEliminationPass.cc
@@ -15,7 +15,7 @@
  */
 
 #include "PermutationEliminationPass.h"
-#include "backend/controlflow/Config.h"
+#include "backend/builtin/Config.h"
 
 #include "util/logging.h"
 
@@ -39,8 +39,9 @@ void PermutationEliminationPass::visit(const ir::operation::Permute &node)
 
   // Check if two tensors are both portable if not, we can't eliminate the node
   {
-    auto in_def_factor = _lowered_graph.getLowerInfo(in_operand)->def_factors().getOnlyElement();
-    auto out_def_factor = _lowered_graph.getLowerInfo(out_operand)->def_factors().getOnlyElement();
+    auto &operand_li_map = _lowered_graph.lower_info().operand;
+    auto in_def_factor = operand_li_map.getRawPtr(in_operand)->def_factors().getOnlyElement();
+    auto out_def_factor = operand_li_map.getRawPtr(out_operand)->def_factors().getOnlyElement();
 
     auto in_config = in_def_factor.backend()->config();
     auto out_config = out_def_factor.backend()->config();
@@ -73,53 +74,30 @@ void PermutationEliminationPass::visit(const ir::operation::Permute &node)
     auto &out_operand_obj = _graph.operands().at(out_operand);
     assert(out_operand_obj.getDef() == _op_ind);
     out_operand_obj.unsetDef();
-    _lowered_graph.op_seqs().iterate([&](const ir::OpSequenceIndex &, ir::OpSequence &op_seq) {
-      if (!op_seq.getOutputs().contains(in_operand))
+    _graph.operations().iterate([&](const ir::OperationIndex &op_ind, ir::Operation &op) {
+      if (!op.getOutputs().contains(in_operand))
         return;
-
-      // Update OpSequence/ir::Operation edges and ir::Operand edges
-      op_seq.replaceOutputs(in_operand, out_operand);
-      for (auto op : op_seq.operations())
-      {
-        auto &operation_obj = _graph.operations().at(op);
-        if (operation_obj.getOutputs().contains(in_operand))
-        {
-          operation_obj.replaceOutputs(in_operand, out_operand);
-          out_operand_obj.setDef(op);
-        }
-      }
+      // Update Operation and Operand edges
+      op.replaceOutputs(in_operand, out_operand);
+      out_operand_obj.setDef(op_ind);
     });
 
-    // Remove Permute operation, enclosing OpSequence and the operand
+    // Remove Permute operation and the operand
     {
       _graph.removeOperand(in_operand);
-
-      auto op_seq_ind = _lowered_graph.op_seqs().getOperation(_op_ind);
-      // Assumes enclosing OpSequence contatins just this Permute operation
-      assert(_lowered_graph.op_seqs().at(op_seq_ind).size() == 1);
-      _lowered_graph.op_seqs().remove(op_seq_ind);
       _graph.operations().remove(_op_ind);
     }
 
-    _lowered_graph.op_seqs().iterate([&](const ir::OpSequenceIndex &, ir::OpSequence &op_seq) {
-      if (!op_seq.getInputs().contains(in_operand))
+    _graph.operations().iterate([&](const ir::OperationIndex &op_ind, ir::Operation &op) {
+      if (!op.getInputs().contains(in_operand))
         return;
-
-      op_seq.replaceInputs(in_operand, out_operand);
-      for (auto op : op_seq.operations())
-      {
-        auto &operation_obj = _graph.operations().at(op);
-        if (operation_obj.getInputs().contains(in_operand))
-        {
-          operation_obj.replaceInputs(in_operand, out_operand);
-          out_operand_obj.insertUse(op);
-        }
-      }
+      op.replaceInputs(in_operand, out_operand);
+      out_operand_obj.insertUse(op_ind);
     });
 
     VERBOSE(removePermute) << "Permute Op removed, node index : " << _op_ind << std::endl;
-    VERBOSE(removePermute) << "  - Input (removed) ir::Operand : " << in_operand << std::endl;
-    VERBOSE(removePermute) << "  - Output(kept)    ir::Operand : " << out_operand << std::endl;
+    VERBOSE(removePermute) << "  - Input (removed) Operand : " << in_operand << std::endl;
+    VERBOSE(removePermute) << "  - Output(kept)    Operand : " << out_operand << std::endl;
   }
   else
   {
@@ -128,37 +106,23 @@ void PermutationEliminationPass::visit(const ir::operation::Permute &node)
     auto &in_operand_obj = _graph.operands().at(in_operand);
     in_operand_obj.removeUse(_op_ind);
 
-    // Make OpSequences(that use the output) use the input
-    _lowered_graph.op_seqs().iterate([&](const ir::OpSequenceIndex &, ir::OpSequence &op_seq) {
-      if (!op_seq.getInputs().contains(out_operand))
+    // Make operations(that use the output) use the input
+    _graph.operations().iterate([&](const ir::OperationIndex &op_ind, ir::Operation &op) {
+      if (!op.getInputs().contains(out_operand))
         return;
-
-      op_seq.replaceInputs(out_operand, in_operand);
-      for (auto op : op_seq.operations())
-      {
-        auto &operation_obj = _graph.operations().at(op);
-        if (operation_obj.getInputs().contains(out_operand))
-        {
-          operation_obj.replaceInputs(out_operand, in_operand);
-          in_operand_obj.insertUse(op);
-        }
-      }
+      op.replaceInputs(out_operand, in_operand);
+      in_operand_obj.insertUse(op_ind);
     });
 
-    // Remove Permute operation, enclosing OpSequence and the operand
+    // Remove the Permute operation and out_operand
     {
       _graph.removeOperand(out_operand);
-
-      auto op_seq_ind = _lowered_graph.op_seqs().getOperation(_op_ind);
-      // Assumes enclosing OpSequence contatins just this Permute operation
-      assert(_lowered_graph.op_seqs().at(op_seq_ind).size() == 1);
-      _lowered_graph.op_seqs().remove(op_seq_ind);
       _graph.operations().remove(_op_ind);
     }
 
-    VERBOSE(removePermute) << "Permute Op removed, node index : " << _op_ind << std::endl;
-    VERBOSE(removePermute) << "  - Input (kept)    ir::Operand : " << in_operand << std::endl;
-    VERBOSE(removePermute) << "  - Output(removed) ir::Operand : " << out_operand << std::endl;
+    VERBOSE(removePermute) << "Permute Op removed : " << _op_ind << std::endl;
+    VERBOSE(removePermute) << "  - Input (kept)    Operand : " << in_operand << std::endl;
+    VERBOSE(removePermute) << "  - Output(removed) Operand : " << out_operand << std::endl;
   }
 }
 
diff --git a/runtime/onert/core/src/compiler/pass/PermutationEliminationPass.h b/runtime/onert/core/src/compiler/pass/PermutationEliminationPass.h
index 29daf1a82..50c38c53f 100644
--- a/runtime/onert/core/src/compiler/pass/PermutationEliminationPass.h
+++ b/runtime/onert/core/src/compiler/pass/PermutationEliminationPass.h
@@ -35,7 +35,7 @@ namespace pass
  * are compatible and layouts match.
  *
  * Permute input tensor is kept and the output is removed for all the cases, except model outputs.
- * As all output tensors have to be controlflow backend, so the output is kept.
+ * As all output tensors have to be builtin backend, so the output is kept.
  *
  * @note This is an optimization pass which means that everything should work fine even if this pass
  *       was skipped.
diff --git a/runtime/onert/core/src/compiler/pass/PermutationInsertionPass.cc b/runtime/onert/core/src/compiler/pass/PermutationInsertionPass.cc
index 8467d51c8..6f9899114 100644
--- a/runtime/onert/core/src/compiler/pass/PermutationInsertionPass.cc
+++ b/runtime/onert/core/src/compiler/pass/PermutationInsertionPass.cc
@@ -9,6 +9,7 @@
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
+
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
@@ -20,9 +21,9 @@
 #include <utility>
 #include <unordered_map>
 
-#include "backend/controlflow/Config.h"
+#include "backend/builtin/Config.h"
 #include "ir/Operand.h"
-#include "ir/operation/LowerInfo.h"
+#include "compiler/OperationLowerInfo.h"
 #include "ir/Graph.h"
 #include "backend/IConfig.h"
 #include "util/logging.h"
@@ -38,7 +39,8 @@ namespace pass
 
 void PermutationInsertionPass::callback(const ir::OperandIndex &index, ir::Operand &object)
 {
-  auto &&operand_li = _lowered_graph.getLowerInfo(index);
+  auto &operand_li_map = _lowered_graph.lower_info().operand;
+  auto &&operand_li = operand_li_map.getRawPtr(index);
   assert(operand_li);
 
   // NOTE Later, constants also will have Def
@@ -51,7 +53,7 @@ void PermutationInsertionPass::callback(const ir::OperandIndex &index, ir::Opera
   std::list<ir::OperationIndex> permute_indexes;
 
   // Build a map for all necessary type of operands
-  std::unordered_map<ir::operand::PermuteFactor, ir::OperandIndex> factor_to_index;
+  std::unordered_map<PermuteFactor, ir::OperandIndex> factor_to_index;
   {
     assert(operand_li->def_factors().size() == 1);
     for (auto factor : operand_li->def_factors())
@@ -82,26 +84,17 @@ void PermutationInsertionPass::callback(const ir::OperandIndex &index, ir::Opera
         continue;
 
       auto &operation = _graph.operations().at(use);
-      assert(_lowered_graph.op_seqs().containsOperation(use));
-      auto op_seq_index = _lowered_graph.op_seqs().getOperation(use);
-      auto op_seq_li = _lowered_graph.getLowerInfo(op_seq_index);
-      assert(op_seq_li);
-      const auto op_seq_layout = op_seq_li->layout();
-      const backend::Backend *backend = op_seq_li->backend();
+      auto op_li = _lowered_graph.lower_info().operation.getRawPtr(use);
+      assert(op_li);
+      const auto op_layout = op_li->layout();
+      const backend::Backend *backend = op_li->backend();
       assert(backend);
       auto use_node_inputs = operation.getInputs();
       assert(use_node_inputs.contains(index));
 
-      auto new_index = factor_to_index.at({backend, op_seq_layout});
+      auto new_index = factor_to_index.at({backend, op_layout});
       if (index != new_index)
       {
-        // Update from op_seq
-        // Replace the same inputs of an OpSequence at once for the following reasons:
-        // 1. An OpSequence's inputs are the same inputs of first operation
-        // 2. An OpSequence may have inputs as the same operand (2 or more).
-        // 3. The same inputs of OpSequence have the same PermuteFactor.
-        _lowered_graph.op_seqs().at(op_seq_index).replaceInputs(index, new_index);
-
         // Update from operation
         // Replace the same inputs of an operation at once for the following reasons:
         // No. 2 and 3 above
@@ -109,7 +102,7 @@ void PermutationInsertionPass::callback(const ir::OperandIndex &index, ir::Opera
 
         // Update from operand
         remove_list.push_back(
-            use); // Removal should be done in another loop since we are in the loop
+          use); // Removal should be done in another loop since we are in the loop
         _graph.operands().at(new_index).insertUse(use);
       }
     }
@@ -122,52 +115,52 @@ void PermutationInsertionPass::callback(const ir::OperandIndex &index, ir::Opera
 }
 
 ir::OperationIndex PermutationInsertionPass::insertPermute(const ir::OperandIndex &operand_index,
-                                                           const ir::operand::PermuteFactor &factor)
+                                                           const PermuteFactor &factor)
 {
-  assert(!_graph.isBuildingPhase());
-
   auto &operand = _graph.operands().at(operand_index);
 
   // Generate output operand and permute operation
   auto out_operand_index = _graph.addOperand(operand.shape(), operand.typeInfo());
-  // change model output if operand_index is model output index and the out operand is controlflow
+  // change model output if operand_index is model output index and the out operand is builtin
   // backend
   auto &model_outputs = _graph.getOutputs();
-  const backend::Backend *cf_backend = compiler::BackendManager::get().getControlflow();
-  if (model_outputs.contains(operand_index) && factor.backend() == cf_backend)
+  const backend::Backend *builtin_backend = compiler::BackendManager::get().getBuiltin();
+  if (model_outputs.contains(operand_index) && factor.backend() == builtin_backend)
   {
     model_outputs.replace(operand_index, out_operand_index);
   }
 
+  auto &operand_li_map = _lowered_graph.lower_info().operand;
+
   // Find Permute information
-  auto input_factor = _lowered_graph.getLowerInfo(operand_index)->def_factors().getOnlyElement();
+  auto input_factor = operand_li_map.getRawPtr(operand_index)->def_factors().getOnlyElement();
   auto input_backend = input_factor.backend();
   auto output_backend = factor.backend();
   // NOTE Permute may not have specific layout because the layout of input and output may be
   // different.
   const auto permute_node_layout = ir::Layout::UNKNOWN;
   // NOTE If one backend supports several layout, the backend must support Permute operation
-  const backend::Backend *permute_node_backend = compiler::BackendManager::get().getControlflow();
+  const backend::Backend *permute_node_backend = compiler::BackendManager::get().getBuiltin();
   if (input_backend == output_backend)
   {
     permute_node_backend = input_backend;
   }
-  const ir::operand::PermuteFactor permute_node_factor{permute_node_backend, permute_node_layout};
+  const PermuteFactor permute_node_factor{permute_node_backend, permute_node_layout};
 
   // Update LowerInfo of input operand
-  auto operand_lower_info = _lowered_graph.getLowerInfo(operand_index);
+  auto operand_lower_info = operand_li_map.getRawPtr(operand_index);
   operand_lower_info->removeUsePermuteFactor(factor);
   operand_lower_info->addUsePermuteFactor(permute_node_factor);
 
   // Update LowerInfo of output operand
-  auto out_operand_li = std::make_unique<ir::operand::LowerInfo>();
+  auto out_operand_li = std::make_unique<compiler::OperandLowerInfo>();
 
   // The input and output factors of all nodes will be the same except Permute. So Tensor's
   // allocators allocates memory using only the information of def permutation factor now.
   // TODO Change param to permute_node_factor
   out_operand_li->addDefPermuteFactor(factor);
   out_operand_li->addUsePermuteFactor(factor);
-  _lowered_graph.setLowerInfo(out_operand_index, std::move(out_operand_li));
+  operand_li_map.set(out_operand_index, std::move(out_operand_li));
 
   // Insert permute operation to the graph
   const auto input_layout = input_factor.layout();
@@ -190,7 +183,6 @@ ir::OperationIndex PermutationInsertionPass::insertPermute(const ir::OperandInde
   auto insert_node = std::make_unique<Permute>(operand_index, out_operand_index, permute_type);
 
   auto node_index = _graph.operations().push(std::move(insert_node));
-  const auto &node = _graph.operations().at(node_index);
 
   VERBOSE_F() << "Permute Op inserted, node index : " << node_index << std::endl;
   VERBOSE_F() << "  - Input (original) Operand : " << operand_index << "("
@@ -198,14 +190,11 @@ ir::OperationIndex PermutationInsertionPass::insertPermute(const ir::OperandInde
   VERBOSE_F() << "  - Output(inserted) Operand : " << out_operand_index << "("
               << factor.backend()->config()->id() << ")" << std::endl;
 
-  // OpSequence
+  // Operation LowerInfo
   {
-    auto op_seq_index = _lowered_graph.op_seqs().emplace(node_index, permute_node_layout);
-    auto &op_seq = _lowered_graph.op_seqs().at(op_seq_index);
-    op_seq.setInputs(node.getInputs());
-    op_seq.setOutputs(node.getOutputs());
-    _lowered_graph.setLowerInfo(op_seq_index, std::make_unique<ir::operation::LowerInfo>(
-                                                  permute_node_backend, permute_node_layout));
+    auto &operation_li_map = _lowered_graph.lower_info().operation;
+    operation_li_map.set(node_index, std::make_unique<compiler::OperationLowerInfo>(
+                                       permute_node_backend, permute_node_layout));
   }
 
   // Update Use/Def info
diff --git a/runtime/onert/core/src/compiler/pass/PermutationInsertionPass.h b/runtime/onert/core/src/compiler/pass/PermutationInsertionPass.h
index 758515385..ee0a1464c 100644
--- a/runtime/onert/core/src/compiler/pass/PermutationInsertionPass.h
+++ b/runtime/onert/core/src/compiler/pass/PermutationInsertionPass.h
@@ -20,7 +20,7 @@
 #include "LoweredOperandPass.h"
 #include "compiler/BackendManager.h"
 #include "ir/Operand.h"
-#include "ir/operand/PermuteFactor.h"
+#include "compiler/PermuteFactor.h"
 
 namespace onert
 {
@@ -48,7 +48,7 @@ private:
    * @return ir::OperationIndex
    */
   ir::OperationIndex insertPermute(const ir::OperandIndex &operand_index,
-                                   const ir::operand::PermuteFactor &factor);
+                                   const PermuteFactor &factor);
 };
 
 } // namespace pass
diff --git a/runtime/onert/core/src/compiler/pass/PermutationOperationPass.cc b/runtime/onert/core/src/compiler/pass/PermutationOperationPass.cc
index 93d125307..f83b1ba31 100644
--- a/runtime/onert/core/src/compiler/pass/PermutationOperationPass.cc
+++ b/runtime/onert/core/src/compiler/pass/PermutationOperationPass.cc
@@ -33,7 +33,7 @@ using namespace ir;
 void PermutationOperationPass::callback(const OperationIndex &, Operation &node)
 {
   node.accept(*this);
-};
+}
 
 // TODO Remove this. Expanding ranks of Operand is dangerous
 void PermutationOperationPass::applyExpandRanks(const Operation &node)
@@ -43,9 +43,8 @@ void PermutationOperationPass::applyExpandRanks(const Operation &node)
 
   assert(output.getDef().valid());
   const auto node_index = output.getDef();
-  const auto &op_seq_index = _lowered_graph.op_seqs().getOperation(node_index);
-  const auto frontend_layout = _lowered_graph.op_seqs().at(op_seq_index).getLayout();
-  const auto backend_layout = _lowered_graph.getLowerInfo(op_seq_index)->layout();
+  const auto frontend_layout = _graph.layout();
+  const auto backend_layout = _lowered_graph.lower_info().operation.getRawPtr(node_index)->layout();
 
   if (frontend_layout == backend_layout)
   {
@@ -84,10 +83,11 @@ void PermutationOperationPass::changeToKeepLayout(const Operation &node)
 
   assert(output_obj.getDef().valid());
   const auto node_index = output_obj.getDef();
-  const auto &op_seq_index = _lowered_graph.op_seqs().getOperation(node_index);
 
-  const auto frontend_layout = _lowered_graph.op_seqs().at(op_seq_index).getLayout();
-  const auto backend_layout = _lowered_graph.getLowerInfo(op_seq_index)->layout();
+  auto &operation_li_map = _lowered_graph.lower_info().operation;
+  auto &operand_li_map = _lowered_graph.lower_info().operand;
+  const auto frontend_layout = _graph.layout();
+  const auto backend_layout = operation_li_map.getRawPtr(node_index)->layout();
 
   if (frontend_layout == backend_layout)
   {
@@ -97,96 +97,27 @@ void PermutationOperationPass::changeToKeepLayout(const Operation &node)
   // Permutation changing layout beyond 4-D is not supported yet
   assert(output_obj.shape().rank() <= 4);
 
-  // Divide op_seq based on target operation
+  // Change PermuteFactors of operands and the operation of target node
   {
-    auto &prev_op_seq = _lowered_graph.op_seqs().at(op_seq_index);
-    auto &operations = _lowered_graph.graph().operations();
-
-    // Create new op_seq and move information from existing op_seq to new op_seq if target
-    // node is the end of op_seq
-    auto it = prev_op_seq.begin();
-    // Find iterator of target node in op_seq
-    while (*(it++) != node_index)
-      ;
-    if (it != prev_op_seq.end())
-    {
-      const auto &target_op_idx = *it;
-      const auto &target_node = operations.at(target_op_idx);
-      const auto &next_op_seq_index =
-          _lowered_graph.op_seqs().emplace(target_op_idx, prev_op_seq.getLayout());
-      auto &next_op_seq = _lowered_graph.op_seqs().at(next_op_seq_index);
-      next_op_seq.setInputs(target_node.getInputs());
-      next_op_seq.setOutputs(target_node.getOutputs());
-
-      std::vector<OperationIndex> remove_list;
-      remove_list.emplace_back(target_op_idx);
-      while (++it != prev_op_seq.end())
-      {
-        next_op_seq.appendOperation(target_op_idx);
-        next_op_seq.setOutputs(target_node.getOutputs());
-        remove_list.emplace_back(target_op_idx);
-      }
+    const auto op_li = operation_li_map.getRawPtr(node_index);
+    const auto backend = op_li->backend();
 
-      prev_op_seq.setOutputs(node.getOutputs());
-      for (const auto &index : remove_list)
-      {
-        prev_op_seq.remove(index);
-      }
-
-      const auto op_seq_li = _lowered_graph.getLowerInfo(op_seq_index);
-      _lowered_graph.setLowerInfo(
-          next_op_seq_index,
-          std::make_unique<ir::operation::LowerInfo>(op_seq_li->backend(), op_seq_li->layout()));
-    }
-  }
-
-  // Remove target operation from op_seq and insert the target operation to new op_seq
-  {
-    const auto backend = _lowered_graph.getLowerInfo(op_seq_index)->backend();
+    operation_li_map.set(node_index,
+                         std::make_unique<compiler::OperationLowerInfo>(backend, frontend_layout));
 
-    // Remove target operation from op_sequence
-    _lowered_graph.op_seqs().removeFromOpSequence(node_index);
-
-    if (!_lowered_graph.op_seqs().exist(op_seq_index))
-    {
-      // Remove lowerinfo for op_seq of target operation if the op_seq does not exist
-      _lowered_graph.removeLowerInfo(op_seq_index);
-    }
-    else
-    {
-      // Update op_seq of target operation if the op_seq exists
-      auto &prev_op_seq = _lowered_graph.op_seqs().at(op_seq_index);
-      const auto &last_node_idx = *(--prev_op_seq.end());
-      const auto &last_node = _lowered_graph.graph().operations().at(last_node_idx);
-      prev_op_seq.setOutputs(last_node.getOutputs());
-    }
-
-    // Create new op_seq and set information to the op_seq
-    auto new_op_seq_index = _lowered_graph.op_seqs().emplace(node_index, frontend_layout);
-    auto &new_op_seq = _lowered_graph.op_seqs().at(new_op_seq_index);
-    new_op_seq.setInputs(node.getInputs());
-    new_op_seq.setOutputs(node.getOutputs());
-    _lowered_graph.setLowerInfo(
-        new_op_seq_index, std::make_unique<ir::operation::LowerInfo>(backend, frontend_layout));
-  }
-
-  // Change PermuteFactors of operands of target node
-  {
-    const auto &op_seq_index = _lowered_graph.op_seqs().getOperation(node_index);
-    const auto op_seq_li = _lowered_graph.getLowerInfo(op_seq_index);
-    const auto backend = op_seq_li->backend();
-    const operand::PermuteFactor removed_factor{backend, backend_layout};
-    const operand::PermuteFactor new_factor{backend, frontend_layout};
+    const PermuteFactor removed_factor{backend, backend_layout};
+    const PermuteFactor new_factor{backend, frontend_layout};
     for (const auto &input : node.getInputs() | Remove::DUPLICATED | Remove::UNDEFINED)
     {
+      // Check if it can be removed by checking if the operand is used by another operation and
+      // it uses the same backend and layout
       bool canRemove = true;
       for (const auto &use : _graph.operands().at(input).getUses())
       {
         if (use != node_index)
         {
-          const auto &use_op_seq_index = _lowered_graph.op_seqs().getOperation(use);
-          auto use_op_seq_li = _lowered_graph.getLowerInfo(use_op_seq_index);
-          if (use_op_seq_li->backend() == backend && use_op_seq_li->layout() == backend_layout)
+          auto use_op_li = operation_li_map.getRawPtr(use);
+          if (use_op_li->backend() == backend && use_op_li->layout() == backend_layout)
           {
             canRemove = false;
             break;
@@ -194,27 +125,27 @@ void PermutationOperationPass::changeToKeepLayout(const Operation &node)
         }
       }
 
-      auto lower_info = _lowered_graph.getLowerInfo(input);
+      auto input_li = operand_li_map.getRawPtr(input);
       if (canRemove)
       {
-        lower_info->removeUsePermuteFactor(removed_factor);
+        input_li->removeUsePermuteFactor(removed_factor);
       }
-      lower_info->addUsePermuteFactor(new_factor);
+      input_li->addUsePermuteFactor(new_factor);
 
       // Whether if node's input is an input of model or a constant
       if (!_graph.operands().at(input).getDef().valid() &&
-          (lower_info->def_factors().size() == 1 &&
-           lower_info->def_factors().getOnlyElement() == removed_factor))
+          (input_li->def_factors().size() == 1 &&
+           input_li->def_factors().getOnlyElement() == removed_factor))
       {
         assert(_graph.getInputs().contains(input) || _graph.operands().at(input).isConstant());
-        lower_info->removeDefPermuteFactor(removed_factor);
-        lower_info->addDefPermuteFactor(new_factor);
+        input_li->removeDefPermuteFactor(removed_factor);
+        input_li->addDefPermuteFactor(new_factor);
       }
     }
 
     for (const auto &output : node.getOutputs() | Remove::DUPLICATED | Remove::UNDEFINED)
     {
-      auto lower_info = _lowered_graph.getLowerInfo(output);
+      auto lower_info = operand_li_map.getRawPtr(output);
       lower_info->removeDefPermuteFactor(removed_factor);
       lower_info->addDefPermuteFactor(new_factor);
 
diff --git a/runtime/onert/core/src/compiler/pass/UnusedOperandEliminationPass.cc b/runtime/onert/core/src/compiler/pass/UnusedOperandEliminationPass.cc
new file mode 100644
index 000000000..35fb575b0
--- /dev/null
+++ b/runtime/onert/core/src/compiler/pass/UnusedOperandEliminationPass.cc
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Pass.h"
+
+#include "UnusedOperandEliminationPass.h"
+#include "ir/Index.h"
+#include "util/Set.h"
+#include "ir/Graph.h"
+
+/**
+ * @file  UnusedOperandEliminationPass.cc
+ * @brief This file contains UnusedOperandEliminationPass class implementation
+ */
+
+namespace onert
+{
+namespace compiler
+{
+namespace pass
+{
+
+void UnusedOperandEliminationPass::run()
+{
+  util::Set<ir::OperandIndex> used;
+
+  _graph.operations().iterate([&](const ir::OperationIndex &, const ir::Operation &node) {
+    for (auto ind : (node.getInputs() + node.getOutputs()) | ir::Remove::UNDEFINED)
+    {
+      used.add(ind);
+    }
+  });
+
+  // Graph's inputs/outputs are always considered as used
+  for (auto ind : (_graph.getInputs() + _graph.getOutputs()) | ir::Remove::UNDEFINED)
+  {
+    used.add(ind);
+  }
+
+  _graph.operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &) {
+    if (!used.contains(ind))
+    {
+      VERBOSE() << "Remove unused operand " << ind << std::endl;
+      _graph.operands().remove(ind);
+    }
+  });
+}
+
+} // namespace pass
+} // namespace compiler
+} // namespace onert
diff --git a/runtime/onert/core/src/compiler/pass/UnusedOperandEliminationPass.h b/runtime/onert/core/src/compiler/pass/UnusedOperandEliminationPass.h
new file mode 100644
index 000000000..8078f4246
--- /dev/null
+++ b/runtime/onert/core/src/compiler/pass/UnusedOperandEliminationPass.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file  UnusedOperandEliminationPass.h
+ * @brief This file contains UnusedOperandEliminationPass class
+ */
+
+#ifndef __ONERT_COMPILER_PASS_UNUSED_OPERAND_ELIMINATION_PASS_H__
+#define __ONERT_COMPILER_PASS_UNUSED_OPERAND_ELIMINATION_PASS_H__
+
+#include "Pass.h"
+
+namespace onert
+{
+namespace compiler
+{
+namespace pass
+{
+
+/**
+ * @brief  A pass to eliminate unused operands from the graph
+ *
+ * Remove operands that are not used by any operations, except Graph inputs/outputs.
+ *
+ */
+class UnusedOperandEliminationPass : public Pass
+{
+public:
+  using Pass::Pass;
+
+public:
+  std::string id() override { return "UnusedOperandEliminationPass"; }
+  void run() final;
+};
+
+} // namespace pass
+} // namespace compiler
+} // namespace onert
+
+#endif // __ONERT_COMPILER_PASS_UNUSED_OPERAND_ELIMINATION_PASS_H__
diff --git a/runtime/onert/core/src/dumper/dot/DotBuilder.cc b/runtime/onert/core/src/dumper/dot/DotBuilder.cc
index 38a69696e..d4e4d5484 100644
--- a/runtime/onert/core/src/dumper/dot/DotBuilder.cc
+++ b/runtime/onert/core/src/dumper/dot/DotBuilder.cc
@@ -35,25 +35,6 @@ void DotBuilder::update(const Node &node_info)
   }
 }
 
-void DotBuilder::addOpSequence(const DotSubgraphInfo &subgraph_info)
-{
-  _dot << "subgraph cluster_" << subgraph_info.index().value() << " {\n";
-  _dot << "  label=\"" << subgraph_info.label() << "\";\n";
-  _dot << "  style=filled;\n";
-  _dot << "  color=lightgrey;\n";
-  _dot << "  ";
-  for (auto op : subgraph_info.operations())
-  {
-    _dot << "operation" << op.value() << "; ";
-  }
-  for (auto op : subgraph_info.operands())
-  {
-    _dot << "operand" << op.value() << "; ";
-  }
-  _dot << "\n";
-  _dot << "}\n";
-}
-
 void DotBuilder::writeDot(std::ostream &os)
 {
   os << "digraph D {\n"
diff --git a/runtime/onert/core/src/dumper/dot/DotBuilder.h b/runtime/onert/core/src/dumper/dot/DotBuilder.h
index 681cbbf5d..30f32f8f9 100644
--- a/runtime/onert/core/src/dumper/dot/DotBuilder.h
+++ b/runtime/onert/core/src/dumper/dot/DotBuilder.h
@@ -25,7 +25,6 @@
 
 #include "OperationNode.h"
 #include "OperandNode.h"
-#include "DotSubgraphInfo.h"
 
 using Operation = onert::ir::Operation;
 using Object = onert::ir::Operand;
@@ -44,7 +43,6 @@ public:
 
 public:
   void update(const Node &dotinfo);
-  void addOpSequence(const DotSubgraphInfo &subgraph_info);
 
   void writeDot(std::ostream &os);
 
diff --git a/runtime/onert/core/src/dumper/dot/DotDumper.cc b/runtime/onert/core/src/dumper/dot/DotDumper.cc
index fdf5c6eaa..714fb6fda 100644
--- a/runtime/onert/core/src/dumper/dot/DotDumper.cc
+++ b/runtime/onert/core/src/dumper/dot/DotDumper.cc
@@ -19,8 +19,6 @@
 
 #include "DotDumper.h"
 #include "DotBuilder.h"
-#include "DotSubgraphInfo.h"
-#include "ir/OpSequence.h"
 #include "ir/OperationIndexMap.h"
 #include "backend/Backend.h"
 #include "backend/IConfig.h"
@@ -82,7 +80,7 @@ void DotDumper::dump(const std::string &tag)
     else
     {
       showing_cond =
-          !object.isConstant() || (_graph.getInputs() + _graph.getOutputs()).contains(index);
+        !object.isConstant() || (_graph.getInputs() + _graph.getOutputs()).contains(index);
     }
     if (showing_cond)
     {
@@ -105,7 +103,7 @@ void DotDumper::dump(const std::string &tag)
         std::string fillcolor = "";
         if (_lowered_graph)
         {
-          auto lower_info = _lowered_graph->getLowerInfo(index);
+          auto lower_info = _lowered_graph->lower_info().operand.getRawPtr(index);
           const auto &def_factors = lower_info->def_factors();
           if (def_factors.size() > 0)
           {
@@ -151,25 +149,18 @@ void DotDumper::dump(const std::string &tag)
 
   if (_lowered_graph)
   {
-    const auto &op_seqs = _lowered_graph->op_seqs();
-    op_seqs.iterate([&](const ir::OpSequenceIndex &index, const ir::OpSequence &op_seq) {
-      const auto lower_info = _lowered_graph->getLowerInfo(index);
-      auto fillcolor = backend_to_fillcolor(lower_info->backend());
-      std::string label =
-          std::to_string(index.value()) + " [" + lower_info->backend()->config()->id() + "]";
-      DotSubgraphInfo subgraph_info{index, op_seq, shown_operand_set, _graph.operations()};
-      subgraph_info.label(label);
-      subgraph_info.fillcolor(fillcolor);
-      dot_builder.addOpSequence(subgraph_info);
-
-      // Set fillcolor of all operations in the op_seq
-      for (const auto &op_idx : op_seq.operations())
+    _graph.operations().iterate([&](const ir::OperationIndex &index, const ir::Operation &) {
+      const auto lower_info = _lowered_graph->lower_info().operation.getRawPtr(index);
+      if (lower_info)
       {
-        auto found = operation_nodes.find(op_idx);
-        if (found != operation_nodes.end())
+        auto fillcolor = backend_to_fillcolor(lower_info->backend());
+        std::string backend_label = "[" + lower_info->backend()->config()->id() + "]";
+        auto itr = operation_nodes.find(index);
+        if (itr != operation_nodes.end())
         {
-          auto &&op = found->second;
-          op->setAttribute("fillcolor", fillcolor);
+          auto &node = itr->second;
+          node->setAttribute("label", node->getAttribute("label") + "\n" + backend_label);
+          node->setAttribute("fillcolor", fillcolor);
         }
       }
     });
diff --git a/runtime/onert/core/src/dumper/dot/DotDumper.h b/runtime/onert/core/src/dumper/dot/DotDumper.h
index fdbca1642..f300c3432 100644
--- a/runtime/onert/core/src/dumper/dot/DotDumper.h
+++ b/runtime/onert/core/src/dumper/dot/DotDumper.h
@@ -39,11 +39,11 @@ public:
 
 public:
   DotDumper(const ir::Graph &graph, Level level)
-      : _lowered_graph{nullptr}, _graph(graph), _level{level}
+    : _lowered_graph{nullptr}, _graph(graph), _level{level}
   {
   }
   DotDumper(const compiler::LoweredGraph *lowered_graph, Level level)
-      : _lowered_graph{lowered_graph}, _graph(_lowered_graph->graph()), _level{level}
+    : _lowered_graph{lowered_graph}, _graph(_lowered_graph->graph()), _level{level}
   {
   }
 
diff --git a/runtime/onert/core/src/dumper/dot/DotSubgraphInfo.cc b/runtime/onert/core/src/dumper/dot/DotSubgraphInfo.cc
deleted file mode 100644
index 52e9c758d..000000000
--- a/runtime/onert/core/src/dumper/dot/DotSubgraphInfo.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "DotSubgraphInfo.h"
-
-#include <sstream>
-
-namespace onert
-{
-namespace dumper
-{
-namespace dot
-{
-
-DotSubgraphInfo::DotSubgraphInfo(const ir::OpSequenceIndex &index, const ir::OpSequence &op_seq,
-                                 const util::Set<ir::OperandIndex> &shown_operands,
-                                 const ir::Operations &operations_ctx)
-    : _index{index}
-{
-  for (const auto &op_idx : op_seq.operations())
-  {
-    _operations.insert(op_idx);
-    const auto &node = operations_ctx.at(op_idx);
-    for (auto o : node.getInputs())
-    {
-      // Must be a shown operand, not op_seq's inputs
-      if (shown_operands.contains(o) && !op_seq.getInputs().contains(o))
-      {
-        _operands.insert(o);
-      }
-    }
-    for (auto o : node.getOutputs())
-    {
-      // Must be a shown operand, not op_seq's inputs
-      if (shown_operands.contains(o) && !op_seq.getOutputs().contains(o))
-      {
-        _operands.insert(o);
-      }
-    }
-  }
-}
-
-} // namespace dot
-} // namespace dumper
-} // namespace onert
diff --git a/runtime/onert/core/src/dumper/dot/DotSubgraphInfo.h b/runtime/onert/core/src/dumper/dot/DotSubgraphInfo.h
deleted file mode 100644
index 95ba8953e..000000000
--- a/runtime/onert/core/src/dumper/dot/DotSubgraphInfo.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __ONERT_CORE_DUMPER_DOT_DOT_SUBGRAPH_INFO_H__
-#define __ONERT_CORE_DUMPER_DOT_DOT_SUBGRAPH_INFO_H__
-
-#include <unordered_set>
-
-#include "ir/Index.h"
-#include <ir/Operations.h>
-#include "ir/OpSequence.h"
-#include "util/Set.h"
-
-namespace onert
-{
-namespace dumper
-{
-namespace dot
-{
-
-class DotSubgraphInfo
-{
-public:
-  DotSubgraphInfo(const ir::OpSequenceIndex &index, const ir::OpSequence &op_seq,
-                  const util::Set<ir::OperandIndex> &shown_operands,
-                  const ir::Operations &operations_ctx);
-
-  ir::OpSequenceIndex index() const { return _index; }
-  std::string label() const { return _label; }
-  void label(const std::string &val) { _label = val; }
-  std::string fillcolor() const { return _fillcolor; }
-  void fillcolor(const std::string &val) { _fillcolor = val; }
-  const std::unordered_set<ir::OperationIndex> &operations() const { return _operations; }
-  const std::unordered_set<ir::OperandIndex> &operands() const { return _operands; }
-
-private:
-  ir::OpSequenceIndex _index;
-  std::string _label;
-  std::string _fillcolor;
-  std::unordered_set<ir::OperationIndex> _operations;
-  std::unordered_set<ir::OperandIndex> _operands;
-};
-
-} // namespace dot
-} // namespace dumper
-} // namespace onert
-
-#endif // __ONERT_CORE_DUMPER_DOT_DOT_SUBGRAPH_INFO_H__
diff --git a/runtime/onert/core/src/dumper/dot/OperandNode.cc b/runtime/onert/core/src/dumper/dot/OperandNode.cc
index 5a6015ca9..88f5254f3 100644
--- a/runtime/onert/core/src/dumper/dot/OperandNode.cc
+++ b/runtime/onert/core/src/dumper/dot/OperandNode.cc
@@ -18,7 +18,6 @@
 
 #include "OperandNode.h"
 #include "ir/Graph.h"
-#include "ir/operand/LowerInfo.h"
 
 namespace onert
 {
@@ -33,7 +32,7 @@ const std::string Operand::OPERAND_SHAPE = "ellipse";
 const std::string Operand::BG_COLOR_SCHEME = "set18";
 
 Operand::Operand(const ir::OperandIndex &index, Type type)
-    : Node{"operand" + std::to_string(index.value())}
+  : Node{"operand" + std::to_string(index.value())}
 {
   {
     auto type_to_shape = [](Type type) {
diff --git a/runtime/onert/core/src/dumper/dot/OperandNode.h b/runtime/onert/core/src/dumper/dot/OperandNode.h
index 2e7cc5861..f2aea80ad 100644
--- a/runtime/onert/core/src/dumper/dot/OperandNode.h
+++ b/runtime/onert/core/src/dumper/dot/OperandNode.h
@@ -64,7 +64,6 @@ public:
    *
    * @param[in] index Operand index
    * @param[in] type Operand type
-   * @param[in] lower_info Operand LowerInfo
    */
   Operand(const ir::OperandIndex &index, Type type);
 
diff --git a/runtime/onert/core/src/dumper/dot/OperationNode.cc b/runtime/onert/core/src/dumper/dot/OperationNode.cc
index bee137e7c..87c5ba148 100644
--- a/runtime/onert/core/src/dumper/dot/OperationNode.cc
+++ b/runtime/onert/core/src/dumper/dot/OperationNode.cc
@@ -18,7 +18,6 @@
 
 #include "OperationNode.h"
 #include "ir/Graph.h"
-#include "ir/operation/LowerInfo.h"
 #include "backend/IConfig.h"
 #include "backend/Backend.h"
 
@@ -33,7 +32,7 @@ const std::string Operation::OPERATION_SHAPE = "rect";
 const std::string Operation::BG_COLOR_SCHEME = "pastel18";
 
 Operation::Operation(const ir::OperationIndex &index, const ir::Operation &node)
-    : Node{"operation" + std::to_string(index.value())}
+  : Node{"operation" + std::to_string(index.value())}
 {
   setAttribute("label", std::to_string(index.value()) + " : " + node.name());
   setAttribute("shape", OPERATION_SHAPE);
diff --git a/runtime/onert/core/src/dumper/text/GraphDumper.cc b/runtime/onert/core/src/dumper/text/GraphDumper.cc
new file mode 100644
index 000000000..80cfbbc34
--- /dev/null
+++ b/runtime/onert/core/src/dumper/text/GraphDumper.cc
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GraphDumper.h"
+
+#include "ir/Graph.h"
+#include "compiler/LoweredGraph.h"
+#include "util/logging.h"
+#include "misc/string_helpers.h"
+
+namespace onert
+{
+namespace dumper
+{
+namespace text
+{
+
+namespace
+{
+
+std::string formatOperandIndexSequence(const ir::OperandIndexSequence &seq)
+{
+  std::vector<std::string> strs;
+  for (auto ind : seq)
+    strs.push_back(dumper::text::formatOperandBrief(ind));
+  return nnfw::misc::join(strs.begin(), strs.end(), ", ");
+}
+
+} // namespace
+
+std::string formatOperandBrief(ir::OperandIndex ind)
+{
+  std::stringstream ss;
+  ss << ind;
+  return ss.str();
+}
+
+std::string formatOperand(const ir::Graph &, ir::OperandIndex ind)
+{
+  std::stringstream ss;
+  ss << ind;
+  // TODO Print shape, type and maybe more
+  return ss.str();
+}
+
+std::string formatOperation(const ir::Graph &graph, ir::OperationIndex ind)
+{
+  std::stringstream ss;
+  const auto &op = graph.operations().at(ind);
+
+  ss << formatOperandIndexSequence(op.getOutputs());
+  ss << " = ";
+  ss << ind << "_" << op.name() << "(";
+  ss << formatOperandIndexSequence(op.getInputs());
+  ss << ")";
+  return ss.str();
+}
+
+void dumpGraph(const ir::Graph &graph)
+{
+  VERBOSE(GraphDumper) << "{\n";
+  auto ops_topol = graph.topolSortOperations();
+  for (auto op_ind : ops_topol)
+  {
+    VERBOSE(GraphDumper) << "  " << formatOperation(graph, op_ind) << "\n";
+  }
+  VERBOSE(GraphDumper) << "}\n";
+  VERBOSE(GraphDumper) << std::endl;
+}
+
+void dumpLoweredGraph(const compiler::LoweredGraph &lgraph)
+{
+  // TODO Graph dump with backend info
+  dumpGraph(lgraph.graph());
+}
+
+} // namespace text
+} // namespace dumper
+} // namespace onert
diff --git a/runtime/onert/core/src/dumper/text/GraphDumper.h b/runtime/onert/core/src/dumper/text/GraphDumper.h
new file mode 100644
index 000000000..0501ff050
--- /dev/null
+++ b/runtime/onert/core/src/dumper/text/GraphDumper.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_DUMPER_TEXT_GRAPH_DUMPER_H__
+#define __ONERT_DUMPER_TEXT_GRAPH_DUMPER_H__
+
+#include <ir/Index.h>
+
+namespace onert
+{
+namespace ir
+{
+class Graph;
+}
+} // namespace onert
+
+namespace onert
+{
+namespace compiler
+{
+class LoweredGraph;
+}
+} // namespace onert
+
+namespace onert
+{
+namespace dumper
+{
+namespace text
+{
+
+std::string formatOperandBrief(ir::OperandIndex ind);
+std::string formatOperand(const ir::Graph &, ir::OperandIndex ind);
+std::string formatOperation(const ir::Graph &graph, ir::OperationIndex ind);
+void dumpGraph(const ir::Graph &graph);
+void dumpLoweredGraph(const compiler::LoweredGraph &lgraph);
+
+} // namespace text
+} // namespace dumper
+} // namespace onert
+
+#endif // __ONERT_DUMPER_TEXT_GRAPH_DUMPER_H__
diff --git a/runtime/onert/core/src/exec/DataflowExecutor.cc b/runtime/onert/core/src/exec/DataflowExecutor.cc
index b81a75794..8dac1219e 100644
--- a/runtime/onert/core/src/exec/DataflowExecutor.cc
+++ b/runtime/onert/core/src/exec/DataflowExecutor.cc
@@ -54,8 +54,7 @@ void DataflowExecutor::emplaceToReadyJobs(const uint32_t &id)
 {
   auto &job = _waiting_jobs[id];
   assert(job != nullptr);
-  auto &op_seq = _lowered_graph->op_seqs().at(_job_to_op_seq[job->index()]);
-  auto rank = calculateRank(op_seq.operations());
+  auto rank = calculateRank({_job_to_op[job->index()]});
   _ready_jobs.emplace(rank, std::move(job));
 }
 
@@ -78,48 +77,48 @@ bool DataflowExecutor::noWaitingJobs()
 }
 
 DataflowExecutor::DataflowExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
+                                   backend::BackendContexts &&backend_contexts,
                                    const compiler::TensorRegistries &tensor_regs,
                                    compiler::CodeMap &&code_map,
                                    const util::TracingCtx *tracing_ctx)
-    : ExecutorBase{std::move(lowered_graph), tensor_regs, tracing_ctx},
-      _code_map{std::move(code_map)}
+  : ExecutorBase{std::move(lowered_graph), std::move(backend_contexts), tensor_regs, tracing_ctx},
+    _code_map{std::move(code_map)}
 {
   VERBOSE(DataflowExecutor) << "Constructing Dataflow Executor" << std::endl;
 
-  const auto &op_seqs = _lowered_graph->op_seqs();
-  // Assign jobs convert OpSequenceIndex to job index(uint32_t)
+  // Assign jobs convert OperationIndex to job index(uint32_t)
   uint32_t next_job_index = 0;
-  std::unordered_map<ir::OpSequenceIndex, uint32_t> op_seq_to_job;
-  op_seqs.iterate([&](const ir::OpSequenceIndex &op_seq_index, const ir::OpSequence &) {
-    VERBOSE(DataflowExecutor) << "Create a job #" << next_job_index << " with OpSequenceIndex "
-                              << op_seq_index.value() << std::endl;
+  std::unordered_map<ir::OperationIndex, uint32_t> op_to_job;
+  const auto &operations = _lowered_graph->graph().operations();
+  operations.iterate([&](const ir::OperationIndex &op_ind, const ir::Operation &) {
+    VERBOSE(DataflowExecutor) << "Create a job " << next_job_index << " with Operation " << op_ind
+                              << std::endl;
     _finished_jobs.emplace_back(
-        std::make_unique<Job>(next_job_index, _code_map.at(op_seq_index).fn_seq.get()));
-    op_seq_to_job[op_seq_index] = next_job_index++;
+      std::make_unique<Job>(next_job_index, _code_map.at(op_ind).fn_seq.get()));
+    op_to_job[op_ind] = next_job_index++;
   });
 
   _waiting_jobs.resize(next_job_index);
   _output_info.resize(next_job_index);
   _initial_input_info.resize(next_job_index, 0);
 
-  op_seqs.iterate([&](const ir::OpSequenceIndex &op_seq_index, const ir::OpSequence &op_seq) {
-    auto job_index = op_seq_to_job[op_seq_index];
-    for (auto output : op_seq.getOutputs())
+  operations.iterate([&](const ir::OperationIndex &op_ind, const ir::Operation &op) {
+    auto job_index = op_to_job[op_ind];
+    for (auto output : op.getOutputs())
     {
       // Update output and input info
-      op_seqs.iterate(
-          [&](const ir::OpSequenceIndex &op_seq_cur_index, const ir::OpSequence &op_seq_cur) {
-            if (op_seq_cur.getInputs().contains(output))
-            {
-              auto dep_index = op_seq_to_job[op_seq_cur_index];
-              ++_initial_input_info[dep_index];
-              _output_info[job_index].push_back(dep_index);
-            }
-          });
+      operations.iterate([&](const ir::OperationIndex &op_cur_ind, const ir::Operation &op_cur) {
+        if (op_cur.getInputs().contains(output))
+        {
+          auto dep_index = op_to_job[op_cur_ind];
+          ++_initial_input_info[dep_index];
+          _output_info[job_index].push_back(dep_index);
+        }
+      });
     }
   });
-  for (const auto &s : op_seq_to_job)
-    _job_to_op_seq.emplace(s.second, s.first);
+  for (const auto &s : op_to_job)
+    _job_to_op.emplace(s.second, s.first);
 
   _input_info = _initial_input_info;
 }
@@ -151,24 +150,23 @@ void DataflowExecutor::executeImpl()
     auto job = std::move((_ready_jobs.begin())->second);
     _ready_jobs.erase(_ready_jobs.begin());
     auto job_index = job->index();
-    VERBOSE(DataflowExecutor) << "Run job #" << job_index << std::endl;
+    VERBOSE(DataflowExecutor) << "Run job " << job_index << std::endl;
 
-    auto op_seq_index = _job_to_op_seq[job_index];
-    auto op_seq = &_lowered_graph->op_seqs().at(op_seq_index);
-    const backend::Backend *backend =
-        _lowered_graph->getLowerInfo()->op_seq.at(op_seq_index)->backend();
+    auto op_ind = _job_to_op[job_index];
+    const backend::Backend *backend = _lowered_graph->lower_info().operation.at(op_ind).backend();
 
-    _subject.notifyJobBegin(this, profiling_subg_index, op_seq, backend);
+    _subject.notifyJobBegin(this, profiling_subg_index, op_ind, backend);
 
     job->fn_seq()->initRunning();
 
     // check if FunctionSequence needs to handle dynamic tensor
-    bool handle_dynamic_tensor = op_seq->has_dynamic_tensor() || dynamic_input_exists;
+    bool handle_dynamic_tensor =
+      _lowered_graph->getHasDynamicTensor(op_ind) || dynamic_input_exists;
     job->fn_seq()->enableDynamicShapeInferer(handle_dynamic_tensor);
 
     job->run();
 
-    _subject.notifyJobEnd(this, profiling_subg_index, op_seq, backend);
+    _subject.notifyJobEnd(this, profiling_subg_index, op_ind, backend);
     notify(job_index);
     _finished_jobs[job_index] = std::move(job);
   }
diff --git a/runtime/onert/core/src/exec/DataflowExecutor.h b/runtime/onert/core/src/exec/DataflowExecutor.h
index b72c0d030..bcac19d2e 100644
--- a/runtime/onert/core/src/exec/DataflowExecutor.h
+++ b/runtime/onert/core/src/exec/DataflowExecutor.h
@@ -48,9 +48,10 @@ public:
    *
    * @param lowered_graph LoweredGraph object
    * @param tensor_builders Tensor builders that are currently used
-   * @param code_map OpSequence and its code map
+   * @param code_map @c ir::Operation and its code map
    */
   DataflowExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
+                   backend::BackendContexts &&backend_contexts,
                    const compiler::TensorRegistries &tensor_regs, compiler::CodeMap &&code_map,
                    const util::TracingCtx *tracing_ctx);
 
@@ -87,7 +88,7 @@ protected:
   std::multimap<int64_t, std::unique_ptr<Job>, std::greater<int64_t>> _ready_jobs;
 
   /// @brief Which job runs which op and function.
-  std::unordered_map<uint32_t, ir::OpSequenceIndex> _job_to_op_seq;
+  std::unordered_map<uint32_t, ir::OperationIndex> _job_to_op;
 };
 
 } // namespace exec
diff --git a/runtime/onert/core/src/exec/DynamicShapeInferer.cc b/runtime/onert/core/src/exec/DynamicShapeInferer.cc
index 2d9d534f1..dbf4eb28f 100644
--- a/runtime/onert/core/src/exec/DynamicShapeInferer.cc
+++ b/runtime/onert/core/src/exec/DynamicShapeInferer.cc
@@ -48,12 +48,12 @@ void DynamicShapeInferer::handleBinaryArithmeticOp(const ir::Operation &op,
 
     So, only when all inputs are static, we can skip dynamic shape inference.
   */
-  if ((!lhs->is_dynamic()) && (!rhs->is_dynamic()))
-    return;
-
   auto output_idx = op.getOutputs().at(0);
   auto output = _tensor_registry->getITensor(output_idx);
 
+  if ((currently_static(lhs) && currently_static(rhs)) && previously_static(output))
+    return;
+
   ir::Shape new_shape = shape_inference::inferEltwiseShape(lhs_shape, rhs_shape);
 
   output->applyShape(new_shape);
@@ -144,7 +144,7 @@ void DynamicShapeInferer::visit(const ir::operation::BCQFullyConnected &op)
   const auto &input = _tensor_registry->getITensor(input_idx);
 
   const auto cluster_idx{
-      op.getInputs().at(ir::operation::BCQFullyConnected::Input::WEIGHTS_CLUSTERS)};
+    op.getInputs().at(ir::operation::BCQFullyConnected::Input::WEIGHTS_CLUSTERS)};
   const auto &cluster = _tensor_registry->getITensor(cluster_idx);
   assert(cluster->is_constant());
 
@@ -158,7 +158,7 @@ void DynamicShapeInferer::visit(const ir::operation::BCQFullyConnected &op)
   assert(cluster_buf);
 
   ir::Shape new_shape =
-      shape_inference::inferBCQFullyConnectedShape(input_shape, cluster_shape, cluster_buf);
+    shape_inference::inferBCQFullyConnectedShape(input_shape, cluster_shape, cluster_buf);
 
   auto output_ind = op.getOutputs().at(0);
   auto output = _tensor_registry->getITensor(output_ind);
@@ -222,7 +222,7 @@ void DynamicShapeInferer::visit(const ir::operation::BroadcastTo &op)
   assert(shape); // It shouldn't be 0.
 
   auto output_shape = shape_inference::inferBroadcastToShape(
-      shape->getShape(), reinterpret_cast<const int32_t *>(shape->buffer()));
+    shape->getShape(), reinterpret_cast<const int32_t *>(shape->buffer()));
 
   // set output shape and output buffer
   output->applyShape(output_shape);
@@ -270,15 +270,17 @@ void DynamicShapeInferer::visit(const ir::operation::Concat &op)
   {
     auto isConcatible = [](const backend::ITensor *input1, const backend::ITensor *input2,
                            int32_t axis) {
-      if (input1->num_dimensions() != input2->num_dimensions())
+      auto shape1 = input1->getShape();
+      auto shape2 = input2->getShape();
+      if (shape1.rank() != shape2.rank())
         return false;
 
-      for (size_t i = 0; i < input1->num_dimensions(); i++)
+      for (int i = 0; i < shape1.rank(); i++)
       {
-        auto positive_axis = (axis >= 0) ? axis : axis + input1->num_dimensions();
+        auto positive_axis = (axis >= 0) ? axis : axis + input1->getShape().rank();
 
         if (i != positive_axis)
-          if (input1->dimension(i) != input2->dimension(i))
+          if (shape1.dim(i) != shape2.dim(i))
             return false;
       }
 
@@ -393,9 +395,9 @@ void DynamicShapeInferer::visit(const ir::operation::ExpandDims &op)
 
   assert(axis->buffer());
   int32_t axis_value =
-      (axis_type == ir::DataType::INT32)
-          ? reinterpret_cast<const int32_t *>(axis->buffer())[0]
-          : static_cast<int32_t>(reinterpret_cast<const int64_t *>(axis->buffer())[0]);
+    (axis_type == ir::DataType::INT32)
+      ? reinterpret_cast<const int32_t *>(axis->buffer())[0]
+      : static_cast<int32_t>(reinterpret_cast<const int64_t *>(axis->buffer())[0]);
 
   auto output_shape = shape_inference::inferExpandDimsShape(input_shape, axis_value);
 
@@ -422,10 +424,10 @@ void DynamicShapeInferer::visit(const ir::operation::Fill &op)
 
   const auto &dims_shape = shape->getShape();
   auto output_shape = ((dims_type == ir::DataType::INT32)
-                           ? shape_inference::inferFillShape<int32_t>(
-                                 dims_shape, reinterpret_cast<const int32_t *>(dims_buf))
-                           : shape_inference::inferFillShape<int64_t>(
-                                 dims_shape, reinterpret_cast<const int64_t *>(dims_buf)));
+                         ? shape_inference::inferFillShape<int32_t>(
+                             dims_shape, reinterpret_cast<const int32_t *>(dims_buf))
+                         : shape_inference::inferFillShape<int64_t>(
+                             dims_shape, reinterpret_cast<const int64_t *>(dims_buf)));
 
   output->applyShape(output_shape);
   assert(output->buffer() != nullptr);
@@ -497,7 +499,7 @@ void DynamicShapeInferer::visit(const ir::operation::LSTM &op)
   auto output = _tensor_registry->getITensor(output_index);
 
   const auto output_state_out_index{
-      op.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)};
+    op.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)};
 
   const auto cell_state_out_index{op.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)};
 
@@ -517,19 +519,19 @@ void DynamicShapeInferer::visit(const ir::operation::LSTM &op)
   const auto input_shape = input->getShape();
 
   const auto input_to_output_weights_index{
-      op.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
+    op.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
   const auto input_to_output_weights = _tensor_registry->getITensor(input_to_output_weights_index);
   const auto input_to_output_weights_shape = input_to_output_weights->getShape();
 
   const auto recurrent_to_output_weights_index{
-      op.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
+    op.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
   const auto recurrent_to_output_weights =
-      _tensor_registry->getITensor(recurrent_to_output_weights_index);
+    _tensor_registry->getITensor(recurrent_to_output_weights_index);
   const auto recurrent_to_output_weights_shape = recurrent_to_output_weights->getShape();
 
   // re-sizing outputs
   const int n_batch =
-      (input_shape.rank() == 3 && op.param().time_major) ? input_shape.dim(1) : input_shape.dim(0);
+    (input_shape.rank() == 3 && op.param().time_major) ? input_shape.dim(1) : input_shape.dim(0);
   const int n_cell = input_to_output_weights_shape.dim(0);
   const int n_output = recurrent_to_output_weights_shape.dim(1);
   if (input_shape.rank() == 3)
@@ -564,19 +566,19 @@ void DynamicShapeInferer::visit(const ir::operation::LSTM &op)
   if (scratch_buffer != nullptr)
   {
     const auto input_to_input_weights_index{
-        op.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)};
+      op.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)};
     const auto recurrent_to_input_weights_index{
-        op.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)};
+      op.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)};
 
     const auto input_to_input_weights_shape =
-        _tensor_registry->getITensor(input_to_input_weights_index)->getShape();
+      _tensor_registry->getITensor(input_to_input_weights_index)->getShape();
     bool has_input_to_input_weights =
-        input_to_input_weights_shape.dim(0) != 0 && input_to_input_weights_shape.dim(1) != 0;
+      input_to_input_weights_shape.dim(0) != 0 && input_to_input_weights_shape.dim(1) != 0;
 
     const auto recurrent_to_input_weights_shape =
-        _tensor_registry->getITensor(recurrent_to_input_weights_index)->getShape();
-    bool has_recurrent_to_input_weights = recurrent_to_input_weights_shape.dim(0) != 0 &&
-                                          recurrent_to_input_weights_shape.dim(1) != 0;
+      _tensor_registry->getITensor(recurrent_to_input_weights_index)->getShape();
+    bool has_recurrent_to_input_weights =
+      recurrent_to_input_weights_shape.dim(0) != 0 && recurrent_to_input_weights_shape.dim(1) != 0;
 
     // NOTE The cell_to_input_weights do not exist in non-peephole although regular LSTM(non-CIFG).
     // true: no CIFG
@@ -681,7 +683,7 @@ void DynamicShapeInferer::visit(const ir::operation::Pad &op)
   assert(pad_buf);
 
   auto output_shape =
-      shape_inference::inferPadShape(input->getShape(), pad_buf, pad->getShape().num_elements());
+    shape_inference::inferPadShape(input->getShape(), pad_buf, pad->getShape().num_elements());
 
   // change output shape and reallocate output tensor memory
   output->applyShape(output_shape);
@@ -725,16 +727,16 @@ void DynamicShapeInferer::visit(const ir::operation::Range &op)
   if (output->data_type() == ir::DataType::FLOAT32)
   {
     new_shape =
-        shape_inference::inferRangeShape<float>(*reinterpret_cast<float *>(start_tensor->buffer()),
-                                                *reinterpret_cast<float *>(limit_tensor->buffer()),
-                                                *reinterpret_cast<float *>(delta_tensor->buffer()));
+      shape_inference::inferRangeShape<float>(*reinterpret_cast<float *>(start_tensor->buffer()),
+                                              *reinterpret_cast<float *>(limit_tensor->buffer()),
+                                              *reinterpret_cast<float *>(delta_tensor->buffer()));
   }
   else if (output->data_type() == ir::DataType::INT32)
   {
     new_shape = shape_inference::inferRangeShape<int32_t>(
-        *reinterpret_cast<int32_t *>(start_tensor->buffer()),
-        *reinterpret_cast<int32_t *>(limit_tensor->buffer()),
-        *reinterpret_cast<int32_t *>(delta_tensor->buffer()));
+      *reinterpret_cast<int32_t *>(start_tensor->buffer()),
+      *reinterpret_cast<int32_t *>(limit_tensor->buffer()),
+      *reinterpret_cast<int32_t *>(delta_tensor->buffer()));
   }
   output->applyShape(new_shape);
   assert(output->buffer() != nullptr);
@@ -828,7 +830,7 @@ void DynamicShapeInferer::visit(const ir::operation::Reshape &op)
     assert(new_shape_buf);
 
     auto output_shape = shape_inference::inferReshapeShape(
-        new_shape_buf, new_shape->getShape().num_elements(), input->getShape().num_elements());
+      new_shape_buf, new_shape->getShape().num_elements(), input->getShape().num_elements());
 
     // if shape is changed, change output shape and reallocate output tensor memory
     if (output_shape != output->getShape() || output->buffer() == nullptr)
@@ -896,7 +898,7 @@ void DynamicShapeInferer::visit(const ir::operation::ResizeBilinear &op)
     width_out = op.param().width_out;
   }
   auto output_shape =
-      shape_inference::inferResizeBilinearShape(input->getShape(), height_out, width_out);
+    shape_inference::inferResizeBilinearShape(input->getShape(), height_out, width_out);
 
   // if shape is changed, change output shape and reallocate output tensor memory
   if (output_shape != output->getShape() || output->buffer() == nullptr)
@@ -934,7 +936,7 @@ void DynamicShapeInferer::visit(const ir::operation::Select &op)
 
   // Select output shpae
   ir::Shape new_shape =
-      shape_inference::inferSelectShape(input_cond_shape, input_true_shape, input_false_shape);
+    shape_inference::inferSelectShape(input_cond_shape, input_true_shape, input_false_shape);
 
   auto output_ind = op.getOutputs().at(0);
   auto output = _tensor_registry->getITensor(output_ind);
@@ -1019,7 +1021,7 @@ void DynamicShapeInferer::visit(const ir::operation::SpaceToBatchND &op)
   auto padding_data = reinterpret_cast<int32_t *>(padding->buffer());
 
   ir::Shape new_shape = shape_inference::inferSpaceToBatchNDShape(
-      input_shape, block_shape_shape, padding_shape, block_shape_data, padding_data);
+    input_shape, block_shape_shape, padding_shape, block_shape_data, padding_data);
 
   output->applyShape(new_shape);
   assert(output->buffer() != nullptr);
@@ -1120,15 +1122,14 @@ void DynamicShapeInferer::visit(const ir::operation::StridedSlice &op)
   const auto rank = input_shape.rank();
 
   auto op_params = shape_inference::buildStridedSliceParams(
-      reinterpret_cast<uint32_t *>(starts->buffer()), reinterpret_cast<uint32_t *>(ends->buffer()),
-      reinterpret_cast<uint32_t *>(strides->buffer()), begin_mask, end_mask, shrink_axis_mask,
-      rank);
+    reinterpret_cast<uint32_t *>(starts->buffer()), reinterpret_cast<uint32_t *>(ends->buffer()),
+    reinterpret_cast<uint32_t *>(strides->buffer()), begin_mask, end_mask, shrink_axis_mask, rank);
 
   auto output_index = op.getOutputs().at(0);
   auto output = _tensor_registry->getITensor(output_index);
 
   ir::Shape output_shape =
-      onert::shape_inference::inferStridedSliceShape(input_shape, op_params, rank);
+    onert::shape_inference::inferStridedSliceShape(input_shape, op_params, rank);
 
   output->applyShape(output_shape);
   assert(output->buffer() != nullptr);
@@ -1152,8 +1153,9 @@ void DynamicShapeInferer::visit(const ir::operation::Tile &op)
   auto multiplier_buffer = reinterpret_cast<const int32_t *>(multiplier->buffer());
   assert(multiplier_buffer);
 
-  auto output_shape =
-      shape_inference::inferTileShape(input_shape, multiplier_buffer, multiplier->dimension(0));
+  auto mult_shape = multiplier->getShape();
+  auto output_shape = shape_inference::inferTileShape(
+    input_shape, multiplier_buffer, mult_shape.rank() == 0 ? 1 : mult_shape.dim(0));
 
   // set output shape and output buffer
   output->applyShape(output_shape);
@@ -1191,7 +1193,7 @@ void DynamicShapeInferer::visit(const ir::operation::Transpose &op)
 
   ir::Shape new_shape;
   // TODO Change perm->dimension(0) == 0 to perm->num_elements() == 0
-  if (perm->dimension(0) == 0) // This condition means that perm is (n-1...0)
+  if (perm->getShape().dim(0) == 0) // This condition means that perm is (n-1...0)
   {
     // Call by (n-1...0)
     new_shape = shape_inference::inferTransposeShape(input_shape, nullptr, 0);
@@ -1199,7 +1201,7 @@ void DynamicShapeInferer::visit(const ir::operation::Transpose &op)
   else
   {
     // Check rank
-    if (input->num_dimensions() != perm->getShape().num_elements())
+    if (static_cast<size_t>(input->getShape().rank()) != perm->getShape().num_elements())
     {
       throw std::runtime_error("DynamicShapeInferer failed, bad rank size: " +
                                std::to_string(perm->getShape().num_elements()));
@@ -1207,7 +1209,8 @@ void DynamicShapeInferer::visit(const ir::operation::Transpose &op)
 
     // set output shape, based on input and params
     const auto perm_buffer = reinterpret_cast<const int32_t *>(perm->buffer());
-    new_shape = shape_inference::inferTransposeShape(input_shape, perm_buffer, perm->dimension(0));
+    new_shape =
+      shape_inference::inferTransposeShape(input_shape, perm_buffer, perm->getShape().dim(0));
   }
   output->applyShape(new_shape);
   assert(output->buffer() != nullptr);
diff --git a/runtime/onert/core/src/exec/ExecTime.h b/runtime/onert/core/src/exec/ExecTime.h
index d2ddbad34..95f460053 100644
--- a/runtime/onert/core/src/exec/ExecTime.h
+++ b/runtime/onert/core/src/exec/ExecTime.h
@@ -34,7 +34,7 @@ class ExecTime
 {
 public:
   explicit ExecTime(const std::vector<const backend::Backend *> &backends)
-      : _json(backends, _measurements)
+    : _json(backends, _measurements)
   {
   }
 
diff --git a/runtime/onert/core/src/exec/Execution.cc b/runtime/onert/core/src/exec/Execution.cc
index 21fdd9c05..3d88cf5ff 100644
--- a/runtime/onert/core/src/exec/Execution.cc
+++ b/runtime/onert/core/src/exec/Execution.cc
@@ -40,7 +40,7 @@ void Execution::changeInputShape(const ir::IOIndex &index, const ir::Shape &new_
   _io_desc.dynamic_input_shapes[index] = new_shape;
 
   VERBOSE(Execution) << "Model input shape will be changed at the start of execute()"
-                     << "(index: " << index.value() << ")" << std::endl;
+                     << "(index: " << index << ")" << std::endl;
 }
 
 // TODO Remove default parameter
@@ -57,10 +57,10 @@ void Execution::setInput(const ir::IOIndex &index, const void *buffer, size_t le
   // note: input_shape_sig contains shape passed by nnfw_set_input_tensorinfo()
   {
     auto input_shape_sig = _io_desc.dynamic_input_shapes.find(index);
-    auto size_required = (input_shape_sig != _io_desc.dynamic_input_shapes.end())
-                             ? input_shape_sig->second.num_elements() *
-                                   onert::ir::sizeOfDataType(info.typeInfo().type())
-                             : info.total_size();
+    auto size_required =
+      (input_shape_sig != _io_desc.dynamic_input_shapes.end())
+        ? input_shape_sig->second.num_elements() * onert::ir::sizeOfDataType(info.typeInfo().type())
+        : info.total_size();
 
     if (length < size_required)
     {
@@ -117,14 +117,14 @@ void Execution::setInputLayout(const ir::IOIndex &index, ir::Layout layout)
 {
   const auto &input_desc = _io_desc.inputs.at(index.value());
   _io_desc.inputs.at(index.value()) =
-      std::make_unique<InputDesc>(input_desc->info, input_desc->buffer, input_desc->size, layout);
+    std::make_unique<InputDesc>(input_desc->info, input_desc->buffer, input_desc->size, layout);
 }
 
 void Execution::setOutputLayout(const ir::IOIndex &index, ir::Layout layout)
 {
   const auto &output_desc = _io_desc.outputs.at(index.value());
-  _io_desc.outputs.at(index.value()) = std::make_unique<OutputDesc>(
-      output_desc->info, output_desc->buffer, output_desc->size, layout);
+  _io_desc.outputs.at(index.value()) =
+    std::make_unique<OutputDesc>(output_desc->info, output_desc->buffer, output_desc->size, layout);
 }
 
 void Execution::execute()
@@ -159,7 +159,7 @@ ir::Shape Execution::getInputShape(ir::IOIndex ind) const
   auto itr = _io_desc.dynamic_input_shapes.find(ind);
   if (itr == _io_desc.dynamic_input_shapes.end())
   {
-    auto operand_idx = primary_subgraph().getInputs().at(ind.value());
+    auto operand_idx = primary_subgraph().getInputs().at(ind);
     return primary_subgraph().operands().at(operand_idx).shape();
   }
   else
diff --git a/runtime/onert/core/src/exec/ExecutionObservee.cc b/runtime/onert/core/src/exec/ExecutionObservee.cc
index d5003b126..d6a2bfd17 100644
--- a/runtime/onert/core/src/exec/ExecutionObservee.cc
+++ b/runtime/onert/core/src/exec/ExecutionObservee.cc
@@ -42,22 +42,21 @@ void ExecutionObservee::notifySubgraphEnd(ir::SubgraphIndex ind)
   }
 }
 
-void ExecutionObservee::notifyJobBegin(IExecutor *executor, ir::SubgraphIndex index,
-                                       const ir::OpSequence *op_seq,
-                                       const backend::Backend *backend)
+void ExecutionObservee::notifyJobBegin(IExecutor *executor, ir::SubgraphIndex subg_ind,
+                                       ir::OperationIndex op_ind, const backend::Backend *backend)
 {
   for (auto &o : _observers)
   {
-    o->handleJobBegin(executor, index, op_seq, backend);
+    o->handleJobBegin(executor, subg_ind, op_ind, backend);
   }
 }
 
-void ExecutionObservee::notifyJobEnd(IExecutor *executor, ir::SubgraphIndex index,
-                                     const ir::OpSequence *op_seq, const backend::Backend *backend)
+void ExecutionObservee::notifyJobEnd(IExecutor *executor, ir::SubgraphIndex subg_ind,
+                                     ir::OperationIndex op_ind, const backend::Backend *backend)
 {
   for (auto &o : _observers)
   {
-    o->handleJobEnd(executor, index, op_seq, backend);
+    o->handleJobEnd(executor, subg_ind, op_ind, backend);
   }
 }
 
diff --git a/runtime/onert/core/src/exec/ExecutionObservee.h b/runtime/onert/core/src/exec/ExecutionObservee.h
index 62b3f6201..423b5026b 100644
--- a/runtime/onert/core/src/exec/ExecutionObservee.h
+++ b/runtime/onert/core/src/exec/ExecutionObservee.h
@@ -42,9 +42,9 @@ public:
   void add(std::unique_ptr<IExecutionObserver> observer);
   void notifySubgraphBegin(ir::SubgraphIndex ind);
   void notifySubgraphEnd(ir::SubgraphIndex ind);
-  void notifyJobBegin(IExecutor *executor, ir::SubgraphIndex index, const ir::OpSequence *op_seq,
+  void notifyJobBegin(IExecutor *executor, ir::SubgraphIndex subg_ind, ir::OperationIndex op_ind,
                       const backend::Backend *backend);
-  void notifyJobEnd(IExecutor *executor, ir::SubgraphIndex index, const ir::OpSequence *op_seq,
+  void notifyJobEnd(IExecutor *executor, ir::SubgraphIndex subg_ind, ir::OperationIndex op_ind,
                     const backend::Backend *backend);
 
 private:
diff --git a/runtime/onert/core/src/exec/ExecutionObservers.cc b/runtime/onert/core/src/exec/ExecutionObservers.cc
index 18c0c1dd3..386178ae6 100644
--- a/runtime/onert/core/src/exec/ExecutionObservers.cc
+++ b/runtime/onert/core/src/exec/ExecutionObservers.cc
@@ -22,19 +22,15 @@
 #include "util/logging.h"
 #include "exec/IExecutor.h"
 #include "misc/polymorphic_downcast.h"
-#include "ir/OpSequence.h"
+#include "ir/Operation.h"
 #include "util/EventWriter.h"
-#include "util/Utils.h"
 
 namespace
 {
 
-void setUserData(const onert::ir::Graph &g, const onert::ir::OpSequence *op_seq,
+void setUserData(const onert::ir::Graph &g, const onert::ir::Operation *op,
                  decltype(EventCollector::Event::userData) &data)
 {
-  if (op_seq->size() == 0)
-    return;
-
   // From a tensor of shape [a, b, c], this will return a string "shape(a b c)".
   // String like "[1, 2, 3]" looks better but this will be considered as a list in Json
   // so text search (e.g., Ctrl-F in Chrome Tracing) could be difficult
@@ -53,10 +49,7 @@ void setUserData(const onert::ir::Graph &g, const onert::ir::OpSequence *op_seq,
     return shape_str;
   };
 
-  const auto &first_op_idx = op_seq->operations().at(0);
-  const auto &first_op_node = g.operations().at(first_op_idx);
-
-  auto &inputs = first_op_node.getInputs();
+  auto &inputs = op->getInputs();
   auto size = inputs.size();
   for (size_t i = 0; i < size; i++)
   {
@@ -81,7 +74,7 @@ namespace exec
 {
 
 void ProfileObserver::handleJobBegin(onert::exec::IExecutor *, ir::SubgraphIndex,
-                                     const ir::OpSequence *, const onert::backend::Backend *backend)
+                                     ir::OperationIndex, const onert::backend::Backend *backend)
 {
   _timer = backend->config()->timer();
   if (_timer == nullptr)
@@ -89,14 +82,14 @@ void ProfileObserver::handleJobBegin(onert::exec::IExecutor *, ir::SubgraphIndex
   _timer->handleBegin();
 }
 
-void ProfileObserver::handleJobEnd(IExecutor *exec, ir::SubgraphIndex, const ir::OpSequence *op_seq,
-                                   const backend::Backend *backend)
+void ProfileObserver::handleJobEnd(IExecutor *exec, ir::SubgraphIndex,
+                                   const ir::OperationIndex op_ind, const backend::Backend *backend)
 {
   _timer->handleEnd();
   const auto timer_res = _timer->getTime();
 
-  // NOTE This assumes there is just one operation in a op_seq
-  const auto &node = _graph.operations().at(op_seq->operations().at(0));
+  // NOTE This assumes there is just one operation in a op
+  const auto &node = _graph.operations().at(op_ind);
   auto node_name = node.name();
   VERBOSE(ProfileInfo) << "Time for " << node_name << " : " << timer_res << std::endl;
 
@@ -122,12 +115,9 @@ void ProfileObserver::handleJobEnd(IExecutor *exec, ir::SubgraphIndex, const ir:
 
 TracingObserver::TracingObserver(const std::string &filepath, const ir::Graph &graph,
                                  const util::TracingCtx *tracing_ctx)
-    : _recorder{std::make_unique<EventRecorder>()}, _collector{_recorder.get()}, _graph{graph},
-      _tracing_ctx{tracing_ctx}
+  : _recorder{std::make_unique<EventRecorder>()}, _collector{_recorder.get()}, _graph{graph},
+    _tracing_ctx{tracing_ctx}
 {
-  // TODO Remove below after using _tracing_ctx
-  UNUSED_RELEASE(_tracing_ctx);
-
   _event_writer = EventWriter::get(filepath);
   _event_writer->startToUse();
 }
@@ -146,61 +136,36 @@ TracingObserver::~TracingObserver()
 
 void TracingObserver::handleSubgraphBegin(ir::SubgraphIndex subg_ind)
 {
-  // TODO Write subg_ind into profling result
-  UNUSED_RELEASE(subg_ind);
-  _collector.onEvent(EventCollector::Event{EventCollector::Edge::BEGIN, "runtime", "Graph"});
+  _collector.onEvent(
+    EventCollector::SubgEvent{_tracing_ctx, EventCollector::Edge::BEGIN, subg_ind.value()});
 }
 
 void TracingObserver::handleJobBegin(IExecutor *, ir::SubgraphIndex subg_ind,
-                                     const ir::OpSequence *op_seq, const backend::Backend *backend)
+                                     ir::OperationIndex op_ind, const backend::Backend *backend)
 {
-  // TODO Write subg_ind into profling result
-  UNUSED_RELEASE(subg_ind);
-
   std::string backend_id = backend->config()->id();
-
-  auto ev = EventCollector::Event{EventCollector::Edge::BEGIN, backend_id,
-                                  opSequenceTag(op_seq, _graph.operations())};
+  const auto &op = _graph.operations().at(op_ind);
+  auto ev = EventCollector::OpSeqEvent{_tracing_ctx,     EventCollector::Edge::BEGIN,
+                                       subg_ind.value(), backend_id,
+                                       op_ind.value(),   op.name()};
   // add shape of inputs
-  setUserData(_graph, op_seq, ev.userData);
-
+  setUserData(_graph, &op, ev.userData);
   _collector.onEvent(ev);
 }
 
 void TracingObserver::handleJobEnd(IExecutor *, ir::SubgraphIndex subg_ind,
-                                   const ir::OpSequence *op_seq, const backend::Backend *backend)
+                                   ir::OperationIndex op_ind, const backend::Backend *backend)
 {
-  // TODO Write subg_ind into profling result
-  UNUSED_RELEASE(subg_ind);
-
   std::string backend_id = backend->config()->id();
-  _collector.onEvent(EventCollector::Event{EventCollector::Edge::END, backend_id,
-                                           opSequenceTag(op_seq, _graph.operations())});
+  _collector.onEvent(EventCollector::OpSeqEvent{_tracing_ctx, EventCollector::Edge::END,
+                                                subg_ind.value(), backend_id, op_ind.value(),
+                                                _graph.operations().at(op_ind).name()});
 }
 
 void TracingObserver::handleSubgraphEnd(ir::SubgraphIndex subg_ind)
 {
-  // TODO Write subg_ind into profling result
-  UNUSED_RELEASE(subg_ind);
-
-  _collector.onEvent(EventCollector::Event{EventCollector::Edge::END, "runtime", "Graph"});
-}
-
-std::string TracingObserver::opSequenceTag(const ir::OpSequence *op_seq,
-                                           const ir::Operations &operations)
-{
-  if (op_seq->size() == 0)
-    return "Empty OpSequence";
-
-  const auto &first_op_idx = op_seq->operations().at(0);
-  const auto &first_op_node = operations.at(first_op_idx);
-  std::string tag = "$" + std::to_string(first_op_idx.value());
-  tag += " " + first_op_node.name();
-  if (op_seq->size() > 1)
-  {
-    tag += " (+" + std::to_string(op_seq->size() - 1) + ")";
-  }
-  return tag;
+  _collector.onEvent(
+    EventCollector::SubgEvent{_tracing_ctx, EventCollector::Edge::END, subg_ind.value()});
 }
 
 } // namespace exec
diff --git a/runtime/onert/core/src/exec/ExecutionObservers.h b/runtime/onert/core/src/exec/ExecutionObservers.h
index a9eebfee1..4c6c7b18e 100644
--- a/runtime/onert/core/src/exec/ExecutionObservers.h
+++ b/runtime/onert/core/src/exec/ExecutionObservers.h
@@ -19,7 +19,7 @@
 
 #include "exec/IFunction.h"
 #include "ir/Index.h"
-#include "ir/OpSequence.h"
+#include "ir/Operation.h"
 #include "ExecTime.h"
 #include "util/ITimer.h"
 #include "exec/IExecutor.h"
@@ -39,9 +39,9 @@ public:
   /// @brief Invoked just before model (not individual operation) execution begins
   virtual void handleSubgraphBegin(ir::SubgraphIndex) { return; }
 
-  virtual void handleJobBegin(IExecutor *, ir::SubgraphIndex, const ir::OpSequence *,
+  virtual void handleJobBegin(IExecutor *, ir::SubgraphIndex, ir::OperationIndex,
                               const backend::Backend *) = 0;
-  virtual void handleJobEnd(IExecutor *, ir::SubgraphIndex, const ir::OpSequence *,
+  virtual void handleJobEnd(IExecutor *, ir::SubgraphIndex, ir::OperationIndex,
                             const backend::Backend *) = 0;
 
   /// @brief Invoked just after model (not individual operation) execution ends
@@ -54,12 +54,12 @@ class ProfileObserver : public IExecutionObserver
 {
 public:
   explicit ProfileObserver(std::shared_ptr<ExecTime> et, const ir::Graph &graph)
-      : _et(std::move(et)), _graph(graph)
+    : _et(std::move(et)), _graph(graph)
   {
   }
-  void handleJobBegin(IExecutor *, ir::SubgraphIndex, const ir::OpSequence *,
+  void handleJobBegin(IExecutor *, ir::SubgraphIndex, ir::OperationIndex,
                       const backend::Backend *) override;
-  void handleJobEnd(IExecutor *, ir::SubgraphIndex, const ir::OpSequence *,
+  void handleJobEnd(IExecutor *, ir::SubgraphIndex, ir::OperationIndex,
                     const backend::Backend *) override;
 
   void handleSubgraphEnd(ir::SubgraphIndex) override { _et->storeOperationsExecTime(); }
@@ -77,16 +77,13 @@ public:
                   const util::TracingCtx *tracing_ctx);
   ~TracingObserver();
   void handleSubgraphBegin(ir::SubgraphIndex) override;
-  void handleJobBegin(IExecutor *, ir::SubgraphIndex, const ir::OpSequence *,
+  void handleJobBegin(IExecutor *, ir::SubgraphIndex, ir::OperationIndex,
                       const backend::Backend *) override;
-  void handleJobEnd(IExecutor *, ir::SubgraphIndex, const ir::OpSequence *,
+  void handleJobEnd(IExecutor *, ir::SubgraphIndex, ir::OperationIndex,
                     const backend::Backend *) override;
   void handleSubgraphEnd(ir::SubgraphIndex) override;
 
 private:
-  static std::string opSequenceTag(const ir::OpSequence *op_seq, const ir::Operations &operations);
-
-private:
   std::unique_ptr<EventRecorder> _recorder;
   EventCollector _collector;
   const ir::Graph &_graph;
diff --git a/runtime/onert/core/src/exec/ExecutorBase.cc b/runtime/onert/core/src/exec/ExecutorBase.cc
index 588a3258d..3a624adef 100644
--- a/runtime/onert/core/src/exec/ExecutorBase.cc
+++ b/runtime/onert/core/src/exec/ExecutorBase.cc
@@ -17,7 +17,7 @@
 #include "ExecutorBase.h"
 #include "ShapeConverter.h"
 
-#include "backend/controlflow/UserTensor.h"
+#include "backend/builtin/UserTensor.h"
 #include "util/logging.h"
 #include "misc/polymorphic_downcast.h"
 
@@ -27,10 +27,12 @@ namespace exec
 {
 
 ExecutorBase::ExecutorBase(std::unique_ptr<compiler::LoweredGraph> &&lowered_graph,
+                           backend::BackendContexts &&backend_contexts,
                            const compiler::TensorRegistries &tensor_regs,
                            const util::TracingCtx *tracing_ctx)
-    : _lowered_graph{std::move(lowered_graph)}, _graph{_lowered_graph->graph()}, _mutex(),
-      _tracing_ctx(tracing_ctx)
+  : _lowered_graph{std::move(lowered_graph)},
+    _backend_contexts{std::move(backend_contexts)}, _graph{_lowered_graph->graph()}, _mutex(),
+    _tracing_ctx(tracing_ctx)
 {
   auto build_tensor_list = [&](const auto &ind_seq, auto &tensors) {
     assert(tensors.empty());
@@ -38,7 +40,7 @@ ExecutorBase::ExecutorBase(std::unique_ptr<compiler::LoweredGraph> &&lowered_gra
     {
       backend::ITensor *tensor = tensor_regs.getITensor(ind);
       assert(tensor != nullptr);
-      auto io_tensor = nnfw::misc::polymorphic_downcast<backend::controlflow::IOTensor *>(tensor);
+      auto io_tensor = nnfw::misc::polymorphic_downcast<backend::builtin::IOTensor *>(tensor);
       tensors.push_back(io_tensor);
     }
   };
@@ -67,7 +69,13 @@ void ExecutorBase::execute(const std::vector<backend::IPortableTensor *> &inputs
     {
       const auto orig_input_shape = input_tensor->orig_info().shape();
       const auto changed_input_shape =
-          convertShape(input->getShape(), input->layout(), input_tensor->orig_layout());
+        convertShape(input->getShape(), input->layout(), input_tensor->orig_layout());
+      if (input_tensor->get_info().shape() != changed_input_shape)
+      {
+        // TODO Fix this workaround that is introduced since cpu based kernels directly use `_info`
+        // rather than interface methods to avoid virtual function calls.
+        input_tensor->setShapeOfIPortableTensor(changed_input_shape);
+      }
       if (orig_input_shape != changed_input_shape)
       {
         input_tensor->set_dynamic();
@@ -145,7 +153,7 @@ void ExecutorBase::execute(const IODescription &desc)
     // set shape of outputDesc to tensor shape since tensor can be dynamic
     const auto output_tensor_shape = _output_tensors[n]->getShape();
     output.info.shape(
-        convertShape(output_tensor_shape, _output_tensors[n]->layout(), output.layout));
+      convertShape(output_tensor_shape, _output_tensors[n]->layout(), output.layout));
   }
 }
 
diff --git a/runtime/onert/core/src/exec/ExecutorBase.h b/runtime/onert/core/src/exec/ExecutorBase.h
index 5d95c10bf..3a124bd5b 100644
--- a/runtime/onert/core/src/exec/ExecutorBase.h
+++ b/runtime/onert/core/src/exec/ExecutorBase.h
@@ -25,11 +25,11 @@
 #include "exec/IODescription.h"
 #include "ir/Graph.h"
 #include "ir/Index.h"
-#include "ir/LowerInfoMap.h"
+#include "compiler/GraphLowerInfo.h"
 #include "ir/OperationIndexMap.h"
 #include "compiler/LoweredGraph.h"
 #include "compiler/TensorRegistries.h"
-#include "backend/controlflow/IOTensor.h"
+#include "backend/builtin/IOTensor.h"
 #include "util/TracingCtx.h"
 
 #include <cstdint>
@@ -51,6 +51,7 @@ public:
    * @param tensor_builders Tensor builders that are currently used
    */
   ExecutorBase(std::unique_ptr<compiler::LoweredGraph> &&lowered_graph,
+               backend::BackendContexts &&backend_contexts,
                const compiler::TensorRegistries &tensor_regs, const util::TracingCtx *tracing_ctx);
 
   virtual ~ExecutorBase() = default;
@@ -72,7 +73,7 @@ public:
 
   void addObserver(std::unique_ptr<IExecutionObserver> ref) { _subject.add(std::move(ref)); };
 
-  const std::vector<backend::controlflow::IOTensor *> &getOutputTensors() const override
+  const std::vector<backend::builtin::IOTensor *> &getOutputTensors() const override
   {
     return _output_tensors;
   }
@@ -87,9 +88,10 @@ protected:
   ExecutionObservee _subject;
   std::shared_ptr<ir::OperationIndexMap<int64_t>> _indexed_ranks;
   std::unique_ptr<compiler::LoweredGraph> _lowered_graph;
+  backend::BackendContexts _backend_contexts;
   const ir::Graph &_graph;
-  std::vector<backend::controlflow::IOTensor *> _input_tensors;
-  std::vector<backend::controlflow::IOTensor *> _output_tensors;
+  std::vector<backend::builtin::IOTensor *> _input_tensors;
+  std::vector<backend::builtin::IOTensor *> _output_tensors;
   std::mutex _mutex;
   const util::TracingCtx *_tracing_ctx;
 
diff --git a/runtime/onert/core/src/exec/FunctionSequence.cc b/runtime/onert/core/src/exec/FunctionSequence.cc
index 8aefa5eeb..df68b1b64 100644
--- a/runtime/onert/core/src/exec/FunctionSequence.cc
+++ b/runtime/onert/core/src/exec/FunctionSequence.cc
@@ -17,7 +17,6 @@
 #include "exec/FunctionSequence.h"
 
 #include "ir/Operation.h"
-#include "backend/IDynamicTensorManager.h"
 #include "backend/ITensorRegistry.h"
 #include "util/logging.h"
 
@@ -33,16 +32,16 @@ void FunctionSequence::run()
     // acl_cl and acl_neon backend don't support dynamic shape.
     // _dynamic_tensor_ctx is always nullptr for acl_cl and acl_neon
     // Thus, those two bakends cannot reach here.
-    if (_dynamic_tensor_ctx->op_seq->size() != _functions.size())
-      throw std::runtime_error("operation and functions should be mapped one by one");
 
-    auto op_seq_iter = _dynamic_tensor_ctx->op_seq->begin();
+    // Do dynamic shape inference
+    auto op_ind = _dynamic_tensor_ctx->op_ind;
+    auto &op = _dynamic_tensor_ctx->operations->at(op_ind);
+    op.accept(*_dynamic_tensor_ctx->dynamic_shape_inferer);
+
     for (const auto &function : _functions)
     {
-      // set shape of output and allocate memory when needed
-      auto &op = _dynamic_tensor_ctx->operations->at(*op_seq_iter);
-      op.accept(*_dynamic_tensor_ctx->dynamic_shape_inferer);
-
+      // NOTE the function could be also FunctionSequence so we do this
+      // TODO Remove this or do this recursively
       auto *sub_func_seq = dynamic_cast<FunctionSequence *>(function.get());
       if (sub_func_seq != nullptr)
       {
@@ -52,11 +51,6 @@ void FunctionSequence::run()
 
       // run kernel
       function->run();
-
-      // deallocate input tensors which is no longer used
-      _dynamic_tensor_ctx->dynamic_tensor_manager->deallocInput(*op_seq_iter);
-
-      op_seq_iter++;
     }
   }
   else
diff --git a/runtime/onert/core/src/exec/IPermuteFunction.h b/runtime/onert/core/src/exec/IPermuteFunction.h
index 8f62156a6..8e343cffa 100644
--- a/runtime/onert/core/src/exec/IPermuteFunction.h
+++ b/runtime/onert/core/src/exec/IPermuteFunction.h
@@ -99,7 +99,7 @@ public:
       auto &dst_offsets = _dst_tensors_offsets.at(i);
       if (src_tensor != dst_tensor)
       {
-        const auto rank = src_tensor->num_dimensions();
+        const auto rank = src_tensor->getShape().rank();
         permute(src_tensor, dst_tensor, rank, src_offsets, dst_offsets);
       }
     }
@@ -225,17 +225,18 @@ private:
         case PermuteType::NHWC_TO_NCHW:
         {
           ir::FeatureShape shape;
-          shape.N = dst->dimension(0);
-          shape.C = dst->dimension(1);
-          shape.H = dst->dimension(2);
-          shape.W = dst->dimension(3);
+          auto dst_shape = dst->getShape();
+          shape.N = dst_shape.dim(0);
+          shape.C = dst_shape.dim(1);
+          shape.H = dst_shape.dim(2);
+          shape.W = dst_shape.dim(3);
 
           typename feature::nchw::View<T>::Strides strides;
           const auto start_offset = dst->calcOffset({0, 0, 0, 0});
-          strides.W = dst->dimension(3) == 1 ? 0 : dst->calcOffset({0, 0, 0, 1}) - start_offset;
-          strides.H = dst->dimension(2) == 1 ? 0 : dst->calcOffset({0, 0, 1, 0}) - start_offset;
-          strides.C = dst->dimension(1) == 1 ? 0 : dst->calcOffset({0, 1, 0, 0}) - start_offset;
-          strides.N = dst->dimension(0) == 1 ? 0 : dst->calcOffset({1, 0, 0, 0}) - start_offset;
+          strides.W = dst_shape.dim(3) == 1 ? 0 : dst->calcOffset({0, 0, 0, 1}) - start_offset;
+          strides.H = dst_shape.dim(2) == 1 ? 0 : dst->calcOffset({0, 0, 1, 0}) - start_offset;
+          strides.C = dst_shape.dim(1) == 1 ? 0 : dst->calcOffset({0, 1, 0, 0}) - start_offset;
+          strides.N = dst_shape.dim(0) == 1 ? 0 : dst->calcOffset({1, 0, 0, 0}) - start_offset;
 
           const feature::nhwc::Reader<T> from(src);
           feature::nchw::View<T> into(shape, strides,
@@ -249,17 +250,18 @@ private:
         case PermuteType::NCHW_TO_NHWC:
         {
           ir::FeatureShape shape;
-          shape.N = dst->dimension(0);
-          shape.H = dst->dimension(1);
-          shape.W = dst->dimension(2);
-          shape.C = dst->dimension(3);
+          auto dst_shape = dst->getShape();
+          shape.N = dst_shape.dim(0);
+          shape.H = dst_shape.dim(1);
+          shape.W = dst_shape.dim(2);
+          shape.C = dst_shape.dim(3);
 
           typename feature::nhwc::View<T>::Strides strides;
           const auto start_offset = dst->calcOffset({0, 0, 0, 0});
-          strides.C = dst->dimension(3) == 1 ? 0 : dst->calcOffset({0, 0, 0, 1}) - start_offset;
-          strides.W = dst->dimension(2) == 1 ? 0 : dst->calcOffset({0, 0, 1, 0}) - start_offset;
-          strides.H = dst->dimension(1) == 1 ? 0 : dst->calcOffset({0, 1, 0, 0}) - start_offset;
-          strides.N = dst->dimension(0) == 1 ? 0 : dst->calcOffset({1, 0, 0, 0}) - start_offset;
+          strides.C = dst_shape.dim(3) == 1 ? 0 : dst->calcOffset({0, 0, 0, 1}) - start_offset;
+          strides.W = dst_shape.dim(2) == 1 ? 0 : dst->calcOffset({0, 0, 1, 0}) - start_offset;
+          strides.H = dst_shape.dim(1) == 1 ? 0 : dst->calcOffset({0, 1, 0, 0}) - start_offset;
+          strides.N = dst_shape.dim(0) == 1 ? 0 : dst->calcOffset({1, 0, 0, 0}) - start_offset;
 
           const feature::nchw::Reader<T> from(src);
           feature::nhwc::View<T> into(shape, strides,
diff --git a/runtime/onert/core/src/exec/JSONExecTime.h b/runtime/onert/core/src/exec/JSONExecTime.h
index 8987d723c..e01723611 100644
--- a/runtime/onert/core/src/exec/JSONExecTime.h
+++ b/runtime/onert/core/src/exec/JSONExecTime.h
@@ -37,15 +37,15 @@ namespace exec
  * _measurements[Backend*]["string"][bool][uint32_t] = int64_t
  */
 using MeasurementData = std::unordered_map<
-    const backend::Backend *,
-    std::unordered_map<std::string, std::unordered_map<bool, std::map<uint32_t, int64_t>>>>;
+  const backend::Backend *,
+  std::unordered_map<std::string, std::unordered_map<bool, std::map<uint32_t, int64_t>>>>;
 
 class JSON
 {
 public:
   explicit JSON(const std::vector<const backend::Backend *> &backends,
                 MeasurementData &measurements)
-      : _measurement_file("exec_time.json"), _backends(), _measurements(measurements)
+    : _measurement_file("exec_time.json"), _backends(), _measurements(measurements)
   {
     for (const auto b : backends)
     {
diff --git a/runtime/onert/core/src/exec/LinearExecutor.cc b/runtime/onert/core/src/exec/LinearExecutor.cc
index a6d447312..4d10c869b 100644
--- a/runtime/onert/core/src/exec/LinearExecutor.cc
+++ b/runtime/onert/core/src/exec/LinearExecutor.cc
@@ -24,19 +24,6 @@ namespace onert
 namespace exec
 {
 
-#ifdef RUY_PROFILER
-namespace
-{
-char *seq_to_label(const onert::ir::OpSequence *op_seq, const onert::ir::Operations &operations)
-{
-  auto node_name = operations.at(*op_seq->begin()).name();
-  char *cstr = new char[node_name.length() + 1];
-  std::strcpy(cstr, node_name.c_str());
-  return cstr;
-}
-} // namespace
-#endif
-
 void LinearExecutor::executeImpl()
 {
   auto profiling_subg_index = _tracing_ctx->getSubgraphIndex(&_graph);
@@ -44,23 +31,23 @@ void LinearExecutor::executeImpl()
   _subject.notifySubgraphBegin(profiling_subg_index);
   for (auto &&code : _code)
   {
-    const auto op_seq = code.op_seq;
     const auto backend = code.lower_info->backend();
 // TODO : Move ruy profiler into ExecutionObserver
 #ifdef RUY_PROFILER
-    ruy::profiler::ScopeLabel label(seq_to_label(op_seq, _graph.operations()));
+    ruy::profiler::ScopeLabel label(code.op->name());
 #endif
-    _subject.notifyJobBegin(this, profiling_subg_index, op_seq, backend);
+    _subject.notifyJobBegin(this, profiling_subg_index, code.op_ind, backend);
 
     auto &fn_seq = code.fn_seq;
 
     fn_seq->initRunning();
 
-    bool handle_dynamic_tensor = op_seq->has_dynamic_tensor() || hasDynamicInput();
+    bool handle_dynamic_tensor =
+      _lowered_graph->getHasDynamicTensor(code.op_ind) || hasDynamicInput();
     fn_seq->enableDynamicShapeInferer(handle_dynamic_tensor);
     fn_seq->run();
 
-    _subject.notifyJobEnd(this, profiling_subg_index, op_seq, backend);
+    _subject.notifyJobEnd(this, profiling_subg_index, code.op_ind, backend);
   }
   _subject.notifySubgraphEnd(profiling_subg_index);
 }
diff --git a/runtime/onert/core/src/exec/LinearExecutor.h b/runtime/onert/core/src/exec/LinearExecutor.h
index d43c97012..39d653154 100644
--- a/runtime/onert/core/src/exec/LinearExecutor.h
+++ b/runtime/onert/core/src/exec/LinearExecutor.h
@@ -45,12 +45,13 @@ public:
    * @brief Construct a new LinearExecutor object
    * @param lowered_graph LoweredGraph object
    * @param tensor_builders Tensor builders that are currently used
-   * @param code_map OpSequence and its code map
+   * @param code_map @c ir::Operation and its code map
    */
   LinearExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
+                 backend::BackendContexts &&backend_contexts,
                  const compiler::TensorRegistries &tensor_regs, compiler::CodeMap &&code_map,
-                 const std::vector<ir::OpSequenceIndex> &order, const util::TracingCtx *tracing_ctx)
-      : ExecutorBase{std::move(lowered_graph), tensor_regs, tracing_ctx}
+                 const std::vector<ir::OperationIndex> &order, const util::TracingCtx *tracing_ctx)
+    : ExecutorBase{std::move(lowered_graph), std::move(backend_contexts), tensor_regs, tracing_ctx}
   {
     for (auto index : order)
     {
diff --git a/runtime/onert/core/src/exec/ParallelExecutor.cc b/runtime/onert/core/src/exec/ParallelExecutor.cc
index e9e576ce8..9da7c82b4 100644
--- a/runtime/onert/core/src/exec/ParallelExecutor.cc
+++ b/runtime/onert/core/src/exec/ParallelExecutor.cc
@@ -31,7 +31,7 @@ class HookFunction : public IFunction
 public:
   HookFunction(IFunction *fn, const std::function<void()> &setup,
                const std::function<void()> &teardown)
-      : _fn{fn}, _setup{setup}, _teardown{teardown}
+    : _fn{fn}, _setup{setup}, _teardown{teardown}
   {
   }
 
@@ -60,10 +60,12 @@ void ParallelExecutor::notify(uint32_t finished_job_id)
 }
 
 ParallelExecutor::ParallelExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
+                                   backend::BackendContexts &&backend_contexts,
                                    const compiler::TensorRegistries &tensor_regs,
                                    compiler::CodeMap &&code_map,
                                    const util::TracingCtx *tracing_ctx)
-    : DataflowExecutor{std::move(lowered_graph), tensor_regs, std::move(code_map), tracing_ctx}
+  : DataflowExecutor{std::move(lowered_graph), std::move(backend_contexts), tensor_regs,
+                     std::move(code_map), tracing_ctx}
 {
   VERBOSE(ParallelExecutor) << "Constructing Parallel Executor" << std::endl;
 }
@@ -73,12 +75,12 @@ void ParallelExecutor::executeImpl()
   bool dynamic_input_exists = hasDynamicInput();
 
   // Init scheduler
-  // TODO Consider to have distinct backend set in LowerInfoMap
+  // TODO Consider to have distinct backend set in GraphLowerInfo
   BackendSet backends;
-  for (auto &itr : _lowered_graph->getLowerInfo()->op_seq)
-  {
-    backends.add(itr.second->backend());
-  }
+  _lowered_graph->lower_info().operation.iterate(
+    [&](const ir::OperationIndex &, const compiler::OperationLowerInfo &lower_info) {
+      backends.add(lower_info.backend());
+    });
   _scheduler = std::make_unique<ParallelScheduler>(backends);
 
   assert(noWaitingJobs());
@@ -121,24 +123,24 @@ void ParallelExecutor::executeImpl()
 
     lock.unlock();
 
-    VERBOSE(ParallelExecutor) << "Assigning fn #" << job->index() << std::endl;
+    VERBOSE(ParallelExecutor) << "Assigning fn " << job->index() << std::endl;
 
     auto job_index = job->index();
-    auto op_sequence_index = _job_to_op_seq[job_index];
-    auto op_seq = &_lowered_graph->op_seqs().at(op_sequence_index);
-    auto backend = _lowered_graph->getLowerInfo()->op_seq.at(op_sequence_index)->backend();
-    auto setup = [&, op_seq, backend]() {
-      _subject.notifyJobBegin(this, profiling_subg_index, op_seq, backend);
+    auto op_ind = _job_to_op[job_index];
+    auto backend = _lowered_graph->lower_info().operation.at(op_ind).backend();
+    auto setup = [&, op_ind, backend]() {
+      _subject.notifyJobBegin(this, profiling_subg_index, op_ind, backend);
     };
-    auto teardown = [&, job_index, op_seq, backend]() {
-      _subject.notifyJobEnd(this, profiling_subg_index, op_seq, backend);
+    auto teardown = [&, job_index, op_ind, backend]() {
+      _subject.notifyJobEnd(this, profiling_subg_index, op_ind, backend);
       notify(job_index);
     };
 
     job->fn_seq()->initRunning();
 
     // dynamic tensor setting
-    bool handle_dynamic_tensor = op_seq->has_dynamic_tensor() || dynamic_input_exists;
+    bool handle_dynamic_tensor =
+      _lowered_graph->getHasDynamicTensor(op_ind) || dynamic_input_exists;
     job->fn_seq()->enableDynamicShapeInferer(handle_dynamic_tensor);
 
     _scheduler->assign(std::make_unique<HookFunction>(job->fn_seq(), setup, teardown), backend);
diff --git a/runtime/onert/core/src/exec/ParallelExecutor.h b/runtime/onert/core/src/exec/ParallelExecutor.h
index fd9db42e1..7f107fa22 100644
--- a/runtime/onert/core/src/exec/ParallelExecutor.h
+++ b/runtime/onert/core/src/exec/ParallelExecutor.h
@@ -49,9 +49,10 @@ public:
    *
    * @param lowered_graph LoweredGraph object
    * @param tensor_builders Tensor builders that are currently used
-   * @param code_map OpSequence and its code map
+   * @param code_map @c ir::Operation and its code map
    */
   ParallelExecutor(std::unique_ptr<compiler::LoweredGraph> lowered_graph,
+                   backend::BackendContexts &&backend_contexts,
                    const compiler::TensorRegistries &tensor_regs, compiler::CodeMap &&code_map,
                    const util::TracingCtx *tracing_ctx);
 
diff --git a/runtime/onert/core/src/exec/feature/nchw/Reader.h b/runtime/onert/core/src/exec/feature/nchw/Reader.h
index aebedd853..d5e3cb97c 100644
--- a/runtime/onert/core/src/exec/feature/nchw/Reader.h
+++ b/runtime/onert/core/src/exec/feature/nchw/Reader.h
@@ -39,32 +39,33 @@ public:
   using Strides = ir::FeatureShape;
   // Construct for buffer and strides
   Reader(const ir::FeatureShape &shape, const Strides &strides, const T *ptr, size_t len)
-      : _shape{shape}, _strides{strides}, _ptr{reinterpret_cast<const uint8_t *>(ptr)}, _len{len}
+    : _shape{shape}, _strides{strides}, _ptr{reinterpret_cast<const uint8_t *>(ptr)}, _len{len}
   {
     UNUSED_RELEASE(len); // Workaround for unused variable in release mode
     assert(len == static_cast<size_t>(strides.N != 0
-                                          ? shape.N * strides.N
-                                          : strides.C != 0 ? shape.C * strides.C
-                                                           : strides.H != 0 ? shape.H * strides.H
-                                                                            : shape.W * strides.W));
+                                        ? shape.N * strides.N
+                                        : strides.C != 0 ? shape.C * strides.C
+                                                         : strides.H != 0 ? shape.H * strides.H
+                                                                          : shape.W * strides.W));
   }
 
   // Construct for backend tensor
   Reader(backend::ITensor *tensor)
-      : _ptr{tensor->buffer() + tensor->calcOffset({0, 0, 0, 0})}, _len{tensor->total_size()}
+    : _ptr{tensor->buffer() + tensor->calcOffset({0, 0, 0, 0})}, _len{tensor->total_size()}
   {
     assert(tensor->layout() == ir::Layout::NCHW);
 
     const auto start_offset = tensor->calcOffset({0, 0, 0, 0});
-    _strides.W = tensor->dimension(3) == 1 ? 0 : tensor->calcOffset({0, 0, 0, 1}) - start_offset;
-    _strides.H = tensor->dimension(2) == 1 ? 0 : tensor->calcOffset({0, 0, 1, 0}) - start_offset;
-    _strides.C = tensor->dimension(1) == 1 ? 0 : tensor->calcOffset({0, 1, 0, 0}) - start_offset;
-    _strides.N = tensor->dimension(0) == 1 ? 0 : tensor->calcOffset({1, 0, 0, 0}) - start_offset;
-
-    _shape.W = tensor->dimension(3);
-    _shape.H = tensor->dimension(2);
-    _shape.C = tensor->dimension(1);
-    _shape.N = tensor->dimension(0);
+    auto shape = tensor->getShape();
+    _strides.W = shape.dim(3) == 1 ? 0 : tensor->calcOffset({0, 0, 0, 1}) - start_offset;
+    _strides.H = shape.dim(2) == 1 ? 0 : tensor->calcOffset({0, 0, 1, 0}) - start_offset;
+    _strides.C = shape.dim(1) == 1 ? 0 : tensor->calcOffset({0, 1, 0, 0}) - start_offset;
+    _strides.N = shape.dim(0) == 1 ? 0 : tensor->calcOffset({1, 0, 0, 0}) - start_offset;
+
+    _shape.W = shape.dim(3);
+    _shape.H = shape.dim(2);
+    _shape.C = shape.dim(1);
+    _shape.N = shape.dim(0);
   }
 
 public:
diff --git a/runtime/onert/core/src/exec/feature/nchw/View.h b/runtime/onert/core/src/exec/feature/nchw/View.h
index df3576264..cdbb0cd7c 100644
--- a/runtime/onert/core/src/exec/feature/nchw/View.h
+++ b/runtime/onert/core/src/exec/feature/nchw/View.h
@@ -40,7 +40,7 @@ public:
   using Strides = typename Reader<T>::Strides;
   // Construct for buffer of model inputs
   View(const ir::FeatureShape &shape, const Strides &strides, T *ptr, size_t len)
-      : Reader<T>{shape, strides, ptr, len}
+    : Reader<T>{shape, strides, ptr, len}
   {
     // DO NOTHING
   }
diff --git a/runtime/onert/core/src/exec/feature/nhwc/Reader.h b/runtime/onert/core/src/exec/feature/nhwc/Reader.h
index da6a5f6a9..0bc1ee95b 100644
--- a/runtime/onert/core/src/exec/feature/nhwc/Reader.h
+++ b/runtime/onert/core/src/exec/feature/nhwc/Reader.h
@@ -40,32 +40,33 @@ public:
   using Strides = ir::FeatureShape;
   // Construct for buffer and strides
   Reader(const ir::FeatureShape &shape, const Strides &strides, const T *ptr, size_t len)
-      : _shape{shape}, _strides{strides}, _ptr{reinterpret_cast<const uint8_t *>(ptr)}, _len{len}
+    : _shape{shape}, _strides{strides}, _ptr{reinterpret_cast<const uint8_t *>(ptr)}, _len{len}
   {
     UNUSED_RELEASE(len); // Workaround for unused variable in release mode
     assert(len == static_cast<size_t>(strides.N != 0
-                                          ? shape.N * strides.N
-                                          : strides.H != 0 ? shape.H * strides.H
-                                                           : strides.W != 0 ? shape.W * strides.W
-                                                                            : shape.C * strides.C));
+                                        ? shape.N * strides.N
+                                        : strides.H != 0 ? shape.H * strides.H
+                                                         : strides.W != 0 ? shape.W * strides.W
+                                                                          : shape.C * strides.C));
   }
 
   // Construct for backend tensor
   Reader(const backend::ITensor *tensor)
-      : _ptr{tensor->buffer() + tensor->calcOffset({0, 0, 0, 0})}, _len{tensor->total_size()}
+    : _ptr{tensor->buffer() + tensor->calcOffset({0, 0, 0, 0})}, _len{tensor->total_size()}
   {
     assert(tensor->layout() == ir::Layout::NHWC);
 
     const auto start_offset = tensor->calcOffset({0, 0, 0, 0});
-    _strides.C = tensor->dimension(3) == 1 ? 0 : tensor->calcOffset({0, 0, 0, 1}) - start_offset;
-    _strides.W = tensor->dimension(2) == 1 ? 0 : tensor->calcOffset({0, 0, 1, 0}) - start_offset;
-    _strides.H = tensor->dimension(1) == 1 ? 0 : tensor->calcOffset({0, 1, 0, 0}) - start_offset;
-    _strides.N = tensor->dimension(0) == 1 ? 0 : tensor->calcOffset({1, 0, 0, 0}) - start_offset;
-
-    _shape.C = tensor->dimension(3);
-    _shape.W = tensor->dimension(2);
-    _shape.H = tensor->dimension(1);
-    _shape.N = tensor->dimension(0);
+    auto shape = tensor->getShape();
+    _strides.C = shape.dim(3) == 1 ? 0 : tensor->calcOffset({0, 0, 0, 1}) - start_offset;
+    _strides.W = shape.dim(2) == 1 ? 0 : tensor->calcOffset({0, 0, 1, 0}) - start_offset;
+    _strides.H = shape.dim(1) == 1 ? 0 : tensor->calcOffset({0, 1, 0, 0}) - start_offset;
+    _strides.N = shape.dim(0) == 1 ? 0 : tensor->calcOffset({1, 0, 0, 0}) - start_offset;
+
+    _shape.C = shape.dim(3);
+    _shape.W = shape.dim(2);
+    _shape.H = shape.dim(1);
+    _shape.N = shape.dim(0);
   }
 
 public:
diff --git a/runtime/onert/core/src/exec/feature/nhwc/View.h b/runtime/onert/core/src/exec/feature/nhwc/View.h
index a77f68024..40d1d237c 100644
--- a/runtime/onert/core/src/exec/feature/nhwc/View.h
+++ b/runtime/onert/core/src/exec/feature/nhwc/View.h
@@ -41,7 +41,7 @@ public:
   using Strides = typename Reader<T>::Strides;
   // Construct for buffer and strides
   View(const ir::FeatureShape &shape, const Strides &strides, T *ptr, size_t len)
-      : Reader<T>{shape, strides, ptr, len}
+    : Reader<T>{shape, strides, ptr, len}
   {
     // DO NOTHING
   }
diff --git a/runtime/onert/core/src/interp/InterpExecutor.cc b/runtime/onert/core/src/interp/InterpExecutor.cc
index cd31a4dca..44d1575d7 100644
--- a/runtime/onert/core/src/interp/InterpExecutor.cc
+++ b/runtime/onert/core/src/interp/InterpExecutor.cc
@@ -50,7 +50,7 @@ void InterpExecutor::execute(const exec::IODescription &desc)
 
     auto input_tensor = std::make_shared<ROTensor>(input->info);
     input_tensor->setData(std::make_shared<const ir::ExternalData>(
-        reinterpret_cast<const uint8_t *>(input->buffer), input->size));
+      reinterpret_cast<const uint8_t *>(input->buffer), input->size));
     tensor_map[input_index] = input_tensor;
   }
 
@@ -66,7 +66,7 @@ void InterpExecutor::execute(const exec::IODescription &desc)
   {
     if (tensor_map.find(index) != tensor_map.end())
     {
-      VERBOSE(INTERPRETER) << "Assign input tensor. operand index:" << index.value() << std::endl;
+      VERBOSE(INTERPRETER) << "Assign input tensor. operand index:" << index << std::endl;
       interp_env->assignTensor(index, tensor_map.at(index));
     }
   }
@@ -86,22 +86,22 @@ void InterpExecutor::execute(const exec::IODescription &desc)
                          << std::endl;
 
     interp_env->assignExternalBuffer(
-        output_index, std::make_shared<ExternalBuffer>(reinterpret_cast<uint8_t *>(output->buffer),
-                                                       output->size));
+      output_index,
+      std::make_shared<ExternalBuffer>(reinterpret_cast<uint8_t *>(output->buffer), output->size));
   }
 
   // Allocate constant tensor
   _graph.operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &obj) {
     if (obj.isConstant())
     {
-      VERBOSE(INTERPRETER) << "Allocate and assign constant tensor. operand index:" << ind.value()
+      VERBOSE(INTERPRETER) << "Allocate and assign constant tensor. operand index:" << ind
                            << std::endl;
 
       assert(obj.data());
       auto const_tensor = std::make_shared<ROTensor>(obj.info());
       // Assume that interpreter's tensor layout is same with model (NHWC)
       const_tensor->setData(
-          std::make_shared<ir::ExternalData>(obj.data()->base(), obj.info().total_size()));
+        std::make_shared<ir::ExternalData>(obj.data()->base(), obj.info().total_size()));
       interp_env->assignTensor(ind, const_tensor);
     }
   });
diff --git a/runtime/onert/core/src/interp/InterpExecutor.h b/runtime/onert/core/src/interp/InterpExecutor.h
index 99d7b3af7..6e3a02327 100644
--- a/runtime/onert/core/src/interp/InterpExecutor.h
+++ b/runtime/onert/core/src/interp/InterpExecutor.h
@@ -51,7 +51,7 @@ public:
    */
   const ir::Graph &graph() final { return _graph; }
   void setIndexedRanks(std::shared_ptr<ir::OperationIndexMap<int64_t>>) override{
-      // Not implemented
+    // Not implemented
   };
   /**
    * @brief  Start execution
@@ -63,7 +63,7 @@ public:
   {
     throw new std::runtime_error{"Interpreter does not support subgraph calls(control flow ops)"};
   }
-  const std::vector<backend::controlflow::IOTensor *> &getOutputTensors() const final
+  const std::vector<backend::builtin::IOTensor *> &getOutputTensors() const final
   {
     throw new std::runtime_error{"Interpreter does not support this function."};
   }
diff --git a/runtime/onert/core/src/interp/Interpreter.cc b/runtime/onert/core/src/interp/Interpreter.cc
index b92afbe73..e01afb8a6 100644
--- a/runtime/onert/core/src/interp/Interpreter.cc
+++ b/runtime/onert/core/src/interp/Interpreter.cc
@@ -49,7 +49,7 @@ public:
     const ir::Operation &node = _env->graph().operations().at(idx);
     const auto nodeName = node.name();
     VERBOSE(INTERPRETER) << "Prepare output operands and execute " << nodeName
-                         << " operation (id: " << idx.value() << ")" << std::endl;
+                         << " operation (id: " << idx << ")" << std::endl;
 
     const auto nodeOpCode = node.opcode();
     if (_kernels.find(nodeOpCode) == _kernels.end())
@@ -83,7 +83,7 @@ void Interpreter::run()
   //       But that scenario may not exist
   for (auto ind : _env->graph().getInputs())
   {
-    VERBOSE(INTERPRETER) << "Input: Push to operand stack " << ind.value() << std::endl;
+    VERBOSE(INTERPRETER) << "Input: Push to operand stack " << ind << std::endl;
 
     operand_stack.push(ind);
   }
@@ -91,7 +91,7 @@ void Interpreter::run()
   _env->graph().operands().iterate([&](const ir::OperandIndex &ind, const ir::Operand &obj) {
     if (obj.isConstant())
     {
-      VERBOSE(INTERPRETER) << "Constant: Push to operand stack " << ind.value() << std::endl;
+      VERBOSE(INTERPRETER) << "Constant: Push to operand stack " << ind << std::endl;
 
       operand_stack.push(ind);
     }
@@ -129,7 +129,7 @@ void Interpreter::run()
 
       if (operator_ready)
       {
-        VERBOSE(INTERPRETER) << "Ready to execute operation " << use_operator.value() << std::endl;
+        VERBOSE(INTERPRETER) << "Ready to execute operation " << use_operator << std::endl;
         operation_stack.push(use_operator);
       }
     }
@@ -138,7 +138,7 @@ void Interpreter::run()
     {
       const auto current_operation_index = operation_stack.top();
       operation_stack.pop();
-      VERBOSE(INTERPRETER) << "Poped operation: " << current_operation_index.value() << "("
+      VERBOSE(INTERPRETER) << "Poped operation: " << current_operation_index << "("
                            << _env->graph().operations().at(current_operation_index).name() << ")"
                            << std::endl;
 
diff --git a/runtime/onert/core/src/interp/Tensor.cc b/runtime/onert/core/src/interp/Tensor.cc
index 07f8b75dc..de095c9e4 100644
--- a/runtime/onert/core/src/interp/Tensor.cc
+++ b/runtime/onert/core/src/interp/Tensor.cc
@@ -49,5 +49,9 @@ ir::Layout Tensor::layout() const
   return ir::Layout::NHWC;
 }
 
+ir::Shape Tensor::getShape() const { return _info.shape(); }
+
+ir::Shape ROTensor::getShape() const { return _info.shape(); }
+
 } // namespace interp
 } // namespace onert
diff --git a/runtime/onert/core/src/interp/Tensor.h b/runtime/onert/core/src/interp/Tensor.h
index 8b72d537d..642fdc164 100644
--- a/runtime/onert/core/src/interp/Tensor.h
+++ b/runtime/onert/core/src/interp/Tensor.h
@@ -70,8 +70,6 @@ public:
   virtual void releaseData() = 0;
 
   virtual size_t total_size() const = 0;
-  virtual size_t dimension(size_t index) const = 0;
-  virtual size_t num_dimensions() const = 0;
   virtual size_t calcOffset(const ir::Coordinates &coords) const = 0;
 
   virtual bool has_padding() const = 0;
@@ -118,17 +116,21 @@ public:
   void releaseData() override { _data = nullptr; }
 
   size_t total_size() const override { return _info.total_size(); }
-  size_t dimension(size_t index) const override { return _info.shape().dim(index); }
-  size_t num_dimensions() const override { return _info.shape().rank(); }
   size_t calcOffset(const ir::Coordinates &coords) const override;
   ir::Layout layout() const override;
   bool is_dynamic() const override { return false; }
   bool has_padding() const override { return false; }
   ir::DataType data_type() const override { return _info.typeInfo().type(); }
   float data_scale() const override { return _info.typeInfo().scale(); }
-  int32_t data_offset() const override { return _info.typeInfo().offset(); }
+  int32_t data_zero_point() const override { return _info.typeInfo().zero_point(); }
+  const std::vector<float> &data_scales() const override { return _info.typeInfo().scales(); }
+  const std::vector<int32_t> &data_zero_points() const override
+  {
+    return _info.typeInfo().zero_points();
+  }
   const ir::OperandInfo &tensorInfo() const override { return _info; }
   uint64_t num_elements() const override { return _info.shape().num_elements(); };
+  ir::Shape getShape() const override;
 
 private:
   const ir::OperandInfo _info;
@@ -160,17 +162,21 @@ public:
   void releaseData() override { _buffer = nullptr; }
 
   size_t total_size() const override { return _info.total_size(); }
-  size_t dimension(size_t index) const override { return _info.shape().dim(index); }
-  size_t num_dimensions() const override { return _info.shape().rank(); }
   size_t calcOffset(const ir::Coordinates &coords) const override;
   ir::Layout layout() const override;
   bool is_dynamic() const override { return false; }
   bool has_padding() const override { return false; }
   ir::DataType data_type() const override { return _info.typeInfo().type(); }
   float data_scale() const override { return _info.typeInfo().scale(); }
-  int32_t data_offset() const override { return _info.typeInfo().offset(); }
+  int32_t data_zero_point() const override { return _info.typeInfo().zero_point(); }
+  const std::vector<float> &data_scales() const override { return _info.typeInfo().scales(); }
+  const std::vector<int32_t> &data_zero_points() const override
+  {
+    return _info.typeInfo().zero_points();
+  }
   const ir::OperandInfo &tensorInfo() const override { return _info; }
   uint64_t num_elements() const override { return _info.shape().num_elements(); };
+  ir::Shape getShape() const override;
 
 private:
   const ir::OperandInfo _info;
diff --git a/runtime/onert/core/src/interp/operations/BinaryArithmeticOps.cc b/runtime/onert/core/src/interp/operations/BinaryArithmeticOps.cc
index 86e883524..804e9fb51 100644
--- a/runtime/onert/core/src/interp/operations/BinaryArithmeticOps.cc
+++ b/runtime/onert/core/src/interp/operations/BinaryArithmeticOps.cc
@@ -40,7 +40,7 @@ enum class OpType
 void prepare(ExecEnv *env, const ir::Operation &node)
 {
   const auto &arithmetic_node =
-      nnfw::misc::polymorphic_downcast<const ir::operation::BinaryArithmetic &>(node);
+    nnfw::misc::polymorphic_downcast<const ir::operation::BinaryArithmetic &>(node);
 
   const auto lhs_index = node.getInputs().at(arithmetic_node.LHS);
   const auto rhs_index = node.getInputs().at(arithmetic_node.RHS);
@@ -68,7 +68,7 @@ void prepare(ExecEnv *env, const ir::Operation &node)
     }
 
     auto output_info =
-        ir::OperandInfo::createStaticInfo(out_shape, lhs_tensor->tensorInfo().typeInfo());
+      ir::OperandInfo::createStaticInfo(out_shape, lhs_tensor->tensorInfo().typeInfo());
     // We can handle already allocated (ex. model output)
     env->allocateIfNeeded(out_index, output_info);
   }
@@ -119,14 +119,13 @@ void invoke(const ITensor *lhs_tensor, const ITensor *rhs_tensor, const ITensor
   raw_type *out_ptr = reinterpret_cast<raw_type *>(out_buffer);
 
   const auto cker_op_type =
-      (op_type == OpType::ADD)
-          ? nnfw::cker::BinaryArithmeticOpType::ADD
-          : ((op_type == OpType::SUB) ? nnfw::cker::BinaryArithmeticOpType::SUB
-                                      : nnfw::cker::BinaryArithmeticOpType::MUL);
+    (op_type == OpType::ADD) ? nnfw::cker::BinaryArithmeticOpType::ADD
+                             : ((op_type == OpType::SUB) ? nnfw::cker::BinaryArithmeticOpType::SUB
+                                                         : nnfw::cker::BinaryArithmeticOpType::MUL);
 
-  const bool need_broadcast = nnfw::cker::ProcessBroadcastShapes(
-      convertShape(lhs_tensor->tensorInfo().shape()),
-      convertShape(rhs_tensor->tensorInfo().shape()), &cker_param);
+  const bool need_broadcast =
+    nnfw::cker::ProcessBroadcastShapes(convertShape(lhs_tensor->tensorInfo().shape()),
+                                       convertShape(rhs_tensor->tensorInfo().shape()), &cker_param);
 
   if (need_broadcast)
   {
@@ -173,7 +172,7 @@ void invokeBinaryArithmetic(const ExecEnv *env, const ir::operation::BinaryArith
 void invokeBinaryArithmeticOps(const ExecEnv *env, const ir::Operation &node)
 {
   const auto &arithmetic_node =
-      nnfw::misc::polymorphic_downcast<const ir::operation::BinaryArithmetic &>(node);
+    nnfw::misc::polymorphic_downcast<const ir::operation::BinaryArithmetic &>(node);
 
   switch (arithmetic_node.param().arithmetic_type)
   {
diff --git a/runtime/onert/core/src/interp/operations/Concat.cc b/runtime/onert/core/src/interp/operations/Concat.cc
index efc46c66b..a063ab14a 100644
--- a/runtime/onert/core/src/interp/operations/Concat.cc
+++ b/runtime/onert/core/src/interp/operations/Concat.cc
@@ -39,44 +39,44 @@ void prepareConcat(ExecEnv *env, const ir::Operation &node)
   const auto first_tensor = env->tensorAt(first_index);
   uint32_t out_axis_dimension = 0;
   const int32_t axis_raw = concat_node.param().axis;
-  const uint32_t axis = (axis_raw < 0) ? (axis_raw + first_tensor->num_dimensions()) : axis_raw;
+  const int32_t axis = (axis_raw < 0) ? (axis_raw + first_tensor->getShape().rank()) : axis_raw;
 
   // All inputs shape should be same except axis dimension
   // All inputs type should be same
   for (auto input : node.getInputs())
   {
-    assert(first_tensor->num_dimensions() == env->tensorAt(input)->num_dimensions());
+    assert(first_tensor->getShape().rank() == env->tensorAt(input)->getShape().rank());
     assert(first_tensor->data_type() == env->tensorAt(input)->data_type());
-    for (uint32_t i = 0; i < first_tensor->num_dimensions(); i++)
+    for (int i = 0; i < first_tensor->getShape().rank(); i++)
     {
       if (i == axis)
       {
-        out_axis_dimension += env->tensorAt(input)->dimension(i);
+        out_axis_dimension += env->tensorAt(input)->getShape().dim(i);
         continue;
       }
-      assert(first_tensor->dimension(i) == env->tensorAt(input)->dimension(i));
+      assert(first_tensor->getShape().dim(i) == env->tensorAt(input)->getShape().dim(i));
     }
   }
 
   // Make output tensor info using first input tensor info, and accumulated axis dimension value
   auto out_shape = first_tensor->tensorInfo().shape();
   out_shape.dim(axis) = out_axis_dimension;
-  env->allocateIfNeeded(out_index, ir::OperandInfo::createStaticInfo(
-                                       out_shape, first_tensor->tensorInfo().typeInfo()));
+  env->allocateIfNeeded(
+    out_index, ir::OperandInfo::createStaticInfo(out_shape, first_tensor->tensorInfo().typeInfo()));
 
   auto out_tensor = env->tensorAt(out_index);
   UNUSED_RELEASE(out_tensor);
 
-  // Output shape should be same with input except axis dimension
+  // Output shape should be same with input except axis getShape().dim
   // Output type should be same with input
   assert(first_tensor->data_type() == out_tensor->data_type());
-  for (uint32_t i = 0; i < first_tensor->num_dimensions(); i++)
+  for (int i = 0; i < first_tensor->getShape().rank(); i++)
   {
     if (i == axis)
     {
       continue;
     }
-    assert(first_tensor->dimension(i) == out_tensor->dimension(i));
+    assert(first_tensor->getShape().dim(i) == out_tensor->getShape().dim(i));
   }
 }
 
@@ -123,7 +123,7 @@ void invokeConcat(const ExecEnv *env, const ir::Operation &node)
 
   const auto out_index = node.getOutputs().at(0);
   const auto out_tensor = env->tensorAt(out_index);
-  const uint32_t axis = (axis_raw < 0) ? (axis_raw + out_tensor->num_dimensions()) : axis_raw;
+  const uint32_t axis = (axis_raw < 0) ? (axis_raw + out_tensor->getShape().rank()) : axis_raw;
 
   const auto data_type = in_tensors[0]->data_type();
   if (data_type == ir::DataType::FLOAT32)
diff --git a/runtime/onert/core/src/interp/operations/Conv2D.cc b/runtime/onert/core/src/interp/operations/Conv2D.cc
index bb00b828c..0b43a4799 100644
--- a/runtime/onert/core/src/interp/operations/Conv2D.cc
+++ b/runtime/onert/core/src/interp/operations/Conv2D.cc
@@ -42,9 +42,9 @@ void prepareConv2D(ExecEnv *env, const ir::Operation &node)
   const auto kernel_tensor = env->tensorAt(kernel_index);
   const auto bias_tensor = env->tensorAt(bias_index);
 
-  assert(in_tensor->num_dimensions() == 4);
-  assert(kernel_tensor->num_dimensions() == 4);
-  assert(bias_tensor->num_dimensions() == 1);
+  assert(in_tensor->getShape().rank() == 4);
+  assert(kernel_tensor->getShape().rank() == 4);
+  assert(bias_tensor->getShape().rank() == 1);
 
   UNUSED_RELEASE(in_tensor);
   UNUSED_RELEASE(kernel_tensor);
@@ -56,9 +56,9 @@ void prepareConv2D(ExecEnv *env, const ir::Operation &node)
     // Handle unspecified output shape
     const auto &conv_node = nnfw::misc::polymorphic_downcast<const ir::operation::Conv2D &>(node);
     const auto infered_output_shape = shape_inference::inferConv2DShape(
-        in_tensor->tensorInfo().shape(), kernel_tensor->tensorInfo().shape(), conv_node.param());
+      in_tensor->tensorInfo().shape(), kernel_tensor->tensorInfo().shape(), conv_node.param());
     env->allocateIfNeeded(
-        out_index, ir::OperandInfo::createStaticInfo(infered_output_shape, output_info.typeInfo()));
+      out_index, ir::OperandInfo::createStaticInfo(infered_output_shape, output_info.typeInfo()));
   }
   else
   {
@@ -70,7 +70,7 @@ void prepareConv2D(ExecEnv *env, const ir::Operation &node)
 
   // Handle same ifm & ofm data type only
   assert(in_tensor->data_type() == out_tensor->data_type());
-  assert(out_tensor->num_dimensions() == 4);
+  assert(out_tensor->getShape().rank() == 4);
 }
 
 void invoke(const ITensor *ifm_tensor, const ITensor *ker_tensor, const ITensor *bias_tensor,
@@ -83,8 +83,8 @@ void invoke(const ITensor *ifm_tensor, const ITensor *ker_tensor, const ITensor
   const auto &ker_shape = ker_tensor->tensorInfo().shape();
   const auto ker_height = ker_shape.dim(1);
   const auto ker_width = ker_shape.dim(2);
-  const auto padding = ir::calculatePadding(param.padding, ifm_shape, ofm_shape, param.stride,
-                                            ker_width, ker_height);
+  const auto padding =
+    ir::calculatePadding(param.padding, ifm_shape, ofm_shape, param.stride, ker_width, ker_height);
 
   // Calculate
   float activation_min, activation_max;
diff --git a/runtime/onert/core/src/interp/operations/DepthwiseConv2D.cc b/runtime/onert/core/src/interp/operations/DepthwiseConv2D.cc
index e1fb767fe..d1c62d73f 100644
--- a/runtime/onert/core/src/interp/operations/DepthwiseConv2D.cc
+++ b/runtime/onert/core/src/interp/operations/DepthwiseConv2D.cc
@@ -43,9 +43,9 @@ void prepareDepthwiseConv(ExecEnv *env, const ir::Operation &node)
   const auto kernel_tensor = env->tensorAt(kernel_index);
   const auto bias_tensor = env->tensorAt(bias_index);
 
-  assert(in_tensor->num_dimensions() == 4);
-  assert(kernel_tensor->num_dimensions() == 4);
-  assert(bias_tensor->num_dimensions() == 1);
+  assert(in_tensor->getShape().rank() == 4);
+  assert(kernel_tensor->getShape().rank() == 4);
+  assert(bias_tensor->getShape().rank() == 1);
 
   UNUSED_RELEASE(in_tensor);
   UNUSED_RELEASE(kernel_tensor);
@@ -58,12 +58,12 @@ void prepareDepthwiseConv(ExecEnv *env, const ir::Operation &node)
   {
     // Handle unspecified output shape
     const auto &depth_conv_node =
-        nnfw::misc::polymorphic_downcast<const ir::operation::DepthwiseConv2D &>(node);
+      nnfw::misc::polymorphic_downcast<const ir::operation::DepthwiseConv2D &>(node);
     const auto infered_output_shape = shape_inference::inferDepthwiseConv2DShape(
-        in_tensor->tensorInfo().shape(), kernel_tensor->tensorInfo().shape(),
-        depth_conv_node.param());
+      in_tensor->tensorInfo().shape(), kernel_tensor->tensorInfo().shape(),
+      depth_conv_node.param());
     env->allocateIfNeeded(
-        out_index, ir::OperandInfo::createStaticInfo(infered_output_shape, output_info.typeInfo()));
+      out_index, ir::OperandInfo::createStaticInfo(infered_output_shape, output_info.typeInfo()));
   }
   else
   {
@@ -75,7 +75,7 @@ void prepareDepthwiseConv(ExecEnv *env, const ir::Operation &node)
 
   // Handle same ifm & ofm data type only
   assert(in_tensor->data_type() == out_tensor->data_type());
-  assert(out_tensor->num_dimensions() == 4);
+  assert(out_tensor->getShape().rank() == 4);
 }
 
 void invoke(const ITensor *ifm_tensor, const ITensor *ker_tensor, const ITensor *bias_tensor,
@@ -88,8 +88,8 @@ void invoke(const ITensor *ifm_tensor, const ITensor *ker_tensor, const ITensor
   const auto &ker_shape = ker_tensor->tensorInfo().shape();
   const auto ker_height = ker_shape.dim(1);
   const auto ker_width = ker_shape.dim(2);
-  const auto padding = ir::calculatePadding(param.padding, ifm_shape, ofm_shape, param.stride,
-                                            ker_width, ker_height);
+  const auto padding =
+    ir::calculatePadding(param.padding, ifm_shape, ofm_shape, param.stride, ker_width, ker_height);
 
   // Calculate
   float activation_min, activation_max;
diff --git a/runtime/onert/core/src/interp/operations/ElementwiseActivations.cc b/runtime/onert/core/src/interp/operations/ElementwiseActivations.cc
index c8773bef4..197855ff4 100644
--- a/runtime/onert/core/src/interp/operations/ElementwiseActivations.cc
+++ b/runtime/onert/core/src/interp/operations/ElementwiseActivations.cc
@@ -118,7 +118,7 @@ template <ActivationType act_type> void invoke(const ExecEnv *env, const ir::Ope
     else
     {
       const auto &act_node =
-          nnfw::misc::polymorphic_downcast<const ir::operation::ElementwiseActivation &>(node);
+        nnfw::misc::polymorphic_downcast<const ir::operation::ElementwiseActivation &>(node);
       evalFloat<act_type>(input_start, out, elements, act_node.param().alpha,
                           act_node.param().beta);
     }
@@ -132,7 +132,7 @@ template <ActivationType act_type> void invoke(const ExecEnv *env, const ir::Ope
 void invokeElementwiseActivation(const ExecEnv *env, const ir::Operation &node)
 {
   const auto &act_node =
-      nnfw::misc::polymorphic_downcast<const ir::operation::ElementwiseActivation &>(node);
+    nnfw::misc::polymorphic_downcast<const ir::operation::ElementwiseActivation &>(node);
   switch (act_node.param().op_type)
   {
     case ir::operation::ElementwiseActivation::Type::LOGISTIC:
diff --git a/runtime/onert/core/src/interp/operations/FullyConnected.cc b/runtime/onert/core/src/interp/operations/FullyConnected.cc
index 4f97632b2..ef827605b 100644
--- a/runtime/onert/core/src/interp/operations/FullyConnected.cc
+++ b/runtime/onert/core/src/interp/operations/FullyConnected.cc
@@ -44,23 +44,23 @@ void prepareFC(ExecEnv *env, const ir::Operation &node)
   UNUSED_RELEASE(kernel_tensor);
   UNUSED_RELEASE(bias_tensor);
 
-  assert(in_tensor->num_dimensions() >= 2);
-  assert(kernel_tensor->num_dimensions() == 2);
-  assert(bias_tensor->num_dimensions() == 1);
+  assert(in_tensor->getShape().rank() >= 2);
+  assert(kernel_tensor->getShape().rank() == 2);
+  assert(bias_tensor->getShape().rank() == 1);
 
   const auto input_size_with_batch = in_tensor->num_elements();
-  const auto num_units = kernel_tensor->dimension(0);
-  const auto input_size = kernel_tensor->dimension(1);
-  const auto batch_size = input_size_with_batch / input_size;
+  const auto num_units = kernel_tensor->getShape().dim(0);
+  const auto input_size = kernel_tensor->getShape().dim(1);
+  const int32_t batch_size = input_size_with_batch / input_size;
   assert(input_size_with_batch % input_size == 0);
-  assert(num_units == bias_tensor->dimension(0));
+  assert(num_units == bias_tensor->getShape().dim(0));
 
   // Make output tensor info
   ir::Shape output_shape(2);
   output_shape.dim(0) = batch_size;
   output_shape.dim(1) = num_units;
   const auto out_info =
-      ir::OperandInfo::createStaticInfo(output_shape, in_tensor->tensorInfo().typeInfo());
+    ir::OperandInfo::createStaticInfo(output_shape, in_tensor->tensorInfo().typeInfo());
   env->allocateIfNeeded(out_index, out_info);
 
   auto out_tensor = env->tensorAt(out_index);
@@ -68,9 +68,9 @@ void prepareFC(ExecEnv *env, const ir::Operation &node)
 
   // Handle same ifm & ofm data type only
   assert(in_tensor->data_type() == out_tensor->data_type());
-  assert(out_tensor->num_dimensions() == 2);
-  assert(out_tensor->dimension(0) == batch_size);
-  assert(out_tensor->dimension(1) == num_units);
+  assert(out_tensor->getShape().rank() == 2);
+  assert(out_tensor->getShape().dim(0) == batch_size);
+  assert(out_tensor->getShape().dim(1) == num_units);
 }
 
 void invoke(const ITensor *ifm_tensor, const ITensor *ker_tensor, const ITensor *bias_tensor,
@@ -100,7 +100,7 @@ void invoke(const ITensor *ifm_tensor, const ITensor *ker_tensor, const ITensor
 void invokeFC(const ExecEnv *env, const ir::Operation &node)
 {
   const auto &conv_node =
-      nnfw::misc::polymorphic_downcast<const ir::operation::FullyConnected &>(node);
+    nnfw::misc::polymorphic_downcast<const ir::operation::FullyConnected &>(node);
 
   const auto ifm_index = node.getInputs().at(ir::operation::FullyConnected::INPUT);
   const auto ker_index = node.getInputs().at(ir::operation::FullyConnected::WEIGHT);
diff --git a/runtime/onert/core/src/interp/operations/Gather.cc b/runtime/onert/core/src/interp/operations/Gather.cc
index 9e82def5f..0ea60875c 100644
--- a/runtime/onert/core/src/interp/operations/Gather.cc
+++ b/runtime/onert/core/src/interp/operations/Gather.cc
@@ -56,9 +56,9 @@ void prepareGather(ExecEnv *env, const ir::Operation &node)
   }
 
   auto output_tensor = env->tensorAt(output_index);
-  auto output_rank = input_tensor->num_dimensions() + indices_tensor->num_dimensions() - 1;
+  auto output_rank = input_tensor->getShape().rank() + indices_tensor->getShape().rank() - 1;
 
-  if (output_rank != output_tensor->num_dimensions())
+  if (output_rank != output_tensor->getShape().rank())
   {
     throw std::runtime_error{"Interp(Gather): Invalid output rank"};
   }
@@ -71,7 +71,7 @@ void prepareGather(ExecEnv *env, const ir::Operation &node)
       input_tensor->tensorInfo().typeInfo() != output_tensor->tensorInfo().typeInfo())
   {
     throw std::runtime_error{
-        "Interp(Gather): Cannot handle different I/O QUANT_UINT8_ASYMM scale/offset"};
+      "Interp(Gather): Cannot handle different I/O QUANT_UINT8_ASYMM scale/offset"};
   }
 }
 
@@ -106,7 +106,7 @@ void invokeGather(const ExecEnv *env, const ir::Operation &node)
   const auto input_tensor = env->tensorAt(input_index);
   const auto indices_tensor = env->tensorAt(indices_index);
   const auto output_tensor = env->tensorAt(output_index);
-  const uint32_t axis = (axis_raw < 0) ? (axis_raw + input_tensor->num_dimensions()) : axis_raw;
+  const uint32_t axis = (axis_raw < 0) ? (axis_raw + input_tensor->getShape().rank()) : axis_raw;
 
   const auto data_type = input_tensor->data_type();
 
diff --git a/runtime/onert/core/src/interp/operations/InstanceNorm.cc b/runtime/onert/core/src/interp/operations/InstanceNorm.cc
index 2538bcc39..b5c38819d 100644
--- a/runtime/onert/core/src/interp/operations/InstanceNorm.cc
+++ b/runtime/onert/core/src/interp/operations/InstanceNorm.cc
@@ -32,13 +32,13 @@ namespace instancenorm
 void prepareInstanceNorm(ExecEnv *env, const ir::Operation &node)
 {
   const auto &instancenorm_node =
-      nnfw::misc::polymorphic_downcast<const ir::operation::InstanceNorm &>(node);
+    nnfw::misc::polymorphic_downcast<const ir::operation::InstanceNorm &>(node);
 
   const auto input_index = node.getInputs().at(instancenorm_node.INPUT);
   const auto output_index = node.getOutputs().at(0);
   const auto input_tensor = env->tensorAt(input_index);
 
-  if (input_tensor->num_dimensions() != 4)
+  if (input_tensor->getShape().rank() != 4)
   {
     throw std::runtime_error{"Interp(InstanceNorm): Input should be 4D-tensor"};
   }
@@ -88,7 +88,7 @@ void invoke(const ITensor *input_tensor, const ITensor *gamma_tensor, const ITen
 void invokeInstanceNorm(const ExecEnv *env, const ir::Operation &node)
 {
   const auto &instancenorm_node =
-      nnfw::misc::polymorphic_downcast<const ir::operation::InstanceNorm &>(node);
+    nnfw::misc::polymorphic_downcast<const ir::operation::InstanceNorm &>(node);
 
   const auto input_index = node.getInputs().at(instancenorm_node.INPUT);
   const auto gamma_index = node.getInputs().at(instancenorm_node.GAMMA);
diff --git a/runtime/onert/core/src/interp/operations/Pad.cc b/runtime/onert/core/src/interp/operations/Pad.cc
index c8dce698d..0eec7fe9a 100644
--- a/runtime/onert/core/src/interp/operations/Pad.cc
+++ b/runtime/onert/core/src/interp/operations/Pad.cc
@@ -61,7 +61,7 @@ void invoke(const ITensor *input_tensor, const ITensor *pad_tensor, const ITenso
   const auto pad_buffer = pad_tensor->bufferRO();
   auto output_buffer = output_tensor->buffer();
 
-  int32_t pad_rank = pad_tensor->dimension(0);
+  int32_t pad_rank = pad_tensor->getShape().dim(0);
 
   const auto cker_input_shape = convertShape(input_tensor->tensorInfo().shape());
   const auto cker_output_shape = convertShape(output_tensor->tensorInfo().shape());
diff --git a/runtime/onert/core/src/interp/operations/Pool2D.cc b/runtime/onert/core/src/interp/operations/Pool2D.cc
index 92f9d70b2..2f3b71655 100644
--- a/runtime/onert/core/src/interp/operations/Pool2D.cc
+++ b/runtime/onert/core/src/interp/operations/Pool2D.cc
@@ -41,16 +41,16 @@ void preparePool2D(ExecEnv *env, const ir::Operation &node)
   const auto in_tensor = env->tensorAt(in_index);
   UNUSED_RELEASE(in_tensor);
 
-  assert(in_tensor->num_dimensions() == 4);
+  assert(in_tensor->getShape().rank() == 4);
 
   const auto output_info = env->graph().operands().at(out_index).info();
   if (output_info.total_size() == 0)
   {
     // Handle unspecified output shape
     const auto infered_output_shape =
-        shape_inference::inferPoolShape(in_tensor->tensorInfo().shape(), pool_node.param());
+      shape_inference::inferPoolShape(in_tensor->tensorInfo().shape(), pool_node.param());
     env->allocateIfNeeded(
-        out_index, ir::OperandInfo::createStaticInfo(infered_output_shape, output_info.typeInfo()));
+      out_index, ir::OperandInfo::createStaticInfo(infered_output_shape, output_info.typeInfo()));
   }
   else
   {
@@ -62,7 +62,7 @@ void preparePool2D(ExecEnv *env, const ir::Operation &node)
 
   // Handle same ifm & ofm data type only
   assert(in_tensor->data_type() == out_tensor->data_type());
-  assert(out_tensor->num_dimensions() == 4);
+  assert(out_tensor->getShape().rank() == 4);
 }
 
 template <typename T>
@@ -100,7 +100,7 @@ void invokePool2DOps(const ExecEnv *env, const ir::Operation &node)
   const auto ofm_shape = out_tensor->tensorInfo().shape().asFeature(ir::Layout::NHWC);
   const auto param = pool_node.param();
   const auto padding =
-      ir::calculatePadding(param.padding, ifm_shape, ofm_shape, param.stride, param.kw, param.kh);
+    ir::calculatePadding(param.padding, ifm_shape, ofm_shape, param.stride, param.kw, param.kh);
   // Calculate
   nnfw::cker::PoolParams cker_param;
   cker_param.filter_width = param.kw;
diff --git a/runtime/onert/core/src/interp/operations/Softmax.cc b/runtime/onert/core/src/interp/operations/Softmax.cc
index d30f78deb..1fc303117 100644
--- a/runtime/onert/core/src/interp/operations/Softmax.cc
+++ b/runtime/onert/core/src/interp/operations/Softmax.cc
@@ -37,7 +37,7 @@ void prepareSoftMax(ExecEnv *env, const ir::Operation &node)
   const auto in_tensor = env->tensorAt(in_index);
   UNUSED_RELEASE(in_tensor);
 
-  assert((in_tensor->num_dimensions() == 4) || (in_tensor->num_dimensions() == 2));
+  assert((in_tensor->getShape().rank() == 4) || (in_tensor->getShape().rank() == 2));
 
   // Output shape should be same with input
   // Output type is pre-defined in model
@@ -51,10 +51,10 @@ void prepareSoftMax(ExecEnv *env, const ir::Operation &node)
   UNUSED_RELEASE(out_tensor);
 
   // Check output shape is same with input
-  assert(out_tensor->num_dimensions() == out_tensor->num_dimensions());
-  for (uint32_t i = 0; i < in_tensor->num_dimensions(); i++)
+  assert(out_tensor->getShape().rank() == out_tensor->getShape().rank());
+  for (int32_t i = 0; i < in_tensor->getShape().rank(); i++)
   {
-    assert(in_tensor->dimension(i) == out_tensor->dimension(i));
+    assert(in_tensor->getShape().dim(i) == out_tensor->getShape().dim(i));
   }
 }
 
@@ -66,14 +66,14 @@ void invoke(const ITensor *in_tensor, const ITensor *out_tensor,
 
   float beta = param.beta;
 
-  if (in_tensor->num_dimensions() == 2)
+  if (in_tensor->getShape().rank() == 2)
   {
-    uint32_t batch_size = in_tensor->dimension(0);
-    uint32_t input_size = in_tensor->dimension(1);
+    uint32_t batch_size = in_tensor->getShape().dim(0);
+    uint32_t input_size = in_tensor->getShape().dim(1);
 
     nnfw::cker::Softmax(in_ptr, input_size, batch_size, beta, out_ptr);
   }
-  else if (in_tensor->num_dimensions() == 4)
+  else if (in_tensor->getShape().rank() == 4)
   {
     const auto in_shape = convertShape(in_tensor->tensorInfo().shape());
     const auto out_shape = convertShape(out_tensor->tensorInfo().shape());
diff --git a/runtime/onert/core/src/interp/operations/TransposeConv.cc b/runtime/onert/core/src/interp/operations/TransposeConv.cc
index cc2ced26b..755103dc2 100644
--- a/runtime/onert/core/src/interp/operations/TransposeConv.cc
+++ b/runtime/onert/core/src/interp/operations/TransposeConv.cc
@@ -40,9 +40,9 @@ void prepareTransposeConv(ExecEnv *env, const ir::Operation &node)
   const auto ker_tensor = env->tensorAt(ker_index);
   const auto ofm_shape_tensor = env->tensorAt(ofm_shape_index);
 
-  assert(ifm_tensor->num_dimensions() == 4);
-  assert(ker_tensor->num_dimensions() == 4);
-  assert(ofm_shape_tensor->num_dimensions() == 1);
+  assert(ifm_tensor->getShape().rank() == 4);
+  assert(ker_tensor->getShape().rank() == 4);
+  assert(ofm_shape_tensor->getShape().rank() == 1);
 
   UNUSED_RELEASE(ifm_tensor);
   UNUSED_RELEASE(ker_tensor);
@@ -68,7 +68,7 @@ void prepareTransposeConv(ExecEnv *env, const ir::Operation &node)
     throw std::runtime_error{"Interp(TConv): Different I/O data dype"};
   }
 
-  if (ofm_tensor->num_dimensions() != 4)
+  if (ofm_tensor->getShape().rank() != 4)
   {
     throw std::runtime_error{"Interp(TConv): Invalid output rank"};
   }
@@ -83,8 +83,8 @@ void invoke(const ITensor *ifm_tensor, const ITensor *ker_tensor, const ITensor
   const auto ker_shape = ker_tensor->tensorInfo().shape();
   const auto ker_height = ker_shape.dim(1);
   const auto ker_width = ker_shape.dim(2);
-  const auto padding = ir::calculatePadding(param.padding, ofm_shape, ifm_shape, param.stride,
-                                            ker_width, ker_height);
+  const auto padding =
+    ir::calculatePadding(param.padding, ofm_shape, ifm_shape, param.stride, ker_width, ker_height);
 
   nnfw::cker::TransposeConvParams cker_param;
   cker_param.padding_values.width = padding.left;
@@ -108,7 +108,7 @@ void invoke(const ITensor *ifm_tensor, const ITensor *ker_tensor, const ITensor
 void invokeTransposeConv(const ExecEnv *env, const ir::Operation &node)
 {
   const auto &tconv_node =
-      nnfw::misc::polymorphic_downcast<const ir::operation::TransposeConv &>(node);
+    nnfw::misc::polymorphic_downcast<const ir::operation::TransposeConv &>(node);
 
   const auto ifm_index = node.getInputs().at(ir::operation::TransposeConv::INPUT);
   const auto ker_index = node.getInputs().at(ir::operation::TransposeConv::KERNEL);
diff --git a/runtime/onert/core/src/ir/Graph.cc b/runtime/onert/core/src/ir/Graph.cc
index 1b8300f40..df30bbdbe 100644
--- a/runtime/onert/core/src/ir/Graph.cc
+++ b/runtime/onert/core/src/ir/Graph.cc
@@ -19,16 +19,16 @@
 #include "OperationValidator.h"
 
 #include <algorithm>
+
 #include <bitset>
 #include <sstream>
 
 #include "util/logging.h"
+#include "util/Set.h"
 #include "verifier/Verifier.h"
-#include "ir/operation/LowerInfo.h"
-#include "ir/operand/LowerInfo.h"
-#include "ir/operand/PermuteFactor.h"
 #include "ir/OperandIndexMap.h"
-#include "ir/GraphIterator.h"
+#include "ir/OperationIndexMap.h"
+#include "dumper/text/GraphDumper.h"
 #include "backend/IConfig.h"
 
 namespace onert
@@ -45,22 +45,68 @@ OperandIndex Graph::addOperand(const Shape &shape, const TypeInfo &type)
   return _operands.emplace(shape, type);
 }
 
-OperationIndex Graph::addOperation(std::unique_ptr<Operation> &&node)
+OperandIndex Graph::addOperand(OperandIndex index, std::unique_ptr<Operand> &&operand)
+{
+  return _operands.push(std::move(operand), index);
+}
+
+bool Graph::checkOperandsForOperation(const Operation &operation)
+{
+  auto inputs = operation.getInputs() | ir::Remove::UNDEFINED | ir::Remove::DUPLICATED;
+  auto outputs = operation.getOutputs() | ir::Remove::UNDEFINED | ir::Remove::DUPLICATED;
+  for (auto input : inputs)
+    if (!operands().exist(input))
+      return false;
+  for (auto input : outputs)
+    if (!operands().exist(input))
+      return false;
+  return true;
+}
+
+void Graph::linkOperandToOperation(OperationIndex index, const Operation &operation)
 {
-  assert(isBuildingPhase());
-  return _operations.push(std::move(node));
+  auto inputs = operation.getInputs() | ir::Remove::UNDEFINED | ir::Remove::DUPLICATED;
+  auto outputs = operation.getOutputs() | ir::Remove::UNDEFINED | ir::Remove::DUPLICATED;
+
+  for (auto input : inputs)
+    operands().at(input).insertUse(index);
+  for (auto output : outputs)
+    operands().at(output).setDef(index);
+}
+
+OperationIndex Graph::addOperation(std::unique_ptr<Operation> &&operation)
+{
+  const Operation &op_ref = *operation;
+  if (!checkOperandsForOperation(op_ref))
+    return OperationIndex{};
+  auto ind = _operations.push(std::move(operation));
+  if (ind.valid())
+    linkOperandToOperation(ind, op_ref);
+  return ind;
+}
+
+OperationIndex Graph::addOperation(OperationIndex index, std::unique_ptr<Operation> &&operation)
+{
+  const Operation &op_ref = *operation;
+  if (!checkOperandsForOperation(op_ref))
+    return OperationIndex{};
+  auto ind_gen = _operations.push(std::move(operation), index);
+  if (ind_gen.valid())
+  {
+    assert(ind_gen == index);
+    linkOperandToOperation(index, op_ref);
+  }
+  return index;
 }
 
 void Graph::setOperandValue(const OperandIndex &ind, std::shared_ptr<Data> data)
 {
-  assert(isBuildingPhase());
   assert(_operands.exist(ind));
   _operands.at(ind).data(std::move(data));
 }
 
 void Graph::addInput(const OperandIndex &ind, const std::string &name)
 {
-  assert(isBuildingPhase());
   if (!name.empty())
     _name_to_input.emplace(name, IOIndex{_inputs.size()});
   _inputs.append(ind);
@@ -68,7 +114,6 @@ void Graph::addInput(const OperandIndex &ind, const std::string &name)
 
 void Graph::addOutput(const OperandIndex &ind, const std::string &name)
 {
-  assert(isBuildingPhase());
   if (!name.empty())
     _name_to_output.emplace(name, IOIndex{_outputs.size()});
   _outputs.append(ind);
@@ -86,14 +131,8 @@ IOIndex Graph::getOutputIndex(const std::string &name) const
   return (itr == _name_to_output.end()) ? IOIndex{} : itr->second;
 }
 
-void Graph::finishBuilding(void)
+void Graph::verify(void)
 {
-  assert(isBuildingPhase());
-  _phase = Phase::MODEL;
-
-  initializeUseDef();
-  sweepGarbageOperands();
-
   // Call graph verifications for the MODEL phase
   {
     // Except for edge consistency, the user might have been given a bad model
@@ -102,7 +141,7 @@ void Graph::finishBuilding(void)
       throw std::runtime_error{"One of model input and output operands does not exist."};
     if (!verifier::DAGChecker().verify(*this))
       throw std::runtime_error{"The graph is cyclic."};
-    assert(verifier::EdgeConsistencyChecker().verify(*this));
+    assert(verifier::EdgeChecker().verify(*this));
   }
 
   // Check shape independent operation feature
@@ -127,31 +166,35 @@ void Graph::initializeUseDef()
   });
 }
 
-void Graph::sweepGarbageOperands()
+std::vector<ir::OperationIndex> Graph::topolSortOperations() const
 {
-  // Remove operands that are not used by any operations, except Graph inputs/outputs
-  ir::OperandIndexMap<bool> visited;
-
-  operations().iterate([&](const OperationIndex &, const Operation &node) {
-    for (auto ind : node.getInputs() + node.getOutputs())
+  std::vector<ir::OperationIndex> ret;
+  util::Set<ir::OperationIndex> unvisited;
+  operations().iterate(
+    [&](const ir::OperationIndex &index, const ir::Operation &) { unvisited.add(index); });
+
+  std::function<void(const ir::OperationIndex &, const ir::Operation &)> dfs =
+    [&](const ir::OperationIndex &index, const ir::Operation &op) -> void {
+    if (!unvisited.contains(index))
+      return;
+    unvisited.remove(index);
+
+    for (const auto output : op.getOutputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED)
     {
-      visited[ind] = true;
+      const auto &operand = operands().at(output);
+      for (const auto &use : operand.getUses())
+      {
+        dfs(use, operations().at(use));
+      }
     }
-  });
-
-  // Graph's inputs/outputs are always reachable
-  for (auto ind : getInputs() + getOutputs())
-  {
-    visited[ind] = true;
-  }
-
-  operands().iterate([&](const OperandIndex &ind, const Operand &) {
-    if (!visited[ind])
-    {
-      VERBOSE(Graph::sweepGarbageOperands) << "Sweep garbage operand " << ind.value() << std::endl;
-      operands().remove(ind);
-    }
-  });
+    ret.push_back(index);
+  };
+  operations().iterate(dfs);
+
+  assert(unvisited.empty()); // All of the nodes must have been visited
+  // Reversing Postorder DFS result to make it sorted in topoligical order
+  std::reverse(ret.begin(), ret.end());
+  return ret;
 }
 
 } // namespace ir
diff --git a/runtime/onert/core/src/ir/GraphIterator.cc b/runtime/onert/core/src/ir/GraphIterator.cc
deleted file mode 100644
index ac67771c4..000000000
--- a/runtime/onert/core/src/ir/GraphIterator.cc
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "GraphIterator.h"
-
-#include "ir/OperationIndexMap.h"
-#include "compiler/LoweredGraph.h"
-
-namespace onert
-{
-namespace ir
-{
-
-//
-// Graph::DefaultIterator
-//
-
-template <bool is_const>
-void DefaultIterator<is_const>::iterate(GraphRef graph, const IterFn &fn) const
-{
-  graph.operations().iterate(
-      [&](const OperationIndex &index, NodeRef node) -> void { fn(index, node); });
-}
-
-//
-// Graph::PostDfsIterator
-//
-
-template <bool is_const>
-void PostDfsIterator<is_const>::iterate(GraphRef graph, const IterFn &fn) const
-{
-  assert(!graph.isBuildingPhase()); // Restrict iteration condition
-
-  OperationIndexMap<bool> visited;
-  graph.operations().iterate([&](const OperationIndex &index, NodeRef) { visited[index] = false; });
-
-  std::function<void(const OperationIndex &, NodeRef)> dfs_recursive =
-      [&](const OperationIndex &index, NodeRef node) -> void {
-    if (visited[index])
-      return;
-    visited[index] = true;
-
-    for (const auto output : node.getOutputs() | Remove::DUPLICATED | Remove::UNDEFINED)
-    {
-      const auto &operand = graph.operands().at(output);
-      for (const auto &use : operand.getUses())
-      {
-        dfs_recursive(use, graph.operations().at(use));
-      }
-    }
-
-    fn(index, node);
-  };
-
-  graph.operations().iterate(dfs_recursive);
-
-  // All of the operations(nodes) must have been visited.
-  assert(std::all_of(visited.begin(), visited.end(),
-                     [](const std::pair<const OperationIndex, bool> &v) { return v.second; }));
-}
-
-template <bool is_const>
-void PostDfsIterator<is_const>::iterateOpSeqs(LoweredGraphRef lowered_graph,
-                                              const OpSeqIterFn &fn) const
-{
-  std::unordered_map<OpSequenceIndex, bool> visited;
-  lowered_graph.op_seqs().iterate(
-      [&](const OpSequenceIndex &index, OpSequenceRef) { visited[index] = false; });
-
-  std::function<void(const OpSequenceIndex &, OpSequenceRef)> dfs_recursive =
-      [&](const OpSequenceIndex &index, OpSequenceRef op_seq) -> void {
-    if (visited[index])
-      return;
-    visited[index] = true;
-
-    for (const auto output : op_seq.getOutputs() | Remove::DUPLICATED | Remove::UNDEFINED)
-    {
-      const auto &operand = lowered_graph.graph().operands().at(output);
-      for (const auto &use : operand.getUses())
-      {
-        const auto use_op_seq_index = lowered_graph.op_seqs().getOperation(use);
-        dfs_recursive(use_op_seq_index, lowered_graph.op_seqs().at(use_op_seq_index));
-      }
-    }
-
-    fn(index, op_seq);
-  };
-
-  lowered_graph.op_seqs().iterate(dfs_recursive);
-
-  // All of the operations(nodes) must have been visited.
-  assert(std::all_of(visited.begin(), visited.end(),
-                     [](const std::pair<const OpSequenceIndex, bool> &v) { return v.second; }));
-}
-
-// Explicit instantiations to have implementation in the source file.
-// NOTE If these instatiations were in the top of this file, `iterate` is compiled and saved in
-//      `GraphIterator.cc.o` but `iterateOpSeqs`. This happens only when cross-building for Android.
-//      (Maybe a bug of NDK toolchain(clang)?)
-
-template class DefaultIterator<true>;
-template class DefaultIterator<false>;
-
-template class PostDfsIterator<true>;
-template class PostDfsIterator<false>;
-
-} // namespace ir
-} // namespace onert
diff --git a/runtime/onert/core/src/ir/GraphIterator.h b/runtime/onert/core/src/ir/GraphIterator.h
deleted file mode 100644
index b54314e0e..000000000
--- a/runtime/onert/core/src/ir/GraphIterator.h
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __ONERT_IR_GRAPH_ITERATOR_H__
-#define __ONERT_IR_GRAPH_ITERATOR_H__
-
-#include <type_traits>
-
-#include "ir/Index.h"
-
-namespace onert
-{
-namespace compiler
-{
-class LoweredGraph;
-} // namespace compiler
-} // namespace onert
-
-namespace onert
-{
-namespace ir
-{
-
-class Graph;
-class Operation;
-class OpSequence;
-
-template <bool is_const> class Iterator
-{
-public:
-  using GraphRef = typename std::conditional<is_const, const Graph &, Graph &>::type;
-  using IndexRef = const OperationIndex &;
-  using NodeRef = typename std::conditional<is_const, const Operation &, Operation &>::type;
-  using IterFn = std::function<void(IndexRef, NodeRef)>;
-
-public:
-  virtual ~Iterator() = default;
-  virtual void iterate(GraphRef graph, const IterFn &fn) const = 0;
-};
-
-template <bool is_const = false> class DefaultIterator final : public Iterator<is_const>
-{
-public:
-  using GraphRef = typename Iterator<is_const>::GraphRef;
-  using IndexRef = typename Iterator<is_const>::IndexRef;
-  using NodeRef = typename Iterator<is_const>::NodeRef;
-  using IterFn = typename Iterator<is_const>::IterFn;
-
-public:
-  void iterate(GraphRef graph, const IterFn &fn) const;
-};
-using DefaultConstIterator = DefaultIterator<true>;
-
-template <bool is_const = false> class PostDfsIterator final : public Iterator<is_const>
-{
-public:
-  using GraphRef = typename Iterator<is_const>::GraphRef;
-  using IndexRef = typename Iterator<is_const>::IndexRef;
-  using NodeRef = typename Iterator<is_const>::NodeRef;
-  using IterFn = typename Iterator<is_const>::IterFn;
-  using LoweredGraphRef =
-      typename std::conditional<is_const, const typename compiler::LoweredGraph &,
-                                typename compiler::LoweredGraph &>::type;
-  using OpSequenceRef = typename std::conditional<is_const, const OpSequence &, OpSequence &>::type;
-  using OpSeqIndexRef = const OpSequenceIndex &;
-  using OpSeqIterFn = std::function<void(OpSeqIndexRef, OpSequenceRef)>;
-
-public:
-  void iterate(GraphRef graph, const IterFn &fn) const;
-  void iterateOpSeqs(LoweredGraphRef lowered_graph, const OpSeqIterFn &f) const;
-};
-using PostDfsConstIterator = PostDfsIterator<true>;
-
-} // namespace ir
-} // namespace onert
-
-#endif // __ONERT_IR_GRAPH_ITERATOR_H__
diff --git a/runtime/onert/core/src/ir/OpSequence.cc b/runtime/onert/core/src/ir/OpSequence.cc
deleted file mode 100644
index e2b989d8c..000000000
--- a/runtime/onert/core/src/ir/OpSequence.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "ir/OpSequence.h"
-
-#include "ir/Operations.h"
-#include "ir/OperationVisitor.h"
-#include <sstream>
-
-namespace
-{
-
-std::string getStrFromIndice(const onert::ir::OperandIndexSequence &indice)
-{
-  std::string str;
-  for (const auto &ind : indice)
-  {
-    str += std::to_string(ind.value());
-    str.push_back(',');
-  }
-  if (str.back() == ',')
-    str.pop_back();
-
-  return str;
-}
-}
-
-namespace onert
-{
-namespace ir
-{
-
-OpSequence::OpSequence(Layout layout) : _layout{layout}, _has_dynamic_tensor{false}
-{
-  // DO NOTHING
-}
-
-void OpSequence::accept(OperationVisitor &v) const { v.visit(*this); }
-
-// TODO: Impl Dumper instead of this method
-std::string getStrFromOpSeq(const OpSequence &op_seq, const Operations &operations)
-{
-  // "  OpSequence IN(0,1,2) -> { op0(0,1,2:3), op1(3:4), op2(4:5) } -> OUT(5)"
-  std::stringstream ss;
-  ss << "  OpSequence IN(" << getStrFromIndice(op_seq.getInputs()) << ") -> {";
-  for (const auto &op_idx : op_seq)
-  {
-    ss << " " << op_idx.value() << "(" << operations.at(op_idx).name() << ":"
-       << getStrFromIndice(operations.at(op_idx).getInputs()) << ":"
-       << getStrFromIndice(operations.at(op_idx).getOutputs()) << ")";
-  }
-  ss << " } -> OUT(" << getStrFromIndice(op_seq.getOutputs()) << ")";
-  return ss.str();
-}
-
-void OpSequence::remove(const OperationIndex &index)
-{
-  assert(exist(index));
-  for (auto it = _operations.cbegin(); it != _operations.cend(); ++it)
-  {
-    if (*it == index)
-    {
-      _operations.erase(it);
-      break;
-    }
-  }
-}
-
-bool OpSequence::exist(const OperationIndex &index) const
-{
-  for (const auto &inner_op_idx : _operations)
-  {
-    if (inner_op_idx == index)
-    {
-      return true;
-    }
-  }
-  return false;
-}
-
-} // namespace ir
-} // namespace onert
diff --git a/runtime/onert/core/src/ir/OpSequences.cc b/runtime/onert/core/src/ir/OpSequences.cc
deleted file mode 100644
index 68884783e..000000000
--- a/runtime/onert/core/src/ir/OpSequences.cc
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "ir/OpSequences.h"
-#include "util/logging.h"
-#include <memory>
-
-#include <cassert>
-#include <string>
-
-namespace onert
-{
-namespace ir
-{
-
-OpSequenceIndex OpSequences::emplace(const OperationIndex &index, Layout layout)
-{
-  std::unique_ptr<OpSequence> op_seq = std::make_unique<OpSequence>(layout);
-  op_seq->appendOperation(index);
-  const OpSequenceIndex &seq_index = push(std::move(op_seq));
-  cacheSequenceIndex(seq_index, index);
-  return seq_index;
-}
-
-OpSequenceIndex OpSequences::emplace(std::unique_ptr<OpSequence> &&op_seq)
-{
-  auto &operations = op_seq->operations();
-  const OpSequenceIndex &seq_index = push(std::move(op_seq));
-  for (const auto &op_idx : operations)
-  {
-    cacheSequenceIndex(seq_index, op_idx);
-  }
-  return seq_index;
-}
-
-void OpSequences::cacheSequenceIndex(const OpSequenceIndex &seq_index,
-                                     const OperationIndex &op_index) const
-{
-  _seq_indexes.emplace(op_index, seq_index);
-}
-
-OpSequenceIndex *OpSequences::findSequenceIndex(const OperationIndex &operation_index) const
-{
-  // If opration_index is cached, return sequence_index from cache
-  if (_seq_indexes.count(operation_index))
-  {
-    auto &op_seq_index = _seq_indexes.at(operation_index);
-    if (_objects.count(op_seq_index) && _objects.at(op_seq_index)->exist(operation_index))
-    {
-      return &op_seq_index;
-    }
-    else
-    {
-      _seq_indexes.erase(operation_index);
-      return nullptr;
-    }
-  }
-  return nullptr;
-}
-
-bool OpSequences::containsOperation(const OperationIndex &operation_index) const
-{
-  return findOperation(operation_index).valid();
-}
-
-OpSequenceIndex OpSequences::getOperation(const OperationIndex &operation_index) const
-{
-  OpSequenceIndex ret = findOperation(operation_index);
-  assert(ret.valid());
-  return ret;
-}
-
-void OpSequences::removeFromOpSequence(const OperationIndex &operation_index)
-{
-  const auto op_seq_index = findOperation(operation_index);
-  auto &op_seq = at(op_seq_index);
-  _seq_indexes.erase(operation_index);
-  op_seq.remove(operation_index);
-  if (op_seq.size() == 0)
-  {
-    remove(op_seq_index);
-  }
-}
-
-OpSequenceIndex OpSequences::findOperation(const OperationIndex &operation_index) const
-{
-  if (OpSequenceIndex *op_seq_index = findSequenceIndex(operation_index))
-    return *op_seq_index;
-
-  for (auto &e : _objects)
-  {
-    OpSequence &object = *e.second;
-    auto it = find(object.operations().begin(), object.operations().end(), operation_index);
-    if (it != object.operations().end())
-    {
-      cacheSequenceIndex(e.first, operation_index);
-      return e.first;
-    }
-  }
-  throw std::runtime_error("Operation not found");
-}
-
-void dumpOpSequences(const OpSequences &op_seqs, const Operations &operations)
-{
-  op_seqs.iterate([&](const OpSequenceIndex &idx, const OpSequence &op_seq) {
-    VERBOSE(OpSequences) << idx.value() << "] " << getStrFromOpSeq(op_seq, operations) << std::endl;
-  });
-}
-
-} // namespace ir
-} // namespace onert
diff --git a/runtime/onert/core/src/ir/Operand.cc b/runtime/onert/core/src/ir/Operand.cc
index e29c7a6ec..18981dbf1 100644
--- a/runtime/onert/core/src/ir/Operand.cc
+++ b/runtime/onert/core/src/ir/Operand.cc
@@ -46,5 +46,11 @@ void Operand::setDef(const OperationIndex &idx) { _def = idx; }
 
 void Operand::unsetDef() { _def = OperationIndex{}; }
 
+void Operand::clearDefUse()
+{
+  unsetDef();
+  _uses.clear();
+}
+
 } // namespace ir
 } // namespace onert
diff --git a/runtime/onert/core/src/ir/OperandIndexSequence.cc b/runtime/onert/core/src/ir/OperandIndexSequence.cc
index 73f928280..b092f5cee 100644
--- a/runtime/onert/core/src/ir/OperandIndexSequence.cc
+++ b/runtime/onert/core/src/ir/OperandIndexSequence.cc
@@ -62,10 +62,10 @@ OperandIndexSequence OperandIndexSequence::operator+(const OperandIndexSequence
   return ret;
 }
 
-std::ostream &operator<<(std::ostream &o, const OperandIndexSequence &op_seq)
+std::ostream &operator<<(std::ostream &o, const OperandIndexSequence &operand_seq)
 {
   std::string delimeter;
-  for (const auto &ind : op_seq._vec)
+  for (const auto &ind : operand_seq._vec)
   {
     o << delimeter << ind;
     delimeter = ',';
diff --git a/runtime/onert/core/src/ir/Operands.cc b/runtime/onert/core/src/ir/Operands.cc
index ab32e478a..f8cfd16ef 100644
--- a/runtime/onert/core/src/ir/Operands.cc
+++ b/runtime/onert/core/src/ir/Operands.cc
@@ -29,7 +29,7 @@ Operands::Operands(const Operands &obj)
   obj.iterate([&](const OperandIndex &index, const Operand &operand) {
     _objects.emplace(index, std::make_unique<Operand>(operand));
   });
-  _index_count = obj._index_count;
+  _next_index = obj._next_index;
 }
 
 } // namespace ir
diff --git a/runtime/onert/core/src/ir/Operation.cc b/runtime/onert/core/src/ir/Operation.cc
index 4af878541..64792525d 100644
--- a/runtime/onert/core/src/ir/Operation.cc
+++ b/runtime/onert/core/src/ir/Operation.cc
@@ -25,14 +25,14 @@ namespace ir
 
 Operation::Operation(OperandConstraint input_constr, const OperandIndexSequence &inputs,
                      const OperandIndexSequence &outputs, OperandConstraint output_constr)
-    : _input_constr{input_constr}, _output_constr{output_constr}
+  : _input_constr{input_constr}, _output_constr{output_constr}
 {
   setInputs(inputs);
   setOutputs(outputs);
 }
 
 Operation::Operation(OperandConstraint input_constr, OperandConstraint output_constr)
-    : _input_constr{input_constr}, _output_constr{output_constr}
+  : _input_constr{input_constr}, _output_constr{output_constr}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/OperationCloner.cc b/runtime/onert/core/src/ir/OperationCloner.cc
index b4e60f0bc..c06315814 100644
--- a/runtime/onert/core/src/ir/OperationCloner.cc
+++ b/runtime/onert/core/src/ir/OperationCloner.cc
@@ -23,6 +23,23 @@ namespace onert
 namespace ir
 {
 
+namespace
+{
+
+class OperationCloner : public OperationVisitor
+{
+public:
+#define OP(Name) void visit(const operation::Name &o) override;
+#include "ir/Operations.lst"
+#undef OP
+
+public:
+  std::unique_ptr<Operation> releaseClone();
+
+private:
+  std::unique_ptr<Operation> _return_op;
+};
+
 #define OP(Name)                                        \
   void OperationCloner::visit(const operation::Name &o) \
   {                                                     \
@@ -38,5 +55,14 @@ std::unique_ptr<Operation> OperationCloner::releaseClone()
   return std::move(_return_op);
 }
 
+} // namespace
+
+std::unique_ptr<Operation> clone(const Operation &operation)
+{
+  OperationCloner cloner;
+  operation.accept(cloner);
+  return cloner.releaseClone();
+}
+
 } // namespace ir
 } // namespace onert
diff --git a/runtime/onert/core/src/ir/OperationCloner.h b/runtime/onert/core/src/ir/OperationCloner.h
index 0e8cda2a0..6424549e9 100644
--- a/runtime/onert/core/src/ir/OperationCloner.h
+++ b/runtime/onert/core/src/ir/OperationCloner.h
@@ -26,19 +26,7 @@ namespace onert
 namespace ir
 {
 
-class OperationCloner : public OperationVisitor
-{
-public:
-#define OP(Name) void visit(const operation::Name &o) override;
-#include "ir/Operations.lst"
-#undef OP
-
-public:
-  std::unique_ptr<Operation> releaseClone();
-
-private:
-  std::unique_ptr<Operation> _return_op;
-};
+std::unique_ptr<Operation> clone(const Operation &operation);
 
 } // namespace ir
 } // namespace onert
diff --git a/runtime/onert/core/src/ir/OperationDumper.cc b/runtime/onert/core/src/ir/OperationDumper.cc
index a8578b4ce..80e2a3f7a 100644
--- a/runtime/onert/core/src/ir/OperationDumper.cc
+++ b/runtime/onert/core/src/ir/OperationDumper.cc
@@ -84,8 +84,8 @@ void OperationDumper::visit(const ArgMinMax &node)
 void OperationDumper::visit(const BatchToSpaceND &node)
 {
   std::string block_size =
-      "BlockSize(" +
-      std::to_string(node.getInputs().at(BatchToSpaceND::Input::BLOCK_SIZE).value()) + ")";
+    "BlockSize(" + std::to_string(node.getInputs().at(BatchToSpaceND::Input::BLOCK_SIZE).value()) +
+    ")";
   dumpUnaryInputOp(node, block_size);
 }
 
@@ -114,7 +114,7 @@ void OperationDumper::visit(const Concat &node) { dumpPackingOp(node); }
 void OperationDumper::visit(const Conv2D &node)
 {
   std::string padding_type =
-      node.param().padding.type == PaddingType::EXPLICIT ? "Explicit" : "Implicit";
+    node.param().padding.type == PaddingType::EXPLICIT ? "Explicit" : "Implicit";
   dumpConvOp(node, padding_type);
 }
 
@@ -127,7 +127,7 @@ void OperationDumper::visit(const DepthToSpace &node) { dumpUnaryInputOp(node);
 void OperationDumper::visit(const DepthwiseConv2D &node)
 {
   std::string padding_type =
-      node.param().padding.type == PaddingType::EXPLICIT ? "Explicit" : "Implicit";
+    node.param().padding.type == PaddingType::EXPLICIT ? "Explicit" : "Implicit";
   dumpConvOp(node, padding_type);
 }
 
@@ -162,7 +162,7 @@ void OperationDumper::visit(const EmbeddingLookup &node)
 void OperationDumper::visit(const ExpandDims &node)
 {
   std::string axis =
-      "AXIS(" + std::to_string(node.getInputs().at(ExpandDims::Input::AXIS).value()) + ")";
+    "AXIS(" + std::to_string(node.getInputs().at(ExpandDims::Input::AXIS).value()) + ")";
   dumpUnaryInputOp(node, axis);
 }
 
@@ -177,15 +177,15 @@ void OperationDumper::visit(const Fill &node)
 void OperationDumper::visit(const FullyConnected &node)
 {
   std::string inputs =
-      "Weight(" + std::to_string(node.getInputs().at(FullyConnected::Input::WEIGHT).value()) +
-      ") Bias(" + std::to_string(node.getInputs().at(FullyConnected::Input::BIAS).value()) + ")";
+    "Weight(" + std::to_string(node.getInputs().at(FullyConnected::Input::WEIGHT).value()) +
+    ") Bias(" + std::to_string(node.getInputs().at(FullyConnected::Input::BIAS).value()) + ")";
   dumpUnaryInputOp(node, inputs);
 }
 
 void OperationDumper::visit(const Gather &node)
 {
   std::string indices =
-      "Indices(" + std::to_string(node.getInputs().at(Gather::Input::INDICES).value()) + ")";
+    "Indices(" + std::to_string(node.getInputs().at(Gather::Input::INDICES).value()) + ")";
   dumpUnaryInputOp(node, indices);
 }
 
@@ -203,8 +203,8 @@ void OperationDumper::visit(const HashtableLookup &node)
 void OperationDumper::visit(const InstanceNorm &node)
 {
   std::string inputs =
-      "Gamma(" + std::to_string(node.getInputs().at(InstanceNorm::Input::GAMMA).value()) +
-      ") Beta(" + std::to_string(node.getInputs().at(InstanceNorm::Input::BETA).value()) + ")";
+    "Gamma(" + std::to_string(node.getInputs().at(InstanceNorm::Input::GAMMA).value()) + ") Beta(" +
+    std::to_string(node.getInputs().at(InstanceNorm::Input::BETA).value()) + ")";
   dumpUnaryInputOp(node, inputs);
 }
 
@@ -216,30 +216,29 @@ void OperationDumper::visit(const LSTM &node)
 {
   VERBOSE(LIR) << "* " << node.name() << std::endl;
   VERBOSE(LIR)
-      << "  - Inputs : Input(" << node.getInputs().at(LSTM::Input::INPUT)
-      << ") Input To Input Weights(" << node.getInputs().at(LSTM::Input::INPUT_TO_INPUT_WEIGHTS)
-      << ") Input To Forget Weights(" << node.getInputs().at(LSTM::Input::INPUT_TO_FORGET_WEIGHTS)
-      << ") Input To Cell Weights(" << node.getInputs().at(LSTM::Input::INPUT_TO_CELL_WEIGHTS)
-      << ") Input To Output Weights(" << node.getInputs().at(LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)
-      << ") Recurrent To Input Weights("
-      << node.getInputs().at(LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)
-      << ") Recurrent To Forget Weights("
-      << node.getInputs().at(LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)
-      << ") Recurrent To Cell Weights("
-      << node.getInputs().at(LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)
-      << ") Recurrent To Output Weights("
-      << node.getInputs().at(LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS) << ") Cell To Input Weights("
-      << node.getInputs().at(LSTM::Input::CELL_TO_INPUT_WEIGHTS) << ") Cell To Forget Weights("
-      << node.getInputs().at(LSTM::Input::CELL_TO_FORGET_WEIGHTS) << ") Cell To OUTPUT Weights("
-      << node.getInputs().at(LSTM::Input::CELL_TO_OUTPUT_WEIGHTS) << ") Input Gate Bias("
-      << node.getInputs().at(LSTM::Input::INPUT_GATE_BIAS) << ") Forget Gate Bias("
-      << node.getInputs().at(LSTM::Input::FORGET_GATE_BIAS) << ") Cell Bias("
-      << node.getInputs().at(LSTM::Input::CELL_BIAS) << ") Output Gate Bias("
-      << node.getInputs().at(LSTM::Input::OUTPUT_GATE_BIAS) << ") Projection Weights("
-      << node.getInputs().at(LSTM::Input::PROJECTION_WEIGHTS) << ") Projection Bias("
-      << node.getInputs().at(LSTM::Input::PROJECTION_BIAS) << ") Output State In("
-      << node.getInputs().at(LSTM::Input::OUTPUT_STATE_IN) << ") Cell State In("
-      << node.getInputs().at(LSTM::Input::CELL_STATE_IN);
+    << "  - Inputs : Input(" << node.getInputs().at(LSTM::Input::INPUT)
+    << ") Input To Input Weights(" << node.getInputs().at(LSTM::Input::INPUT_TO_INPUT_WEIGHTS)
+    << ") Input To Forget Weights(" << node.getInputs().at(LSTM::Input::INPUT_TO_FORGET_WEIGHTS)
+    << ") Input To Cell Weights(" << node.getInputs().at(LSTM::Input::INPUT_TO_CELL_WEIGHTS)
+    << ") Input To Output Weights(" << node.getInputs().at(LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)
+    << ") Recurrent To Input Weights("
+    << node.getInputs().at(LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)
+    << ") Recurrent To Forget Weights("
+    << node.getInputs().at(LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)
+    << ") Recurrent To Cell Weights(" << node.getInputs().at(LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)
+    << ") Recurrent To Output Weights("
+    << node.getInputs().at(LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS) << ") Cell To Input Weights("
+    << node.getInputs().at(LSTM::Input::CELL_TO_INPUT_WEIGHTS) << ") Cell To Forget Weights("
+    << node.getInputs().at(LSTM::Input::CELL_TO_FORGET_WEIGHTS) << ") Cell To OUTPUT Weights("
+    << node.getInputs().at(LSTM::Input::CELL_TO_OUTPUT_WEIGHTS) << ") Input Gate Bias("
+    << node.getInputs().at(LSTM::Input::INPUT_GATE_BIAS) << ") Forget Gate Bias("
+    << node.getInputs().at(LSTM::Input::FORGET_GATE_BIAS) << ") Cell Bias("
+    << node.getInputs().at(LSTM::Input::CELL_BIAS) << ") Output Gate Bias("
+    << node.getInputs().at(LSTM::Input::OUTPUT_GATE_BIAS) << ") Projection Weights("
+    << node.getInputs().at(LSTM::Input::PROJECTION_WEIGHTS) << ") Projection Bias("
+    << node.getInputs().at(LSTM::Input::PROJECTION_BIAS) << ") Output State In("
+    << node.getInputs().at(LSTM::Input::OUTPUT_STATE_IN) << ") Cell State In("
+    << node.getInputs().at(LSTM::Input::CELL_STATE_IN);
   if (node.getInputs().size() == 24)
   {
     VERBOSE(LIR) << ") Input Layer Normalization Weights("
@@ -291,7 +290,7 @@ void OperationDumper::visit(const Permute &node)
 void OperationDumper::visit(const Pool2D &node)
 {
   std::string padding_type =
-      node.param().padding.type == PaddingType::EXPLICIT ? "Explicit" : "Implicit";
+    node.param().padding.type == PaddingType::EXPLICIT ? "Explicit" : "Implicit";
   VERBOSE(LIR) << "* " << node.name() << "(" << padding_type << ")" << std::endl;
   VERBOSE(LIR) << "  - Inputs : IFM(" << node.getInputs().at(Pool2D::Input::INPUT) << ")"
                << std::endl;
@@ -303,7 +302,7 @@ void OperationDumper::visit(const Pow &node) { dumpBinaryInputOp(node); }
 void OperationDumper::visit(const PReLU &node)
 {
   std::string alpha =
-      "Alpha(" + std::to_string(node.getInputs().at(PReLU::Input::ALPHA).value()) + ")";
+    "Alpha(" + std::to_string(node.getInputs().at(PReLU::Input::ALPHA).value()) + ")";
   dumpUnaryInputOp(node, alpha);
 }
 
@@ -315,9 +314,9 @@ void OperationDumper::visit(const Reshape &node)
 {
   // optional param
   std::string shape =
-      node.getInputs().size() == 2
-          ? "Shape(" + std::to_string(node.getInputs().at(Reshape::Input::SHAPE).value()) + ")"
-          : "Shape(not provided)";
+    node.getInputs().size() == 2
+      ? "Shape(" + std::to_string(node.getInputs().at(Reshape::Input::SHAPE).value()) + ")"
+      : "Shape(not provided)";
   dumpUnaryInputOp(node, shape);
 }
 
@@ -356,7 +355,7 @@ void OperationDumper::visit(const ResizeNearestNeighbor &node)
 void OperationDumper::visit(const Reverse &node)
 {
   std::string axis =
-      "Axis(" + std::to_string(node.getInputs().at(Reverse::Input::AXIS).value()) + ")";
+    "Axis(" + std::to_string(node.getInputs().at(Reverse::Input::AXIS).value()) + ")";
   dumpUnaryInputOp(node, axis);
 }
 
@@ -399,10 +398,9 @@ void OperationDumper::visit(const Softmax &node) { dumpUnaryInputOp(node); }
 void OperationDumper::visit(const SpaceToBatchND &node)
 {
   std::string inputs =
-      "BlockSize(" +
-      std::to_string(node.getInputs().at(SpaceToBatchND::Input::BLOCK_SIZE).value()) +
-      ") Paddings(" + std::to_string(node.getInputs().at(SpaceToBatchND::Input::PADDINGS).value()) +
-      ")";
+    "BlockSize(" + std::to_string(node.getInputs().at(SpaceToBatchND::Input::BLOCK_SIZE).value()) +
+    ") Paddings(" + std::to_string(node.getInputs().at(SpaceToBatchND::Input::PADDINGS).value()) +
+    ")";
   dumpUnaryInputOp(node, inputs);
 }
 
@@ -430,7 +428,7 @@ void OperationDumper::visit(const StridedSlice &node) { dumpUnaryInputOp(node);
 void OperationDumper::visit(const Tile &node)
 {
   std::string multiples =
-      "Multiples(" + std::to_string(node.getInputs().at(Tile::Input::MULTIPLES).value()) + ")";
+    "Multiples(" + std::to_string(node.getInputs().at(Tile::Input::MULTIPLES).value()) + ")";
   dumpUnaryInputOp(node, multiples);
 }
 
@@ -447,7 +445,7 @@ void OperationDumper::visit(const TopKV2 &node)
 void OperationDumper::visit(const TransposeConv &node)
 {
   std::string padding_type =
-      node.param().padding.type == PaddingType::EXPLICIT ? "Explicit" : "Implicit";
+    node.param().padding.type == PaddingType::EXPLICIT ? "Explicit" : "Implicit";
   VERBOSE(LIR) << "* TransposeConv(" << padding_type << ")" << std::endl;
   VERBOSE(LIR) << "  - Inputs : Output Shape("
                << node.getInputs().at(TransposeConv::Input::OUTPUT_SHAPE) << ") KERNEL("
diff --git a/runtime/onert/core/src/ir/OperationValidator.cc b/runtime/onert/core/src/ir/OperationValidator.cc
index 6f81c2a56..705a37e2c 100644
--- a/runtime/onert/core/src/ir/OperationValidator.cc
+++ b/runtime/onert/core/src/ir/OperationValidator.cc
@@ -17,6 +17,7 @@
 #include "OperationValidator.h"
 
 #include "ir/Graph.h"
+#include "util/logging.h"
 
 #define OP_REQUIRES(EXP)                                                                         \
   do                                                                                             \
@@ -31,7 +32,7 @@ namespace ir
 {
 
 OperationValidator::OperationValidator(const Graph &graph)
-    : _operations{graph.operations()}, _operands{graph.operands()}
+  : _operations{graph.operations()}, _operands{graph.operands()}
 {
 }
 
@@ -60,7 +61,7 @@ bool OperationValidator::isSameQuantParam(const OperandIndex &idx1, const Operan
   if (_operands.at(idx1).typeInfo().scale() != _operands.at(idx2).typeInfo().scale())
     return false;
 
-  if (_operands.at(idx1).typeInfo().offset() != _operands.at(idx2).typeInfo().offset())
+  if (_operands.at(idx1).typeInfo().zero_point() != _operands.at(idx2).typeInfo().zero_point())
     return false;
 
   return true;
@@ -177,6 +178,7 @@ void OperationValidator::visit(const operation::Concat &node)
 void OperationValidator::visit(const operation::Conv2D &node)
 {
   const auto input_index{node.getInputs().at(operation::Conv2D::Input::INPUT)};
+  const auto kernel_index{node.getInputs().at(operation::Conv2D::Input::KERNEL)};
   const auto output_index{node.getOutputs().at(0)};
 
   uint32_t stride_horizontal = node.param().stride.horizontal;
@@ -187,6 +189,12 @@ void OperationValidator::visit(const operation::Conv2D &node)
   OP_REQUIRES((stride_horizontal > 0) && (stride_vertical > 0));
   OP_REQUIRES((dilation_width > 0) && (dilation_height > 0));
   OP_REQUIRES(isSameType(input_index, output_index));
+
+  if (isConstant(kernel_index) && operandType(kernel_index) == DataType::QUANT_INT8_ASYMM)
+  {
+    for (const auto zeropoint : _operands.at(kernel_index).typeInfo().zero_points())
+      OP_REQUIRES(zeropoint == 0);
+  }
 }
 
 void OperationValidator::visit(const operation::DepthToSpace &node)
@@ -206,6 +214,7 @@ void OperationValidator::visit(const operation::DepthToSpace &node)
 void OperationValidator::visit(const operation::DepthwiseConv2D &node)
 {
   const auto input_index{node.getInputs().at(operation::DepthwiseConv2D::Input::INPUT)};
+  const auto kernel_index{node.getInputs().at(operation::DepthwiseConv2D::Input::KERNEL)};
   const auto output_index{node.getOutputs().at(0)};
 
   uint32_t stride_horizontal = node.param().stride.horizontal;
@@ -216,6 +225,12 @@ void OperationValidator::visit(const operation::DepthwiseConv2D &node)
   OP_REQUIRES((stride_horizontal > 0) && (stride_vertical > 0));
   OP_REQUIRES((dilation_width > 0) && (dilation_height > 0));
   OP_REQUIRES(isSameType(input_index, output_index));
+
+  if (isConstant(kernel_index) && operandType(kernel_index) == DataType::QUANT_INT8_ASYMM)
+  {
+    for (const auto zeropoint : _operands.at(kernel_index).typeInfo().zero_points())
+      OP_REQUIRES(zeropoint == 0);
+  }
 }
 
 void OperationValidator::visit(const operation::ElementwiseActivation &node)
@@ -233,22 +248,22 @@ void OperationValidator::visit(const operation::ElementwiseActivation &node)
       break;
     case operation::ElementwiseActivation::Type::LEAKY_RELU:
       OP_REQUIRES(
-          isValidType(input_index, {DataType::FLOAT32, DataType::QUANT_UINT8_ASYMM,
-                                    DataType::QUANT_INT8_ASYMM, DataType::QUANT_INT16_ASYMM}));
+        isValidType(input_index, {DataType::FLOAT32, DataType::QUANT_UINT8_ASYMM,
+                                  DataType::QUANT_INT8_ASYMM, DataType::QUANT_INT16_ASYMM}));
       break;
     case operation::ElementwiseActivation::Type::LOGISTIC:
       OP_REQUIRES(
-          isValidType(input_index, {DataType::FLOAT32, DataType::QUANT_UINT8_ASYMM,
-                                    DataType::QUANT_INT8_ASYMM, DataType::QUANT_INT16_ASYMM}));
+        isValidType(input_index, {DataType::FLOAT32, DataType::QUANT_UINT8_ASYMM,
+                                  DataType::QUANT_INT8_ASYMM, DataType::QUANT_INT16_ASYMM}));
       break;
     case operation::ElementwiseActivation::Type::RELU:
-      OP_REQUIRES(isValidType(input_index, {DataType::FLOAT32, DataType::QUANT_UINT8_ASYMM,
-                                            DataType::QUANT_INT8_ASYMM}));
+      OP_REQUIRES(isValidType(
+        input_index, {DataType::FLOAT32, DataType::QUANT_UINT8_ASYMM, DataType::QUANT_INT8_ASYMM}));
       break;
     case operation::ElementwiseActivation::Type::TANH:
       OP_REQUIRES(
-          isValidType(input_index, {DataType::FLOAT32, DataType::QUANT_UINT8_ASYMM,
-                                    DataType::QUANT_INT8_ASYMM, DataType::QUANT_INT16_ASYMM}));
+        isValidType(input_index, {DataType::FLOAT32, DataType::QUANT_UINT8_ASYMM,
+                                  DataType::QUANT_INT8_ASYMM, DataType::QUANT_INT16_ASYMM}));
       break;
   }
 }
@@ -285,8 +300,10 @@ void OperationValidator::visit(const operation::ElementwiseUnary &node)
   }
   else if (node.param().op_type == operation::ElementwiseUnary::Type::QUANTIZE)
   {
-    OP_REQUIRES(isValidType(input_index, DataType::FLOAT32));
-    OP_REQUIRES(isValidType(output_index, DataType::QUANT_UINT8_ASYMM));
+    OP_REQUIRES(isValidType(
+      input_index, {DataType::FLOAT32, DataType::QUANT_UINT8_ASYMM, DataType::QUANT_INT8_ASYMM}));
+    OP_REQUIRES(
+      isValidType(output_index, {DataType::QUANT_UINT8_ASYMM, DataType::QUANT_INT8_ASYMM}));
   }
   else if (node.param().op_type == operation::ElementwiseUnary::Type::FLOOR)
   {
@@ -310,9 +327,9 @@ void OperationValidator::visit(const operation::EmbeddingLookup &node)
   // TFLite: Allow hybrid type - value table & output
   // NNAPI: Require same value table and output type
   OP_REQUIRES(
-      isSameType(values_index, output_index) ||
-      (isValidType(output_index, DataType::FLOAT32) &&
-       (isValidType(values_index, {DataType::QUANT_INT8_ASYMM, DataType::QUANT_INT8_SYMM}))));
+    isSameType(values_index, output_index) ||
+    (isValidType(output_index, DataType::FLOAT32) &&
+     (isValidType(values_index, {DataType::QUANT_INT8_ASYMM, DataType::QUANT_INT8_SYMM}))));
 }
 
 void OperationValidator::visit(const operation::ExpandDims &node)
@@ -357,9 +374,32 @@ void OperationValidator::visit(const operation::Pack &node)
 
 void OperationValidator::visit(const operation::Pad &node)
 {
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(operation::Pad::Input::INPUT)};
   const auto pad_index{node.getInputs().at(operation::Pad::Input::PAD)};
+  bool isQuantType =
+    isValidType(output_index, {DataType::QUANT_UINT8_ASYMM, DataType::QUANT_INT8_ASYMM});
+  bool isPadV2 = node.getInputs().size() == 3 ? true : false;
 
   OP_REQUIRES(isValidType(pad_index, DataType::INT32));
+  OP_REQUIRES(isSameType(input_index, output_index));
+
+  if (isQuantType)
+    OP_REQUIRES(isSameQuantParam(input_index, output_index));
+
+  if (isPadV2)
+  {
+    const auto value_index{node.getInputs().at(operation::Pad::Input::VALUE)};
+    const bool cond_same = isSameType(input_index, value_index);
+    const bool cond_same_quant = (!isQuantType || isSameQuantParam(input_index, value_index));
+    const auto input_t = operandType(input_index);
+    const auto value_t = operandType(value_index);
+    // NNAPI accepts this case. scale and zeroPoint are assumed to be the same as in input0.
+    const bool cond_quant8 =
+      ((input_t == DataType::QUANT_UINT8_ASYMM || input_t == DataType::QUANT_INT8_ASYMM) &&
+       value_t == DataType::INT32);
+    OP_REQUIRES((cond_same && cond_same_quant) || cond_quant8);
+  }
 }
 
 void OperationValidator::visit(const operation::Rank &node)
@@ -404,6 +444,25 @@ void OperationValidator::visit(const operation::Shape &node)
   OP_REQUIRES(isValidType(output_index, {DataType::UINT32, DataType::INT32, DataType::INT64}));
 }
 
+void OperationValidator::visit(const operation::Slice &node)
+{
+  const auto begins_index{node.getInputs().at(operation::Slice::BEGINS)};
+  const auto sizes_index{node.getInputs().at(operation::Slice::SIZES)};
+
+  OP_REQUIRES(isValidType(begins_index, {DataType::INT32, DataType::INT64}));
+  OP_REQUIRES(isSameType(begins_index, sizes_index));
+}
+
+void OperationValidator::visit(const operation::Softmax &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(operation::Softmax::INPUT)};
+
+  OP_REQUIRES(isSameType(input_index, output_index));
+  OP_REQUIRES(isValidType(
+    output_index, {DataType::FLOAT32, DataType::QUANT_UINT8_ASYMM, DataType::QUANT_INT8_ASYMM}));
+}
+
 void OperationValidator::visit(const operation::SpaceToBatchND &node)
 {
   const auto block_size_index{node.getInputs().at(operation::SpaceToBatchND::Input::BLOCK_SIZE)};
@@ -438,6 +497,17 @@ void OperationValidator::visit(const operation::SquaredDifference &node)
   OP_REQUIRES(isSameType(lhs_index, rhs_index));
 }
 
+void OperationValidator::visit(const operation::StatelessRandomUniform &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto shape_index{node.getInputs().at(operation::StatelessRandomUniform::Input::SHAPE)};
+  const auto seed_index{node.getInputs().at(operation::StatelessRandomUniform::Input::SEED)};
+
+  OP_REQUIRES(isValidType(output_index, DataType::FLOAT32));
+  OP_REQUIRES(isValidType(shape_index, DataType::INT32));
+  OP_REQUIRES(isValidType(seed_index, DataType::INT32));
+}
+
 void OperationValidator::visit(const operation::StridedSlice &node)
 {
   const auto output_index{node.getOutputs().at(0)};
@@ -463,5 +533,5 @@ void OperationValidator::visit(const operation::While &node)
   OP_REQUIRES(node.getInputs().size() == node.getOutputs().size());
 }
 
-} // namespace compiler
+} // namespace ir
 } // namespace onert
diff --git a/runtime/onert/core/src/ir/OperationValidator.h b/runtime/onert/core/src/ir/OperationValidator.h
index 5b95b16ba..9829ca095 100644
--- a/runtime/onert/core/src/ir/OperationValidator.h
+++ b/runtime/onert/core/src/ir/OperationValidator.h
@@ -18,6 +18,8 @@
 #define __ONERT_IR_OPERATION_VALIDATOR_H__
 
 #include "ir/OperationVisitor.h"
+#include "ir/Operations.h"
+#include "ir/Operands.h"
 
 namespace onert
 {
@@ -67,10 +69,13 @@ public:
   void visit(const operation::Reverse &node) override;
   void visit(const operation::Select &node) override;
   void visit(const operation::Shape &node) override;
+  void visit(const operation::Slice &node) override;
+  void visit(const operation::Softmax &node) override;
   void visit(const operation::SpaceToBatchND &node) override;
   void visit(const operation::SpaceToDepth &node) override;
   void visit(const operation::Split &node) override;
   void visit(const operation::SquaredDifference &node) override;
+  void visit(const operation::StatelessRandomUniform &node) override;
   void visit(const operation::StridedSlice &node) override;
   void visit(const operation::TransposeConv &node) override;
   void visit(const operation::Unpack &node) override;
diff --git a/runtime/onert/core/src/ir/Operations.cc b/runtime/onert/core/src/ir/Operations.cc
index 64d0bd6f0..e7e0c88cf 100644
--- a/runtime/onert/core/src/ir/Operations.cc
+++ b/runtime/onert/core/src/ir/Operations.cc
@@ -25,12 +25,9 @@ namespace ir
 
 Operations::Operations(const Operations &obj)
 {
-  obj.iterate([&](const OperationIndex &index, const Operation &op) {
-    OperationCloner cloner;
-    op.accept(cloner);
-    _objects.emplace(index, cloner.releaseClone());
-  });
-  _index_count = obj._index_count;
+  obj.iterate(
+    [&](const OperationIndex &index, const Operation &op) { _objects.emplace(index, clone(op)); });
+  _next_index = obj._next_index;
 }
 
 } // namespace ir
diff --git a/runtime/onert/core/src/ir/Padding.cc b/runtime/onert/core/src/ir/Padding.cc
index d74f80217..b2b004e7a 100644
--- a/runtime/onert/core/src/ir/Padding.cc
+++ b/runtime/onert/core/src/ir/Padding.cc
@@ -66,14 +66,14 @@ inline ExplicitPadding samePaddingUsingIFM(const FeatureShape &ifm_shape, const
 
   const int32_t vertical_expected_output = (ifm_shape.H + stride.vertical - 1) / stride.vertical;
   const int32_t horizontal_expected_output =
-      (ifm_shape.W + stride.horizontal - 1) / stride.horizontal;
+    (ifm_shape.W + stride.horizontal - 1) / stride.horizontal;
 
   const int32_t vertical_needed_input =
-      (vertical_expected_output - 1) * stride.vertical + effective_filter_h_size;
+    (vertical_expected_output - 1) * stride.vertical + effective_filter_h_size;
   const int32_t vertical_total_padding = std::max(0, vertical_needed_input - ifm_shape.H);
 
   const int32_t horizontal_needed_input =
-      (horizontal_expected_output - 1) * stride.horizontal + effective_filter_w_size;
+    (horizontal_expected_output - 1) * stride.horizontal + effective_filter_w_size;
   const int32_t horizontal_total_padding = std::max(0, horizontal_needed_input - ifm_shape.W);
 
   padding.top = vertical_total_padding / 2;
@@ -90,7 +90,7 @@ inline ExplicitPadding samePadding(const FeatureShape &ifm_shape, const FeatureS
 {
   const int32_t vertical_expected_output = (ifm_shape.H + stride.vertical - 1) / stride.vertical;
   const int32_t horizontal_expected_output =
-      (ifm_shape.W + stride.horizontal - 1) / stride.horizontal;
+    (ifm_shape.W + stride.horizontal - 1) / stride.horizontal;
   assert(vertical_expected_output == ofm_shape.H);
   assert(horizontal_expected_output == ofm_shape.W);
 
@@ -129,7 +129,7 @@ Padding::Padding(PaddingType paddingType) : type{paddingType}, param{0, 0, 0, 0}
 }
 
 Padding::Padding(uint32_t left, uint32_t right, uint32_t top, uint32_t bottom)
-    : type{PaddingType::EXPLICIT}, param{left, right, top, bottom}
+  : type{PaddingType::EXPLICIT}, param{left, right, top, bottom}
 {
   // DO NOTHING
 }
diff --git a/runtime/onert/core/src/ir/Shape.cc b/runtime/onert/core/src/ir/Shape.cc
index 322df7b4c..a7c50a266 100644
--- a/runtime/onert/core/src/ir/Shape.cc
+++ b/runtime/onert/core/src/ir/Shape.cc
@@ -87,27 +87,30 @@ uint64_t Shape::num_elements() const
                          std::multiplies<uint64_t>());
 }
 
-Shape permuteShape(const Shape &shape, Layout frontend_layout, Layout backend_layout)
+Shape permuteShape(const Shape &shape, Layout from, Layout to)
 {
   assert(shape.rank() <= Shape::MAX_RANK);
-  Shape backend_shape{shape};
-  if (shape.rank() >= 4 && frontend_layout == Layout::NHWC && backend_layout == Layout::NCHW)
+  Shape ret{shape};
+  if (from == to)
+    return ret;
+  if (shape.rank() < 4)
+    return ret;
+  // Permutation changing layout beyond 4-D is not supported yet
+  assert(shape.rank() <= 4);
+  if (from == Layout::NHWC && to == Layout::NCHW)
   {
-    // Permutation changing layout beyond 4-D is not supported yet
-    assert(shape.rank() <= 4);
-    backend_shape.dim(1) = shape.dim(3);
-    backend_shape.dim(2) = shape.dim(1);
-    backend_shape.dim(3) = shape.dim(2);
+    ret.dim(1) = shape.dim(3);
+    ret.dim(2) = shape.dim(1);
+    ret.dim(3) = shape.dim(2);
   }
-  else if (shape.rank() >= 4 && frontend_layout == Layout::NCHW && backend_layout == Layout::NHWC)
+  else if (from == Layout::NCHW && to == Layout::NHWC)
   {
-    // Permutation changing layout beyond 4-D is not supported yet
-    assert(shape.rank() <= 4);
-    backend_shape.dim(1) = shape.dim(2);
-    backend_shape.dim(2) = shape.dim(3);
-    backend_shape.dim(3) = shape.dim(1);
+    ret.dim(1) = shape.dim(2);
+    ret.dim(2) = shape.dim(3);
+    ret.dim(3) = shape.dim(1);
   }
-  return backend_shape;
+  // Other cases(either `from` or `to` is UNKNOWN), just return the original shape
+  return ret;
 }
 
 } // namespace ir
diff --git a/runtime/onert/core/src/ir/TypeInfo.cc b/runtime/onert/core/src/ir/TypeInfo.cc
index ab8af287e..5d1c7ba8b 100644
--- a/runtime/onert/core/src/ir/TypeInfo.cc
+++ b/runtime/onert/core/src/ir/TypeInfo.cc
@@ -28,7 +28,7 @@ bool operator==(const TypeInfo &lhs, const TypeInfo &rhs)
     return false;
   }
 
-  if (lhs.offset() != rhs.offset())
+  if (lhs.zero_point() != rhs.zero_point())
   {
     return false;
   }
diff --git a/runtime/onert/core/src/ir/operation/AddN.cc b/runtime/onert/core/src/ir/operation/AddN.cc
index ce471252d..110aeebe7 100644
--- a/runtime/onert/core/src/ir/operation/AddN.cc
+++ b/runtime/onert/core/src/ir/operation/AddN.cc
@@ -30,7 +30,7 @@ namespace operation
 void AddN::accept(OperationVisitor &v) const { v.visit(*this); }
 
 AddN::AddN(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs)
-    : Operation{OperandConstraint::createExact(inputs.size()), inputs, outputs}
+  : Operation{OperandConstraint::createExact(inputs.size()), inputs, outputs}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/ArgMinMax.cc b/runtime/onert/core/src/ir/operation/ArgMinMax.cc
index 989d905bf..2f18ff2e2 100644
--- a/runtime/onert/core/src/ir/operation/ArgMinMax.cc
+++ b/runtime/onert/core/src/ir/operation/ArgMinMax.cc
@@ -28,7 +28,7 @@ void ArgMinMax::accept(OperationVisitor &v) const { v.visit(*this); }
 
 ArgMinMax::ArgMinMax(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
                      const Param &param)
-    : Operation{OperandConstraint::createExact(2u), inputs, outputs}, _param{param}
+  : Operation{OperandConstraint::createExact(2u), inputs, outputs}, _param{param}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/BCQFullyConnected.cc b/runtime/onert/core/src/ir/operation/BCQFullyConnected.cc
index 9dc54e6e9..e918d27ae 100644
--- a/runtime/onert/core/src/ir/operation/BCQFullyConnected.cc
+++ b/runtime/onert/core/src/ir/operation/BCQFullyConnected.cc
@@ -31,7 +31,7 @@ void BCQFullyConnected::accept(OperationVisitor &v) const { v.visit(*this); }
 
 BCQFullyConnected::BCQFullyConnected(const OperandIndexSequence &inputs,
                                      const OperandIndexSequence &outputs, const Param &param)
-    : Operation{OperandConstraint::createExact(5u), inputs, outputs}, _param{param}
+  : Operation{OperandConstraint::createExact(5u), inputs, outputs}, _param{param}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/BCQGather.cc b/runtime/onert/core/src/ir/operation/BCQGather.cc
index 80efa6460..f9dfaa3f6 100644
--- a/runtime/onert/core/src/ir/operation/BCQGather.cc
+++ b/runtime/onert/core/src/ir/operation/BCQGather.cc
@@ -31,7 +31,7 @@ void BCQGather::accept(OperationVisitor &v) const { v.visit(*this); }
 
 BCQGather::BCQGather(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
                      const Param &param)
-    : Operation{OperandConstraint::createExact(4u), inputs, outputs}, _param{param}
+  : Operation{OperandConstraint::createExact(4u), inputs, outputs}, _param{param}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/BatchMatMul.cc b/runtime/onert/core/src/ir/operation/BatchMatMul.cc
index b9616158d..20c5682f9 100644
--- a/runtime/onert/core/src/ir/operation/BatchMatMul.cc
+++ b/runtime/onert/core/src/ir/operation/BatchMatMul.cc
@@ -28,7 +28,7 @@ void BatchMatMul::accept(OperationVisitor &v) const { v.visit(*this); }
 
 BatchMatMul::BatchMatMul(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
                          const Param &param)
-    : Operation{OperandConstraint::createExact(2u), inputs, outputs}, _param{param}
+  : Operation{OperandConstraint::createExact(2u), inputs, outputs}, _param{param}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/BatchToSpaceND.cc b/runtime/onert/core/src/ir/operation/BatchToSpaceND.cc
index 34be79dd2..e58e0f486 100644
--- a/runtime/onert/core/src/ir/operation/BatchToSpaceND.cc
+++ b/runtime/onert/core/src/ir/operation/BatchToSpaceND.cc
@@ -31,7 +31,7 @@ void BatchToSpaceND::accept(OperationVisitor &v) const { v.visit(*this); }
 
 BatchToSpaceND::BatchToSpaceND(const OperandIndexSequence &inputs,
                                const OperandIndexSequence &outputs)
-    : Operation{OperandConstraint::createInRange(2u, 3u), inputs, outputs}
+  : Operation{OperandConstraint::createInRange(2u, 3u), inputs, outputs}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/BinaryArithmetic.cc b/runtime/onert/core/src/ir/operation/BinaryArithmetic.cc
index 2b1422c73..2d439194f 100644
--- a/runtime/onert/core/src/ir/operation/BinaryArithmetic.cc
+++ b/runtime/onert/core/src/ir/operation/BinaryArithmetic.cc
@@ -32,7 +32,7 @@ void BinaryArithmetic::accept(OperationVisitor &v) const { v.visit(*this); }
 
 BinaryArithmetic::BinaryArithmetic(const OperandIndexSequence &inputs,
                                    const OperandIndexSequence &outputs, const Param &param)
-    : Operation{OperandConstraint::createExact(2u), inputs, outputs}, _param{param}
+  : Operation{OperandConstraint::createExact(2u), inputs, outputs}, _param{param}
 {
 }
 
@@ -40,10 +40,10 @@ std::string BinaryArithmetic::name() const
 {
   using ArithmeticType = onert::ir::operation::BinaryArithmetic::ArithmeticType;
   static const std::unordered_map<ArithmeticType, std::string> name_map{
-      {ArithmeticType::ADD, std::string{"Add"}},
-      {ArithmeticType::SUB, std::string{"Sub"}},
-      {ArithmeticType::MUL, std::string{"Mul"}},
-      {ArithmeticType::DIV, std::string{"Div"}}};
+    {ArithmeticType::ADD, std::string{"Add"}},
+    {ArithmeticType::SUB, std::string{"Sub"}},
+    {ArithmeticType::MUL, std::string{"Mul"}},
+    {ArithmeticType::DIV, std::string{"Div"}}};
   return name_map.at(_param.arithmetic_type);
 }
 
diff --git a/runtime/onert/core/src/ir/operation/BroadcastTo.cc b/runtime/onert/core/src/ir/operation/BroadcastTo.cc
index a8f5e59cf..5da7b5abc 100644
--- a/runtime/onert/core/src/ir/operation/BroadcastTo.cc
+++ b/runtime/onert/core/src/ir/operation/BroadcastTo.cc
@@ -29,7 +29,7 @@ namespace operation
 void BroadcastTo::accept(OperationVisitor &v) const { v.visit(*this); }
 
 BroadcastTo::BroadcastTo(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs)
-    : Operation{OperandConstraint::createExact(2u), inputs, outputs}
+  : Operation{OperandConstraint::createExact(2u), inputs, outputs}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/Comparison.cc b/runtime/onert/core/src/ir/operation/Comparison.cc
index 2f6775411..94c96ff69 100644
--- a/runtime/onert/core/src/ir/operation/Comparison.cc
+++ b/runtime/onert/core/src/ir/operation/Comparison.cc
@@ -31,7 +31,7 @@ void Comparison::accept(OperationVisitor &v) const { v.visit(*this); }
 
 Comparison::Comparison(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
                        const Param &param)
-    : Operation{OperandConstraint::createExact(2u), inputs, outputs}, _param{param}
+  : Operation{OperandConstraint::createExact(2u), inputs, outputs}, _param{param}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/Concat.cc b/runtime/onert/core/src/ir/operation/Concat.cc
index 608bc29a6..5d99debb7 100644
--- a/runtime/onert/core/src/ir/operation/Concat.cc
+++ b/runtime/onert/core/src/ir/operation/Concat.cc
@@ -31,7 +31,7 @@ void Concat::accept(OperationVisitor &v) const { v.visit(*this); }
 
 Concat::Concat(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
                const Param &param)
-    : Operation{OperandConstraint::createAtLeast(1u), inputs, outputs}, _param{param}
+  : Operation{OperandConstraint::createAtLeast(1u), inputs, outputs}, _param{param}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/Conv2D.cc b/runtime/onert/core/src/ir/operation/Conv2D.cc
index 3a2e1d1fe..725f3e70b 100644
--- a/runtime/onert/core/src/ir/operation/Conv2D.cc
+++ b/runtime/onert/core/src/ir/operation/Conv2D.cc
@@ -31,7 +31,7 @@ void Conv2D::accept(OperationVisitor &v) const { v.visit(*this); }
 
 Conv2D::Conv2D(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
                const Param &param)
-    : Operation{OperandConstraint::createExact(3u), inputs, outputs}, _param{param}
+  : Operation{OperandConstraint::createExact(3u), inputs, outputs}, _param{param}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/ConvertFp16ToFp32.cc b/runtime/onert/core/src/ir/operation/ConvertFp16ToFp32.cc
index 676e039fa..822eb30a9 100644
--- a/runtime/onert/core/src/ir/operation/ConvertFp16ToFp32.cc
+++ b/runtime/onert/core/src/ir/operation/ConvertFp16ToFp32.cc
@@ -31,7 +31,7 @@ void ConvertFp16ToFp32::accept(OperationVisitor &v) const { v.visit(*this); }
 
 ConvertFp16ToFp32::ConvertFp16ToFp32(const OperandIndexSequence &inputs,
                                      const OperandIndexSequence &outputs)
-    : Operation{OperandConstraint::createExact(1u), inputs, outputs}
+  : Operation{OperandConstraint::createExact(1u), inputs, outputs}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/ConvertFp32ToFp16.cc b/runtime/onert/core/src/ir/operation/ConvertFp32ToFp16.cc
index bcfcbfc04..5e5b42f3b 100644
--- a/runtime/onert/core/src/ir/operation/ConvertFp32ToFp16.cc
+++ b/runtime/onert/core/src/ir/operation/ConvertFp32ToFp16.cc
@@ -31,7 +31,7 @@ void ConvertFp32ToFp16::accept(OperationVisitor &v) const { v.visit(*this); }
 
 ConvertFp32ToFp16::ConvertFp32ToFp16(const OperandIndexSequence &inputs,
                                      const OperandIndexSequence &outputs)
-    : Operation{OperandConstraint::createExact(1u), inputs, outputs}
+  : Operation{OperandConstraint::createExact(1u), inputs, outputs}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/Custom.cc b/runtime/onert/core/src/ir/operation/Custom.cc
index 25c53e1ba..06c84f81a 100644
--- a/runtime/onert/core/src/ir/operation/Custom.cc
+++ b/runtime/onert/core/src/ir/operation/Custom.cc
@@ -29,7 +29,7 @@ void Custom::accept(OperationVisitor &v) const { v.visit(*this); }
 
 Custom::Custom(OperandConstraint input_constr, const OperandIndexSequence &inputs,
                const OperandIndexSequence &outputs, std::string id, const Userdata &userdata)
-    : Operation{input_constr, inputs, outputs}, _id(std::move(id)), _userdata(userdata)
+  : Operation{input_constr, inputs, outputs}, _id(std::move(id)), _userdata(userdata)
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/DepthToSpace.cc b/runtime/onert/core/src/ir/operation/DepthToSpace.cc
index f2d6c7c1b..197c7ee48 100644
--- a/runtime/onert/core/src/ir/operation/DepthToSpace.cc
+++ b/runtime/onert/core/src/ir/operation/DepthToSpace.cc
@@ -31,7 +31,7 @@ void DepthToSpace::accept(OperationVisitor &v) const { v.visit(*this); }
 
 DepthToSpace::DepthToSpace(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
                            const Param &param)
-    : Operation{OperandConstraint::createExact(1u), inputs, outputs}, _param{param}
+  : Operation{OperandConstraint::createExact(1u), inputs, outputs}, _param{param}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/DepthwiseConv2D.cc b/runtime/onert/core/src/ir/operation/DepthwiseConv2D.cc
index d587a5591..bef75c5cf 100644
--- a/runtime/onert/core/src/ir/operation/DepthwiseConv2D.cc
+++ b/runtime/onert/core/src/ir/operation/DepthwiseConv2D.cc
@@ -31,7 +31,7 @@ void DepthwiseConv2D::accept(OperationVisitor &v) const { v.visit(*this); }
 
 DepthwiseConv2D::DepthwiseConv2D(const OperandIndexSequence &inputs,
                                  const OperandIndexSequence &outputs, const Param &param)
-    : Operation{OperandConstraint::createExact(3u), inputs, outputs}, _param{param}
+  : Operation{OperandConstraint::createExact(3u), inputs, outputs}, _param{param}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/Einsum.cc b/runtime/onert/core/src/ir/operation/Einsum.cc
index 3c1473aaa..b50f070e7 100644
--- a/runtime/onert/core/src/ir/operation/Einsum.cc
+++ b/runtime/onert/core/src/ir/operation/Einsum.cc
@@ -28,7 +28,7 @@ void Einsum::accept(OperationVisitor &v) const { v.visit(*this); }
 
 Einsum::Einsum(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
                const Param &param)
-    : Operation{OperandConstraint::createAtLeast(1u), inputs, outputs}, _param{param}
+  : Operation{OperandConstraint::createAtLeast(1u), inputs, outputs}, _param{param}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/ElementwiseActivation.cc b/runtime/onert/core/src/ir/operation/ElementwiseActivation.cc
index f6718b656..f3e942f7d 100644
--- a/runtime/onert/core/src/ir/operation/ElementwiseActivation.cc
+++ b/runtime/onert/core/src/ir/operation/ElementwiseActivation.cc
@@ -33,13 +33,14 @@ void ElementwiseActivation::accept(OperationVisitor &v) const { v.visit(*this);
 ElementwiseActivation::ElementwiseActivation(const OperandIndexSequence &inputs,
                                              const OperandIndexSequence &outputs,
                                              const Param &param)
-    : Operation{OperandConstraint::createExact(1u), inputs, outputs}, _param{param}
+  : Operation{OperandConstraint::createExact(1u), inputs, outputs}, _param{param}
 {
   if (param.op_type == Type::LOGISTIC)
   {
-    assert(param.alpha == 0.0f && param.beta == 0.0f && "Logistic will be supported only as "
-                                                        "sigmoid function(L=1, k=1, x0=0). So, do "
-                                                        "not use alpha and beta");
+    assert(param.alpha == 0.0f && param.beta == 0.0f &&
+           "Logistic will be supported only as "
+           "sigmoid function(L=1, k=1, x0=0). So, do "
+           "not use alpha and beta");
   }
   else if (param.op_type == Type::RELU)
   {
@@ -47,9 +48,10 @@ ElementwiseActivation::ElementwiseActivation(const OperandIndexSequence &inputs,
   }
   else if (param.op_type == Type::TANH)
   {
-    assert(param.alpha == 1.0f && param.beta == 1.0f && "f(x) = alpha * tanh(beta * x), Tanh is "
-                                                        "supported only the values of alpha and "
-                                                        "beta are 1.f");
+    assert(param.alpha == 1.0f && param.beta == 1.0f &&
+           "f(x) = alpha * tanh(beta * x), Tanh is "
+           "supported only the values of alpha and "
+           "beta are 1.f");
   }
 }
 
@@ -57,11 +59,11 @@ std::string ElementwiseActivation::name() const
 {
   using ElementwiseActivationType = onert::ir::operation::ElementwiseActivation::Type;
   static const std::unordered_map<Type, std::string> name_map{
-      {ElementwiseActivationType::ELU, "ELU"},
-      {ElementwiseActivationType::LOGISTIC, "Logistic"},
-      {ElementwiseActivationType::RELU, "ReLU"},
-      {ElementwiseActivationType::TANH, "Tanh"},
-      {ElementwiseActivationType::LEAKY_RELU, "LeakyRelu"}};
+    {ElementwiseActivationType::ELU, "ELU"},
+    {ElementwiseActivationType::LOGISTIC, "Logistic"},
+    {ElementwiseActivationType::RELU, "ReLU"},
+    {ElementwiseActivationType::TANH, "Tanh"},
+    {ElementwiseActivationType::LEAKY_RELU, "LeakyRelu"}};
   return name_map.at(_param.op_type);
 }
 
diff --git a/runtime/onert/core/src/ir/operation/ElementwiseBinary.cc b/runtime/onert/core/src/ir/operation/ElementwiseBinary.cc
index 3287fc0a3..8dc42903c 100644
--- a/runtime/onert/core/src/ir/operation/ElementwiseBinary.cc
+++ b/runtime/onert/core/src/ir/operation/ElementwiseBinary.cc
@@ -32,7 +32,7 @@ void ElementwiseBinary::accept(OperationVisitor &v) const { v.visit(*this); }
 
 ElementwiseBinary::ElementwiseBinary(const OperandIndexSequence &inputs,
                                      const OperandIndexSequence &outputs, const Param &param)
-    : Operation{OperandConstraint::createExact(2u), inputs, outputs}, _param{param}
+  : Operation{OperandConstraint::createExact(2u), inputs, outputs}, _param{param}
 {
 }
 
@@ -40,10 +40,10 @@ std::string ElementwiseBinary::name() const
 {
   using ElementwiseBinaryType = onert::ir::operation::ElementwiseBinary::ElementwiseBinaryType;
   static const std::unordered_map<ElementwiseBinaryType, std::string> name_map{
-      {ElementwiseBinaryType::LOGICAL_AND, std::string{"LogicalAnd"}},
-      {ElementwiseBinaryType::LOGICAL_OR, std::string{"LogicalOr"}},
-      {ElementwiseBinaryType::MAX, std::string{"Max"}},
-      {ElementwiseBinaryType::MIN, std::string{"Min"}}};
+    {ElementwiseBinaryType::LOGICAL_AND, std::string{"LogicalAnd"}},
+    {ElementwiseBinaryType::LOGICAL_OR, std::string{"LogicalOr"}},
+    {ElementwiseBinaryType::MAX, std::string{"Max"}},
+    {ElementwiseBinaryType::MIN, std::string{"Min"}}};
   return name_map.at(_param.op_type);
 }
 
diff --git a/runtime/onert/core/src/ir/operation/ElementwiseUnary.cc b/runtime/onert/core/src/ir/operation/ElementwiseUnary.cc
index 20b6fa124..c21c51c05 100644
--- a/runtime/onert/core/src/ir/operation/ElementwiseUnary.cc
+++ b/runtime/onert/core/src/ir/operation/ElementwiseUnary.cc
@@ -32,9 +32,9 @@ void ElementwiseUnary::accept(OperationVisitor &v) const { v.visit(*this); }
 
 ElementwiseUnary::ElementwiseUnary(const OperandIndexSequence &inputs,
                                    const OperandIndexSequence &outputs, const Param &param)
-    : Operation{OperandConstraint::createExact(1u), inputs, outputs,
-                OperandConstraint::createExact(1u)},
-      _param{param}
+  : Operation{OperandConstraint::createExact(1u), inputs, outputs,
+              OperandConstraint::createExact(1u)},
+    _param{param}
 {
 }
 
@@ -42,23 +42,23 @@ std::string ElementwiseUnary::name() const
 {
   using ElementwiseUnaryType = onert::ir::operation::ElementwiseUnary::Type;
   static const std::unordered_map<ElementwiseUnaryType, std::string> name_map{
-      {ElementwiseUnaryType::ABS, std::string{"Abs"}},
-      {ElementwiseUnaryType::CAST, std::string{"Cast"}},
-      {ElementwiseUnaryType::COS, std::string{"Cos"}},
-      {ElementwiseUnaryType::DEQUANTIZE, std::string{"Dequantize"}},
-      {ElementwiseUnaryType::ERF, std::string{"Erf"}},
-      {ElementwiseUnaryType::EXP, std::string{"Exp"}},
-      {ElementwiseUnaryType::FLOOR, std::string{"Floor"}},
-      {ElementwiseUnaryType::LOG, std::string{"Log"}},
-      {ElementwiseUnaryType::LOGICAL_NOT, std::string{"LogicalNot"}},
-      {ElementwiseUnaryType::NEG, std::string{"Neg"}},
-      {ElementwiseUnaryType::QUANTIZE, std::string{"Quantize"}},
-      {ElementwiseUnaryType::ROUND, std::string{"Round"}},
-      {ElementwiseUnaryType::RSQRT, std::string{"RSqrt"}},
-      {ElementwiseUnaryType::SIN, std::string{"Sin"}},
-      {ElementwiseUnaryType::SQRT, std::string{"Sqrt"}},
-      {ElementwiseUnaryType::SQUARE, std::string{"Square"}},
-      {ElementwiseUnaryType::ZEROS_LIKE, std::string{"ZerosLike"}}};
+    {ElementwiseUnaryType::ABS, std::string{"Abs"}},
+    {ElementwiseUnaryType::CAST, std::string{"Cast"}},
+    {ElementwiseUnaryType::COS, std::string{"Cos"}},
+    {ElementwiseUnaryType::DEQUANTIZE, std::string{"Dequantize"}},
+    {ElementwiseUnaryType::ERF, std::string{"Erf"}},
+    {ElementwiseUnaryType::EXP, std::string{"Exp"}},
+    {ElementwiseUnaryType::FLOOR, std::string{"Floor"}},
+    {ElementwiseUnaryType::LOG, std::string{"Log"}},
+    {ElementwiseUnaryType::LOGICAL_NOT, std::string{"LogicalNot"}},
+    {ElementwiseUnaryType::NEG, std::string{"Neg"}},
+    {ElementwiseUnaryType::QUANTIZE, std::string{"Quantize"}},
+    {ElementwiseUnaryType::ROUND, std::string{"Round"}},
+    {ElementwiseUnaryType::RSQRT, std::string{"RSqrt"}},
+    {ElementwiseUnaryType::SIN, std::string{"Sin"}},
+    {ElementwiseUnaryType::SQRT, std::string{"Sqrt"}},
+    {ElementwiseUnaryType::SQUARE, std::string{"Square"}},
+    {ElementwiseUnaryType::ZEROS_LIKE, std::string{"ZerosLike"}}};
   return name_map.at(_param.op_type);
 }
 
diff --git a/runtime/onert/core/src/ir/operation/EmbeddingLookup.cc b/runtime/onert/core/src/ir/operation/EmbeddingLookup.cc
index b300b004e..e23674706 100644
--- a/runtime/onert/core/src/ir/operation/EmbeddingLookup.cc
+++ b/runtime/onert/core/src/ir/operation/EmbeddingLookup.cc
@@ -31,7 +31,7 @@ void EmbeddingLookup::accept(OperationVisitor &v) const { v.visit(*this); }
 
 EmbeddingLookup::EmbeddingLookup(const OperandIndexSequence &inputs,
                                  const OperandIndexSequence &outputs)
-    : Operation{OperandConstraint::createExact(2u), inputs, outputs}
+  : Operation{OperandConstraint::createExact(2u), inputs, outputs}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/ExpandDims.cc b/runtime/onert/core/src/ir/operation/ExpandDims.cc
index 3f555bd23..50e3636f3 100644
--- a/runtime/onert/core/src/ir/operation/ExpandDims.cc
+++ b/runtime/onert/core/src/ir/operation/ExpandDims.cc
@@ -30,7 +30,7 @@ namespace operation
 void ExpandDims::accept(OperationVisitor &v) const { v.visit(*this); }
 
 ExpandDims::ExpandDims(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs)
-    : Operation{OperandConstraint::createExact(2u), inputs, outputs}
+  : Operation{OperandConstraint::createExact(2u), inputs, outputs}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/Fill.cc b/runtime/onert/core/src/ir/operation/Fill.cc
index b8b97d1c0..4a13737d4 100644
--- a/runtime/onert/core/src/ir/operation/Fill.cc
+++ b/runtime/onert/core/src/ir/operation/Fill.cc
@@ -30,7 +30,7 @@ namespace operation
 void Fill::accept(OperationVisitor &v) const { v.visit(*this); }
 
 Fill::Fill(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs)
-    : Operation{OperandConstraint::createExact(2u), inputs, outputs}
+  : Operation{OperandConstraint::createExact(2u), inputs, outputs}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/FullyConnected.cc b/runtime/onert/core/src/ir/operation/FullyConnected.cc
index 9837a3137..335b7b209 100644
--- a/runtime/onert/core/src/ir/operation/FullyConnected.cc
+++ b/runtime/onert/core/src/ir/operation/FullyConnected.cc
@@ -31,7 +31,7 @@ void FullyConnected::accept(OperationVisitor &v) const { v.visit(*this); }
 
 FullyConnected::FullyConnected(const OperandIndexSequence &inputs,
                                const OperandIndexSequence &outputs, const Param &param)
-    : Operation{OperandConstraint::createInRange(2u, 3u), inputs, outputs}, _param{param}
+  : Operation{OperandConstraint::createInRange(2u, 3u), inputs, outputs}, _param{param}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/FusedBatchNorm.cc b/runtime/onert/core/src/ir/operation/FusedBatchNorm.cc
index 7b9301ea6..b5679f308 100644
--- a/runtime/onert/core/src/ir/operation/FusedBatchNorm.cc
+++ b/runtime/onert/core/src/ir/operation/FusedBatchNorm.cc
@@ -28,7 +28,7 @@ void FusedBatchNorm::accept(OperationVisitor &v) const { v.visit(*this); }
 
 FusedBatchNorm::FusedBatchNorm(const OperandIndexSequence &inputs,
                                const OperandIndexSequence &outputs, const Param &param)
-    : Operation{OperandConstraint::createAtLeast(5u), inputs, outputs}, _param{param}
+  : Operation{OperandConstraint::createAtLeast(5u), inputs, outputs}, _param{param}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/Gather.cc b/runtime/onert/core/src/ir/operation/Gather.cc
index 11d46e75b..96a39b3f2 100644
--- a/runtime/onert/core/src/ir/operation/Gather.cc
+++ b/runtime/onert/core/src/ir/operation/Gather.cc
@@ -31,7 +31,7 @@ void Gather::accept(OperationVisitor &v) const { v.visit(*this); }
 
 Gather::Gather(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
                const Param &param)
-    : Operation{OperandConstraint::createExact(2u), inputs, outputs}, _param{param}
+  : Operation{OperandConstraint::createExact(2u), inputs, outputs}, _param{param}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/HashtableLookup.cc b/runtime/onert/core/src/ir/operation/HashtableLookup.cc
index e9a7a82ff..2974679d4 100644
--- a/runtime/onert/core/src/ir/operation/HashtableLookup.cc
+++ b/runtime/onert/core/src/ir/operation/HashtableLookup.cc
@@ -31,7 +31,7 @@ void HashtableLookup::accept(OperationVisitor &v) const { v.visit(*this); }
 
 HashtableLookup::HashtableLookup(const OperandIndexSequence &inputs,
                                  const OperandIndexSequence &outputs)
-    : Operation{OperandConstraint::createExact(3u), inputs, outputs}
+  : Operation{OperandConstraint::createExact(3u), inputs, outputs}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/If.cc b/runtime/onert/core/src/ir/operation/If.cc
index 599751dfd..380c87dbe 100644
--- a/runtime/onert/core/src/ir/operation/If.cc
+++ b/runtime/onert/core/src/ir/operation/If.cc
@@ -24,7 +24,7 @@ namespace operation
 {
 void If::accept(OperationVisitor &v) const { v.visit(*this); }
 If::If(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs, const Param &param)
-    : Operation{OperandConstraint::createAny(), inputs, outputs}, _param{param}
+  : Operation{OperandConstraint::createAny(), inputs, outputs}, _param{param}
 {
 }
 } // namespace operation
diff --git a/runtime/onert/core/src/ir/operation/InstanceNorm.cc b/runtime/onert/core/src/ir/operation/InstanceNorm.cc
index 2334560ef..d9af9d0b7 100644
--- a/runtime/onert/core/src/ir/operation/InstanceNorm.cc
+++ b/runtime/onert/core/src/ir/operation/InstanceNorm.cc
@@ -31,7 +31,7 @@ void InstanceNorm::accept(OperationVisitor &v) const { v.visit(*this); }
 
 InstanceNorm::InstanceNorm(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
                            const Param &param)
-    : Operation{OperandConstraint::createExact(3u), inputs, outputs}, _param{param}
+  : Operation{OperandConstraint::createExact(3u), inputs, outputs}, _param{param}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/L2Normalization.cc b/runtime/onert/core/src/ir/operation/L2Normalization.cc
index 9a7d3eb61..0184ef628 100644
--- a/runtime/onert/core/src/ir/operation/L2Normalization.cc
+++ b/runtime/onert/core/src/ir/operation/L2Normalization.cc
@@ -31,7 +31,7 @@ void L2Normalization::accept(OperationVisitor &v) const { v.visit(*this); }
 
 L2Normalization::L2Normalization(const OperandIndexSequence &inputs,
                                  const OperandIndexSequence &outputs)
-    : Operation{OperandConstraint::createExact(1u), inputs, outputs}
+  : Operation{OperandConstraint::createExact(1u), inputs, outputs}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/LSTM.cc b/runtime/onert/core/src/ir/operation/LSTM.cc
index 5cd7c793a..45a1fd120 100644
--- a/runtime/onert/core/src/ir/operation/LSTM.cc
+++ b/runtime/onert/core/src/ir/operation/LSTM.cc
@@ -31,7 +31,7 @@ void LSTM::accept(OperationVisitor &v) const { v.visit(*this); }
 
 LSTM::LSTM(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
            const Param &param)
-    : Operation{OperandConstraint::createInRange(20u, 24u), inputs, outputs}, _param{param}
+  : Operation{OperandConstraint::createInRange(20u, 24u), inputs, outputs}, _param{param}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/LocalResponseNormalization.cc b/runtime/onert/core/src/ir/operation/LocalResponseNormalization.cc
index 1ae97c142..52037cc72 100644
--- a/runtime/onert/core/src/ir/operation/LocalResponseNormalization.cc
+++ b/runtime/onert/core/src/ir/operation/LocalResponseNormalization.cc
@@ -32,7 +32,7 @@ void LocalResponseNormalization::accept(OperationVisitor &v) const { v.visit(*th
 LocalResponseNormalization::LocalResponseNormalization(const OperandIndexSequence &inputs,
                                                        const OperandIndexSequence &outputs,
                                                        const Param &param)
-    : Operation{OperandConstraint::createExact(1u), inputs, outputs}, _param{param}
+  : Operation{OperandConstraint::createExact(1u), inputs, outputs}, _param{param}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/LogSoftmax.cc b/runtime/onert/core/src/ir/operation/LogSoftmax.cc
index 73c6580ec..51f6a6c5d 100644
--- a/runtime/onert/core/src/ir/operation/LogSoftmax.cc
+++ b/runtime/onert/core/src/ir/operation/LogSoftmax.cc
@@ -31,7 +31,7 @@ void LogSoftmax::accept(OperationVisitor &v) const { v.visit(*this); }
 
 LogSoftmax::LogSoftmax(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
                        const Param &param)
-    : Operation{OperandConstraint::createExact(1u), inputs, outputs}, _param{param}
+  : Operation{OperandConstraint::createExact(1u), inputs, outputs}, _param{param}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/MatrixBandPart.cc b/runtime/onert/core/src/ir/operation/MatrixBandPart.cc
index bac31f13e..6046e36fe 100644
--- a/runtime/onert/core/src/ir/operation/MatrixBandPart.cc
+++ b/runtime/onert/core/src/ir/operation/MatrixBandPart.cc
@@ -31,7 +31,7 @@ void MatrixBandPart::accept(OperationVisitor &v) const { v.visit(*this); }
 
 MatrixBandPart::MatrixBandPart(const OperandIndexSequence &inputs,
                                const OperandIndexSequence &outputs)
-    : Operation{OperandConstraint::createExact(3u), inputs, outputs}
+  : Operation{OperandConstraint::createExact(3u), inputs, outputs}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/OneHot.cc b/runtime/onert/core/src/ir/operation/OneHot.cc
index 22935e7d6..90898f1ed 100644
--- a/runtime/onert/core/src/ir/operation/OneHot.cc
+++ b/runtime/onert/core/src/ir/operation/OneHot.cc
@@ -28,7 +28,7 @@ void OneHot::accept(OperationVisitor &v) const { v.visit(*this); }
 
 OneHot::OneHot(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
                const Param &param)
-    : Operation{OperandConstraint::createExact(4u), inputs, outputs}, _param{param}
+  : Operation{OperandConstraint::createExact(4u), inputs, outputs}, _param{param}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/PReLU.cc b/runtime/onert/core/src/ir/operation/PReLU.cc
index a2e37e0ad..5ed31c2b9 100644
--- a/runtime/onert/core/src/ir/operation/PReLU.cc
+++ b/runtime/onert/core/src/ir/operation/PReLU.cc
@@ -30,7 +30,7 @@ namespace operation
 void PReLU::accept(OperationVisitor &v) const { v.visit(*this); }
 
 PReLU::PReLU(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs)
-    : Operation{OperandConstraint::createExact(2u), inputs, outputs}
+  : Operation{OperandConstraint::createExact(2u), inputs, outputs}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/Pack.cc b/runtime/onert/core/src/ir/operation/Pack.cc
index 784d4162a..00feadfb0 100644
--- a/runtime/onert/core/src/ir/operation/Pack.cc
+++ b/runtime/onert/core/src/ir/operation/Pack.cc
@@ -25,7 +25,7 @@ namespace operation
 void Pack::accept(OperationVisitor &v) const { v.visit(*this); }
 Pack::Pack(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
            const Param &param)
-    : Operation{OperandConstraint::createAtLeast(1u), inputs, outputs}, _param{param}
+  : Operation{OperandConstraint::createAtLeast(1u), inputs, outputs}, _param{param}
 {
 }
 } // namespace operation
diff --git a/runtime/onert/core/src/ir/operation/Pad.cc b/runtime/onert/core/src/ir/operation/Pad.cc
index 0c56e92e3..a3f2d9752 100644
--- a/runtime/onert/core/src/ir/operation/Pad.cc
+++ b/runtime/onert/core/src/ir/operation/Pad.cc
@@ -30,7 +30,7 @@ void Pad::accept(OperationVisitor &v) const { v.visit(*this); }
 // PAD: 2 inputs
 // PADV2: 3 inputs
 Pad::Pad(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs)
-    : Operation{OperandConstraint::createInRange(2u, 3u), inputs, outputs}
+  : Operation{OperandConstraint::createInRange(2u, 3u), inputs, outputs}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/Permute.cc b/runtime/onert/core/src/ir/operation/Permute.cc
index eefb6c542..571965de8 100644
--- a/runtime/onert/core/src/ir/operation/Permute.cc
+++ b/runtime/onert/core/src/ir/operation/Permute.cc
@@ -30,7 +30,7 @@ namespace operation
 void Permute::accept(OperationVisitor &v) const { v.visit(*this); }
 
 Permute::Permute(const OperandIndex &input, const OperandIndex &output, Type type)
-    : Operation{OperandConstraint::createExact(1u)}, _type{type}
+  : Operation{OperandConstraint::createExact(1u)}, _type{type}
 {
   setInputs({input});
   setOutputs({output});
diff --git a/runtime/onert/core/src/ir/operation/Pool2D.cc b/runtime/onert/core/src/ir/operation/Pool2D.cc
index 761d14c3d..cbb42a80a 100644
--- a/runtime/onert/core/src/ir/operation/Pool2D.cc
+++ b/runtime/onert/core/src/ir/operation/Pool2D.cc
@@ -32,7 +32,7 @@ void Pool2D::accept(OperationVisitor &v) const { v.visit(*this); }
 
 Pool2D::Pool2D(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
                const Param &param)
-    : Operation{OperandConstraint::createExact(1u), inputs, outputs}, _param{param}
+  : Operation{OperandConstraint::createExact(1u), inputs, outputs}, _param{param}
 {
 }
 
@@ -40,9 +40,9 @@ std::string Pool2D::name() const
 {
   using PoolType = onert::ir::operation::Pool2D::PoolType;
   static const std::unordered_map<PoolType, std::string> name_map{
-      {PoolType::AVG, "Avg" + std::string{toString(opcode())}},
-      {PoolType::L2, "L2" + std::string{toString(opcode())}},
-      {PoolType::MAX, "Max" + std::string{toString(opcode())}}};
+    {PoolType::AVG, "Avg" + std::string{toString(opcode())}},
+    {PoolType::L2, "L2" + std::string{toString(opcode())}},
+    {PoolType::MAX, "Max" + std::string{toString(opcode())}}};
   return name_map.at(_param.op_type);
 }
 
diff --git a/runtime/onert/core/src/ir/operation/Pow.cc b/runtime/onert/core/src/ir/operation/Pow.cc
index 940b1391a..f1df54c60 100644
--- a/runtime/onert/core/src/ir/operation/Pow.cc
+++ b/runtime/onert/core/src/ir/operation/Pow.cc
@@ -30,7 +30,7 @@ namespace operation
 void Pow::accept(OperationVisitor &v) const { v.visit(*this); }
 
 Pow::Pow(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs)
-    : Operation{OperandConstraint::createExact(2u), inputs, outputs}
+  : Operation{OperandConstraint::createExact(2u), inputs, outputs}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/RNN.cc b/runtime/onert/core/src/ir/operation/RNN.cc
index 298c5e745..a40e5bdc9 100644
--- a/runtime/onert/core/src/ir/operation/RNN.cc
+++ b/runtime/onert/core/src/ir/operation/RNN.cc
@@ -31,7 +31,7 @@ void RNN::accept(OperationVisitor &v) const { v.visit(*this); }
 
 RNN::RNN(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
          const Param &param)
-    : Operation{OperandConstraint::createExact(5u), inputs, outputs}, _param{param}
+  : Operation{OperandConstraint::createExact(5u), inputs, outputs}, _param{param}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/Range.cc b/runtime/onert/core/src/ir/operation/Range.cc
index 96ab04c1b..f85d52cb0 100644
--- a/runtime/onert/core/src/ir/operation/Range.cc
+++ b/runtime/onert/core/src/ir/operation/Range.cc
@@ -30,7 +30,7 @@ namespace operation
 void Range::accept(OperationVisitor &v) const { v.visit(*this); }
 
 Range::Range(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs)
-    : Operation{OperandConstraint::createExact(3u), inputs, outputs}
+  : Operation{OperandConstraint::createExact(3u), inputs, outputs}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/Rank.cc b/runtime/onert/core/src/ir/operation/Rank.cc
index c357e9018..c33ed0a80 100644
--- a/runtime/onert/core/src/ir/operation/Rank.cc
+++ b/runtime/onert/core/src/ir/operation/Rank.cc
@@ -30,7 +30,7 @@ namespace operation
 void Rank::accept(OperationVisitor &v) const { v.visit(*this); }
 
 Rank::Rank(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs)
-    : Operation{OperandConstraint::createExact(1u), inputs, outputs}
+  : Operation{OperandConstraint::createExact(1u), inputs, outputs}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/Reduce.cc b/runtime/onert/core/src/ir/operation/Reduce.cc
index d6a1d953c..0811f1c37 100644
--- a/runtime/onert/core/src/ir/operation/Reduce.cc
+++ b/runtime/onert/core/src/ir/operation/Reduce.cc
@@ -32,7 +32,7 @@ void Reduce::accept(OperationVisitor &v) const { v.visit(*this); }
 
 Reduce::Reduce(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
                const Param &param)
-    : Operation{OperandConstraint::createExact(2u), inputs, outputs}, _param{param}
+  : Operation{OperandConstraint::createExact(2u), inputs, outputs}, _param{param}
 {
 }
 
@@ -40,13 +40,13 @@ std::string Reduce::name() const
 {
   using ReduceType = onert::ir::operation::Reduce::ReduceType;
   static const std::unordered_map<ReduceType, std::string> name_map{
-      {ReduceType::ALL, std::string{toString(opcode())} + "All"},
-      {ReduceType::ANY, std::string{toString(opcode())} + "Any"},
-      {ReduceType::MAX, std::string{toString(opcode())} + "Max"},
-      {ReduceType::MEAN, std::string{toString(opcode())} + "Mean"},
-      {ReduceType::MIN, std::string{toString(opcode())} + "Min"},
-      {ReduceType::PROD, std::string{toString(opcode())} + "Prod"},
-      {ReduceType::SUM, std::string{toString(opcode())} + "SUM"}};
+    {ReduceType::ALL, std::string{toString(opcode())} + "All"},
+    {ReduceType::ANY, std::string{toString(opcode())} + "Any"},
+    {ReduceType::MAX, std::string{toString(opcode())} + "Max"},
+    {ReduceType::MEAN, std::string{toString(opcode())} + "Mean"},
+    {ReduceType::MIN, std::string{toString(opcode())} + "Min"},
+    {ReduceType::PROD, std::string{toString(opcode())} + "Prod"},
+    {ReduceType::SUM, std::string{toString(opcode())} + "SUM"}};
   return name_map.at(_param.reduce_type);
   //  return std::string(toString(opcode())) + reduce_type_str_map.at(_param.reduce_type);
 }
diff --git a/runtime/onert/core/src/ir/operation/Reshape.cc b/runtime/onert/core/src/ir/operation/Reshape.cc
index 92aa89ac6..54c12574a 100644
--- a/runtime/onert/core/src/ir/operation/Reshape.cc
+++ b/runtime/onert/core/src/ir/operation/Reshape.cc
@@ -31,7 +31,7 @@ void Reshape::accept(OperationVisitor &v) const { v.visit(*this); }
 
 Reshape::Reshape(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
                  const Param &param)
-    : Operation{OperandConstraint::createExact(2u), inputs, outputs}, _param(param)
+  : Operation{OperandConstraint::createExact(2u), inputs, outputs}, _param(param)
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/ResizeBilinear.cc b/runtime/onert/core/src/ir/operation/ResizeBilinear.cc
index 71925bb44..7c9f5e104 100644
--- a/runtime/onert/core/src/ir/operation/ResizeBilinear.cc
+++ b/runtime/onert/core/src/ir/operation/ResizeBilinear.cc
@@ -31,7 +31,7 @@ void ResizeBilinear::accept(OperationVisitor &v) const { v.visit(*this); }
 
 ResizeBilinear::ResizeBilinear(const OperandIndexSequence &inputs,
                                const OperandIndexSequence &outputs, const Param &param)
-    : Operation{OperandConstraint::createInRange(1u, 2u), inputs, outputs}, _param{param}
+  : Operation{OperandConstraint::createInRange(1u, 2u), inputs, outputs}, _param{param}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/ResizeNearestNeighbor.cc b/runtime/onert/core/src/ir/operation/ResizeNearestNeighbor.cc
index 98d0b5f26..9792b292d 100644
--- a/runtime/onert/core/src/ir/operation/ResizeNearestNeighbor.cc
+++ b/runtime/onert/core/src/ir/operation/ResizeNearestNeighbor.cc
@@ -32,7 +32,7 @@ void ResizeNearestNeighbor::accept(OperationVisitor &v) const { v.visit(*this);
 ResizeNearestNeighbor::ResizeNearestNeighbor(const OperandIndexSequence &inputs,
                                              const OperandIndexSequence &outputs,
                                              const Param &param)
-    : Operation{OperandConstraint::createInRange(1u, 2u), inputs, outputs}, _param{param}
+  : Operation{OperandConstraint::createInRange(1u, 2u), inputs, outputs}, _param{param}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/Reverse.cc b/runtime/onert/core/src/ir/operation/Reverse.cc
index 4b3c1e1af..471457739 100644
--- a/runtime/onert/core/src/ir/operation/Reverse.cc
+++ b/runtime/onert/core/src/ir/operation/Reverse.cc
@@ -30,7 +30,7 @@ namespace operation
 void Reverse::accept(OperationVisitor &v) const { v.visit(*this); }
 
 Reverse::Reverse(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs)
-    : Operation{OperandConstraint::createExact(2u), inputs, outputs}
+  : Operation{OperandConstraint::createExact(2u), inputs, outputs}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/Select.cc b/runtime/onert/core/src/ir/operation/Select.cc
index 1f22b5234..59684190c 100644
--- a/runtime/onert/core/src/ir/operation/Select.cc
+++ b/runtime/onert/core/src/ir/operation/Select.cc
@@ -28,7 +28,7 @@ namespace operation
 void Select::accept(OperationVisitor &v) const { v.visit(*this); }
 
 Select::Select(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs)
-    : Operation{OperandConstraint::createExact(3u), inputs, outputs}
+  : Operation{OperandConstraint::createExact(3u), inputs, outputs}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/Shape.cc b/runtime/onert/core/src/ir/operation/Shape.cc
index 2a63d6dcf..1b2cd6241 100644
--- a/runtime/onert/core/src/ir/operation/Shape.cc
+++ b/runtime/onert/core/src/ir/operation/Shape.cc
@@ -30,7 +30,7 @@ namespace operation
 void Shape::accept(OperationVisitor &v) const { v.visit(*this); }
 
 Shape::Shape(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs)
-    : Operation{OperandConstraint::createExact(1u), inputs, outputs}
+  : Operation{OperandConstraint::createExact(1u), inputs, outputs}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/Slice.cc b/runtime/onert/core/src/ir/operation/Slice.cc
index 888b563fb..1362c0f91 100644
--- a/runtime/onert/core/src/ir/operation/Slice.cc
+++ b/runtime/onert/core/src/ir/operation/Slice.cc
@@ -27,7 +27,7 @@ namespace operation
 void Slice::accept(OperationVisitor &v) const { v.visit(*this); }
 
 Slice::Slice(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs)
-    : Operation{OperandConstraint::createExact(3u), inputs, outputs}
+  : Operation{OperandConstraint::createExact(3u), inputs, outputs}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/Softmax.cc b/runtime/onert/core/src/ir/operation/Softmax.cc
index 3f1aa0af1..91850fa33 100644
--- a/runtime/onert/core/src/ir/operation/Softmax.cc
+++ b/runtime/onert/core/src/ir/operation/Softmax.cc
@@ -31,7 +31,7 @@ void Softmax::accept(OperationVisitor &v) const { v.visit(*this); }
 
 Softmax::Softmax(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
                  const Param &param)
-    : Operation{OperandConstraint::createExact(1u), inputs, outputs}, _param{param}
+  : Operation{OperandConstraint::createExact(1u), inputs, outputs}, _param{param}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/SpaceToBatchND.cc b/runtime/onert/core/src/ir/operation/SpaceToBatchND.cc
index 53fab4fa9..97c630888 100644
--- a/runtime/onert/core/src/ir/operation/SpaceToBatchND.cc
+++ b/runtime/onert/core/src/ir/operation/SpaceToBatchND.cc
@@ -31,7 +31,7 @@ void SpaceToBatchND::accept(OperationVisitor &v) const { v.visit(*this); }
 
 SpaceToBatchND::SpaceToBatchND(const OperandIndexSequence &inputs,
                                const OperandIndexSequence &outputs)
-    : Operation{OperandConstraint::createExact(3u), inputs, outputs}
+  : Operation{OperandConstraint::createExact(3u), inputs, outputs}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/SpaceToDepth.cc b/runtime/onert/core/src/ir/operation/SpaceToDepth.cc
index d8a45aee5..e1fd27a55 100644
--- a/runtime/onert/core/src/ir/operation/SpaceToDepth.cc
+++ b/runtime/onert/core/src/ir/operation/SpaceToDepth.cc
@@ -31,7 +31,7 @@ void SpaceToDepth::accept(OperationVisitor &v) const { v.visit(*this); }
 
 SpaceToDepth::SpaceToDepth(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
                            const Param &param)
-    : Operation{OperandConstraint::createExact(1u), inputs, outputs}, _param{param}
+  : Operation{OperandConstraint::createExact(1u), inputs, outputs}, _param{param}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/Split.cc b/runtime/onert/core/src/ir/operation/Split.cc
index b538e9206..96822822b 100644
--- a/runtime/onert/core/src/ir/operation/Split.cc
+++ b/runtime/onert/core/src/ir/operation/Split.cc
@@ -25,7 +25,7 @@ namespace operation
 void Split::accept(OperationVisitor &v) const { v.visit(*this); }
 Split::Split(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
              const Param &param)
-    : Operation{OperandConstraint::createExact(2u), inputs, outputs}, _param{param}
+  : Operation{OperandConstraint::createExact(2u), inputs, outputs}, _param{param}
 {
 }
 } // namespace operation
diff --git a/runtime/onert/core/src/ir/operation/SplitV.cc b/runtime/onert/core/src/ir/operation/SplitV.cc
index e638c9ac9..38918cd81 100644
--- a/runtime/onert/core/src/ir/operation/SplitV.cc
+++ b/runtime/onert/core/src/ir/operation/SplitV.cc
@@ -25,7 +25,7 @@ namespace operation
 void SplitV::accept(OperationVisitor &v) const { v.visit(*this); }
 SplitV::SplitV(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
                const Param &param)
-    : Operation{OperandConstraint::createExact(3u), inputs, outputs}, _param{param}
+  : Operation{OperandConstraint::createExact(3u), inputs, outputs}, _param{param}
 {
 }
 } // namespace operation
diff --git a/runtime/onert/core/src/ir/operation/SquaredDifference.cc b/runtime/onert/core/src/ir/operation/SquaredDifference.cc
index 49e58aaf2..705b60abc 100644
--- a/runtime/onert/core/src/ir/operation/SquaredDifference.cc
+++ b/runtime/onert/core/src/ir/operation/SquaredDifference.cc
@@ -31,7 +31,7 @@ void SquaredDifference::accept(OperationVisitor &v) const { v.visit(*this); }
 
 SquaredDifference::SquaredDifference(const OperandIndexSequence &inputs,
                                      const OperandIndexSequence &outputs)
-    : Operation{OperandConstraint::createExact(2u), inputs, outputs}
+  : Operation{OperandConstraint::createExact(2u), inputs, outputs}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/Squeeze.cc b/runtime/onert/core/src/ir/operation/Squeeze.cc
index 8cf928fb4..e059c4bee 100644
--- a/runtime/onert/core/src/ir/operation/Squeeze.cc
+++ b/runtime/onert/core/src/ir/operation/Squeeze.cc
@@ -28,7 +28,7 @@ void Squeeze::accept(OperationVisitor &v) const { v.visit(*this); }
 
 Squeeze::Squeeze(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
                  const Param &param)
-    : Operation{OperandConstraint::createExact(1u), inputs, outputs}, _param(param)
+  : Operation{OperandConstraint::createExact(1u), inputs, outputs}, _param(param)
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/StatelessRandomUniform.cc b/runtime/onert/core/src/ir/operation/StatelessRandomUniform.cc
index cbb0ff251..18f1cf5a6 100644
--- a/runtime/onert/core/src/ir/operation/StatelessRandomUniform.cc
+++ b/runtime/onert/core/src/ir/operation/StatelessRandomUniform.cc
@@ -30,7 +30,7 @@ void StatelessRandomUniform::accept(OperationVisitor &v) const { v.visit(*this);
 
 StatelessRandomUniform::StatelessRandomUniform(const OperandIndexSequence &inputs,
                                                const OperandIndexSequence &outputs)
-    : Operation{OperandConstraint::createExact(2u), inputs, outputs}
+  : Operation{OperandConstraint::createExact(2u), inputs, outputs}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/StridedSlice.cc b/runtime/onert/core/src/ir/operation/StridedSlice.cc
index 2a7905995..e8278b456 100644
--- a/runtime/onert/core/src/ir/operation/StridedSlice.cc
+++ b/runtime/onert/core/src/ir/operation/StridedSlice.cc
@@ -31,7 +31,7 @@ void StridedSlice::accept(OperationVisitor &v) const { v.visit(*this); }
 
 StridedSlice::StridedSlice(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
                            const Param &param)
-    : Operation{OperandConstraint::createExact(4u), inputs, outputs}, _param{param}
+  : Operation{OperandConstraint::createExact(4u), inputs, outputs}, _param{param}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/Tile.cc b/runtime/onert/core/src/ir/operation/Tile.cc
index 5ba3df2ad..0ec785579 100644
--- a/runtime/onert/core/src/ir/operation/Tile.cc
+++ b/runtime/onert/core/src/ir/operation/Tile.cc
@@ -30,7 +30,7 @@ namespace operation
 void Tile::accept(OperationVisitor &v) const { v.visit(*this); }
 
 Tile::Tile(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs)
-    : Operation{OperandConstraint::createExact(2u), inputs, outputs}
+  : Operation{OperandConstraint::createExact(2u), inputs, outputs}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/TopKV2.cc b/runtime/onert/core/src/ir/operation/TopKV2.cc
index a5e6c6a85..a1f39202d 100644
--- a/runtime/onert/core/src/ir/operation/TopKV2.cc
+++ b/runtime/onert/core/src/ir/operation/TopKV2.cc
@@ -31,7 +31,7 @@ void TopKV2::accept(OperationVisitor &v) const { v.visit(*this); }
 
 TopKV2::TopKV2(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
                const Param &param)
-    : Operation{OperandConstraint::createExact(1u), inputs, outputs}, _param{param}
+  : Operation{OperandConstraint::createExact(1u), inputs, outputs}, _param{param}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/Transpose.cc b/runtime/onert/core/src/ir/operation/Transpose.cc
index 997f98ab0..f2ee52f0e 100644
--- a/runtime/onert/core/src/ir/operation/Transpose.cc
+++ b/runtime/onert/core/src/ir/operation/Transpose.cc
@@ -30,7 +30,7 @@ namespace operation
 void Transpose::accept(OperationVisitor &v) const { v.visit(*this); }
 
 Transpose::Transpose(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs)
-    : Operation{OperandConstraint::createExact(2u), inputs, outputs}
+  : Operation{OperandConstraint::createExact(2u), inputs, outputs}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/TransposeConv.cc b/runtime/onert/core/src/ir/operation/TransposeConv.cc
index 7f29ca44e..1f405dc6b 100644
--- a/runtime/onert/core/src/ir/operation/TransposeConv.cc
+++ b/runtime/onert/core/src/ir/operation/TransposeConv.cc
@@ -31,7 +31,7 @@ void TransposeConv::accept(OperationVisitor &v) const { v.visit(*this); }
 
 TransposeConv::TransposeConv(const OperandIndexSequence &inputs,
                              const OperandIndexSequence &outputs, const Param &param)
-    : Operation{OperandConstraint::createExact(3u), inputs, outputs}, _param{param}
+  : Operation{OperandConstraint::createExact(3u), inputs, outputs}, _param{param}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/Unpack.cc b/runtime/onert/core/src/ir/operation/Unpack.cc
index 67aa54ab5..90d3c0c07 100644
--- a/runtime/onert/core/src/ir/operation/Unpack.cc
+++ b/runtime/onert/core/src/ir/operation/Unpack.cc
@@ -25,7 +25,7 @@ namespace operation
 void Unpack::accept(OperationVisitor &v) const { v.visit(*this); }
 Unpack::Unpack(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
                const Param &param)
-    : Operation{OperandConstraint::createExact(1u), inputs, outputs}, _param{param}
+  : Operation{OperandConstraint::createExact(1u), inputs, outputs}, _param{param}
 {
 }
 } // namespace operation
diff --git a/runtime/onert/core/src/ir/operation/While.cc b/runtime/onert/core/src/ir/operation/While.cc
index 2505c60e3..8a6f5c01e 100644
--- a/runtime/onert/core/src/ir/operation/While.cc
+++ b/runtime/onert/core/src/ir/operation/While.cc
@@ -25,7 +25,7 @@ namespace operation
 void While::accept(OperationVisitor &v) const { v.visit(*this); }
 While::While(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
              const Param &param)
-    : Operation{OperandConstraint::createAny(), inputs, outputs}, _param{param}
+  : Operation{OperandConstraint::createAny(), inputs, outputs}, _param{param}
 {
 }
 } // namespace operation
diff --git a/runtime/onert/core/src/ir/verifier/Verifier.cc b/runtime/onert/core/src/ir/verifier/Verifier.cc
index 7d05acb28..25a82d5a2 100644
--- a/runtime/onert/core/src/ir/verifier/Verifier.cc
+++ b/runtime/onert/core/src/ir/verifier/Verifier.cc
@@ -39,11 +39,11 @@ bool DAGChecker::verify(const Graph &graph) const noexcept
 
   OperationIndexMap<bool> visited;
   operations.iterate(
-      [&](const OperationIndex &index, const Operation &) { visited[index] = false; });
+    [&](const OperationIndex &index, const Operation &) { visited[index] = false; });
   OperationIndexMap<bool> on_stack = visited; // Copy from visited
 
   std::function<void(const OperationIndex &index, const Operation &)> dfs_recursive =
-      [&](const OperationIndex &index, const Operation &node) -> void {
+    [&](const OperationIndex &index, const Operation &node) -> void {
     if (on_stack[index])
       cyclic = true;
     if (visited[index])
@@ -72,7 +72,7 @@ bool DAGChecker::verify(const Graph &graph) const noexcept
 // EdgeConsistencyVerifier
 //
 
-bool EdgeConsistencyChecker::verify(const Graph &graph) const noexcept
+bool EdgeChecker::verify(const Graph &graph) const noexcept
 {
   auto &operations = graph.operations();
   uint32_t errors = 0;
@@ -85,17 +85,16 @@ bool EdgeConsistencyChecker::verify(const Graph &graph) const noexcept
         bool operand_has_use = operand.getUses().contains(index);
         if (!operand_has_use)
         {
-          VERBOSE(EdgeConsistencyChecker) << "[ERROR] EDGE MISMATCH : Missing USE edge - Operand "
-                                          << operand_index << " to Operation " << index
-                                          << std::endl;
+          VERBOSE(EdgeChecker) << "[ERROR] EDGE MISMATCH : Missing USE edge - Operand "
+                               << operand_index << " to Operation " << index << std::endl;
           errors += 1;
         }
       }
       catch (const std::out_of_range &e)
       {
-        VERBOSE(EdgeConsistencyChecker)
-            << "[ERROR] OPEARAND NOT FOUND : Operation " << index << " has Operand "
-            << operand_index << ", but the operand object is not present in the graph" << std::endl;
+        VERBOSE(EdgeChecker) << "[ERROR] OPEARAND NOT FOUND : Operation " << index
+                             << " has Operand " << operand_index
+                             << ", but the operand object is not present in the graph" << std::endl;
         errors += 1;
       }
     }
@@ -106,23 +105,22 @@ bool EdgeConsistencyChecker::verify(const Graph &graph) const noexcept
         auto &operand = graph.operands().at(operand_index);
         if (operand.getDef() != index)
         {
-          VERBOSE(EdgeConsistencyChecker) << "[ERROR] EDGE MISMATCH : Missing DEF edge - Operand"
-                                          << operand_index << " to Operation " << index
-                                          << std::endl;
+          VERBOSE(EdgeChecker) << "[ERROR] EDGE MISMATCH : Missing DEF edge - Operand"
+                               << operand_index << " to Operation " << index << std::endl;
           errors += 1;
         }
       }
       catch (const std::out_of_range &e)
       {
-        VERBOSE(EdgeConsistencyChecker)
-            << "[ERROR] OPEARAND NOT FOUND : Operation " << index << " has Operand "
-            << operand_index << ", but the operand object is not present in the graph" << std::endl;
+        VERBOSE(EdgeChecker) << "[ERROR] OPEARAND NOT FOUND : Operation " << index
+                             << " has Operand " << operand_index
+                             << ", but the operand object is not present in the graph" << std::endl;
         errors += 1;
       }
     }
   });
 
-  VERBOSE(EdgeConsistencyChecker) << "Total Number of errors : " << errors << std::endl;
+  VERBOSE(EdgeChecker) << "Total Number of errors : " << errors << std::endl;
 
   return errors == 0;
 }
diff --git a/runtime/onert/core/src/ir/verifier/Verifier.h b/runtime/onert/core/src/ir/verifier/Verifier.h
index 143db343a..fa1311983 100644
--- a/runtime/onert/core/src/ir/verifier/Verifier.h
+++ b/runtime/onert/core/src/ir/verifier/Verifier.h
@@ -55,7 +55,7 @@ public:
   bool verify(const Graph &graph) const noexcept override;
 };
 
-class EdgeConsistencyChecker : public IVerifier
+class EdgeChecker : public IVerifier
 {
 public:
   bool verify(const Graph &graph) const noexcept override;
diff --git a/runtime/onert/core/src/util/ChromeTracingEventWriter.cc b/runtime/onert/core/src/util/ChromeTracingEventWriter.cc
new file mode 100644
index 000000000..3fc0c8ece
--- /dev/null
+++ b/runtime/onert/core/src/util/ChromeTracingEventWriter.cc
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "util/EventWriter.h"
+
+#include <sstream>
+#include <vector>
+#include <cassert>
+#include <utility>
+
+// json type for ChromeTracingWriter
+namespace
+{
+
+std::string quote(const std::string &value)
+{
+  std::stringstream ss;
+  ss << '"' << value << '"';
+  return ss.str();
+}
+
+std::string field(const std::string &k, const std::string &v)
+{
+  std::stringstream ss;
+  ss << quote(k) << " : " << quote(v);
+  return ss.str();
+}
+
+struct Content // One Entry in Chrome Event Trace
+{
+  std::vector<std::pair<std::string, std::string>> flds;
+  std::vector<std::pair<std::string, std::string>> args;
+};
+
+std::string object(const Content &content)
+{
+  std::stringstream ss;
+
+  ss << "{ ";
+
+  ss << field(content.flds[0].first, content.flds[0].second);
+
+  for (uint32_t n = 1; n < content.flds.size(); ++n)
+  {
+    ss << ", " << field(content.flds.at(n).first, content.flds.at(n).second);
+  }
+
+  if (content.args.size() > 0)
+  {
+    ss << ", " << quote("args") << " : { ";
+    ss << field(content.args.at(0).first, content.args.at(0).second);
+
+    for (uint32_t n = 1; n < content.args.size(); ++n)
+    {
+      ss << ", " << field(content.args.at(n).first, content.args.at(n).second);
+    }
+
+    ss << "}";
+  }
+
+  ss << " }";
+
+  return ss.str();
+}
+
+void fill(Content &content, const DurationEvent &evt, const std::string &name,
+          const std::string &tid)
+{
+  content.flds.emplace_back("name", name);
+  content.flds.emplace_back("pid", "0");
+  content.flds.emplace_back("tid", tid);
+  content.flds.emplace_back("ph", evt.ph);
+  content.flds.emplace_back("ts", evt.ts);
+  content.args = evt.args;
+}
+
+void fill(Content &content, const CounterEvent &evt)
+{
+  assert(evt.name != "");
+
+  content.flds.emplace_back("name", evt.name);
+  content.flds.emplace_back("pid", "0");
+  content.flds.emplace_back("tid", evt.tid);
+  content.flds.emplace_back("ph", evt.ph);
+  content.flds.emplace_back("ts", evt.ts);
+  content.args = evt.args;
+}
+
+std::string object(const DurationEvent &evt, const std::string &name, const std::string &tid)
+{
+  Content content;
+
+  fill(content, evt, name, tid);
+
+  return ::object(content);
+}
+
+std::string object(const CounterEvent &evt)
+{
+  Content content;
+
+  fill(content, evt);
+
+  for (auto it = evt.values.begin(); it != evt.values.end(); ++it)
+  {
+    content.args.emplace_back(it->first, it->second);
+  }
+
+  return ::object(content);
+}
+
+std::string getSessionLabel(const DurationEvent &evt)
+{
+  return "$" + std::to_string(evt.session_index) + " sess";
+}
+
+std::string getSubgLabel(const DurationEvent &evt)
+{
+  return "$" + std::to_string(evt.subg_index) + " subg";
+}
+
+std::string getOpLabel(const OpSeqDurationEvent &evt)
+{
+  return "@" + std::to_string(evt.op_index) + " " + evt.op_name;
+}
+
+std::string getLabel(const DurationEvent &evt)
+{
+  if (auto evt_ptr = dynamic_cast<const OpSeqDurationEvent *>(&evt))
+  {
+    return getOpLabel(*evt_ptr);
+  }
+  else // SubgDurationEvent
+  {
+    return getSubgLabel(evt);
+  }
+}
+
+std::string getTid(const DurationEvent &evt)
+{
+  if (auto evt_ptr = dynamic_cast<const OpSeqDurationEvent *>(&evt))
+  {
+    return getSessionLabel(*evt_ptr) + ", " + getSubgLabel(*evt_ptr) + ", " + evt_ptr->backend;
+  }
+  else // SubgDurationEvent
+  {
+    return getSessionLabel(evt) + ", " + getSubgLabel(evt);
+  }
+}
+
+} // namespace
+
+void ChromeTracingWriter::flush(const std::vector<std::unique_ptr<EventRecorder>> &recorders)
+{
+  _os << "{\n";
+  _os << "  " << quote("traceEvents") << ": [\n";
+
+  for (auto &recorder : recorders)
+  {
+    flushOneRecord(*recorder);
+  }
+
+  _os << "    { }\n";
+  _os << "  ]\n";
+  _os << "}\n";
+}
+
+void ChromeTracingWriter::flushOneRecord(const EventRecorder &recorder)
+{
+  for (auto &evt : recorder.duration_events())
+  {
+    const std::string name = getLabel(*evt);
+    const std::string tid = getTid(*evt);
+
+    _os << "    " << object(*evt, name, tid) << ",\n";
+  }
+
+  for (auto &evt : recorder.counter_events())
+  {
+    _os << "    " << object(evt) << ",\n";
+  }
+}
diff --git a/runtime/onert/core/src/util/EventCollector.cc b/runtime/onert/core/src/util/EventCollector.cc
index fd5618714..83c2649d1 100644
--- a/runtime/onert/core/src/util/EventCollector.cc
+++ b/runtime/onert/core/src/util/EventCollector.cc
@@ -30,26 +30,62 @@ std::string timestamp(void)
 {
   auto now = std::chrono::steady_clock::now();
   return std::to_string(
-      std::chrono::duration_cast<std::chrono::microseconds>(now.time_since_epoch()).count());
+    std::chrono::duration_cast<std::chrono::microseconds>(now.time_since_epoch()).count());
 }
 
-class DurationEventBuilder
+class DurationEventBuilder : public EventCollector::EventVisitor
 {
 public:
   DurationEventBuilder(const std::string &ts) : _ts{ts} {}
 
-  DurationEvent build(const EventCollector::Event &evt_collected, const std::string &ph) const
+  std::unique_ptr<SubgDurationEvent> build(const EventCollector::SubgEvent &evt_collected,
+                                           const std::string &ph) const
   {
-    DurationEvent evt;
+    auto dur_evt = std::make_unique<SubgDurationEvent>();
 
-    evt.name = evt_collected.label;
-    evt.tid = evt_collected.backend;
-    evt.ph = ph;
-    evt.ts = _ts;
+    // The following will be set by a child of EventsWriter:
+    // dur_evt.name, dur_evt.tid
+    dur_evt->ph = ph;
+    dur_evt->ts = _ts;
+    dur_evt->tracing_ctx = evt_collected.tracing_ctx;
 
-    evt.args = evt_collected.userData;
+    dur_evt->session_index = evt_collected.session_index;
+    dur_evt->subg_index = evt_collected.subg_index;
 
-    return evt;
+    dur_evt->args = evt_collected.userData;
+    {
+      dur_evt->args.emplace_back("session", std::to_string(evt_collected.session_index));
+      dur_evt->args.emplace_back("subgraph", std::to_string(evt_collected.subg_index));
+    }
+
+    return dur_evt;
+  }
+
+  std::unique_ptr<OpSeqDurationEvent> build(const EventCollector::OpSeqEvent &evt_collected,
+                                            const std::string &ph) const
+  {
+    auto dur_evt = std::make_unique<OpSeqDurationEvent>();
+
+    // The following will be set by a child of EventsWriter:
+    // dur_evt.name, dur_evt.tid
+    dur_evt->ph = ph;
+    dur_evt->ts = _ts;
+    dur_evt->tracing_ctx = evt_collected.tracing_ctx;
+
+    dur_evt->session_index = evt_collected.session_index;
+    dur_evt->subg_index = evt_collected.subg_index;
+
+    dur_evt->backend = evt_collected.backend;
+    dur_evt->op_index = evt_collected.op_index;
+    dur_evt->op_name = evt_collected.op_name;
+
+    dur_evt->args = evt_collected.userData;
+    {
+      dur_evt->args.emplace_back("session", std::to_string(evt_collected.session_index));
+      dur_evt->args.emplace_back("subgraph", std::to_string(evt_collected.subg_index));
+    }
+
+    return dur_evt;
   }
 
 private:
@@ -88,19 +124,26 @@ inline void emit_rusage(EventRecorder *rec, const std::string &ts)
 
 } // namespace
 
-void EventCollector::onEvent(const Event &event)
+template <typename EventT> void EventCollector::onEvent(const EventT &event)
 {
   auto ts = timestamp();
 
+  DurationEventBuilder builder(ts);
+
   switch (event.edge)
   {
     case Edge::BEGIN:
-      _rec->emit(DurationEventBuilder(ts).build(event, "B"));
+    {
+      auto duration_evt = builder.build(event, "B");
+      _rec->emit(std::move(duration_evt));
       break;
-
+    }
     case Edge::END:
-      _rec->emit(DurationEventBuilder(ts).build(event, "E"));
+    {
+      auto duration_evt = builder.build(event, "E");
+      _rec->emit(std::move(duration_evt));
       break;
+    }
   }
 
 // TODO: Add resurece measurement(e.g. RSS)
@@ -109,3 +152,7 @@ void EventCollector::onEvent(const Event &event)
   emit_rusage(_rec, ts);
 #endif
 }
+
+// template instantiation
+template void EventCollector::onEvent<EventCollector::SubgEvent>(const SubgEvent &event);
+template void EventCollector::onEvent<EventCollector::OpSeqEvent>(const OpSeqEvent &event);
diff --git a/runtime/onert/core/src/util/EventCollector.h b/runtime/onert/core/src/util/EventCollector.h
index 7daa4851f..774fe05ef 100644
--- a/runtime/onert/core/src/util/EventCollector.h
+++ b/runtime/onert/core/src/util/EventCollector.h
@@ -18,6 +18,7 @@
 #define __ONERT_UTIL_EVENT_COLLECTOR_H__
 
 #include "util/EventRecorder.h"
+#include "util/TracingCtx.h"
 
 #include <vector>
 #include <utility>
@@ -32,29 +33,71 @@ public:
     END
   };
 
+  struct SubgEvent;
+  struct OpEvent;
+
+  class EventVisitor
+  {
+  public:
+    virtual ~EventVisitor() = default;
+
+    virtual std::unique_ptr<DurationEvent> visit(const SubgEvent &, const std::string &) const
+    {
+      throw std::runtime_error("Please implement");
+    }
+    virtual std::unique_ptr<DurationEvent> visit(const OpEvent &, const std::string &) const
+    {
+      throw std::runtime_error("Please implement");
+    }
+  };
+
   struct Event
   {
+    const onert::util::TracingCtx *tracing_ctx;
+
     Edge edge;
     uint32_t session_index;
     uint32_t subg_index;
-    std::string backend;
-    uint32_t op_index;
-    std::string op_name;
-    uint32_t op_seq_size; // if this event is for an operation sequence of multiple operations
-
-    // TODO deprecate this. label can be differ by writer. So let the writer decide label.
-    std::string label;
 
     // user-defined data: pairs of (key, value)
     std::vector<std::pair<std::string, std::string>> userData;
 
-    Event(Edge a_edge, const std::string &a_backend, const std::string &a_label)
-        : edge(a_edge), session_index(0), subg_index(0), backend(a_backend), op_index(0),
-          op_seq_size(0), label(a_label)
+  protected:
+    Event(const onert::util::TracingCtx *a_tracing_ctx, Edge a_edge, uint32_t a_subg_index)
+      : tracing_ctx(a_tracing_ctx), edge(a_edge), session_index(tracing_ctx->getSessionId()),
+        subg_index(a_subg_index)
+    { /* empty */
+    }
+
+    virtual ~Event() = default;
+  };
+
+  struct SubgEvent : public Event
+  {
+    // constructor for subgraph start and end event
+    SubgEvent(const onert::util::TracingCtx *a_tracing_ctx, Edge a_edge, uint32_t a_subg_index)
+      : Event(a_tracing_ctx, a_edge, a_subg_index)
     { /* empty */
     }
   };
 
+  // TODO Rename this to OperationEvent
+  struct OpSeqEvent : public Event
+  {
+    std::string backend;
+    uint32_t op_index;
+    std::string op_name;
+
+    OpSeqEvent(const onert::util::TracingCtx *a_tracing_ctx, Edge a_edge, uint32_t a_subg_index,
+               const std::string a_backend, uint32_t a_op_index, const std::string a_op_name)
+      : Event(a_tracing_ctx, a_edge, a_subg_index)
+    {
+      backend.assign(a_backend);
+      op_index = a_op_index;
+      op_name.assign(a_op_name);
+    }
+  };
+
 public:
   EventCollector(EventRecorder *rec) : _rec{rec}
   {
@@ -62,7 +105,7 @@ public:
   }
 
 public:
-  void onEvent(const Event &event);
+  template <typename EventT> void onEvent(const EventT &event);
 
 protected:
   EventRecorder *_rec;
diff --git a/runtime/onert/core/src/util/EventRecorder.cc b/runtime/onert/core/src/util/EventRecorder.cc
index 3714e4f02..5d3d5f5c6 100644
--- a/runtime/onert/core/src/util/EventRecorder.cc
+++ b/runtime/onert/core/src/util/EventRecorder.cc
@@ -16,11 +16,11 @@
 
 #include "util/EventRecorder.h"
 
-void EventRecorder::emit(const DurationEvent &evt)
+void EventRecorder::emit(std::unique_ptr<DurationEvent> &&evt)
 {
   std::lock_guard<std::mutex> lock{_mu};
 
-  _duration_events.push_back(evt);
+  _duration_events.push_back(std::move(evt));
 }
 
 void EventRecorder::emit(const CounterEvent &evt)
diff --git a/runtime/onert/core/src/util/EventRecorder.h b/runtime/onert/core/src/util/EventRecorder.h
index 3ed40875f..5cf03d8ac 100644
--- a/runtime/onert/core/src/util/EventRecorder.h
+++ b/runtime/onert/core/src/util/EventRecorder.h
@@ -17,28 +17,52 @@
 #ifndef __ONERT_UTIL_EVENT_RECORDER_H__
 #define __ONERT_UTIL_EVENT_RECORDER_H__
 
+#include "util/TracingCtx.h"
+
 #include <map>
 #include <memory>
 #include <mutex>
 
 #include <vector>
 
+// refer to https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/edit#
 struct Event
 {
-  std::string name;
-  std::string tid;
-  std::string ph;                                        /* REQUIRED */
-  std::string ts;                                        /* REQUIRED */
+  const onert::util::TracingCtx *tracing_ctx;
+
+  std::string ph;                                        // Event type.
+  std::string ts;                                        // tracing clock of timestamp of this event
   std::vector<std::pair<std::string, std::string>> args; // user-defined data: pairs of (key, value)
+
+  virtual ~Event() = default;
 };
 
 struct DurationEvent : public Event
 {
-  // TO BE FILLED
+  uint32_t session_index = 0;
+  uint32_t subg_index = 0;
+
+protected:
+  DurationEvent() = default;
+};
+
+struct SubgDurationEvent : public DurationEvent
+{ /* same with DurationEvent */
+};
+
+// TODO Rename it to OperationDurationEvent
+struct OpSeqDurationEvent : public DurationEvent
+{
+  // Note: DurationEvent's name and tid will be set by EventWriter
+  std::string backend;
+  uint32_t op_index;
+  std::string op_name;
 };
 
 struct CounterEvent : public Event
 {
+  std::string name; // name of event
+  std::string tid;  // thread ID
   std::map<std::string, std::string> values;
 };
 
@@ -53,17 +77,19 @@ public:
   EventRecorder() = default;
 
 public:
-  void emit(const DurationEvent &evt);
+  void emit(std::unique_ptr<DurationEvent> &&evt);
   void emit(const CounterEvent &evt);
 
 public:
-  bool empty() { return _duration_events.empty() && _counter_events.empty(); }
-  const std::vector<DurationEvent> &duration_events() const { return _duration_events; }
+  const std::vector<std::unique_ptr<DurationEvent>> &duration_events() const
+  {
+    return _duration_events;
+  }
   const std::vector<CounterEvent> &counter_events() const { return _counter_events; }
 
 private:
   std::mutex _mu;
-  std::vector<DurationEvent> _duration_events;
+  std::vector<std::unique_ptr<DurationEvent>> _duration_events;
   std::vector<CounterEvent> _counter_events;
 };
 
diff --git a/runtime/onert/core/src/util/EventWriter.cc b/runtime/onert/core/src/util/EventWriter.cc
index 8760a16db..c42c53730 100644
--- a/runtime/onert/core/src/util/EventWriter.cc
+++ b/runtime/onert/core/src/util/EventWriter.cc
@@ -16,547 +16,7 @@
 
 #include "util/EventWriter.h"
 
-#include <sstream>
-#include <vector>
-#include <unordered_map>
-#include <json/json.h>
-#include <assert.h>
-#include <utility>
-#include <map>
-#include <set>
-#include <stdint.h>
-#include <fstream>
-
-// json type for Chrome Event Trace
-namespace
-{
-
-std::string quote(const std::string &value)
-{
-  std::stringstream ss;
-  ss << '"' << value << '"';
-  return ss.str();
-}
-
-std::string field(const std::string &k, const std::string &v)
-{
-  std::stringstream ss;
-  ss << quote(k) << " : " << quote(v);
-  return ss.str();
-}
-
-struct Content // One Entry in Chrome Event Trace
-{
-  std::vector<std::pair<std::string, std::string>> flds;
-  std::vector<std::pair<std::string, std::string>> args;
-};
-
-std::string object(const Content &content)
-{
-  std::stringstream ss;
-
-  ss << "{ ";
-
-  ss << field(content.flds[0].first, content.flds[0].second);
-
-  for (uint32_t n = 1; n < content.flds.size(); ++n)
-  {
-    ss << ", " << field(content.flds.at(n).first, content.flds.at(n).second);
-  }
-
-  if (content.args.size() > 0)
-  {
-    ss << ", " << quote("args") << " : { ";
-    ss << field(content.args.at(0).first, content.args.at(0).second);
-
-    for (uint32_t n = 1; n < content.args.size(); ++n)
-    {
-      ss << ", " << field(content.args.at(n).first, content.args.at(n).second);
-    }
-
-    ss << "}";
-  }
-
-  ss << " }";
-
-  return ss.str();
-}
-
-void fill(Content &content, const Event &evt)
-{
-  content.flds.emplace_back("name", evt.name);
-  content.flds.emplace_back("pid", "0");
-  content.flds.emplace_back("tid", evt.tid);
-  content.flds.emplace_back("ph", evt.ph);
-  content.flds.emplace_back("ts", evt.ts);
-  content.args = evt.args;
-}
-
-std::string object(const DurationEvent &evt)
-{
-  Content content;
-
-  fill(content, evt);
-
-  return ::object(content);
-}
-
-std::string object(const CounterEvent &evt)
-{
-  Content content;
-
-  fill(content, evt);
-
-  for (auto it = evt.values.begin(); it != evt.values.end(); ++it)
-  {
-    content.args.emplace_back(it->first, it->second);
-  }
-
-  return ::object(content);
-}
-
-} // namespace
-
-// md table type
-namespace
-{
-
-void writeMDTableRow(std::ostream &os, const std::vector<std::string> &list)
-{
-  os << "| ";
-  for (auto &key : list)
-  {
-    os << key << " | ";
-  }
-  os << "\n";
-}
-
-struct MDContent
-{
-  std::string name;
-  uint64_t begin_ts;
-  uint64_t end_ts;
-  uint32_t min_rss;
-  uint32_t max_rss;
-  uint32_t min_page_reclaims;
-  uint32_t max_page_reclaims;
-
-  MDContent()
-      : begin_ts(0), end_ts(0), min_rss(UINT32_MAX), max_rss(0), min_page_reclaims(UINT32_MAX),
-        max_page_reclaims(0)
-  {
-    // DO NOTHING
-  }
-
-  virtual ~MDContent() = default;
-
-  void updateRss(uint32_t rss)
-  {
-    if (min_rss == UINT32_MAX)
-      min_rss = rss;
-    if (max_rss == 0)
-      max_rss = rss;
-
-    if (min_rss > rss)
-      min_rss = rss;
-    else if (max_rss < rss)
-      max_rss = rss;
-  }
-
-  void updateMinflt(uint32_t minflt)
-  {
-    if (min_page_reclaims == UINT32_MAX)
-      min_page_reclaims = minflt;
-    if (max_page_reclaims == 0)
-      max_page_reclaims = minflt;
-
-    if (min_page_reclaims > minflt)
-      min_page_reclaims = minflt;
-    else if (max_page_reclaims < minflt)
-      max_page_reclaims = minflt;
-  }
-
-  virtual void write(std::ostream &os) const = 0;
-};
-
-struct OpSeq : public MDContent
-{
-  std::string backend;
-  uint64_t graph_latency;
-
-  struct OpSeqCmp
-  {
-    bool operator()(const OpSeq &lhs, const OpSeq &rhs) const
-    {
-      return lhs.begin_ts < rhs.begin_ts;
-    }
-    bool operator()(const OpSeq &lhs, const OpSeq &rhs) { return lhs.begin_ts < rhs.begin_ts; }
-    bool operator()(OpSeq &lhs, OpSeq &rhs) { return lhs.begin_ts < rhs.begin_ts; }
-  };
-
-  void write(std::ostream &os) const override
-  {
-    uint64_t opseq_latency = end_ts - begin_ts;
-    double opseq_per = static_cast<double>(opseq_latency) / graph_latency * 100.0;
-    writeMDTableRow(os, {name, backend, std::to_string(opseq_latency), std::to_string(opseq_per),
-                         std::to_string(min_rss), std::to_string(max_rss),
-                         std::to_string(min_page_reclaims), std::to_string(max_page_reclaims)});
-  }
-};
-
-struct Graph : public MDContent
-{
-  std::set<OpSeq, OpSeq::OpSeqCmp> opseqs;
-
-  void setOpSeqs(const std::map<std::string, OpSeq> &name_to_opseq)
-  {
-    uint64_t graph_latency = end_ts - begin_ts;
-    for (auto it : name_to_opseq)
-    {
-      auto opseq = it.second;
-      opseq.graph_latency = graph_latency;
-
-      opseqs.insert(opseq);
-
-      updateRss(opseq.min_rss);
-      updateRss(opseq.max_rss);
-      updateMinflt(opseq.min_page_reclaims);
-      updateMinflt(opseq.max_page_reclaims);
-    }
-  }
-
-  void write(std::ostream &os) const override
-  {
-    static std::vector<std::string> graph_headers{"latency(us)", "rss_min(kb)", "rss_max(kb)",
-                                                  "page_reclaims_min", "page_reclaims_max"};
-
-    static std::vector<std::string> graph_headers_line{"-----------", "-------", "-------",
-                                                       "-----------------", "-----------------"};
-
-    // Graph's Header
-    writeMDTableRow(os, graph_headers);
-    writeMDTableRow(os, graph_headers_line);
-
-    // Graph's contents
-    writeMDTableRow(os, {std::to_string(end_ts - begin_ts), std::to_string(min_rss),
-                         std::to_string(max_rss), std::to_string(min_page_reclaims),
-                         std::to_string(max_page_reclaims)});
-
-    os << "\n";
-
-    static std::vector<std::string> opseq_headers{
-        "OpSeq name",  "backend",     "latency(us)",       "latency(%)",
-        "rss_min(kb)", "rss_max(kb)", "page_reclaims_min", "page_reclaims_max"};
-
-    static std::vector<std::string> opseq_headers_line{
-        "----------", "-------", "-----------",       "-----------",
-        "-------",    "-------", "-----------------", "-----------------"};
-
-    os << "## OpSequences \n";
-
-    // OpSeq's Header
-    writeMDTableRow(os, opseq_headers);
-    writeMDTableRow(os, opseq_headers_line);
-
-    // OpSeq's contents
-    for (auto opseq : opseqs)
-    {
-      opseq.write(os);
-    }
-
-    os << "\n";
-  }
-};
-
-struct MDTableBuilder
-{
-  MDTableBuilder(const std::vector<DurationEvent> &duration_events,
-                 const std::vector<CounterEvent> &counter_events)
-      : _duration_events(duration_events), _counter_events(counter_events)
-  {
-// when ready with low overhead in release build
-#ifdef DEBUG
-    for (const auto &evt : _counter_events)
-    {
-      uint64_t ts = std::stoull(evt.ts);
-      auto &name = evt.name;
-      assert(name.compare("maxrss") == 0 || name.compare("minflt") == 0);
-      assert(evt.values.size() == 1);
-      auto &val = evt.values.begin()->second;
-      if (_ts_to_values.find(ts) == _ts_to_values.end())
-      {
-        std::pair<uint32_t, uint32_t> values;
-        if (name.compare("maxrss") == 0)
-          values.first = std::stoul(val);
-        else
-          values.second = std::stoul(val);
-        _ts_to_values.insert({ts, values});
-      }
-      else
-      {
-        auto &values = _ts_to_values.at(ts);
-        if (name.compare("maxrss") == 0)
-          values.first = std::stoul(val);
-        else
-          values.second = std::stoul(val);
-      }
-    }
-#endif
-  }
-
-  MDTableBuilder &build()
-  {
-    for (auto &it : divideGraph())
-    {
-      size_t begin_idx = it.first;
-      size_t end_idx = it.second;
-      std::map<std::string, OpSeq> name_to_opseq;
-      for (size_t i = begin_idx + 1; i < end_idx; ++i)
-      {
-        const auto &evt = _duration_events[i];
-        assert(evt.name.compare("Graph") != 0);
-        assert(evt.ph.compare("B") == 0 || evt.ph.compare("E") == 0);
-        if (evt.ph.compare("B") == 0)
-        {
-          assert(name_to_opseq.find(evt.name) == name_to_opseq.end());
-          name_to_opseq.insert({evt.name, makeOpSeq(evt)});
-        }
-        else
-        {
-          assert(name_to_opseq.find(evt.name) != name_to_opseq.end());
-          auto &opseq = name_to_opseq.at(evt.name);
-          updateOpSeq(opseq, evt);
-        }
-      }
-
-      _graphs.emplace_back(makeGraph(begin_idx, end_idx, name_to_opseq));
-    }
-
-    return *this;
-  }
-
-  std::vector<std::pair<size_t, size_t>> divideGraph()
-  {
-    std::vector<std::pair<size_t, size_t>> graph_idx_list; // pair<begin_idx, end_idx>
-    for (size_t i = 0, begin_idx = 0; i < _duration_events.size(); ++i)
-    {
-      const auto &evt = _duration_events.at(i);
-      if (evt.name.compare("Graph") == 0)
-      {
-        if (evt.ph.compare("B") == 0)
-          begin_idx = i;
-        else
-          graph_idx_list.emplace_back(begin_idx, i);
-      }
-    }
-    return graph_idx_list;
-  }
-
-  OpSeq makeOpSeq(const DurationEvent &evt)
-  {
-    OpSeq opseq;
-    opseq.name = evt.name;
-    opseq.begin_ts = std::stoull(evt.ts);
-    opseq.backend = evt.tid;
-#ifdef DEBUG
-    opseq.updateRss(_ts_to_values.at(opseq.begin_ts).first);
-    opseq.updateMinflt(_ts_to_values.at(opseq.begin_ts).second);
-#else
-    opseq.updateRss(0);
-    opseq.updateMinflt(0);
-#endif
-    return opseq;
-  }
-
-  void updateOpSeq(OpSeq &opseq, const DurationEvent &evt)
-  {
-    opseq.end_ts = std::stoull(evt.ts);
-#ifdef DEBUG
-    opseq.updateRss(_ts_to_values.at(opseq.end_ts).first);
-    opseq.updateMinflt(_ts_to_values.at(opseq.end_ts).second);
-#else
-    opseq.updateRss(0);
-    opseq.updateMinflt(0);
-#endif
-  }
-
-  Graph makeGraph(size_t begin_idx, size_t end_idx,
-                  const std::map<std::string, OpSeq> &name_to_opseq)
-  {
-    Graph graph;
-    graph.name = "Graph";
-    graph.begin_ts = std::stoull(_duration_events[begin_idx].ts);
-    graph.end_ts = std::stoull(_duration_events[end_idx].ts);
-    graph.setOpSeqs(name_to_opseq);
-#ifdef DEBUG
-    graph.updateRss(_ts_to_values.at(graph.begin_ts).first);
-    graph.updateMinflt(_ts_to_values.at(graph.begin_ts).second);
-    graph.updateRss(_ts_to_values.at(graph.end_ts).first);
-    graph.updateMinflt(_ts_to_values.at(graph.end_ts).second);
-#else
-    graph.updateRss(0);
-    graph.updateMinflt(0);
-#endif
-    return graph;
-  }
-
-  void write(std::ostream &os)
-  {
-    // Write contents
-    for (size_t i = 0; i < _graphs.size(); ++i)
-    {
-      os << "# Graph " << i << "\n";
-      _graphs.at(i).write(os);
-    }
-  }
-
-  const std::vector<DurationEvent> &_duration_events;
-  const std::vector<CounterEvent> &_counter_events;
-  // timestamp to std::pair<maxrss, minflt>
-  std::unordered_map<uint64_t, std::pair<uint32_t, uint32_t>> _ts_to_values;
-  std::vector<Graph> _graphs;
-};
-
-} // namespace
-
-void SNPEWriter::flush(const std::vector<std::unique_ptr<EventRecorder>> &recorders)
-{
-  Json::Value root;
-  auto &exec_data = root["Execution_Data"] = Json::Value{Json::objectValue};
-
-  struct Stat
-  {
-    uint64_t sum = 0;
-    uint64_t count = 0;
-    uint64_t max = 0;
-    uint64_t min = std::numeric_limits<uint64_t>::max();
-
-    void accumulate(uint64_t val)
-    {
-      sum += val;
-      count++;
-      max = std::max(max, val);
-      min = std::min(min, val);
-    }
-  };
-
-  // Memory
-  {
-    std::unordered_map<std::string, Stat> mem_stats;
-    for (auto &recorder : recorders)
-    {
-      for (auto &evt : recorder->counter_events())
-      {
-        auto &mem_stat = mem_stats[evt.name];
-        uint64_t val = std::stoull(evt.values.at("value"));
-        mem_stat.accumulate(val);
-      }
-    }
-
-    auto &mem = exec_data["memory"] = Json::Value{Json::objectValue};
-    for (auto &kv : mem_stats)
-    {
-      auto &key = kv.first;
-      auto &val = kv.second;
-      mem[key]["Avg_Size"] = val.sum / val.count;
-      mem[key]["Max_Size"] = val.max;
-      mem[key]["Min_Size"] = val.min;
-      mem[key]["Runtime"] = "NA";
-    }
-  }
-
-  // Operation Execution Time
-  {
-    // NOTE This assumes _duration_events is sorted by "ts" ascending
-
-    // 2D keys : stats[tid][name]
-    std::unordered_map<std::string, std::unordered_map<std::string, Stat>> stats;
-    std::unordered_map<std::string, std::unordered_map<std::string, uint64_t>> begin_timestamps;
-    for (auto &recorder : recorders)
-    {
-      for (auto &evt : recorder->duration_events())
-      {
-        auto &stat = stats[evt.tid][evt.name];
-        auto &begin_ts = begin_timestamps[evt.tid][evt.name];
-        uint64_t timestamp = std::stoull(evt.ts);
-        if (evt.ph == "B")
-        {
-          if (begin_ts != 0)
-            throw std::runtime_error{"Invalid Data"};
-          begin_ts = timestamp;
-        }
-        else if (evt.ph == "E")
-        {
-          if (begin_ts == 0 || timestamp < begin_ts)
-            throw std::runtime_error{"Invalid Data"};
-          stat.accumulate(timestamp - begin_ts);
-          begin_ts = 0;
-        }
-        else
-          throw std::runtime_error{"Invalid Data - invalid value for \"ph\" : \"" + evt.ph + "\""};
-      }
-    }
-
-    for (auto &kv : begin_timestamps)
-      for (auto &kv2 : kv.second)
-        if (kv2.second != 0)
-          throw std::runtime_error{"Invalid Data - B and E pair does not match."};
-
-    for (auto &kv : stats)
-    {
-      auto &tid = kv.first;
-      auto &map = kv.second;
-      auto &json_tid = exec_data[tid] = Json::Value{Json::objectValue};
-      for (auto &kv : map)
-      {
-        auto &name = kv.first;
-        auto &val = kv.second;
-        json_tid[name]["Avg_Time"] = val.sum / val.count;
-        json_tid[name]["Max_Time"] = val.max;
-        json_tid[name]["Min_Time"] = val.min;
-        json_tid[name]["Runtime"] = tid;
-      }
-    }
-  }
-
-  _os << root;
-}
-
-void ChromeTracingWriter::flush(const std::vector<std::unique_ptr<EventRecorder>> &recorders)
-{
-  _os << "{\n";
-  _os << "  " << quote("traceEvents") << ": [\n";
-
-  for (auto &recorder : recorders)
-  {
-    flushOneRecord(*recorder);
-  }
-
-  _os << "    { }\n";
-  _os << "  ]\n";
-  _os << "}\n";
-}
-
-void ChromeTracingWriter::flushOneRecord(const EventRecorder &recorder)
-{
-  for (auto &evt : recorder.duration_events())
-  {
-    _os << "    " << object(evt) << ",\n";
-  }
-
-  for (auto &evt : recorder.counter_events())
-  {
-    _os << "    " << object(evt) << ",\n";
-  }
-}
-
-void MDTableWriter::flush(const std::vector<std::unique_ptr<EventRecorder>> &records)
-{
-  for (auto &recorder : records)
-  {
-    MDTableBuilder(recorder->duration_events(), recorder->counter_events()).build().write(_os);
-  }
-}
+#include <cassert>
 
 // initialization
 std::mutex EventWriter::_mutex;
diff --git a/runtime/onert/core/src/util/EventWriter.h b/runtime/onert/core/src/util/EventWriter.h
index 0dcd00be6..0a35a8508 100644
--- a/runtime/onert/core/src/util/EventWriter.h
+++ b/runtime/onert/core/src/util/EventWriter.h
@@ -29,7 +29,9 @@ class EventFormatWriter
 {
 public:
   EventFormatWriter(const std::string &filepath) : _os{filepath, std::ofstream::out} {}
-  virtual ~EventFormatWriter() { /* empty */}
+  virtual ~EventFormatWriter()
+  { /* empty */
+  }
 
   virtual void flush(const std::vector<std::unique_ptr<EventRecorder>> &) = 0;
 
@@ -40,14 +42,22 @@ protected:
 class SNPEWriter : public EventFormatWriter
 {
 public:
-  SNPEWriter(const std::string &filepath) : EventFormatWriter(filepath) { /* empty */}
+  SNPEWriter(const std::string &filepath) : EventFormatWriter(filepath)
+  { /* empty */
+  }
+  ~SNPEWriter() {}
+
   void flush(const std::vector<std::unique_ptr<EventRecorder>> &) override;
 };
 
 class ChromeTracingWriter : public EventFormatWriter
 {
 public:
-  ChromeTracingWriter(const std::string &filepath) : EventFormatWriter(filepath) { /* empty */}
+  ChromeTracingWriter(const std::string &filepath) : EventFormatWriter(filepath)
+  { /* empty */
+  }
+  ~ChromeTracingWriter() {}
+
   void flush(const std::vector<std::unique_ptr<EventRecorder>> &) override;
 
 private:
@@ -57,13 +67,16 @@ private:
 class MDTableWriter : public EventFormatWriter
 {
 public:
-  MDTableWriter(const std::string &filepath) : EventFormatWriter(filepath) { /* empty */}
-  void flush(const std::vector<std::unique_ptr<EventRecorder>> &) override;
+  MDTableWriter(const std::string &filepath) : EventFormatWriter(filepath)
+  { /* empty */
+  }
+  ~MDTableWriter() {}
 
-private:
-  void flushOneRecord(const EventRecorder &);
+  void flush(const std::vector<std::unique_ptr<EventRecorder>> &) override;
 };
 
+#include <mutex>
+
 class EventWriter
 {
 public:
@@ -110,7 +123,7 @@ private:
 
     _actual_writers[WriteFormat::SNPE_BENCHMARK] = std::make_unique<SNPEWriter>(snpe_log_name);
     _actual_writers[WriteFormat::CHROME_TRACING] =
-        std::make_unique<ChromeTracingWriter>(chrome_tracing_log_name);
+      std::make_unique<ChromeTracingWriter>(chrome_tracing_log_name);
     _actual_writers[WriteFormat::MD_TABLE] = std::make_unique<MDTableWriter>(md_table_log_name);
   };
 
diff --git a/runtime/onert/core/src/util/MDTableEventWriter.cc b/runtime/onert/core/src/util/MDTableEventWriter.cc
new file mode 100644
index 000000000..b7fbac5e2
--- /dev/null
+++ b/runtime/onert/core/src/util/MDTableEventWriter.cc
@@ -0,0 +1,365 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "util/EventWriter.h"
+
+#include <sstream>
+#include <vector>
+#include <unordered_map>
+#include <cassert>
+#include <utility>
+#include <map>
+#include <set>
+#include <stdint.h>
+
+// md table type
+namespace
+{
+
+void writeMDTableRow(std::ostream &os, const std::vector<std::string> &list)
+{
+  os << "| ";
+  for (auto &key : list)
+  {
+    os << key << " | ";
+  }
+  os << "\n";
+}
+
+struct MDContent
+{
+  std::string name;
+  uint64_t begin_ts;
+  uint64_t end_ts;
+  uint32_t min_rss;
+  uint32_t max_rss;
+  uint32_t min_page_reclaims;
+  uint32_t max_page_reclaims;
+
+  MDContent()
+    : begin_ts(0), end_ts(0), min_rss(UINT32_MAX), max_rss(0), min_page_reclaims(UINT32_MAX),
+      max_page_reclaims(0)
+  {
+    // DO NOTHING
+  }
+
+  virtual ~MDContent() = default;
+
+  void updateRss(uint32_t rss)
+  {
+    if (min_rss == UINT32_MAX)
+      min_rss = rss;
+    if (max_rss == 0)
+      max_rss = rss;
+
+    if (min_rss > rss)
+      min_rss = rss;
+    else if (max_rss < rss)
+      max_rss = rss;
+  }
+
+  void updateMinflt(uint32_t minflt)
+  {
+    if (min_page_reclaims == UINT32_MAX)
+      min_page_reclaims = minflt;
+    if (max_page_reclaims == 0)
+      max_page_reclaims = minflt;
+
+    if (min_page_reclaims > minflt)
+      min_page_reclaims = minflt;
+    else if (max_page_reclaims < minflt)
+      max_page_reclaims = minflt;
+  }
+
+  virtual void write(std::ostream &os) const = 0;
+};
+
+struct Operation : public MDContent
+{
+  std::string backend;
+  uint64_t graph_latency;
+
+  struct OperationCmp
+  {
+    bool operator()(const Operation &lhs, const Operation &rhs) const
+    {
+      return lhs.begin_ts < rhs.begin_ts;
+    }
+    bool operator()(const Operation &lhs, const Operation &rhs)
+    {
+      return lhs.begin_ts < rhs.begin_ts;
+    }
+    bool operator()(Operation &lhs, Operation &rhs) { return lhs.begin_ts < rhs.begin_ts; }
+  };
+
+  void write(std::ostream &os) const override
+  {
+    uint64_t op_latency = end_ts - begin_ts;
+    double op_per = static_cast<double>(op_latency) / graph_latency * 100.0;
+    writeMDTableRow(os, {name, backend, std::to_string(op_latency), std::to_string(op_per),
+                         std::to_string(min_rss), std::to_string(max_rss),
+                         std::to_string(min_page_reclaims), std::to_string(max_page_reclaims)});
+  }
+};
+
+struct Graph : public MDContent
+{
+  std::set<Operation, Operation::OperationCmp> ops;
+  std::string session_index;
+  std::string subgraph_index;
+
+  void setOperations(const std::map<std::string, Operation> &name_to_op)
+  {
+    uint64_t graph_latency = end_ts - begin_ts;
+    for (auto it : name_to_op)
+    {
+      auto op = it.second;
+      op.graph_latency = graph_latency;
+
+      ops.insert(op);
+
+      updateRss(op.min_rss);
+      updateRss(op.max_rss);
+      updateMinflt(op.min_page_reclaims);
+      updateMinflt(op.max_page_reclaims);
+    }
+  }
+
+  void write(std::ostream &os) const override
+  {
+    static std::vector<std::string> graph_headers{"latency(us)", "rss_min(kb)", "rss_max(kb)",
+                                                  "page_reclaims_min", "page_reclaims_max"};
+
+    static std::vector<std::string> graph_headers_line{"-----------", "-------", "-------",
+                                                       "-----------------", "-----------------"};
+
+    // Graph's Header
+    writeMDTableRow(os, graph_headers);
+    writeMDTableRow(os, graph_headers_line);
+
+    // Graph's contents
+    writeMDTableRow(os, {std::to_string(end_ts - begin_ts), std::to_string(min_rss),
+                         std::to_string(max_rss), std::to_string(min_page_reclaims),
+                         std::to_string(max_page_reclaims)});
+
+    os << "\n";
+
+    static std::vector<std::string> op_headers{
+      "Op name",     "backend",     "latency(us)",       "latency(%)",
+      "rss_min(kb)", "rss_max(kb)", "page_reclaims_min", "page_reclaims_max"};
+
+    static std::vector<std::string> op_headers_line{
+      "-------", "-------", "-----------",       "-----------",
+      "-------", "-------", "-----------------", "-----------------"};
+
+    os << "## Op \n";
+
+    // Operation's Header
+    writeMDTableRow(os, op_headers);
+    writeMDTableRow(os, op_headers_line);
+
+    // Operation's contents
+    for (auto op : ops)
+    {
+      op.write(os);
+    }
+
+    os << "\n";
+  }
+};
+
+std::string getLabel(const OpSeqDurationEvent &evt)
+{
+  std::string subg_label("$" + std::to_string(evt.subg_index) + " subgraph");
+  std::string op_label("@" + std::to_string(evt.op_index) + " " + evt.op_name);
+
+  return subg_label + " " + op_label;
+}
+
+struct MDTableBuilder
+{
+  MDTableBuilder(const std::vector<std::unique_ptr<DurationEvent>> &duration_events,
+                 const std::vector<CounterEvent> &counter_events)
+    : _duration_events(duration_events), _counter_events(counter_events)
+  {
+// when ready with low overhead in release build
+#ifdef DEBUG
+    for (const auto &evt : _counter_events)
+    {
+      uint64_t ts = std::stoull(evt.ts);
+      auto &name = evt.name;
+      assert(name.compare("maxrss") == 0 || name.compare("minflt") == 0);
+      assert(evt.values.size() == 1);
+      auto &val = evt.values.begin()->second;
+      if (_ts_to_values.find(ts) == _ts_to_values.end())
+      {
+        std::pair<uint32_t, uint32_t> values;
+        if (name.compare("maxrss") == 0)
+          values.first = std::stoul(val);
+        else
+          values.second = std::stoul(val);
+        _ts_to_values.insert({ts, values});
+      }
+      else
+      {
+        auto &values = _ts_to_values.at(ts);
+        if (name.compare("maxrss") == 0)
+          values.first = std::stoul(val);
+        else
+          values.second = std::stoul(val);
+      }
+    }
+#endif
+  }
+
+  MDTableBuilder &build()
+  {
+    for (auto &it : divideGraph())
+    {
+      size_t begin_idx = it.first;
+      size_t end_idx = it.second;
+      std::map<std::string, Operation> name_to_op;
+      for (size_t i = begin_idx + 1; i < end_idx; ++i)
+      {
+        const auto *evt = dynamic_cast<const OpSeqDurationEvent *>(_duration_events[i].get());
+        if (evt == nullptr)
+          continue;
+
+        const std::string evt_name = getLabel(*evt);
+        assert(evt->ph.compare("B") == 0 || evt->ph.compare("E") == 0);
+        if (evt->ph.compare("B") == 0)
+        {
+          assert(name_to_op.find(evt_name) == name_to_op.end());
+          name_to_op.insert({evt_name, makeOperation(*evt)});
+        }
+        else
+        {
+          assert(name_to_op.find(evt_name) != name_to_op.end());
+          auto &op = name_to_op.at(evt_name);
+          updateOperation(op, *evt);
+        }
+      }
+
+      _graphs.emplace_back(makeGraph(begin_idx, end_idx, name_to_op));
+    }
+
+    return *this;
+  }
+
+  std::vector<std::pair<size_t, size_t>> divideGraph()
+  {
+    std::vector<std::pair<size_t, size_t>> graph_idx_list; // pair<begin_idx, end_idx>
+    for (size_t i = 0, begin_idx = 0; i < _duration_events.size(); ++i)
+    {
+      const auto subg_evt = dynamic_cast<const SubgDurationEvent *>(_duration_events.at(i).get());
+      if (subg_evt == nullptr)
+        continue;
+
+      if (subg_evt->ph.compare("B") == 0)
+        begin_idx = i;
+      else
+        graph_idx_list.emplace_back(begin_idx, i);
+    }
+    return graph_idx_list;
+  }
+
+  Operation makeOperation(const OpSeqDurationEvent &evt)
+  {
+    Operation op;
+    const std::string &evt_name = getLabel(evt);
+    op.name = evt_name;
+    op.begin_ts = std::stoull(evt.ts);
+    op.backend = evt.backend;
+#ifdef DEBUG
+    op.updateRss(_ts_to_values.at(op.begin_ts).first);
+    op.updateMinflt(_ts_to_values.at(op.begin_ts).second);
+#else
+    op.updateRss(0);
+    op.updateMinflt(0);
+#endif
+    return op;
+  }
+
+  void updateOperation(Operation &op, const DurationEvent &evt)
+  {
+    op.end_ts = std::stoull(evt.ts);
+#ifdef DEBUG
+    op.updateRss(_ts_to_values.at(op.end_ts).first);
+    op.updateMinflt(_ts_to_values.at(op.end_ts).second);
+#else
+    op.updateRss(0);
+    op.updateMinflt(0);
+#endif
+  }
+
+  Graph makeGraph(size_t begin_idx, size_t end_idx,
+                  const std::map<std::string, Operation> &name_to_op)
+  {
+    Graph graph;
+    graph.name = "Subgraph";
+    graph.begin_ts = std::stoull(_duration_events[begin_idx]->ts);
+    graph.end_ts = std::stoull(_duration_events[end_idx]->ts);
+    graph.setOperations(name_to_op);
+
+    for (auto &arg : _duration_events[end_idx]->args)
+    {
+      if (arg.first == "session")
+        graph.session_index = arg.second;
+      if (arg.first == "subgraph")
+        graph.subgraph_index = arg.second;
+    }
+
+#ifdef DEBUG
+    graph.updateRss(_ts_to_values.at(graph.begin_ts).first);
+    graph.updateMinflt(_ts_to_values.at(graph.begin_ts).second);
+    graph.updateRss(_ts_to_values.at(graph.end_ts).first);
+    graph.updateMinflt(_ts_to_values.at(graph.end_ts).second);
+#else
+    graph.updateRss(0);
+    graph.updateMinflt(0);
+#endif
+    return graph;
+  }
+
+  void write(std::ostream &os)
+  {
+    // Write contents
+    for (size_t i = 0; i < _graphs.size(); ++i)
+    {
+      auto &graph = _graphs.at(i);
+      os << "# Session: " << graph.session_index << ", Subgraph: " << graph.subgraph_index
+         << ", Running count: " << i << "\n";
+      _graphs.at(i).write(os);
+    }
+  }
+
+  const std::vector<std::unique_ptr<DurationEvent>> &_duration_events;
+  const std::vector<CounterEvent> &_counter_events;
+
+  // timestamp to std::pair<maxrss, minflt>
+  std::unordered_map<uint64_t, std::pair<uint32_t, uint32_t>> _ts_to_values;
+  std::vector<Graph> _graphs;
+};
+
+} // namespace
+
+void MDTableWriter::flush(const std::vector<std::unique_ptr<EventRecorder>> &records)
+{
+  for (auto &recorder : records)
+  {
+    MDTableBuilder(recorder->duration_events(), recorder->counter_events()).build().write(_os);
+  }
+}
diff --git a/runtime/onert/core/src/util/SNPEEventWriter.cc b/runtime/onert/core/src/util/SNPEEventWriter.cc
new file mode 100644
index 000000000..6f03cfccf
--- /dev/null
+++ b/runtime/onert/core/src/util/SNPEEventWriter.cc
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "util/EventWriter.h"
+
+#include <unordered_map>
+#include <json/json.h>
+#include <cassert>
+#include <utility>
+
+/**
+ * @brief Version of SNPE format
+ * In version 1
+ * - There is no "version" field in Json
+ * - Only one subgraph is supported
+ * - Operation name is a form of "$3 ADD"
+ *
+ * In version 2,
+ * - "version" : "2" was added in Json
+ * - Multiple session and multiple subgraphs are supported
+ * - When there is only one session, operation name is a form of "$2 subgraph $3 ADD",
+ *   meaning ADD op whose operation index 3 in a subgraph whose index is 2
+ * - When there are two or more sessions, operation name is a form of
+ *   "$1 session $2 subgraph $3 ADD", meaning ADD op whose operation index 3
+ *   in a subgraph whose index is 2, which was run in 1st session.
+ */
+#define SNPE_JSON_SCHEMA_VERSION "2"
+
+namespace
+{
+
+std::string getLabel(const DurationEvent &evt)
+{
+  if (auto evt_ptr = dynamic_cast<const OpSeqDurationEvent *>(&evt))
+  {
+    std::string subg_label("$" + std::to_string(evt_ptr->subg_index) + " subgraph");
+    std::string op_label("$" + std::to_string(evt_ptr->op_index) + " " + evt_ptr->op_name);
+
+    // Note : At this moment, there is only one thread running for EventWriter
+    if (evt_ptr->tracing_ctx->hasMultipleSessions())
+    {
+      std::string session_label("$" + std::to_string(evt_ptr->session_index) + " session");
+      return session_label + " " + subg_label + " " + op_label;
+    }
+    else
+    {
+      // When there is only one session, do not include session info
+      // Refer to https://github.sec.samsung.net/STAR/nnfw/issues/11436#issuecomment-930332
+      return subg_label + " " + op_label;
+    }
+  }
+  else // SubgEvent
+    return "Graph";
+}
+
+std::string getBackend(const DurationEvent &evt)
+{
+  if (auto evt_ptr = dynamic_cast<const OpSeqDurationEvent *>(&evt))
+    return evt_ptr->backend;
+  else // SubbEvent
+    return "runtime";
+}
+
+} // namespace
+
+void SNPEWriter::flush(const std::vector<std::unique_ptr<EventRecorder>> &recorders)
+{
+  struct Stat
+  {
+    uint64_t sum = 0;
+    uint64_t count = 0;
+    uint64_t max = 0;
+    uint64_t min = std::numeric_limits<uint64_t>::max();
+
+    void accumulate(uint64_t val)
+    {
+      sum += val;
+      count++;
+      max = std::max(max, val);
+      min = std::min(min, val);
+    }
+  };
+
+  Json::Value root;
+  root["version"] = SNPE_JSON_SCHEMA_VERSION;
+
+  auto &exec_data = root["Execution_Data"] = Json::Value{Json::objectValue};
+
+  // Memory
+  {
+    std::unordered_map<std::string, Stat> mem_stats;
+    for (auto &recorder : recorders)
+    {
+      for (auto &evt : recorder->counter_events())
+      {
+        auto &mem_stat = mem_stats[evt.name];
+        uint64_t val = std::stoull(evt.values.at("value"));
+        mem_stat.accumulate(val);
+      }
+    }
+
+    auto &mem = exec_data["memory"] = Json::Value{Json::objectValue};
+    for (auto &kv : mem_stats)
+    {
+      auto &key = kv.first;
+      auto &val = kv.second;
+      mem[key]["Avg_Size"] = val.sum / val.count;
+      mem[key]["Max_Size"] = val.max;
+      mem[key]["Min_Size"] = val.min;
+      mem[key]["Runtime"] = "NA";
+    }
+  }
+
+  // Operation Execution Time
+  {
+    // NOTE This assumes _duration_events is sorted by "ts" ascending
+
+    // 2D keys : stats[tid][name]
+    std::unordered_map<std::string, std::unordered_map<std::string, Stat>> stats;
+    std::unordered_map<std::string, std::unordered_map<std::string, uint64_t>> begin_timestamps;
+    for (auto &recorder : recorders)
+    {
+      for (auto &evt : recorder->duration_events())
+      {
+        std::string evt_name = getLabel(*evt);
+        std::string evt_tid = getBackend(*evt);
+
+        auto &stat = stats[evt_tid][evt_name];
+        auto &begin_ts = begin_timestamps[evt_tid][evt_name];
+        uint64_t timestamp = std::stoull(evt->ts);
+        if (evt->ph == "B")
+        {
+          if (begin_ts != 0)
+            throw std::runtime_error{"Invalid Data"};
+          begin_ts = timestamp;
+        }
+        else if (evt->ph == "E")
+        {
+          if (begin_ts == 0 || timestamp < begin_ts)
+            throw std::runtime_error{"Invalid Data"};
+          stat.accumulate(timestamp - begin_ts);
+          begin_ts = 0;
+        }
+        else
+          throw std::runtime_error{"Invalid Data - invalid value for \"ph\" : \"" + evt->ph + "\""};
+      }
+    }
+
+    for (auto &kv : begin_timestamps)
+      for (auto &kv2 : kv.second)
+        if (kv2.second != 0)
+          throw std::runtime_error{"Invalid Data - B and E pair does not match."};
+
+    for (auto &kv : stats)
+    {
+      auto &tid = kv.first;
+      auto &map = kv.second;
+      auto &json_tid = exec_data[tid] = Json::Value{Json::objectValue};
+      for (auto &kv : map)
+      {
+        auto &name = kv.first;
+        auto &val = kv.second;
+        json_tid[name]["Avg_Time"] = val.sum / val.count;
+        json_tid[name]["Max_Time"] = val.max;
+        json_tid[name]["Min_Time"] = val.min;
+        json_tid[name]["Runtime"] = tid;
+      }
+    }
+  }
+
+  _os << root;
+}
diff --git a/runtime/onert/core/src/util/ShapeInference.cc b/runtime/onert/core/src/util/ShapeInference.cc
index 3ed3080cc..173de29c7 100644
--- a/runtime/onert/core/src/util/ShapeInference.cc
+++ b/runtime/onert/core/src/util/ShapeInference.cc
@@ -111,10 +111,9 @@ std::pair<int, int> calcConvLikeHeightAndWidth(const int in_h, const int in_w, c
       break;
     case ir::PaddingType::EXPLICIT:
       out_h =
-          (in_h + pad.param.top + pad.param.bottom - effective_filter_h_size) / stride.vertical + 1;
+        (in_h + pad.param.top + pad.param.bottom - effective_filter_h_size) / stride.vertical + 1;
       out_w =
-          (in_w + pad.param.left + pad.param.right - effective_filter_w_size) / stride.horizontal +
-          1;
+        (in_w + pad.param.left + pad.param.right - effective_filter_w_size) / stride.horizontal + 1;
       break;
     default:
       assert(false);
@@ -595,9 +594,9 @@ template <typename T> ir::Shape inferRangeShape(T start_val, T limit_val, T delt
   ir::Shape out_shape(static_cast<int>(1));
 
   out_shape.dim(0) =
-      (std::is_integral<T>::value
-           ? ((std::abs(start_val - limit_val) + std::abs(delta_val) - 1) / std::abs(delta_val))
-           : std::ceil(std::abs((start_val - limit_val) / delta_val)));
+    (std::is_integral<T>::value
+       ? ((std::abs(start_val - limit_val) + std::abs(delta_val) - 1) / std::abs(delta_val))
+       : std::ceil(std::abs((start_val - limit_val) / delta_val)));
   return out_shape;
 }
 
@@ -664,9 +663,9 @@ ir::Shape inferSelectShape(const ir::Shape &input_cond_shape, const ir::Shape &i
     ir::Shape true_shape = input_true_shape;
     ir::Shape false_shape = input_false_shape;
     int most_rank =
-        (cond_shape.rank() >= true_shape.rank()) && (cond_shape.rank() >= false_shape.rank())
-            ? cond_shape.rank()
-            : (false_shape.rank() >= true_shape.rank() ? false_shape.rank() : true_shape.rank());
+      (cond_shape.rank() >= true_shape.rank()) && (cond_shape.rank() >= false_shape.rank())
+        ? cond_shape.rank()
+        : (false_shape.rank() >= true_shape.rank() ? false_shape.rank() : true_shape.rank());
 
     ir::Shape calculate_shape(most_rank);
 
@@ -677,9 +676,9 @@ ir::Shape inferSelectShape(const ir::Shape &input_cond_shape, const ir::Shape &i
     for (int i = 0; i < most_rank; ++i)
     {
       calculate_shape.dim(i) =
-          (cond_shape.dim(i) >= true_shape.dim(i)) && (cond_shape.dim(i) >= false_shape.dim(i))
-              ? cond_shape.dim(i)
-              : (false_shape.dim(i) >= true_shape.dim(i) ? false_shape.dim(i) : true_shape.dim(i));
+        (cond_shape.dim(i) >= true_shape.dim(i)) && (cond_shape.dim(i) >= false_shape.dim(i))
+          ? cond_shape.dim(i)
+          : (false_shape.dim(i) >= true_shape.dim(i) ? false_shape.dim(i) : true_shape.dim(i));
 
       if ((cond_shape.dim(i) != calculate_shape.dim(i) && cond_shape.dim(i) != 1) ||
           (true_shape.dim(i) != calculate_shape.dim(i) && true_shape.dim(i) != 1) ||
@@ -711,8 +710,8 @@ ir::Shape inferSelectShape(const ir::Shape &input_cond_shape, const ir::Shape &i
   return new_shape;
 }
 
-ir::Shape inferSliceShape(const ir::Shape &input_shape, const int32_t *begins_buf,
-                          const int32_t *sizes_buf)
+template <typename T>
+ir::Shape inferSliceShape(const ir::Shape &input_shape, const T *begins_buf, const T *sizes_buf)
 {
   const uint32_t rank = input_shape.rank();
   ir::Shape out_shape(rank);
@@ -737,14 +736,19 @@ ir::Shape inferSliceShape(const ir::Shape &input_shape, const int32_t *begins_bu
     }
     else
     {
-      if (input_dim < begin + size)
+      if (input_dim < static_cast<int32_t>(begin + size))
         throw std::runtime_error("shape inference Slice: Invalid begin and size.");
     }
-    out_shape.dim(idx) = size;
+    out_shape.dim(idx) = static_cast<int32_t>(size);
   }
 
   return out_shape;
 }
+// template instantiation
+template ir::Shape inferSliceShape(const ir::Shape &input_shape, const int32_t *begins_buf,
+                                   const int32_t *sizes_buf);
+template ir::Shape inferSliceShape(const ir::Shape &input_shape, const int64_t *begins_buf,
+                                   const int64_t *sizes_buf);
 
 ir::Shape inferSpaceToBatchNDShape(const ir::Shape &input_shape, const ir::Shape &block_shape_shape,
                                    const ir::Shape &padding_shape, const int32_t *block_shape_buf,
@@ -776,7 +780,7 @@ ir::Shape inferSpaceToBatchNDShape(const ir::Shape &input_shape, const ir::Shape
   for (int dim = 0; dim < kSpatialDimensionNum; ++dim)
   {
     int final_dim_size =
-        (input_shape.dim(dim + 1) + padding_buf[dim * 2] + padding_buf[dim * 2 + 1]);
+      (input_shape.dim(dim + 1) + padding_buf[dim * 2] + padding_buf[dim * 2 + 1]);
 
     assert(final_dim_size % block_shape_buf[dim] == 0);
 
@@ -839,7 +843,7 @@ ir::Shape inferSqueezeShape(const ir::Shape &in_shape, const ir::operation::Sque
       if (!(current >= 0 && current < shape_rank && in_shape.dim(current) == 1))
       {
         throw std::runtime_error(
-            "The following conditions must be met: 0 <= dim < Shape rank, dim == 1");
+          "The following conditions must be met: 0 <= dim < Shape rank, dim == 1");
       }
 
       if (!should_squeeze[current])
@@ -1052,9 +1056,9 @@ ir::Shape inferTileShape(const ir::Shape &in_shape, const int32_t *multiplier_bu
 {
   if (multiplier_size != in_shape.rank())
   {
-    throw std::runtime_error("inferTileShape failed, input rank: " +
-                             std::to_string(in_shape.rank()) + ", bad multipliers size: " +
-                             std::to_string(multiplier_size) + "");
+    throw std::runtime_error(
+      "inferTileShape failed, input rank: " + std::to_string(in_shape.rank()) +
+      ", bad multipliers size: " + std::to_string(multiplier_size) + "");
   }
   ir::Shape new_Shape(in_shape.rank());
 
diff --git a/runtime/onert/core/src/util/TracingCtx.cc b/runtime/onert/core/src/util/TracingCtx.cc
index 08a1b32a7..c05baee60 100644
--- a/runtime/onert/core/src/util/TracingCtx.cc
+++ b/runtime/onert/core/src/util/TracingCtx.cc
@@ -24,6 +24,7 @@ namespace util
 
 // initializing static member var
 std::mutex TracingCtx::_session_id_mutex;
+uint32_t TracingCtx::_next_session_id = 0;
 
 } // namespace util
 } // namespace onert
diff --git a/runtime/onert/frontend/.clang-format b/runtime/onert/frontend/.clang-format
deleted file mode 120000
index 83185fee3..000000000
--- a/runtime/onert/frontend/.clang-format
+++ /dev/null
@@ -1 +0,0 @@
-../../../.clang-format.8
-\ No newline at end of file
diff --git a/runtime/onert/frontend/base_loader/include/base_loader.h b/runtime/onert/frontend/base_loader/include/base_loader.h
index f9c97b41b..c096e705d 100644
--- a/runtime/onert/frontend/base_loader/include/base_loader.h
+++ b/runtime/onert/frontend/base_loader/include/base_loader.h
@@ -78,7 +78,7 @@ public:
    *
    * @param file_path
    */
-  void loadFromFile(const char *file_path);
+  void loadFromFile(const std::string &file_path);
   /**
    * @brief Load a model from a buffer
    *
@@ -98,7 +98,8 @@ protected:
 
   // Create operands form tflite::Tensor
   ir::OperandIndex loadOperand(const Tensor *tensor, ir::Graph &subg);
-  void loadSparsity(const Tensor *tensor, const ir::Shape &shape, ir::TypeInfo &typeInfo);
+  void loadQuantization(const Tensor *tensor, ir::TypeInfo &typeInfo);
+  void loadSparsity(const Tensor *tensor, ir::TypeInfo &typeInfo);
   void loadOperationIO(const Operator *op, ir::OperandIndexSequence &inputs,
                        ir::OperandIndexSequence &outputs);
   // Create operations from Operator
@@ -185,22 +186,24 @@ protected:
   std::unique_ptr<Verifier> _verifier;
   // Boolean flag to use MMAPED_DATA
   bool _use_mmaped_data = false;
+
+  std::unordered_map<uint32_t /* Buffer Index in circle file */, std::shared_ptr<ir::Data>>
+    _buf_to_data;
 };
 
 template <typename LoaderDomain>
-void BaseLoader<LoaderDomain>::BaseLoader::loadFromFile(const char *file_path)
+void BaseLoader<LoaderDomain>::BaseLoader::loadFromFile(const std::string &file_path)
 {
-  _fd = open(file_path, O_RDONLY);
+  _fd = open(file_path.c_str(), O_RDONLY);
   if (_fd < 0)
   {
-    throw std::runtime_error("Failed to open file " + std::string(file_path));
+    throw std::runtime_error("Failed to open file " + file_path);
   }
 
   struct stat file_stat;
   if (fstat(_fd, &file_stat) != 0)
   {
-    throw std::runtime_error("Fstat failed or file " + std::string(file_path) +
-                             " is not a regular file");
+    throw std::runtime_error("Fstat failed or file " + file_path + " is not a regular file");
   }
   int size = file_stat.st_size;
 
@@ -324,42 +327,10 @@ ir::OperandIndex BaseLoader<LoaderDomain>::loadOperand(const Tensor *tensor, ir:
   //       If app wants to change the input shape, call nnfw_apply_input_tensorinfo() can
   //       be used.
 
-  // Type
-  ir::DataType data_type = tensorTypeToDataType(tensor->type());
-  // Quantization
-  auto q_params = tensor->quantization();
-  float scale = 0.0;
-  long zero_point = 0;
-  if (q_params != nullptr)
-  {
-    if (q_params->scale())
-    {
-      if (q_params->scale()->size() != 1)
-      {
-        throw std::runtime_error("Only 1 scale for a tensor is supported.");
-      }
-      scale = q_params->scale()->Get(0);
-    }
-
-    if (q_params->zero_point())
-    {
-      if (q_params->zero_point()->size() != 1)
-      {
-        throw std::runtime_error("Only 1 zero_point value for a tensor is supported.");
-      }
-      zero_point = q_params->zero_point()->Get(0);
-      // zero_point is long while TypeInfo.zero_point is defined as int32_t.
-      assert(zero_point >= std::numeric_limits<int32_t>::min());
-      assert(zero_point <= std::numeric_limits<int32_t>::max());
-    }
-    auto details = q_params->details_as_CustomQuantization();
-    if (details != nullptr)
-      throw std::runtime_error("Custom Quantization is not supported");
-  }
-  // Create TypeInfo
-  ir::TypeInfo type_info(data_type, scale, zero_point);
-  // Sparsity
-  loadSparsity(tensor, shape, type_info);
+  // TypeInfo
+  ir::TypeInfo type_info(tensorTypeToDataType(tensor->type()));
+  loadQuantization(tensor, type_info);
+  loadSparsity(tensor, type_info);
 
   // Create operand
   const auto operand_index = subg.addOperand(shape, type_info);
@@ -369,10 +340,11 @@ ir::OperandIndex BaseLoader<LoaderDomain>::loadOperand(const Tensor *tensor, ir:
   if (data != nullptr)
   {
     using std::ptrdiff_t;
-    std::unique_ptr<ir::Data> data_obj;
+    std::shared_ptr<ir::Data> data_obj;
+
     if (_fd == -1) // Model is from memory
     {
-      data_obj = std::make_unique<ir::ExternalData>(data->data(), data->size());
+      data_obj = std::make_shared<ir::ExternalData>(data->data(), data->size());
     }
     else // Model is loaded(mmap'd) from a file
     {
@@ -385,17 +357,30 @@ ir::OperandIndex BaseLoader<LoaderDomain>::loadOperand(const Tensor *tensor, ir:
       ptrdiff_t aligned_offset_start = (unaligned_offset_start / _pagesize) * _pagesize;
       size_t mmap_size = offset_end - aligned_offset_start;
 
-      if (_use_mmaped_data)
+      uint32_t buf_idx = tensor->buffer();
+      auto buffer_found = _buf_to_data.find(buf_idx);
+
+      if (buffer_found != _buf_to_data.end())
+      {
+        // Another tensor points this buffer and its matching Data(either CachedData or MMapedData)
+        // was already created. Let's reuse the Data
+        data_obj = buffer_found->second;
+      }
+      else if (_use_mmaped_data)
       {
-        data_obj = std::make_unique<ir::MMapedData>(_fd, aligned_offset_start, mmap_size,
+        data_obj = std::make_shared<ir::MMapedData>(_fd, aligned_offset_start, mmap_size,
                                                     unaligned_offset_start, data_size);
+        _buf_to_data[buf_idx] = data_obj;
       }
       else
       {
         size_t offset = unaligned_offset_start - aligned_offset_start;
         uint8_t *mmap_base = static_cast<uint8_t *>(
           mmap(NULL, mmap_size, PROT_READ, MAP_PRIVATE, _fd, aligned_offset_start));
-        data_obj = std::make_unique<ir::CachedData>(mmap_base + offset, data_size);
+
+        data_obj = std::make_shared<ir::CachedData>(mmap_base + offset, data_size);
+        _buf_to_data[buf_idx] = data_obj;
+
         munmap(mmap_base, mmap_size);
       }
     }
@@ -417,8 +402,46 @@ ir::OperandIndex BaseLoader<LoaderDomain>::loadOperand(const Tensor *tensor, ir:
 }
 
 template <typename LoaderDomain>
-void BaseLoader<LoaderDomain>::loadSparsity(const Tensor *tensor, const ir::Shape &shape,
-                                            ir::TypeInfo &typeInfo)
+void BaseLoader<LoaderDomain>::loadQuantization(const Tensor *tensor, ir::TypeInfo &typeInfo)
+{
+  auto q_params = tensor->quantization();
+  if (q_params == nullptr || q_params->scale() == nullptr || q_params->scale()->size() == 0)
+  {
+    typeInfo.quantization(0., 0);
+    return;
+  }
+  if (q_params->zero_point() == nullptr)
+  {
+    throw std::runtime_error("Quantization params: scale is not null, but zero_point is null.");
+  }
+  const size_t num_scales = q_params->scale()->size();
+  if (num_scales != q_params->zero_point()->size())
+  {
+    throw std::runtime_error("Quantization params: scale size != zero_point size");
+  }
+  std::vector<float> scales;
+  std::vector<int32_t> zero_points;
+  scales.resize(num_scales);
+  zero_points.resize(num_scales);
+  for (size_t i = 0; i < num_scales; ++i)
+  {
+    scales[i] = q_params->scale()->Get(i);
+    // zero_point is defined as long (i64) in schema while TypeInfo's zero_point is int32_t.
+    // int64_t is used instead of long because long is 4 byte in most 32bit architecture.
+    int64_t zero_point = q_params->zero_point()->Get(i);
+    if (zero_point < std::numeric_limits<int32_t>::min() ||
+        zero_point > std::numeric_limits<int32_t>::max())
+      throw std::runtime_error("Zero_point is out of int32 range.");
+    zero_points[i] = static_cast<int32_t>(zero_point);
+  }
+  auto details = q_params->details_as_CustomQuantization();
+  if (details != nullptr)
+    throw std::runtime_error("Custom Quantization is not supported");
+  typeInfo.quantization(std::move(scales), std::move(zero_points));
+}
+
+template <typename LoaderDomain>
+void BaseLoader<LoaderDomain>::loadSparsity(const Tensor *tensor, ir::TypeInfo &typeInfo)
 {
   auto src_sparsity = tensor->sparsity();
   if (src_sparsity != nullptr)
@@ -447,8 +470,8 @@ void BaseLoader<LoaderDomain>::loadSparsity(const Tensor *tensor, const ir::Shap
       }
     }
     // load metadata
-    const int dim_metadata_size = src_sparsity->dim_metadata()->size();
-    auto dense_rank = shape.rank();
+    const auto dim_metadata_size = src_sparsity->dim_metadata()->size();
+    const auto dense_rank = tensor->shape() ? tensor->shape()->size() : 0;
     if (dense_rank + block_rank != dim_metadata_size)
       throw std::runtime_error("sparsity dim_metadata length is wrong.");
     bool random_sparsity = dim_metadata_size == 2 && block_rank == 0;
diff --git a/runtime/onert/frontend/circle/CMakeLists.txt b/runtime/onert/frontend/circle/CMakeLists.txt
index e89e86142..fffe5cc37 100644
--- a/runtime/onert/frontend/circle/CMakeLists.txt
+++ b/runtime/onert/frontend/circle/CMakeLists.txt
@@ -4,17 +4,11 @@ endif ()
 
 set(CIRCLE_LOADER_SOURCES src/circle_loader.cc)
 
-add_library(circle_loader SHARED ${CIRCLE_LOADER_SOURCES})
+add_library(circle_loader STATIC ${CIRCLE_LOADER_SOURCES})
+set_target_properties(circle_loader PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
 target_include_directories(circle_loader PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
 
 target_link_libraries(circle_loader PRIVATE onert_core)
 target_link_libraries(circle_loader PRIVATE base_loader nnfw_common nnfw_coverage)
 target_link_libraries(circle_loader PRIVATE circle_schema)
-
-if(CMAKE_BUILD_TYPE_LC STREQUAL "release")
-  add_custom_command(TARGET circle_loader POST_BUILD
-                     COMMAND ${CMAKE_STRIP} "--strip-unneeded" $<TARGET_FILE_NAME:circle_loader>)
-endif()
-
-install(TARGETS circle_loader DESTINATION lib)
diff --git a/runtime/onert/frontend/circle/include/circle_loader.h b/runtime/onert/frontend/circle/include/circle_loader.h
index 675a5b3e7..44bf28056 100644
--- a/runtime/onert/frontend/circle/include/circle_loader.h
+++ b/runtime/onert/frontend/circle/include/circle_loader.h
@@ -25,7 +25,7 @@ namespace onert
 {
 namespace circle_loader
 {
-std::unique_ptr<ir::Subgraphs> loadModel(const char *filename);
+std::unique_ptr<ir::Subgraphs> loadModel(const std::string &filename);
 std::unique_ptr<ir::Subgraphs> loadModel(uint8_t *buffer, size_t size);
 } // namespace circle_loader
 } // namespace onert
diff --git a/runtime/onert/frontend/circle/src/circle_loader.cc b/runtime/onert/frontend/circle/src/circle_loader.cc
index 0d7b3eab4..652fbc778 100644
--- a/runtime/onert/frontend/circle/src/circle_loader.cc
+++ b/runtime/onert/frontend/circle/src/circle_loader.cc
@@ -122,7 +122,7 @@ private:
 
     subg->setLayout(convertDataFormat(circle_subg->data_format()));
 
-    subg->finishBuilding();
+    subg->verify();
 
     return subg;
   }
@@ -202,7 +202,7 @@ void CircleLoader::loadBCQFullyConnected(const Operator *op, ir::Graph &subg)
 
 } // namespace
 
-std::unique_ptr<ir::Subgraphs> loadModel(const char *filename)
+std::unique_ptr<ir::Subgraphs> loadModel(const std::string &filename)
 {
   auto subgraphs = std::make_unique<ir::Subgraphs>();
   CircleLoader loader(subgraphs);
diff --git a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksModel.cc b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksModel.cc
index 3e2bea114..81ffa26f3 100644
--- a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksModel.cc
+++ b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksModel.cc
@@ -27,7 +27,8 @@
 // ANeuralNetworksModel
 //
 ANeuralNetworksModel::ANeuralNetworksModel() noexcept
-  : _optional_operands{}, _operand_usages{}, _allowFloat32toFloat16{false}
+  : _finished_building{false}, _optional_operands{}, _operand_usages{}, _allowFloat32toFloat16{
+                                                                          false}
 {
   _graph = std::make_shared<onert::ir::Graph>();
 }
@@ -208,9 +209,9 @@ bool ANeuralNetworksModel::finish() noexcept
   {
     fillOptionalOperand();
 
-    _graph->finishBuilding();
-
+    _graph->verify();
     _operand_usages.clear();
+    _finished_building = true;
   }
   catch (const std::exception &e)
   {
@@ -222,7 +223,7 @@ bool ANeuralNetworksModel::finish() noexcept
   return true;
 }
 
-bool ANeuralNetworksModel::isFinished() noexcept { return !_graph->isBuildingPhase(); }
+bool ANeuralNetworksModel::isFinished() noexcept { return _finished_building; }
 
 bool ANeuralNetworksModel::isExistOperand(uint32_t index) noexcept
 {
diff --git a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksModel.h b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksModel.h
index df6c97c44..4301193d6 100644
--- a/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksModel.h
+++ b/runtime/onert/frontend/nnapi/wrapper/ANeuralNetworksModel.h
@@ -67,6 +67,7 @@ private:
 
 private:
   std::shared_ptr<onert::ir::Graph> _graph;
+  bool _finished_building;
   std::unordered_set<onert::ir::OperandIndex> _optional_operands;
   std::vector<OperandUsage> _operand_usages;
   bool _allowFloat32toFloat16;
diff --git a/runtime/onert/frontend/tflite/CMakeLists.txt b/runtime/onert/frontend/tflite/CMakeLists.txt
index e84eb3e3e..792feebe5 100644
--- a/runtime/onert/frontend/tflite/CMakeLists.txt
+++ b/runtime/onert/frontend/tflite/CMakeLists.txt
@@ -4,16 +4,10 @@ endif(NOT BUILD_TFLITE_LOADER)
 
 set(TFLITE_LOADER_SOURCES src/tflite_loader.cc)
 
-add_library(tflite_loader SHARED ${TFLITE_LOADER_SOURCES})
+add_library(tflite_loader STATIC ${TFLITE_LOADER_SOURCES})
+set_target_properties(tflite_loader PROPERTIES POSITION_INDEPENDENT_CODE ON)
 
 target_include_directories(tflite_loader PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
 
 target_link_libraries(tflite_loader PRIVATE onert_core)
 target_link_libraries(tflite_loader PRIVATE base_loader nnfw_common nnfw_coverage)
-
-if(CMAKE_BUILD_TYPE_LC STREQUAL "release")
-  add_custom_command(TARGET tflite_loader POST_BUILD
-                     COMMAND ${CMAKE_STRIP} "--strip-unneeded" $<TARGET_FILE_NAME:tflite_loader>)
-endif()
-
-install(TARGETS tflite_loader DESTINATION lib)
diff --git a/runtime/onert/frontend/tflite/include/tflite_loader.h b/runtime/onert/frontend/tflite/include/tflite_loader.h
index 743c05f9e..dda34cc6a 100644
--- a/runtime/onert/frontend/tflite/include/tflite_loader.h
+++ b/runtime/onert/frontend/tflite/include/tflite_loader.h
@@ -26,7 +26,7 @@ namespace onert
 namespace tflite_loader
 {
 
-std::unique_ptr<ir::Subgraphs> loadModel(const char *filename);
+std::unique_ptr<ir::Subgraphs> loadModel(const std::string &filename);
 
 } // namespace tflite_loader
 } // namespace onert
diff --git a/runtime/onert/frontend/tflite/src/tflite_loader.cc b/runtime/onert/frontend/tflite/src/tflite_loader.cc
index 91919a0a2..8669bbb44 100644
--- a/runtime/onert/frontend/tflite/src/tflite_loader.cc
+++ b/runtime/onert/frontend/tflite/src/tflite_loader.cc
@@ -107,7 +107,7 @@ private:
       loadOperation(op, *subg);
     }
 
-    subg->finishBuilding();
+    subg->verify();
 
     return subg;
   }
@@ -115,7 +115,7 @@ private:
 
 } // namespace
 
-std::unique_ptr<ir::Subgraphs> loadModel(const char *filename)
+std::unique_ptr<ir::Subgraphs> loadModel(const std::string &filename)
 {
   auto subgraphs = std::make_unique<ir::Subgraphs>();
   TFLiteLoader loader(subgraphs);
diff --git a/runtime/onert/sample/.clang-format b/runtime/onert/sample/.clang-format
deleted file mode 120000
index 83185fee3..000000000
--- a/runtime/onert/sample/.clang-format
+++ /dev/null
@@ -1 +0,0 @@
-../../../.clang-format.8
-\ No newline at end of file
diff --git a/runtime/onert/test/.clang-format b/runtime/onert/test/.clang-format
deleted file mode 120000
index 83185fee3..000000000
--- a/runtime/onert/test/.clang-format
+++ /dev/null
@@ -1 +0,0 @@
-../../../.clang-format.8
-\ No newline at end of file
diff --git a/runtime/onert/test/core/compiler/HEScheduler.cc b/runtime/onert/test/core/compiler/HEScheduler.cc
index c77ebb895..a7185ca0b 100644
--- a/runtime/onert/test/core/compiler/HEScheduler.cc
+++ b/runtime/onert/test/core/compiler/HEScheduler.cc
@@ -49,13 +49,20 @@ struct MockConfigCPU : public IConfig
   bool supportFP16() override { return false; }
 };
 
+class MockBackendContext : public BackendContext
+{
+public:
+  using BackendContext::BackendContext;
+  ITensorRegistry *genTensors() override { return nullptr; }
+  FunctionMap genKernels() override { return {}; }
+};
+
 struct MockBackendCPU : public Backend
 {
   std::shared_ptr<IConfig> config() const override { return std::make_shared<MockConfigCPU>(); }
-  std::unique_ptr<BackendContext>
-  newContext(const Graph &, const std::shared_ptr<custom::IKernelBuilder> &, bool) const override
+  std::unique_ptr<BackendContext> newContext(ContextData &&data) const override
   {
-    return std::unique_ptr<BackendContext>(new BackendContext{this, nullptr});
+    return std::make_unique<MockBackendContext>(this, std::move(data), nullptr);
   }
 };
 
@@ -75,10 +82,9 @@ struct MockConfigGPU : public IConfig
 struct MockBackendGPU : public Backend
 {
   std::shared_ptr<IConfig> config() const override { return std::make_shared<MockConfigGPU>(); }
-  std::unique_ptr<BackendContext>
-  newContext(const Graph &, const std::shared_ptr<custom::IKernelBuilder> &, bool) const override
+  std::unique_ptr<BackendContext> newContext(ContextData &&data) const override
   {
-    return std::unique_ptr<BackendContext>(new BackendContext{this, nullptr});
+    return std::make_unique<MockBackendContext>(this, std::move(data), nullptr);
   }
 };
 
@@ -98,10 +104,9 @@ struct MockConfigNPU : public IConfig
 struct MockBackendNPU : public Backend
 {
   std::shared_ptr<IConfig> config() const override { return std::make_shared<MockConfigNPU>(); }
-  std::unique_ptr<BackendContext>
-  newContext(const Graph &, const std::shared_ptr<custom::IKernelBuilder> &, bool) const override
+  std::unique_ptr<BackendContext> newContext(ContextData &&data) const override
   {
-    return std::unique_ptr<BackendContext>(new BackendContext{this, nullptr});
+    return std::make_unique<MockBackendContext>(this, std::move(data), nullptr);
   }
 };
 
@@ -236,7 +241,7 @@ std::shared_ptr<Graph> createStraightGraph()
   BinaryArithmetic::Param mul_op_params{BinaryArithmetic::ArithmeticType::MUL, Activation::NONE};
   create<BinaryArithmetic>(graph, OIS{sub_out_idx, mul_const_idx}, OIS{mul_out_idx}, mul_op_params);
 
-  graph->finishBuilding();
+  graph->verify();
   return graph;
 }
 
@@ -292,7 +297,7 @@ std::shared_ptr<Graph> createBranchedGraph()
   BinaryArithmetic::Param sub_op_params{BinaryArithmetic::ArithmeticType::SUB, Activation::NONE};
   create<BinaryArithmetic>(graph, OIS{mul2_out_idx, fc2_out_idx}, OIS{sub_out_idx}, sub_op_params);
 
-  graph->finishBuilding();
+  graph->verify();
   return graph;
 }
 
@@ -337,16 +342,6 @@ protected:
     setenv("PROFILING_MODE", _original_profiling_mode.c_str(), true);
   }
 
-  backend::BackendContexts buildBackendContexts(const Graph &graph)
-  {
-    backend::BackendContexts contexts;
-    for (auto backend : _mock_backends)
-    {
-      contexts.emplace(backend, backend->newContext(graph, nullptr, false));
-    }
-    return contexts;
-  }
-
   const MockBackendCPU *_cpu_backend{nullptr};
   const MockBackendGPU *_gpu_backend{nullptr};
   const MockBackendNPU *_npu_backend{nullptr};
@@ -392,9 +387,8 @@ TEST_P(HESchedulerTestWithExecutorParam, straight_graph_known_exec_time)
     et.storeOperationsExecTime();
 
     // Test scheduler
-    auto backend_contexts = buildBackendContexts(*graph);
-    auto scheduler = compiler::HEScheduler(backend_contexts,
-                                           compiler::fetchCompilerOptionsFromGlobalConfig(subgs));
+    auto scheduler =
+      compiler::HEScheduler(_mock_backends, compiler::fetchCompilerOptionsFromGlobalConfig(subgs));
     const auto br = scheduler.schedule(*graph);
     ASSERT_EQ(br->getBackend(add_op_idx)->config()->id(), "cpu");
     ASSERT_EQ(br->getBackend(sub_op_idx)->config()->id(), "gpu");
@@ -408,9 +402,8 @@ TEST_P(HESchedulerTestWithExecutorParam, straight_graph_known_exec_time)
     setPermutationsExecutionTime(_mock_backends, OPERAND_SIZE, 1e5);
 
     // Test scheduler
-    auto backend_contexts = buildBackendContexts(*graph);
-    auto scheduler = compiler::HEScheduler(backend_contexts,
-                                           compiler::fetchCompilerOptionsFromGlobalConfig(subgs));
+    auto scheduler =
+      compiler::HEScheduler(_mock_backends, compiler::fetchCompilerOptionsFromGlobalConfig(subgs));
     const auto br = scheduler.schedule(*graph);
     ASSERT_EQ(br->getBackend(add_op_idx)->config()->id(), "cpu");
     ASSERT_EQ(br->getBackend(sub_op_idx)->config()->id(), "cpu");
@@ -451,9 +444,8 @@ TEST_P(HESchedulerTestWithExecutorParam, branched_graph_known_exec_time)
     et.storeOperationsExecTime();
 
     // Test scheduler
-    auto backend_contexts = buildBackendContexts(*graph);
-    auto scheduler = compiler::HEScheduler(backend_contexts,
-                                           compiler::fetchCompilerOptionsFromGlobalConfig(subgs));
+    auto scheduler =
+      compiler::HEScheduler(_mock_backends, compiler::fetchCompilerOptionsFromGlobalConfig(subgs));
     const auto br = scheduler.schedule(*graph);
 
     std::string branch1_expected_backend("npu"), branch2_expected_backend("npu");
@@ -486,9 +478,8 @@ TEST_P(HESchedulerTestWithExecutorParam, branched_graph_known_exec_time)
     et.storeOperationsExecTime();
 
     // Test scheduler
-    auto backend_contexts = buildBackendContexts(*graph);
-    auto scheduler = compiler::HEScheduler(backend_contexts,
-                                           compiler::fetchCompilerOptionsFromGlobalConfig(subgs));
+    auto scheduler =
+      compiler::HEScheduler(_mock_backends, compiler::fetchCompilerOptionsFromGlobalConfig(subgs));
     const auto br = scheduler.schedule(*graph);
     ASSERT_EQ(br->getBackend(add_op_idx)->config()->id(), "npu");
     ASSERT_EQ(br->getBackend(mul1_op_idx)->config()->id(), "npu");
@@ -537,9 +528,8 @@ TEST_F(HESchedulerTest, branched_graph_profiling_mode)
     et.storeOperationsExecTime();
 
     // Test scheduler
-    auto backend_contexts = buildBackendContexts(*graph);
-    auto scheduler = compiler::HEScheduler(backend_contexts,
-                                           compiler::fetchCompilerOptionsFromGlobalConfig(subgs));
+    auto scheduler =
+      compiler::HEScheduler(_mock_backends, compiler::fetchCompilerOptionsFromGlobalConfig(subgs));
     const auto br = scheduler.schedule(*graph);
     ASSERT_EQ(br->getBackend(mul1_op_idx)->config()->id(), "npu");
     ASSERT_EQ(br->getBackend(mul2_op_idx)->config()->id(), "npu");
@@ -560,9 +550,8 @@ TEST_F(HESchedulerTest, branched_graph_profiling_mode)
     et.storeOperationsExecTime();
 
     // Test scheduler
-    auto backend_contexts = buildBackendContexts(*graph);
-    auto scheduler = compiler::HEScheduler(backend_contexts,
-                                           compiler::fetchCompilerOptionsFromGlobalConfig(subgs));
+    auto scheduler =
+      compiler::HEScheduler(_mock_backends, compiler::fetchCompilerOptionsFromGlobalConfig(subgs));
     const auto br = scheduler.schedule(*graph);
     ASSERT_NE(br->getBackend(add_op_idx)->config()->id(),
               br->getBackend(mul1_op_idx)->config()->id());
diff --git a/runtime/onert/test/core/compiler/pass/UnusedOperandEliminationPass.cc b/runtime/onert/test/core/compiler/pass/UnusedOperandEliminationPass.cc
new file mode 100644
index 000000000..b18dedd15
--- /dev/null
+++ b/runtime/onert/test/core/compiler/pass/UnusedOperandEliminationPass.cc
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include "ir/Graph.h"
+#include "compiler/pass/UnusedOperandEliminationPass.h"
+
+using namespace onert::ir;
+using namespace onert::compiler::pass;
+
+TEST(UnusedOperandEliminationPass, Simple)
+{
+  Graph graph;
+
+  // Add tensors
+  Shape shape{1, 2, 2, 1};
+  TypeInfo type{DataType::FLOAT32};
+  auto in = graph.addOperand(shape, type);
+  auto out = graph.addOperand(shape, type);
+
+  auto unused = graph.addOperand(shape, type);
+
+  // Set model inputs/outputs
+  graph.addInput(in);
+  graph.addOutput(out);
+
+  UnusedOperandEliminationPass{graph}.run();
+
+  ASSERT_TRUE(graph.operands().exist(in));
+  ASSERT_TRUE(graph.operands().exist(out));
+  ASSERT_FALSE(graph.operands().exist(unused));
+}
diff --git a/runtime/onert/test/core/exec/ExecInstance.cc b/runtime/onert/test/core/exec/ExecInstance.cc
index 0e742e1e4..0183b6276 100644
--- a/runtime/onert/test/core/exec/ExecInstance.cc
+++ b/runtime/onert/test/core/exec/ExecInstance.cc
@@ -73,7 +73,7 @@ public:
     graph->addInput(operand_lhs);
     graph->addInput(operand_rhs1);
     graph->addOutput(operand_result2);
-    graph->finishBuilding();
+    graph->verify();
 
     // Compile
     auto subgs = std::make_shared<onert::ir::Subgraphs>();
diff --git a/runtime/onert/test/core/exec/ExecTime.test.cc b/runtime/onert/test/core/exec/ExecTime.test.cc
index 6b0c35a79..178b61ea5 100644
--- a/runtime/onert/test/core/exec/ExecTime.test.cc
+++ b/runtime/onert/test/core/exec/ExecTime.test.cc
@@ -45,9 +45,7 @@ struct MockBackend : public ::onert::backend::Backend
   {
     return std::make_shared<MockConfig>();
   }
-  std::unique_ptr<BackendContext> newContext(const ir::Graph &,
-                                             const std::shared_ptr<custom::IKernelBuilder> &kb,
-                                             bool) const override
+  std::unique_ptr<onert::backend::BackendContext> newContext(ContextData &&) const override
   {
     return nullptr;
   }
diff --git a/runtime/onert/test/core/interp/ExecManager.cc b/runtime/onert/test/core/interp/ExecManager.cc
index 327c38f79..a9f7cd46a 100644
--- a/runtime/onert/test/core/interp/ExecManager.cc
+++ b/runtime/onert/test/core/interp/ExecManager.cc
@@ -71,7 +71,7 @@ protected:
     _graph->getInputs().append(operand_rhs);
     _graph->getOutputs().append(operand_result);
 
-    _graph->finishBuilding();
+    _graph->verify();
 
     auto subgs = std::make_shared<onert::ir::Subgraphs>();
     subgs->push(onert::ir::SubgraphIndex{0}, _graph);
@@ -136,7 +136,7 @@ protected:
     _graph->getInputs().append(operand_rhs1);
     _graph->getOutputs().append(operand_result2);
 
-    _graph->finishBuilding();
+    _graph->verify();
 
     auto subgs = std::make_shared<onert::ir::Subgraphs>();
     subgs->push(onert::ir::SubgraphIndex{0}, _graph);
@@ -189,7 +189,7 @@ protected:
     _graph->getInputs().append(operand_rhs);
     _graph->getOutputs().append(operand_result);
 
-    _graph->finishBuilding();
+    _graph->verify();
 
     auto subgs = std::make_shared<onert::ir::Subgraphs>();
     subgs->push(onert::ir::SubgraphIndex{0}, _graph);
@@ -213,7 +213,7 @@ protected:
 TEST_F(InterpExecutorTest, create_empty)
 {
   Graph graph;
-  graph.finishBuilding();
+  graph.verify();
   auto executor = std::make_unique<InterpExecutor>(graph);
   ASSERT_NE(executor, nullptr);
 }
diff --git a/runtime/onert/test/graph/Graph.cc b/runtime/onert/test/core/ir/Graph.cc
index 6461a0821..d6de7c0cc 100644
--- a/runtime/onert/test/graph/Graph.cc
+++ b/runtime/onert/test/core/ir/Graph.cc
@@ -67,7 +67,7 @@ OperationIndex addAddOperation(Graph &graph, const OperandIndexSequence inputs,
   return graph.addOperation(std::make_unique<operation::BinaryArithmetic>(inputs, outputs, param));
 }
 
-TEST(Graph, OneOpGraphFinish)
+TEST(Graph, OneOpGraphSimpleValid)
 {
   // Simple Graph with just one Add operation
 
@@ -87,12 +87,12 @@ TEST(Graph, OneOpGraphFinish)
   graph.addInput(rhs);
   graph.addOutput(res);
 
-  graph.finishBuilding();
+  graph.verify();
 
   SUCCEED();
 }
 
-TEST(Graph, neg_InvalidGraphFinish_BadInput)
+TEST(Graph, neg_InvalidGraph_BadInput)
 {
   Graph graph;
 
@@ -107,10 +107,10 @@ TEST(Graph, neg_InvalidGraphFinish_BadInput)
   graph.addOutput(out);
   graph.addInput(OperandIndex{89}); // Non-exisiting operand!
 
-  EXPECT_ANY_THROW(graph.finishBuilding());
+  EXPECT_ANY_THROW(graph.verify());
 }
 
-TEST(Graph, neg_InvalidGraphFinish_BadOutput)
+TEST(Graph, neg_InvalidGraph_BadOutput)
 {
   Graph graph;
 
@@ -125,10 +125,10 @@ TEST(Graph, neg_InvalidGraphFinish_BadOutput)
   graph.addOutput(out);
   graph.addOutput(OperandIndex{12}); // Non-exisiting operand!
 
-  EXPECT_ANY_THROW(graph.finishBuilding());
+  EXPECT_ANY_THROW(graph.verify());
 }
 
-TEST(Graph, neg_InvalidGraphFinish_BadInputOutputForOp)
+TEST(Graph, neg_InvalidAddOperation_BadInputIndex)
 {
   Graph graph;
 
@@ -139,12 +139,10 @@ TEST(Graph, neg_InvalidGraphFinish_BadInputOutputForOp)
   auto rhs = graph.addOperand(shape, type);
   auto res = graph.addOperand(shape, type);
 
-  addAddOperation(graph, {lhs, OperandIndex{99}}, {res});
-
   // Set model inputs/outputs
   graph.addInput(lhs);
   graph.addInput(rhs);
   graph.addOutput(res);
 
-  EXPECT_ANY_THROW(graph.finishBuilding());
+  ASSERT_FALSE(addAddOperation(graph, {lhs, OperandIndex{99}}, {res}).valid());
 }
diff --git a/runtime/onert/test/graph/operand/LayoutSet.cc b/runtime/onert/test/core/ir/LayoutSet.cc
index 6aa01b9f1..591710a4d 100644
--- a/runtime/onert/test/graph/operand/LayoutSet.cc
+++ b/runtime/onert/test/core/ir/LayoutSet.cc
@@ -36,6 +36,15 @@ TEST(ir_LayoutSet, neg_add_remove)
   ASSERT_EQ(set.size(), 0);
 }
 
+TEST(ir_LayoutSet, neg_add_twice)
+{
+  LayoutSet set;
+  set.add(Layout::NHWC);
+  ASSERT_EQ(set.size(), 1);
+  set.add(Layout::NHWC);
+  ASSERT_EQ(set.size(), 1);
+}
+
 TEST(ir_LayoutSet, set_operators)
 {
   LayoutSet set1{Layout::NCHW};
diff --git a/runtime/onert/test/graph/MockNode.h b/runtime/onert/test/core/ir/MockNode.h
index 0e7ed977b..0e7ed977b 100644
--- a/runtime/onert/test/graph/MockNode.h
+++ b/runtime/onert/test/core/ir/MockNode.h
diff --git a/runtime/onert/test/graph/operand/IndexSet.cc b/runtime/onert/test/core/ir/OperandIndexSet.cc
index c363e5472..c363e5472 100644
--- a/runtime/onert/test/graph/operand/IndexSet.cc
+++ b/runtime/onert/test/core/ir/OperandIndexSet.cc
diff --git a/runtime/onert/test/graph/operand/Set.cc b/runtime/onert/test/core/ir/OperandSet.cc
index 6cf9c8842..6cf9c8842 100644
--- a/runtime/onert/test/graph/operand/Set.cc
+++ b/runtime/onert/test/core/ir/OperandSet.cc
diff --git a/runtime/onert/test/graph/operation/Set.cc b/runtime/onert/test/core/ir/OperationSet.cc
index 50c3b304d..4a17eeb33 100644
--- a/runtime/onert/test/graph/operation/Set.cc
+++ b/runtime/onert/test/core/ir/OperationSet.cc
@@ -16,7 +16,7 @@
 
 #include <gtest/gtest.h>
 
-#include "../MockNode.h"
+#include "MockNode.h"
 #include "ir/Operations.h"
 
 using onert::ir::Operation;
diff --git a/runtime/onert/test/graph/operation/SetIO.cc b/runtime/onert/test/core/ir/SetIO.cc
index 68b477347..68b477347 100644
--- a/runtime/onert/test/graph/operation/SetIO.cc
+++ b/runtime/onert/test/core/ir/SetIO.cc
diff --git a/runtime/onert/test/ir/Shape.cc b/runtime/onert/test/core/ir/Shape.cc
index c24aeda8d..c24aeda8d 100644
--- a/runtime/onert/test/ir/Shape.cc
+++ b/runtime/onert/test/core/ir/Shape.cc
diff --git a/runtime/onert/test/graph/operand/UseDef.cc b/runtime/onert/test/core/ir/UseDef.cc
index 5ef10027e..47c98f939 100644
--- a/runtime/onert/test/graph/operand/UseDef.cc
+++ b/runtime/onert/test/core/ir/UseDef.cc
@@ -19,7 +19,7 @@
 #include "ir/Graph.h"
 #include "ir/verifier/Verifier.h"
 #include <memory>
-#include "../MockNode.h"
+#include "MockNode.h"
 
 #include <typeindex>
 
@@ -60,7 +60,7 @@ TEST(ir_Operand, neg_usedef)
   auto multiinput_index = graph.addOperation(
     std::make_unique<Mock>(IndexSet{operand_index1, operand_index2}, IndexSet{output_operand}));
 
-  graph.finishBuilding();
+  graph.verify();
 
   ASSERT_TRUE(verifier.verify(graph));
 
diff --git a/runtime/onert/test/graph/verifier/Verifier.cc b/runtime/onert/test/core/ir/Verifier.cc
index 3bce2746c..b4be2d9cd 100644
--- a/runtime/onert/test/graph/verifier/Verifier.cc
+++ b/runtime/onert/test/core/ir/Verifier.cc
@@ -21,7 +21,7 @@
 #include "ir/verifier/Verifier.h"
 #include <memory>
 #include "ir/Operand.h"
-#include "../MockNode.h"
+#include "MockNode.h"
 
 using IndexSet = onert::ir::OperandIndexSequence;
 using Mock = onert_test::ir::SimpleMock;
@@ -41,8 +41,6 @@ TEST(Verifier, dag_checker)
 
   graph.addOperation(std::make_unique<Mock>(IndexSet{operand1}, IndexSet{operand2}));
 
-  graph.finishBuilding();
-
   onert::ir::verifier::DAGChecker verifier;
 
   ASSERT_TRUE(verifier.verify(graph));
@@ -64,11 +62,9 @@ TEST(Verifier, neg_edge_consistency_checker_1)
   auto mock_op = std::make_unique<Mock>(IndexSet{operand1}, IndexSet{operand2});
   auto op_ind = graph.addOperation(std::move(mock_op));
 
-  graph.finishBuilding();
-
   graph.operands().at(operand1).removeUse(op_ind); // Manipulate the operand alone
 
-  onert::ir::verifier::EdgeConsistencyChecker verifier;
+  onert::ir::verifier::EdgeChecker verifier;
   ASSERT_FALSE(verifier.verify(graph));
 }
 
@@ -89,10 +85,8 @@ TEST(Verifier, neg_edge_consistency_checker_2)
   auto mock_op_ptr = mock_op.get();
   auto op_ind = graph.addOperation(std::move(mock_op));
 
-  graph.finishBuilding();
-
   mock_op_ptr->setInputs({operand2}); // Manipulate the operation alone
 
-  onert::ir::verifier::EdgeConsistencyChecker verifier;
+  onert::ir::verifier::EdgeChecker verifier;
   ASSERT_FALSE(verifier.verify(graph));
 }
diff --git a/runtime/onert/test/graph/Index.cc b/runtime/onert/test/core/util/Index.cc
index 2d110e326..2d110e326 100644
--- a/runtime/onert/test/graph/Index.cc
+++ b/runtime/onert/test/core/util/Index.cc
diff --git a/runtime/onert/test/core/util/ObjectManager.cc b/runtime/onert/test/core/util/ObjectManager.cc
new file mode 100644
index 000000000..78f044e56
--- /dev/null
+++ b/runtime/onert/test/core/util/ObjectManager.cc
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "util/ObjectManager.h"
+#include "util/Index.h"
+
+using namespace onert;
+
+struct TestTag;
+using Index = typename util::Index<uint32_t, TestTag>;
+
+TEST(ObjectManager, emplace)
+{
+  util::ObjectManager<Index, int> man;
+
+  auto index = man.emplace(100);
+  ASSERT_EQ(man.at(index), 100);
+}
+
+TEST(ObjectManager, neg_remove_1)
+{
+  util::ObjectManager<Index, int> man;
+
+  Index index = man.emplace(100);
+  ASSERT_TRUE(man.exist(index));
+  ASSERT_EQ(man.at(index), 100);
+
+  man.remove(index);
+  ASSERT_FALSE(man.exist(index));
+}
+
+TEST(ObjectManager, neg_remove_2)
+{
+  util::ObjectManager<Index, int> man;
+
+  auto index0 = man.emplace(100);
+  auto index1 = man.emplace(200);
+  ASSERT_TRUE(man.exist(index0));
+  ASSERT_EQ(man.at(index0), 100);
+  ASSERT_TRUE(man.exist(index1));
+  ASSERT_EQ(man.at(index1), 200);
+
+  man.remove(index0);
+  ASSERT_FALSE(man.exist(index0));
+  ASSERT_TRUE(man.exist(index1));
+  ASSERT_EQ(man.at(index1), 200);
+}
+
+TEST(ObjectManager, push)
+{
+  util::ObjectManager<Index, int> man;
+
+  // Not specify index
+  auto index = man.push(std::make_unique<int>(100));
+  ASSERT_EQ(man.at(index), 100);
+
+  // Specify index
+  auto index2 = man.push(std::make_unique<int>(200), Index{33});
+  ASSERT_EQ(index2.value(), 33);
+  ASSERT_EQ(man.at(index2), 200);
+
+  auto index3 = man.push(std::make_unique<int>(300));
+  // NOTE auto-generated index number is always (biggest index in the ObjectManager + 1)
+  ASSERT_EQ(index3.value(), 34);
+  ASSERT_EQ(man.at(index3), 300);
+
+  auto index4 = man.push(std::make_unique<int>(400), Index{22});
+  ASSERT_EQ(index4.value(), 22);
+  ASSERT_EQ(man.at(index4), 400);
+
+  auto index5 = man.push(std::make_unique<int>(500));
+  // NOTE auto-generated index number is always (biggest index in the ObjectManager + 1)
+  ASSERT_EQ(index5.value(), 35);
+  ASSERT_EQ(man.at(index5), 500);
+}
+
+TEST(ObjectManager, neg_push)
+{
+  util::ObjectManager<Index, int> man;
+
+  // Specify index
+  auto index = man.push(std::make_unique<int>(100), Index{55});
+  ASSERT_EQ(index.value(), 55);
+  ASSERT_EQ(man.at(index), 100);
+
+  // Specify the same index
+  auto index2 = man.push(std::make_unique<int>(200), Index{55});
+  ASSERT_FALSE(index2.valid());
+}
+
+static const uint32_t kMaxUInt32 = std::numeric_limits<uint32_t>::max();
+
+TEST(ObjectManager, neg_push_undefined_index)
+{
+  util::ObjectManager<Index, int> man;
+
+  // Try inserting invalid(undefined) index
+  auto index = man.push(std::make_unique<int>(100), Index{kMaxUInt32});
+  ASSERT_FALSE(index.valid());
+  ASSERT_EQ(man.size(), 0);
+}
+
+TEST(ObjectManager, neg_push_max_index)
+{
+  util::ObjectManager<Index, int> man;
+
+  // Insert an object with maximum valid index
+  auto index = man.push(std::make_unique<int>(100), Index{kMaxUInt32 - 1});
+  ASSERT_EQ(index.value(), kMaxUInt32 - 1);
+  ASSERT_EQ(man.at(index), 100);
+  ASSERT_EQ(man.size(), 1);
+
+  // Reached to the final index so next push/emplace must fail
+  auto index2 = man.push(std::make_unique<int>(200));
+  ASSERT_EQ(man.size(), 1);
+  ASSERT_FALSE(index2.valid());
+}
+
+TEST(ObjectManager, neg_emplace_max_index)
+{
+  util::ObjectManager<Index, int> man;
+
+  // Insert an object with maximum valid index
+  auto index = man.push(std::make_unique<int>(100), Index{kMaxUInt32 - 1});
+  ASSERT_EQ(index.value(), kMaxUInt32 - 1);
+  ASSERT_EQ(man.at(index), 100);
+  ASSERT_EQ(man.size(), 1);
+
+  // Reached to the final index so next push/emplace must fail
+  auto index3 = man.emplace(200);
+  ASSERT_EQ(man.size(), 1);
+  ASSERT_FALSE(index3.valid());
+}
+
+TEST(ObjectManager, const_iterate)
+{
+  util::ObjectManager<Index, int> man;
+
+  auto index0 = man.emplace(100);
+  auto index1 = man.emplace(200);
+  auto index2 = man.emplace(300);
+
+  int sum = 0;
+  man.iterate([&](const Index &index, const int &val) { sum += val; });
+  ASSERT_EQ(sum, 600);
+}
+
+TEST(ObjectManager, non_const_iterate)
+{
+  util::ObjectManager<Index, int> man;
+
+  auto index0 = man.emplace(100);
+  auto index1 = man.emplace(200);
+  auto index2 = man.emplace(300);
+
+  man.iterate([&](const Index &index, int &val) { val += 1; });
+  ASSERT_EQ(man.at(index0), 101);
+  ASSERT_EQ(man.at(index1), 201);
+  ASSERT_EQ(man.at(index2), 301);
+}
+
+TEST(ObjectManager, set)
+{
+  util::ObjectManager<Index, int> man;
+  auto index = man.set(Index{1}, std::make_unique<int>(100)); // Insert
+  ASSERT_EQ(index, Index{1});
+  auto index2 = man.set(index, std::make_unique<int>(200)); // Overwrite
+  ASSERT_EQ(index2, index);
+  ASSERT_EQ(man.at(index2), 200);
+}
+
+TEST(ObjectManager, neg_set)
+{
+  auto v = std::make_unique<int>(100);
+  util::ObjectManager<Index, int> man;
+  auto index = man.set(Index{}, std::move(v)); // Try set with an invalid index
+  ASSERT_EQ(index, Index{});
+  ASSERT_FALSE(index.valid());
+  ASSERT_NE(v, nullptr); // v must be kept when failure
+}
+
+TEST(ObjectManager, getRawPtr)
+{
+  auto v = std::make_unique<int>(100);
+  auto v_ptr = v.get();
+  util::ObjectManager<Index, int> man;
+  auto index = man.push(std::move(v));
+  ASSERT_EQ(v_ptr, man.getRawPtr(index));
+}
+
+TEST(ObjectManager, neg_getRawPtr)
+{
+  util::ObjectManager<Index, int> man;
+  auto ptr = man.getRawPtr(Index{1});
+  ASSERT_EQ(ptr, nullptr);
+}
diff --git a/runtime/onert/test/util/ShapeInference.cc b/runtime/onert/test/core/util/ShapeInference.cc
index 2ecaa2885..2ecaa2885 100644
--- a/runtime/onert/test/util/ShapeInference.cc
+++ b/runtime/onert/test/core/util/ShapeInference.cc
diff --git a/runtime/onert/test/util/ObjectManager.cc b/runtime/onert/test/util/ObjectManager.cc
deleted file mode 100644
index 24bb9b0c1..000000000
--- a/runtime/onert/test/util/ObjectManager.cc
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <gtest/gtest.h>
-
-#include "util/ObjectManager.h"
-#include "util/Index.h"
-
-using namespace onert;
-
-struct TestTag;
-using Index = typename util::Index<uint32_t, TestTag>;
-
-TEST(ObjectManager, emplace)
-{
-  util::ObjectManager<Index, int> man;
-
-  auto index = man.emplace(100);
-  ASSERT_EQ(man.at(index), 100);
-}
-
-TEST(ObjectManager, neg_remove_1)
-{
-  util::ObjectManager<Index, int> man;
-
-  Index index = man.emplace(100);
-  ASSERT_TRUE(man.exist(index));
-  ASSERT_EQ(man.at(index), 100);
-
-  man.remove(index);
-  ASSERT_FALSE(man.exist(index));
-}
-
-TEST(ObjectManager, neg_remove_2)
-{
-  util::ObjectManager<Index, int> man;
-
-  auto index0 = man.emplace(100);
-  auto index1 = man.emplace(200);
-  ASSERT_TRUE(man.exist(index0));
-  ASSERT_EQ(man.at(index0), 100);
-  ASSERT_TRUE(man.exist(index1));
-  ASSERT_EQ(man.at(index1), 200);
-
-  man.remove(index0);
-  ASSERT_FALSE(man.exist(index0));
-  ASSERT_TRUE(man.exist(index1));
-  ASSERT_EQ(man.at(index1), 200);
-}
-
-TEST(ObjectManager, push)
-{
-  util::ObjectManager<Index, int> man;
-
-  auto index = man.push(std::unique_ptr<int>{new int{100}});
-  ASSERT_EQ(man.at(index), 100);
-}
-
-TEST(ObjectManager, const_iterate)
-{
-  util::ObjectManager<Index, int> man;
-
-  auto index0 = man.emplace(100);
-  auto index1 = man.emplace(200);
-  auto index2 = man.emplace(300);
-
-  int sum = 0;
-  man.iterate([&](const Index &index, const int &val) { sum += val; });
-  ASSERT_EQ(sum, 600);
-}
-
-TEST(ObjectManager, non_const_iterate)
-{
-  util::ObjectManager<Index, int> man;
-
-  auto index0 = man.emplace(100);
-  auto index1 = man.emplace(200);
-  auto index2 = man.emplace(300);
-
-  man.iterate([&](const Index &index, int &val) { val += 1; });
-  ASSERT_EQ(man.at(index0), 101);
-  ASSERT_EQ(man.at(index1), 201);
-  ASSERT_EQ(man.at(index2), 301);
-}
diff --git a/tests/.clang-format b/tests/.clang-format
deleted file mode 120000
index 0ff66f331..000000000
--- a/tests/.clang-format
+++ /dev/null
@@ -1 +0,0 @@
-../.clang-format.8
-\ No newline at end of file
diff --git a/tests/nnapi/nnapi_gtest.skip.aarch64-android.acl_neon b/tests/nnapi/nnapi_gtest.skip.aarch64-android.acl_neon
index d443eba03..03bdf0916 100644
--- a/tests/nnapi/nnapi_gtest.skip.aarch64-android.acl_neon
+++ b/tests/nnapi/nnapi_gtest.skip.aarch64-android.acl_neon
@@ -74,6 +74,7 @@ GeneratedTests.fill_ex_1D_float
 GeneratedTests.fill_ex_4D_float
 GeneratedTests.fill_ex_dynamic_nnfw
 GeneratedTests.fully_connected_dynamic_nnfw
+GeneratedTests.fully_connected_float_2_weights_as_inputs
 GeneratedTests.fully_connected_hybrid_1_nnfw
 GeneratedTests.fusedbatchnorm_ex_dynamic_nnfw
 GeneratedTests.fusedbatchnorm_ex_float_fusedbatchnorm_1141
diff --git a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon
index d443eba03..03bdf0916 100644
--- a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon
+++ b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon
@@ -74,6 +74,7 @@ GeneratedTests.fill_ex_1D_float
 GeneratedTests.fill_ex_4D_float
 GeneratedTests.fill_ex_dynamic_nnfw
 GeneratedTests.fully_connected_dynamic_nnfw
+GeneratedTests.fully_connected_float_2_weights_as_inputs
 GeneratedTests.fully_connected_hybrid_1_nnfw
 GeneratedTests.fusedbatchnorm_ex_dynamic_nnfw
 GeneratedTests.fusedbatchnorm_ex_float_fusedbatchnorm_1141
diff --git a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon
index 2a169f6ae..a3320998a 100644
--- a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon
+++ b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon
@@ -74,6 +74,7 @@ GeneratedTests.fill_ex_1D_float
 GeneratedTests.fill_ex_4D_float
 GeneratedTests.fill_ex_dynamic_nnfw
 GeneratedTests.fully_connected_dynamic_nnfw
+GeneratedTests.fully_connected_float_2_weights_as_inputs
 GeneratedTests.fusedbatchnorm_ex_dynamic_nnfw
 GeneratedTests.fusedbatchnorm_ex_float_fusedbatchnorm_1141
 GeneratedTests.gather_dynamic_nnfw
diff --git a/tests/nnfw_api/CMakeLists.txt b/tests/nnfw_api/CMakeLists.txt
index 40142dd15..2e7ef6551 100644
--- a/tests/nnfw_api/CMakeLists.txt
+++ b/tests/nnfw_api/CMakeLists.txt
@@ -34,3 +34,25 @@ target_link_libraries(${RUNTIME_NNFW_API_TEST} ${LIB_PTHREAD} dl)
 target_link_libraries(${RUNTIME_NNFW_API_TEST} circle_schema)
 
 install(TARGETS ${RUNTIME_NNFW_API_TEST} DESTINATION unittest_standalone)
+
+# Install nnpackage test model (add)
+set(NNPACKAGE_MODEL_DIR ${NNAS_PROJECT_SOURCE_DIR}/nnpackage/examples/v1.0.0/add)
+set(NNPACKAGE_INSTALL_TARGET unittest_standalone/nnfw_api_gtest_models)
+
+install(DIRECTORY ${NNPACKAGE_MODEL_DIR} DESTINATION ${NNPACKAGE_INSTALL_TARGET}/add)
+
+# Install nnpackage test model (add_no_manifest)
+set(NNPACKAGE_MODEL ${NNPACKAGE_MODEL_DIR}/add.tflite)
+install(FILES ${NNPACKAGE_MODEL} DESTINATION ${NNPACKAGE_INSTALL_TARGET}/add_no_manifest/add_no_manifest)
+
+# Install nnpackage test model (add_invalid_manifest)
+set(NNPACKAGE_MODEL_DIR ${NNAS_PROJECT_SOURCE_DIR}/nnpackage/examples/v1.0.0/add_invalid_manifest)
+install(DIRECTORY ${NNPACKAGE_MODEL_DIR} DESTINATION ${NNPACKAGE_INSTALL_TARGET}/add_invalid_manifest)
+
+# Install nnpackage test model (if)
+set(NNPACKAGE_MODEL_DIR ${NNAS_PROJECT_SOURCE_DIR}/nnpackage/examples/v1.0.0/if_dynamic)
+install(DIRECTORY ${NNPACKAGE_MODEL_DIR} DESTINATION ${NNPACKAGE_INSTALL_TARGET}/if_dynamic)
+
+# Install nnpackage test model (while)
+set(NNPACKAGE_MODEL_DIR ${NNAS_PROJECT_SOURCE_DIR}/nnpackage/examples/v1.0.0/while_dynamic)
+install(DIRECTORY ${NNPACKAGE_MODEL_DIR} DESTINATION ${NNPACKAGE_INSTALL_TARGET}/while_dynamic)
diff --git a/tests/nnfw_api/src/CircleGen.cc b/tests/nnfw_api/src/CircleGen.cc
index 87b38f238..e3dc57182 100644
--- a/tests/nnfw_api/src/CircleGen.cc
+++ b/tests/nnfw_api/src/CircleGen.cc
@@ -52,6 +52,14 @@ uint32_t CircleGen::addTensor(const TensorParams &params, float scale, int64_t z
   return ind;
 }
 
+uint32_t CircleGen::addTensor(const TensorParams &params, std::vector<float> &scale,
+                              std::vector<int64_t> &zero_point)
+{
+  uint32_t ind = curSubgCtx().tensors.size();
+  curSubgCtx().tensors.emplace_back(buildTensor(params, scale, zero_point));
+  return ind;
+}
+
 uint32_t CircleGen::addTensor(const TensorParams &params, const SparsityParams &sp)
 {
   uint32_t ind = curSubgCtx().tensors.size();
@@ -260,6 +268,14 @@ uint32_t CircleGen::addOperatorMean(const OperatorParams &params, bool keep_dims
                                 circle::BuiltinOptions_ReducerOptions, options);
 }
 
+uint32_t CircleGen::addOperatorMul(const OperatorParams &params,
+                                   circle::ActivationFunctionType actfn)
+{
+  auto options = circle::CreateMulOptions(_fbb, actfn).Union();
+  return addOperatorWithOptions(params, circle::BuiltinOperator_MUL,
+                                circle::BuiltinOptions_MulOptions, options);
+}
+
 uint32_t CircleGen::addOperatorNeg(const OperatorParams &params)
 {
   auto options = circle::CreatePadOptions(_fbb).Union();
@@ -288,6 +304,13 @@ uint32_t CircleGen::addOperatorPadV2(const OperatorParams &params)
                                 circle::BuiltinOptions_PadV2Options, options);
 }
 
+uint32_t CircleGen::addOperatorQuantize(const OperatorParams &params)
+{
+  auto options = circle::CreateQuantizeOptions(_fbb).Union();
+  return addOperatorWithOptions(params, circle::BuiltinOperator_QUANTIZE,
+                                circle::BuiltinOptions_QuantizeOptions, options);
+}
+
 uint32_t CircleGen::addOperatorRank(const OperatorParams &params)
 {
   auto options = circle::CreateRankOptions(_fbb).Union();
@@ -363,12 +386,27 @@ uint32_t CircleGen::addOperatorSelectV2(const OperatorParams &params)
                                 circle::BuiltinOptions_SelectV2Options, options);
 }
 
+uint32_t CircleGen::addOperatorSlice(const OperatorParams &params)
+{
+  auto options = circle::CreateSliceOptions(_fbb).Union();
+  return addOperatorWithOptions(params, circle::BuiltinOperator_SLICE,
+                                circle::BuiltinOptions_SliceOptions, options);
+}
+
+uint32_t CircleGen::addOperatorSoftmax(const OperatorParams &params, float beta)
+{
+  auto options = circle::CreateSoftmaxOptions(_fbb, beta).Union();
+  return addOperatorWithOptions(params, circle::BuiltinOperator_SOFTMAX,
+                                circle::BuiltinOptions_SoftmaxOptions, options);
+}
+
 uint32_t CircleGen::addOperatorSplit(const OperatorParams &params, int32_t num_split)
 {
   auto options = circle::CreateSplitOptions(_fbb, num_split).Union();
   return addOperatorWithOptions(params, circle::BuiltinOperator_SPLIT,
                                 circle::BuiltinOptions_SplitOptions, options);
 }
+
 uint32_t CircleGen::addOperatorStridedSlice(const OperatorParams &params, int32_t begin_mask,
                                             int32_t end_mask, int32_t ellipsis_mask,
                                             int32_t new_axis_mask, int32_t shrink_axis_mask)
@@ -379,6 +417,15 @@ uint32_t CircleGen::addOperatorStridedSlice(const OperatorParams &params, int32_
   return addOperatorWithOptions(params, circle::BuiltinOperator_STRIDED_SLICE,
                                 circle::BuiltinOptions_StridedSliceOptions, options);
 }
+
+uint32_t CircleGen::addOperatorSub(const OperatorParams &params,
+                                   circle::ActivationFunctionType actfn)
+{
+  auto options = circle::CreateSubOptions(_fbb, actfn).Union();
+  return addOperatorWithOptions(params, circle::BuiltinOperator_SUB,
+                                circle::BuiltinOptions_SubOptions, options);
+}
+
 uint32_t CircleGen::addOperatorTile(const OperatorParams &params)
 {
   auto options = circle::CreateTileOptions(_fbb).Union();
@@ -496,6 +543,18 @@ flatbuffers::Offset<circle::Tensor> CircleGen::buildTensor(const TensorParams &p
                               false /* is_variable */, 0 /* sparsity */, 0 /* shape_signature */);
 }
 
+flatbuffers::Offset<circle::Tensor> CircleGen::buildTensor(const TensorParams &params,
+                                                           std::vector<float> &scales,
+                                                           std::vector<int64_t> &zero_points)
+{
+  auto shape = _fbb.CreateVector(params.shape);
+  auto name = _fbb.CreateString(params.name);
+  auto quantization =
+    circle::CreateQuantizationParametersDirect(_fbb, nullptr, nullptr, &scales, &zero_points);
+  return circle::CreateTensor(_fbb, shape, params.tensor_type, params.buffer, name, quantization,
+                              false /* is_variable */, 0 /* sparsity */, 0 /* shape_signature */);
+}
+
 flatbuffers::Offset<circle::SparsityParameters>
 CircleGen::buildSparsityParameters(const SparsityParams &sp)
 {
diff --git a/tests/nnfw_api/src/CircleGen.h b/tests/nnfw_api/src/CircleGen.h
index 666218379..2b88af7b7 100644
--- a/tests/nnfw_api/src/CircleGen.h
+++ b/tests/nnfw_api/src/CircleGen.h
@@ -128,6 +128,8 @@ public:
   uint32_t addBuffer(const uint8_t *buf, size_t size);
   uint32_t addTensor(const TensorParams &params);
   uint32_t addTensor(const TensorParams &params, float scale, int64_t zero_point);
+  uint32_t addTensor(const TensorParams &params, std::vector<float> &scale,
+                     std::vector<int64_t> &zero_point);
   uint32_t addTensor(const TensorParams &params, const SparsityParams &sp);
   void setInputsAndOutputs(const std::vector<int> &inputs, const std::vector<int> &outputs);
   uint32_t nextSubgraph();
@@ -172,11 +174,13 @@ public:
   uint32_t addOperatorLeakyRelu(const OperatorParams &params, float alpha);
   uint32_t addOperatorLess(const OperatorParams &params);
   uint32_t addOperatorLogSoftmax(const OperatorParams &params);
+  uint32_t addOperatorMul(const OperatorParams &params, circle::ActivationFunctionType actfn);
   uint32_t addOperatorMean(const OperatorParams &params, bool keep_dims);
   uint32_t addOperatorNeg(const OperatorParams &params);
   uint32_t addOperatorOneHot(const OperatorParams &params, int32_t axis);
   uint32_t addOperatorPad(const OperatorParams &params);
   uint32_t addOperatorPadV2(const OperatorParams &params);
+  uint32_t addOperatorQuantize(const OperatorParams &params);
   uint32_t addOperatorRank(const OperatorParams &params);
   uint32_t addOperatorReduce(const OperatorParams &params, circle::BuiltinOperator reduce_op,
                              bool keep_dims);
@@ -193,12 +197,15 @@ public:
                             circle::TensorType type = circle::TensorType::TensorType_INT32);
   uint32_t addOperatorSelect(const OperatorParams &params);
   uint32_t addOperatorSelectV2(const OperatorParams &params);
+  uint32_t addOperatorSlice(const OperatorParams &params);
+  uint32_t addOperatorSoftmax(const OperatorParams &params, float beta);
   uint32_t addOperatorSplit(const OperatorParams &params, int32_t num_split);
   uint32_t addOperatorSqrt(const OperatorParams &params);
   uint32_t addOperatorSquare(const OperatorParams &params);
   uint32_t addOperatorStridedSlice(const OperatorParams &params, int32_t begin_mask = 0,
                                    int32_t end_mask = 0, int32_t ellipsis_mask = 0,
                                    int32_t new_axis_mask = 0, int32_t shrink_axis_mask = 0);
+  uint32_t addOperatorSub(const OperatorParams &params, circle::ActivationFunctionType actfn);
   uint32_t addOperatorTile(const OperatorParams &params);
   uint32_t addOperatorTranspose(const OperatorParams &params);
   uint32_t addOperatorWhile(const OperatorParams &params, uint32_t cond_subg, uint32_t body_subg);
@@ -215,6 +222,9 @@ private:
   flatbuffers::Offset<circle::Tensor> buildTensor(const TensorParams &params);
   flatbuffers::Offset<circle::Tensor> buildTensor(const TensorParams &params, float scale,
                                                   int64_t zero_point);
+  flatbuffers::Offset<circle::Tensor> buildTensor(const TensorParams &params,
+                                                  std::vector<float> &scales,
+                                                  std::vector<int64_t> &zero_points);
   flatbuffers::Offset<circle::SparsityParameters> buildSparsityParameters(const SparsityParams &sp);
   flatbuffers::Offset<circle::Tensor> buildTensor(const TensorParams &params,
                                                   const SparsityParams &sp);
diff --git a/tests/nnfw_api/src/ValidationTestAddModelLoaded.cc b/tests/nnfw_api/src/ValidationTestAddModelLoaded.cc
index 5fbb84443..4c482369f 100644
--- a/tests/nnfw_api/src/ValidationTestAddModelLoaded.cc
+++ b/tests/nnfw_api/src/ValidationTestAddModelLoaded.cc
@@ -200,8 +200,6 @@ TEST_F(ValidationTestAddModelLoaded, debug_set_config)
   NNFW_ENSURE_SUCCESS(nnfw_set_config(_session, "GRAPH_DOT_DUMP", "0"));
   NNFW_ENSURE_SUCCESS(nnfw_set_config(_session, "GRAPH_DOT_DUMP", "1"));
   NNFW_ENSURE_SUCCESS(nnfw_set_config(_session, "GRAPH_DOT_DUMP", "2"));
-  NNFW_ENSURE_SUCCESS(nnfw_set_config(_session, "OP_SEQ_MAX_NODE", "0"));
-  NNFW_ENSURE_SUCCESS(nnfw_set_config(_session, "OP_SEQ_MAX_NODE", "1"));
   NNFW_ENSURE_SUCCESS(nnfw_set_config(_session, "EXECUTOR", "Linear"));
   NNFW_ENSURE_SUCCESS(nnfw_set_config(_session, "OP_BACKEND_ALLOPS", "cpu"));
   NNFW_ENSURE_SUCCESS(nnfw_set_config(_session, "USE_SCHEDULER", "0"));
diff --git a/tests/nnfw_api/src/ValidationTestMultipleSessions.cc b/tests/nnfw_api/src/ValidationTestMultipleSessions.cc
index 758e1dbd8..ef00dc6bd 100644
--- a/tests/nnfw_api/src/ValidationTestMultipleSessions.cc
+++ b/tests/nnfw_api/src/ValidationTestMultipleSessions.cc
@@ -15,6 +15,7 @@
  */
 
 #include "fixtures.h"
+#include "one_op_tests/WhileTestModel.h"
 
 TEST_F(ValidationTestTwoSessions, neg_two_sessions_create)
 {
@@ -41,7 +42,7 @@ public:
   CircleBuffer cbuf;
 };
 
-TEST_F(ValidationTestTwoSessionsCreated, two_sessions_run_simple_model)
+TEST_F(ValidationTestTwoSessionsCreated, two_sessions_run_simple_AaveragePool_model)
 {
   constexpr int N = 64, H = 64, W = 64, C = 3;
   AveragePoolModel model(N, H, W, C);
@@ -85,4 +86,55 @@ TEST_F(ValidationTestTwoSessionsCreated, two_sessions_run_simple_model)
   SUCCEED();
 }
 
+TEST_F(ValidationTestTwoSessionsCreated, neg_two_sessions_model_load)
+{
+  constexpr int N = 64, H = 64, W = 64, C = 3;
+  AveragePoolModel model(N, H, W, C);
+
+  NNFW_ENSURE_SUCCESS(
+    nnfw_load_circle_from_buffer(_session1, model.cbuf.buffer(), model.cbuf.size()));
+  ASSERT_EQ(nnfw_load_circle_from_buffer(nullptr, model.cbuf.buffer(), model.cbuf.size()),
+            NNFW_STATUS_UNEXPECTED_NULL);
+}
+
+TEST_F(ValidationTestTwoSessionsCreated, two_sessions_run_simple_While_model)
+{
+  WhileModelLoop10 model;
+
+  NNFW_ENSURE_SUCCESS(
+    nnfw_load_circle_from_buffer(_session1, model.cbuf.buffer(), model.cbuf.size()));
+  NNFW_ENSURE_SUCCESS(
+    nnfw_load_circle_from_buffer(_session2, model.cbuf.buffer(), model.cbuf.size()));
+
+  NNFW_ENSURE_SUCCESS(nnfw_set_available_backends(_session1, "cpu"));
+  NNFW_ENSURE_SUCCESS(nnfw_set_available_backends(_session2, "cpu"));
+
+  NNFW_ENSURE_SUCCESS(nnfw_prepare(_session1));
+  NNFW_ENSURE_SUCCESS(nnfw_prepare(_session2));
+
+  std::vector<float> in_buf1(model.inputCount()); // any value
+  std::vector<float> out_buf1(model.outputputCount());
+
+  NNFW_ENSURE_SUCCESS(nnfw_set_input(_session1, 0, NNFW_TYPE_TENSOR_FLOAT32, in_buf1.data(),
+                                     in_buf1.size() * model.sizeOfDType()));
+  NNFW_ENSURE_SUCCESS(nnfw_set_output(_session1, 0, NNFW_TYPE_TENSOR_FLOAT32, out_buf1.data(),
+                                      out_buf1.size() * model.sizeOfDType()));
+
+  std::vector<float> in_buf2(model.inputCount()); // any value
+  std::vector<float> out_buf2(model.outputputCount());
+
+  NNFW_ENSURE_SUCCESS(nnfw_set_input(_session2, 0, NNFW_TYPE_TENSOR_FLOAT32, in_buf2.data(),
+                                     in_buf2.size() * model.sizeOfDType()));
+  NNFW_ENSURE_SUCCESS(nnfw_set_output(_session2, 0, NNFW_TYPE_TENSOR_FLOAT32, out_buf2.data(),
+                                      out_buf2.size() * model.sizeOfDType()));
+
+  NNFW_ENSURE_SUCCESS(nnfw_run_async(_session1));
+  NNFW_ENSURE_SUCCESS(nnfw_run_async(_session2));
+
+  NNFW_ENSURE_SUCCESS(nnfw_await(_session1));
+  NNFW_ENSURE_SUCCESS(nnfw_await(_session2));
+
+  SUCCEED();
+}
+
 // TODO Write two-session-test with large models run by threads
diff --git a/tests/nnfw_api/src/fixtures.h b/tests/nnfw_api/src/fixtures.h
index 15f51eb65..e2e793ff3 100644
--- a/tests/nnfw_api/src/fixtures.h
+++ b/tests/nnfw_api/src/fixtures.h
@@ -100,8 +100,9 @@ protected:
     ValidationTestSessionCreated::SetUp();
     if (PackageNo == NNPackages::ADD)
     {
-      auto cbuf = genAddModel();
-      NNFW_ENSURE_SUCCESS(nnfw_load_circle_from_buffer(_session, cbuf.buffer(), cbuf.size()));
+      // NOTE the circle buffer must be kept until finishing the test, so keep it as a member
+      _cbuf = genAddModel();
+      NNFW_ENSURE_SUCCESS(nnfw_load_circle_from_buffer(_session, _cbuf.buffer(), _cbuf.size()));
     }
     else
     {
@@ -112,6 +113,9 @@ protected:
   }
 
   void TearDown() override { ValidationTestSessionCreated::TearDown(); }
+
+private:
+  CircleBuffer _cbuf; // Used only for models from buffer, unused for models from files
 };
 
 template <int PackageNo>
@@ -185,6 +189,7 @@ protected:
       auto cbuf = genAddModel();
       NNFW_ENSURE_SUCCESS(nnfw_load_circle_from_buffer(obj.session, cbuf.buffer(), cbuf.size()));
       ASSERT_EQ(nnfw_prepare(obj.session), NNFW_STATUS_NO_ERROR);
+      _cbufs.push_back(std::move(cbuf)); // Keep the buffer so it can outlive the session
 
       uint32_t num_inputs;
       ASSERT_EQ(nnfw_input_size(obj.session, &num_inputs), NNFW_STATUS_NO_ERROR);
@@ -227,6 +232,7 @@ protected:
 
 protected:
   std::array<SessionObject, NUM_SESSIONS> _objects;
+  std::vector<CircleBuffer> _cbufs;
 };
 
 class ValidationTestTwoSessions : public ValidationTest
diff --git a/tests/nnfw_api/src/one_op_tests/Add.cc b/tests/nnfw_api/src/one_op_tests/Add.cc
index e43f6d239..9c0108b9e 100644
--- a/tests/nnfw_api/src/one_op_tests/Add.cc
+++ b/tests/nnfw_api/src/one_op_tests/Add.cc
@@ -69,6 +69,38 @@ TEST_F(GenModelTest, OneOp_Add_VarToVarUint8)
   SUCCEED();
 }
 
+TEST_F(GenModelTest, OneOp_Add_VarToVarInt8)
+{
+  CircleGen cgen;
+  int lhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_INT8}, 1., 2);
+  int rhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_INT8}, 2., 3);
+  int out = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_INT8}, 0.5, -6);
+  cgen.addOperatorAdd({{lhs, rhs}, {out}}, circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({lhs, rhs}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<int8_t>({{1, 3, 2, 4}, {5, -4, -7, 4}}, {{0, -32, -46, 2}}));
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneOp_BroadcastAdd_VarToVarInt8)
+{
+  CircleGen cgen;
+  int lhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_INT8}, 1., 2);
+  int rhs = cgen.addTensor({{1, 1, 1, 1}, circle::TensorType::TensorType_INT8}, 2., 3);
+  int out = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_INT8}, 0.5, -6);
+  cgen.addOperatorAdd({{lhs, rhs}, {out}}, circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({lhs, rhs}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<int8_t>({{1, 3, 2, 4}, {5}}, {{0, 4, 2, 6}}));
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
 TEST_F(GenModelTest, OneOp_Add_VarToVarSame)
 {
   CircleGen cgen;
@@ -119,6 +151,22 @@ TEST_F(GenModelTest, neg_OneOp_Add_InvalidType)
   SUCCEED();
 }
 
+TEST_F(GenModelTest, neg_OneOp_Add_DifferentQuant8Type)
+{
+  CircleGen cgen;
+  int lhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_INT8}, 0.2, -3);
+  int rhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_UINT8}, 0.1, 2);
+  int out = cgen.addTensor({{1, 2, 3, 1}, circle::TensorType::TensorType_INT8});
+  cgen.addOperatorAdd({{lhs, rhs}, {out}}, circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({lhs, rhs}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
+
 TEST_F(GenModelTest, neg_OneOp_Add_InvalidShape)
 {
   CircleGen cgen;
@@ -234,3 +282,20 @@ TEST_F(GenModelTest, neg_OneOp_Add_VarToVarSize0_InvalidShape)
 
   SUCCEED();
 }
+
+TEST_F(GenModelTest, net_OneOp_Add_VarToVarInt16)
+{
+  CircleGen cgen;
+  int lhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_INT16}, 1., 2);
+  int rhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_INT16}, 2., 3);
+  int out = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_INT16}, 0.5, -6);
+  cgen.addOperatorAdd({{lhs, rhs}, {out}}, circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({lhs, rhs}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  // _context->addTestCase(uniformTCD<int8_t>({{1, 3, 2, 4}, {5, -4, -7, 4}}, {{0, -32, -46, 2}}));
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+  _context->expectFailCompile();
+
+  SUCCEED();
+}
diff --git a/tests/nnfw_api/src/one_op_tests/AveragePool2D.cc b/tests/nnfw_api/src/one_op_tests/AveragePool2D.cc
index 3e769835e..1a6ded9c7 100644
--- a/tests/nnfw_api/src/one_op_tests/AveragePool2D.cc
+++ b/tests/nnfw_api/src/one_op_tests/AveragePool2D.cc
@@ -16,38 +16,96 @@
 
 #include "GenModelTest.h"
 
-TEST_F(GenModelTest, OneOp_AvgPool2D)
+struct AvgPool2DParam
 {
-  CircleGen cgen;
-  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
-  int out = cgen.addTensor({{1, 1, 1, 1}, circle::TensorType::TensorType_FLOAT32});
-  cgen.addOperatorAveragePool2D({{in}, {out}}, circle::Padding_SAME, 2, 2, 2, 2,
-                                circle::ActivationFunctionType_NONE);
-  cgen.setInputsAndOutputs({in}, {out});
-
-  _context = std::make_unique<GenModelTestContext>(cgen.finish());
-  _context->addTestCase(uniformTCD<float>({{1, 3, 2, 4}}, {{2.5}}));
-  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
-
-  SUCCEED();
-}
+  TestCaseData tcd;
+  std::vector<int32_t> input_shape;
+  std::vector<int32_t> output_shape;
+  struct filter_stride
+  {
+    int32_t filter_w;
+    int32_t filter_h;
+    int32_t stride_w;
+    int32_t stride_h;
+  } param = {1, 1, 1, 1};
+  struct data_type
+  {
+    circle::TensorType data_type;
+    float scale;
+    int64_t zero_point;
+  } type = {circle::TensorType::TensorType_FLOAT32, 0.0f, 0};
+  std::vector<std::string> backend = {"acl_cl", "acl_neon", "cpu"};
+};
+
+class AveragePool2DVariation : public GenModelTest,
+                               public ::testing::WithParamInterface<AvgPool2DParam>
+{
+};
 
-TEST_F(GenModelTest, OneOp_AvgPool2D_Large)
+TEST_P(AveragePool2DVariation, Test)
 {
+  auto &param = GetParam();
   CircleGen cgen;
-  int in = cgen.addTensor({{1, 16, 32, 2}, circle::TensorType::TensorType_FLOAT32});
-  int out = cgen.addTensor({{1, 1, 2, 2}, circle::TensorType::TensorType_FLOAT32});
-  cgen.addOperatorAveragePool2D({{in}, {out}}, circle::Padding_SAME, 16, 16, 16, 16,
+
+  int in = cgen.addTensor({param.input_shape, param.type.data_type}, param.type.scale,
+                          param.type.zero_point);
+  int out = cgen.addTensor({param.output_shape, param.type.data_type}, param.type.scale,
+                           param.type.zero_point);
+  cgen.addOperatorAveragePool2D({{in}, {out}}, circle::Padding_SAME, param.param.stride_w,
+                                param.param.stride_h, param.param.filter_w, param.param.filter_h,
                                 circle::ActivationFunctionType_NONE);
   cgen.setInputsAndOutputs({in}, {out});
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
-  _context->addTestCase(uniformTCD<float>({std::vector<float>(1024, 99)}, {{99, 99, 99, 99}}));
-  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+  _context->addTestCase(param.tcd);
+  _context->setBackends(param.backend);
 
   SUCCEED();
 }
 
+// Test with different input type and value
+INSTANTIATE_TEST_CASE_P(
+  GenModelTest, AveragePool2DVariation,
+  ::testing::Values(
+    // float data
+    AvgPool2DParam{
+      uniformTCD<float>({{1, 3, 2, 4}}, {{2.5}}), {1, 2, 2, 1}, {1, 1, 1, 1}, {2, 2, 2, 2}},
+    // float data - large
+    AvgPool2DParam{uniformTCD<float>({std::vector<float>(18 * 36 * 2, 99)}, {{99, 99, 99, 99}}),
+                   {1, 18, 36, 2},
+                   {1, 1, 2, 2},
+                   {18, 18, 18, 18}},
+    // uint8_t data
+    AvgPool2DParam{uniformTCD<uint8_t>({{2, 6, 4, 8}}, {{5}}),
+                   {1, 2, 2, 1},
+                   {1, 1, 1, 1},
+                   {2, 2, 2, 2},
+                   {circle::TensorType::TensorType_UINT8, 1.2, 3}},
+    // uint8_t data -large
+    AvgPool2DParam{
+      uniformTCD<uint8_t>({{std::vector<uint8_t>(18 * 36 * 2, 99)}}, {{99, 99, 99, 99}}),
+      {1, 18, 36, 2},
+      {1, 1, 2, 2},
+      {18, 18, 18, 18},
+      {circle::TensorType::TensorType_UINT8, 1.2, 3}},
+    // int8_t data
+    // TODO enable acl-cl, acl-neon backend
+    AvgPool2DParam{uniformTCD<int8_t>({{2, -6, 4, -8}}, {{-2}}),
+                   {1, 2, 2, 1},
+                   {1, 1, 1, 1},
+                   {2, 2, 2, 2},
+                   {circle::TensorType::TensorType_INT8, 2.0, -1},
+                   {"cpu"}},
+    // int8_t data - large
+    // TODO enable acl-cl, acl-neon backend
+    AvgPool2DParam{
+      uniformTCD<int8_t>({{std::vector<int8_t>(18 * 36 * 2, -99)}}, {{-99, -99, -99, -99}}),
+      {1, 18, 36, 2},
+      {1, 1, 2, 2},
+      {18, 18, 18, 18},
+      {circle::TensorType::TensorType_INT8, 2.0, -1},
+      {"cpu"}}));
+
 TEST_F(GenModelTest, neg_OneOp_AvgPool2D_3DInput)
 {
   // 3D Tensors are not supported
diff --git a/tests/nnfw_api/src/one_op_tests/Conv2D.cc b/tests/nnfw_api/src/one_op_tests/Conv2D.cc
index 3822263e6..615673892 100644
--- a/tests/nnfw_api/src/one_op_tests/Conv2D.cc
+++ b/tests/nnfw_api/src/one_op_tests/Conv2D.cc
@@ -88,6 +88,54 @@ TEST_F(GenModelTest, OneOp_Conv2D_Dilation)
   SUCCEED();
 }
 
+TEST_F(GenModelTest, OneOp_Conv2D_I8)
+{
+  CircleGen cgen;
+  std::vector<int8_t> weight_data{1, 2, 3, 4, 5, 6, 7, 8, 9};
+  uint32_t weight_buf = cgen.addBuffer(weight_data);
+  std::vector<int32_t> bias_data{0, 2, 4};
+  uint32_t bias_buf = cgen.addBuffer(bias_data);
+  int in = cgen.addTensor({{1, 1, 1, 3}, circle::TensorType::TensorType_INT8}, 0.5, 0);
+  int weight =
+    cgen.addTensor({{3, 1, 1, 3}, circle::TensorType::TensorType_INT8, weight_buf}, 0.5, 0);
+  int bias = cgen.addTensor({{1, 1, 1, 3}, circle::TensorType::TensorType_INT32, bias_buf}, 1.0, 0);
+  int out = cgen.addTensor({{1, 1, 1, 3}, circle::TensorType::TensorType_INT8}, 1.0, 0);
+  cgen.addOperatorConv2D({{in, weight, bias}, {out}}, circle::Padding_VALID, 1, 1,
+                         circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<int8_t>({{10, 10, 10}}, {{15, 38, 61}}));
+  _context->setBackends({"cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneOp_Conv2D_I8_PerChannel)
+{
+  CircleGen cgen;
+  std::vector<int8_t> weight_data{1, 2, 3, 1, 2, 3, 7, 8, 9};
+  uint32_t weight_buf = cgen.addBuffer(weight_data);
+  std::vector<int32_t> bias_data{0, 0, 0};
+  uint32_t bias_buf = cgen.addBuffer(bias_data);
+  int in = cgen.addTensor({{1, 1, 1, 3}, circle::TensorType::TensorType_INT8}, 0.5, 0);
+  std::vector<float> weight_scales = {0.5, 1, 0.5};
+  std::vector<int64_t> weight_zeropoints = {0, 0, 0};
+  int weight = cgen.addTensor({{3, 1, 1, 3}, circle::TensorType::TensorType_INT8, weight_buf},
+                              weight_scales, weight_zeropoints);
+  int bias = cgen.addTensor({{1, 1, 1, 3}, circle::TensorType::TensorType_INT32, bias_buf}, 1.0, 0);
+  int out = cgen.addTensor({{1, 1, 1, 3}, circle::TensorType::TensorType_INT8}, 1.0, 0);
+  cgen.addOperatorConv2D({{in, weight, bias}, {out}}, circle::Padding_VALID, 1, 1,
+                         circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<int8_t>({{10, 10, 10}}, {{15, 30, 60}}));
+  _context->setBackends({"cpu"});
+
+  SUCCEED();
+}
+
 TEST_F(GenModelTest, neg_OneOp_Conv2D_Type)
 {
   CircleGen cgen;
@@ -150,3 +198,51 @@ TEST_F(GenModelTest, neg_OneOp_Conv2D_Dilation)
 
   SUCCEED();
 }
+
+TEST_F(GenModelTest, neg_OneOp_Conv2D_I8_NonZero_ZeroPoint)
+{
+  CircleGen cgen;
+  std::vector<int8_t> weight_data{1, 2, 3, 4, 5, 6, 7, 8, 9};
+  uint32_t weight_buf = cgen.addBuffer(weight_data);
+  std::vector<int32_t> bias_data{0, 2, 4};
+  uint32_t bias_buf = cgen.addBuffer(bias_data);
+  int in = cgen.addTensor({{1, 1, 1, 3}, circle::TensorType::TensorType_INT8}, 0.5, 0);
+  int weight =
+    cgen.addTensor({{3, 1, 1, 3}, circle::TensorType::TensorType_INT8, weight_buf}, 0.5, 17);
+  int bias = cgen.addTensor({{1, 1, 1, 3}, circle::TensorType::TensorType_INT32, bias_buf}, 1.0, 0);
+  int out = cgen.addTensor({{1, 1, 1, 3}, circle::TensorType::TensorType_INT8}, 1.0, 0);
+  cgen.addOperatorConv2D({{in, weight, bias}, {out}}, circle::Padding_VALID, 1, 1,
+                         circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"cpu"});
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_Conv2D_I8_NonZero_ZeroPoints)
+{
+  CircleGen cgen;
+  std::vector<int8_t> weight_data{1, 2, 3, 4, 5, 6, 7, 8, 9};
+  uint32_t weight_buf = cgen.addBuffer(weight_data);
+  std::vector<int32_t> bias_data{0, 2, 4};
+  uint32_t bias_buf = cgen.addBuffer(bias_data);
+  int in = cgen.addTensor({{1, 1, 1, 3}, circle::TensorType::TensorType_INT8}, 0.5, 0);
+  std::vector<float> weight_scales = {0.5, 1, 0.5};
+  std::vector<int64_t> weight_zeropoints = {0, 0, 10};
+  int weight = cgen.addTensor({{3, 1, 1, 3}, circle::TensorType::TensorType_INT8, weight_buf},
+                              weight_scales, weight_zeropoints);
+  int bias = cgen.addTensor({{1, 1, 1, 3}, circle::TensorType::TensorType_INT32, bias_buf}, 1.0, 0);
+  int out = cgen.addTensor({{1, 1, 1, 3}, circle::TensorType::TensorType_FLOAT32}, 1.0, 0);
+  cgen.addOperatorConv2D({{in, weight, bias}, {out}}, circle::Padding_VALID, 1, 1,
+                         circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"cpu"});
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
diff --git a/tests/nnfw_api/src/one_op_tests/DepthwiseConv2D.cc b/tests/nnfw_api/src/one_op_tests/DepthwiseConv2D.cc
index 87c67f10a..57f448b56 100644
--- a/tests/nnfw_api/src/one_op_tests/DepthwiseConv2D.cc
+++ b/tests/nnfw_api/src/one_op_tests/DepthwiseConv2D.cc
@@ -170,8 +170,25 @@ CircleBuffer genNegTestDepthwiseConv2DModel(circle::Padding padding, int stride_
   return cgen.finish();
 }
 
-CircleBuffer genSimpleDepthwiseConv2DQuantizedModel(int stride, int input_depth,
-                                                    int depth_multiplier)
+template <typename T> struct DepthwiseConv2DQuantTestParam
+{
+  int stride = 1; // Used for both height and width
+  int input_depth = 1;
+  int depth_multiplier = 1;
+  std::vector<T> ref_output;
+};
+
+template <typename T>
+class DepthwiseConv2DQuantTest
+  : public GenModelTest,
+    public ::testing::WithParamInterface<DepthwiseConv2DQuantTestParam<T>>
+{
+};
+
+using DepthwiseConv2DQuantTestParamU8 = DepthwiseConv2DQuantTestParam<uint8_t>;
+using DepthwiseConv2DQuantTestU8 = DepthwiseConv2DQuantTest<uint8_t>;
+
+CircleBuffer genDepthwiseConv2DQuantU8Model(int stride, int input_depth, int depth_multiplier)
 {
   assert(1 <= stride && stride <= 2);
   assert(1 <= input_depth && input_depth <= 16);
@@ -198,20 +215,7 @@ CircleBuffer genSimpleDepthwiseConv2DQuantizedModel(int stride, int input_depth,
   return cgen.finish();
 }
 
-struct DepthwiseConv2DVariationParam
-{
-  int stride = 1; // Used for both height and width
-  int input_depth = 1;
-  int depth_multiplier = 1;
-  std::vector<uint8_t> ref_output;
-};
-
-class DepthwiseConv2DVariation : public GenModelTest,
-                                 public ::testing::WithParamInterface<DepthwiseConv2DVariationParam>
-{
-};
-
-TEST_P(DepthwiseConv2DVariation, Test)
+TEST_P(DepthwiseConv2DQuantTestU8, Test)
 {
   // Same input is used for all tests but output differs
   static const std::vector<uint8_t> input64{
@@ -219,8 +223,8 @@ TEST_P(DepthwiseConv2DVariation, Test)
     2, 4, 6, 8, 2, 4, 6, 8, 2, 4, 6, 8, 2, 4, 6, 8, 2, 3, 5, 8, 8, 5, 3, 2, 1, 2, 3, 4, 5, 4, 3, 2};
 
   auto &param = GetParam();
-  _context = std::make_unique<GenModelTestContext>(genSimpleDepthwiseConv2DQuantizedModel(
-    param.stride, param.input_depth, param.depth_multiplier));
+  _context = std::make_unique<GenModelTestContext>(
+    genDepthwiseConv2DQuantU8Model(param.stride, param.input_depth, param.depth_multiplier));
   std::vector<uint8_t> ref_input(input64.begin(), input64.begin() + param.input_depth * 4);
   _context->addTestCase(uniformTCD<uint8_t>({ref_input}, {param.ref_output}));
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
@@ -231,39 +235,122 @@ TEST_P(DepthwiseConv2DVariation, Test)
 // Test with different InputDepth and DepthMultiplier. The values are intended to test optimized CPU
 // kernels.
 INSTANTIATE_TEST_CASE_P(
-  GenModelTest, DepthwiseConv2DVariation,
+  GenModelTest, DepthwiseConv2DQuantTestU8,
   ::testing::Values(
     // Stride == 1
-    DepthwiseConv2DVariationParam{1, 8, 1, std::vector<uint8_t>{0, 3, 5, 8, 0, 3, 5, 8}},
-    DepthwiseConv2DVariationParam{1, 4, 2, std::vector<uint8_t>{0, 0, 2, 3, 0, 2, 6, 9}},
-    DepthwiseConv2DVariationParam{
+    DepthwiseConv2DQuantTestParamU8{1, 8, 1, std::vector<uint8_t>{0, 3, 5, 8, 0, 3, 5, 8}},
+    DepthwiseConv2DQuantTestParamU8{1, 4, 2, std::vector<uint8_t>{0, 0, 2, 3, 0, 2, 6, 9}},
+    DepthwiseConv2DQuantTestParamU8{
       1, 2, 8, std::vector<uint8_t>{0, 1, 2, 3, 0, 1, 2, 3, 0, 2, 4, 6, 0, 2, 4, 6}},
-    DepthwiseConv2DVariationParam{1, 2, 2, std::vector<uint8_t>{0, 1, 4, 6}},
-    DepthwiseConv2DVariationParam{1, 2, 1, std::vector<uint8_t>{2, 5}},
-    DepthwiseConv2DVariationParam{1, 1, 2, std::vector<uint8_t>{2, 4}},
-    DepthwiseConv2DVariationParam{1, 1, 4, std::vector<uint8_t>{0, 2, 3, 5}},
-    DepthwiseConv2DVariationParam{1, 4, 1, std::vector<uint8_t>{0, 1, 4, 9}},
-    DepthwiseConv2DVariationParam{
+    DepthwiseConv2DQuantTestParamU8{1, 2, 2, std::vector<uint8_t>{0, 1, 4, 6}},
+    DepthwiseConv2DQuantTestParamU8{1, 2, 1, std::vector<uint8_t>{2, 5}},
+    DepthwiseConv2DQuantTestParamU8{1, 1, 2, std::vector<uint8_t>{2, 4}},
+    DepthwiseConv2DQuantTestParamU8{1, 1, 4, std::vector<uint8_t>{0, 2, 3, 5}},
+    DepthwiseConv2DQuantTestParamU8{1, 4, 1, std::vector<uint8_t>{0, 1, 4, 9}},
+    DepthwiseConv2DQuantTestParamU8{
       1, 4, 4, std::vector<uint8_t>{0, 0, 0, 0, 0, 1, 2, 3, 0, 2, 4, 6, 0, 3, 6, 9}},
-    DepthwiseConv2DVariationParam{1, 12, 1,
-                                  std::vector<uint8_t>{0, 3, 7, 12, 0, 4, 7, 12, 0, 4, 9, 16}},
+    DepthwiseConv2DQuantTestParamU8{1, 12, 1,
+                                    std::vector<uint8_t>{0, 3, 7, 12, 0, 4, 7, 12, 0, 4, 9, 16}},
     // Stride == 2
-    DepthwiseConv2DVariationParam{2, 4, 1, std::vector<uint8_t>{0, 1, 4, 9}},
-    DepthwiseConv2DVariationParam{2, 2, 1, std::vector<uint8_t>{2, 5}},
-    DepthwiseConv2DVariationParam{2, 1, 8, std::vector<uint8_t>{0, 2, 3, 5, 0, 2, 3, 5}},
-    DepthwiseConv2DVariationParam{2, 1, 32, std::vector<uint8_t>{0, 2, 3, 5, 0, 2, 3, 5, 0, 2, 3,
-                                                                 5, 0, 2, 3, 5, 0, 2, 3, 5, 0, 2,
-                                                                 3, 5, 0, 2, 3, 5, 0, 2, 3, 5}},
-    DepthwiseConv2DVariationParam{
+    DepthwiseConv2DQuantTestParamU8{2, 4, 1, std::vector<uint8_t>{0, 1, 4, 9}},
+    DepthwiseConv2DQuantTestParamU8{2, 2, 1, std::vector<uint8_t>{2, 5}},
+    DepthwiseConv2DQuantTestParamU8{2, 1, 8, std::vector<uint8_t>{0, 2, 3, 5, 0, 2, 3, 5}},
+    DepthwiseConv2DQuantTestParamU8{2, 1, 32, std::vector<uint8_t>{0, 2, 3, 5, 0, 2, 3, 5, 0, 2, 3,
+                                                                   5, 0, 2, 3, 5, 0, 2, 3, 5, 0, 2,
+                                                                   3, 5, 0, 2, 3, 5, 0, 2, 3, 5}},
+    DepthwiseConv2DQuantTestParamU8{
       2, 1, 20, std::vector<uint8_t>{0, 2, 3, 5, 0, 2, 3, 5, 0, 2, 3, 5, 0, 2, 3, 5, 0, 2, 3, 5}},
-    DepthwiseConv2DVariationParam{
+    DepthwiseConv2DQuantTestParamU8{
       2, 1, 16, std::vector<uint8_t>{0, 2, 3, 5, 0, 2, 3, 5, 0, 2, 3, 5, 0, 2, 3, 5}},
-    DepthwiseConv2DVariationParam{2, 8, 1, std::vector<uint8_t>{0, 3, 5, 8, 0, 3, 5, 8}},
-    DepthwiseConv2DVariationParam{
+    DepthwiseConv2DQuantTestParamU8{2, 8, 1, std::vector<uint8_t>{0, 3, 5, 8, 0, 3, 5, 8}},
+    DepthwiseConv2DQuantTestParamU8{
       2, 8, 2, std::vector<uint8_t>{0, 3, 5, 8, 0, 3, 5, 8, 0, 3, 5, 8, 0, 3, 5, 8}},
-    DepthwiseConv2DVariationParam{
+    DepthwiseConv2DQuantTestParamU8{
       2, 16, 1, std::vector<uint8_t>{0, 3, 8, 16, 0, 4, 7, 12, 0, 3, 7, 13, 0, 4, 7, 12}}));
 
+using DepthwiseConv2DQuantTestParamI8 = DepthwiseConv2DQuantTestParam<int8_t>;
+using DepthwiseConv2DQuantTestI8 = DepthwiseConv2DQuantTest<int8_t>;
+
+CircleBuffer genDepthwiseConv2DQuantI8Model(int stride, int input_depth, int depth_multiplier)
+{
+  assert(1 <= stride && stride <= 2);
+  assert(1 <= input_depth && input_depth <= 16);
+  assert(1 <= depth_multiplier && depth_multiplier <= 32);
+
+  const int output_depth = input_depth * depth_multiplier;
+  assert(1 <= output_depth && output_depth <= 32);
+
+  CircleGen cgen;
+  uint32_t ker_buf = cgen.addBuffer(std::vector<int8_t>{
+    0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1,
+    2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
+    0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1,
+    2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
+    0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3});
+  uint32_t bias_buf = cgen.addBuffer(std::vector<int32_t>(output_depth, 0));
+  int in = cgen.addTensor({{1, 2, 2, input_depth}, circle::TensorType_INT8}, 0.5, 0);
+  int ker = cgen.addTensor({{1, 2, 2, output_depth}, circle::TensorType_INT8, ker_buf}, 0.5, 0);
+  int bias = cgen.addTensor({{output_depth}, circle::TensorType_INT32, bias_buf}, 0.25, 0);
+  int out = cgen.addTensor({{1, 1, 1, output_depth}, circle::TensorType_INT8}, 1, 0);
+  cgen.addOperatorDepthwiseConv2D({{in, ker, bias}, {out}}, circle::Padding::Padding_VALID, stride,
+                                  stride, depth_multiplier, circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({in}, {out});
+  return cgen.finish();
+}
+
+TEST_P(DepthwiseConv2DQuantTestI8, Test)
+{
+  // Same input is used for all tests but output differs
+  static const std::vector<int8_t> input64{
+    0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 5, 4, 3, 2, 5, 4, 3, 2, 5, 4, 3, 2, 5, 4, 3, 2,
+    2, 4, 6, 8, 2, 4, 6, 8, 2, 4, 6, 8, 2, 4, 6, 8, 2, 3, 5, 8, 8, 5, 3, 2, 1, 2, 3, 4, 5, 4, 3, 2};
+
+  auto &param = GetParam();
+  _context = std::make_unique<GenModelTestContext>(
+    genDepthwiseConv2DQuantI8Model(param.stride, param.input_depth, param.depth_multiplier));
+  std::vector<int8_t> ref_input(input64.begin(), input64.begin() + param.input_depth * 4);
+  _context->addTestCase(uniformTCD<int8_t>({ref_input}, {param.ref_output}));
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
+// Test with different InputDepth and DepthMultiplier. The values are intended to test optimized CPU
+// kernels.
+INSTANTIATE_TEST_CASE_P(
+  GenModelTest, DepthwiseConv2DQuantTestI8,
+  ::testing::Values(
+    // Stride == 1
+    DepthwiseConv2DQuantTestParamI8{1, 8, 1, std::vector<int8_t>{0, 3, 5, 8, 0, 3, 5, 8}},
+    DepthwiseConv2DQuantTestParamI8{1, 4, 2, std::vector<int8_t>{0, 0, 2, 3, 0, 2, 6, 9}},
+    DepthwiseConv2DQuantTestParamI8{
+      1, 2, 8, std::vector<int8_t>{0, 1, 2, 3, 0, 1, 2, 3, 0, 2, 4, 6, 0, 2, 4, 6}},
+    DepthwiseConv2DQuantTestParamI8{1, 2, 2, std::vector<int8_t>{0, 1, 4, 6}},
+    DepthwiseConv2DQuantTestParamI8{1, 2, 1, std::vector<int8_t>{2, 5}},
+    DepthwiseConv2DQuantTestParamI8{1, 1, 2, std::vector<int8_t>{2, 4}},
+    DepthwiseConv2DQuantTestParamI8{1, 1, 4, std::vector<int8_t>{0, 2, 3, 5}},
+    DepthwiseConv2DQuantTestParamI8{1, 4, 1, std::vector<int8_t>{0, 1, 4, 9}},
+    DepthwiseConv2DQuantTestParamI8{
+      1, 4, 4, std::vector<int8_t>{0, 0, 0, 0, 0, 1, 2, 3, 0, 2, 4, 6, 0, 3, 6, 9}},
+    DepthwiseConv2DQuantTestParamI8{1, 12, 1,
+                                    std::vector<int8_t>{0, 3, 7, 12, 0, 4, 7, 12, 0, 4, 9, 16}},
+    // Stride == 2
+    DepthwiseConv2DQuantTestParamI8{2, 4, 1, std::vector<int8_t>{0, 1, 4, 9}},
+    DepthwiseConv2DQuantTestParamI8{2, 2, 1, std::vector<int8_t>{2, 5}},
+    DepthwiseConv2DQuantTestParamI8{2, 1, 8, std::vector<int8_t>{0, 2, 3, 5, 0, 2, 3, 5}},
+    DepthwiseConv2DQuantTestParamI8{2, 1, 32, std::vector<int8_t>{0, 2, 3, 5, 0, 2, 3, 5, 0, 2, 3,
+                                                                  5, 0, 2, 3, 5, 0, 2, 3, 5, 0, 2,
+                                                                  3, 5, 0, 2, 3, 5, 0, 2, 3, 5}},
+    DepthwiseConv2DQuantTestParamI8{
+      2, 1, 20, std::vector<int8_t>{0, 2, 3, 5, 0, 2, 3, 5, 0, 2, 3, 5, 0, 2, 3, 5, 0, 2, 3, 5}},
+    DepthwiseConv2DQuantTestParamI8{
+      2, 1, 16, std::vector<int8_t>{0, 2, 3, 5, 0, 2, 3, 5, 0, 2, 3, 5, 0, 2, 3, 5}},
+    DepthwiseConv2DQuantTestParamI8{2, 8, 1, std::vector<int8_t>{0, 3, 5, 8, 0, 3, 5, 8}},
+    DepthwiseConv2DQuantTestParamI8{
+      2, 8, 2, std::vector<int8_t>{0, 3, 5, 8, 0, 3, 5, 8, 0, 3, 5, 8, 0, 3, 5, 8}},
+    DepthwiseConv2DQuantTestParamI8{
+      2, 16, 1, std::vector<int8_t>{0, 3, 8, 16, 0, 4, 7, 12, 0, 3, 7, 13, 0, 4, 7, 12}}));
+
 TEST_F(GenModelTest, neg_OneOp_DepthwiseConv2D_InvalidPaddingType)
 {
   _context = std::make_unique<GenModelTestContext>(genNegTestDepthwiseConv2DModel(
@@ -275,3 +362,27 @@ TEST_F(GenModelTest, neg_OneOp_DepthwiseConv2D_InvalidPaddingType)
 }
 
 // TODO add other invalid operation tests like above
+
+TEST_F(GenModelTest, neg_OneOp_DepthwiseConv2D_I8_NonZero_ZeroPoints)
+{
+  CircleGen cgen;
+  std::vector<int8_t> weight_data{1, 2, 3, 4, 5, 6, 7, 8};
+  uint32_t weight_buf = cgen.addBuffer(weight_data);
+  std::vector<int32_t> bias_data{0, 2};
+  uint32_t bias_buf = cgen.addBuffer(bias_data);
+  int in = cgen.addTensor({{1, 3, 3, 2}, circle::TensorType::TensorType_INT8}, 0.5, 0);
+  std::vector<float> weight_scales = {0.5, 1};
+  std::vector<int64_t> weight_zeropoints = {0, 10};
+  int weight = cgen.addTensor({{1, 2, 2, 2}, circle::TensorType::TensorType_INT8, weight_buf},
+                              weight_scales, weight_zeropoints);
+  int bias = cgen.addTensor({{1, 1, 1, 2}, circle::TensorType::TensorType_INT32, bias_buf});
+  int out = cgen.addTensor({{1, 2, 2, 2}, circle::TensorType::TensorType_FLOAT32}, 1.0, 0);
+  cgen.addOperatorDepthwiseConv2D({{in, weight, bias}, {out}}, circle::Padding_VALID, 1, 1, 2,
+                                  circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({in}, {out});
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"cpu"});
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
diff --git a/tests/nnfw_api/src/one_op_tests/Mul.cc b/tests/nnfw_api/src/one_op_tests/Mul.cc
new file mode 100644
index 000000000..0c7944613
--- /dev/null
+++ b/tests/nnfw_api/src/one_op_tests/Mul.cc
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GenModelTest.h"
+
+#include <memory>
+
+TEST_F(GenModelTest, OneOp_Mul_Uint8_VarVar)
+{
+  CircleGen cgen;
+  int lhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_UINT8}, 1.0, 3);
+  int rhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_UINT8}, 2.0, 1);
+  int out = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_UINT8}, 0.5, 2);
+  cgen.addOperatorMul({{lhs, rhs}, {out}}, circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({lhs, rhs}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<uint8_t>({{3, 12, 5, 2}, {5, 4, 7, 0}}, {{2, 110, 50, 6}}));
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneOp_Mul_Int8_VarVar)
+{
+  CircleGen cgen;
+  int lhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_INT8}, 1.0, 2);
+  int rhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_INT8}, 2.0, 3);
+  int out = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_INT8}, 0.5, -6);
+  cgen.addOperatorMul({{lhs, rhs}, {out}}, circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({lhs, rhs}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<int8_t>({{1, 3, 2, 4}, {5, -4, -7, 4}}, {{-14, -34, -6, 2}}));
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneOp_MulBroadcast_Uint8_VarVar)
+{
+  CircleGen cgen;
+  int lhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_UINT8}, 1.0, 3);
+  int rhs = cgen.addTensor({{1, 1, 1, 1}, circle::TensorType::TensorType_UINT8}, 2.0, 1);
+  int out = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_UINT8}, 0.5, 2);
+  cgen.addOperatorMul({{lhs, rhs}, {out}}, circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({lhs, rhs}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<uint8_t>({{3, 12, 5, 4}, {5}}, {{2, 146, 34, 18}}));
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneOp_MulBroadcast_Int8_VarVar)
+{
+  CircleGen cgen;
+  int lhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_INT8}, 1.0, 2);
+  int rhs = cgen.addTensor({{1, 1, 1, 1}, circle::TensorType::TensorType_INT8}, 2.0, 3);
+  int out = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_INT8}, 0.5, -6);
+  cgen.addOperatorMul({{lhs, rhs}, {out}}, circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({lhs, rhs}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<int8_t>({{1, 3, 2, 4}, {5}}, {{-14, 2, -6, 10}}));
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_Mul_InvalidType)
+{
+  CircleGen cgen;
+  int lhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int rhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_UINT8}, 0.1, 2);
+  int out = cgen.addTensor({{1, 2, 3, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorMul({{lhs, rhs}, {out}}, circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({lhs, rhs}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_Mul_InvalidShape)
+{
+  CircleGen cgen;
+  int lhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int rhs = cgen.addTensor({{1, 2, 3, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 2, 3, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorMul({{lhs, rhs}, {out}}, circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({lhs, rhs}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+  _context->expectFailCompile();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_Mul_OneOperand)
+{
+  CircleGen cgen;
+  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 2, 3, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorMul({{in}, {out}}, circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_Mul_ThreeOperands)
+{
+  CircleGen cgen;
+  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 2, 3, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorMul({{in, in, in}, {out}}, circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
diff --git a/tests/nnfw_api/src/one_op_tests/Pad.cc b/tests/nnfw_api/src/one_op_tests/Pad.cc
index 380c1a3cd..42971da79 100644
--- a/tests/nnfw_api/src/one_op_tests/Pad.cc
+++ b/tests/nnfw_api/src/one_op_tests/Pad.cc
@@ -16,25 +16,56 @@
 
 #include "GenModelTest.h"
 
-TEST_F(GenModelTest, OneOp_Pad)
+// Input shape: {1, 2, 2, 1}
+// Padding: {0, 0, 1, 1, 1, 1, 0, 0}
+// Output shape: {1, 4, 4, 1}
+struct PadParam
 {
+  TestCaseData tcd;
+  circle::TensorType data_type = circle::TensorType::TensorType_FLOAT32;
+  float scale = 0.0f;
+  int64_t zero_point = 0;
+};
+
+class PadVariation : public GenModelTest, public ::testing::WithParamInterface<PadParam>
+{
+};
+
+TEST_P(PadVariation, Test)
+{
+  auto &param = GetParam();
+
   CircleGen cgen;
-  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int in = cgen.addTensor({{1, 2, 2, 1}, param.data_type}, param.scale, param.zero_point);
   std::vector<int32_t> padding_data{0, 0, 1, 1, 1, 1, 0, 0};
   uint32_t padding_buf = cgen.addBuffer(padding_data);
   int padding = cgen.addTensor({{4, 2}, circle::TensorType::TensorType_INT32, padding_buf});
-  int out = cgen.addTensor({{1, 4, 4, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 4, 4, 1}, param.data_type}, param.scale, param.zero_point);
 
   cgen.addOperatorPad({{in, padding}, {out}});
   cgen.setInputsAndOutputs({in}, {out});
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
-  _context->addTestCase(
-    uniformTCD<float>({{1, 2, 3, 4}}, {{0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4, 0, 0, 0, 0, 0}}));
+  _context->addTestCase(param.tcd);
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
 
   SUCCEED();
 }
 
+// Test with different value type
+INSTANTIATE_TEST_CASE_P(
+  GenModelTest, PadVariation,
+  ::testing::Values(
+    // float value
+    PadParam{uniformTCD<float>({{1, 2, 3, 4}}, {{0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4, 0, 0, 0, 0, 0}})},
+    // uint8 value
+    PadParam{
+      uniformTCD<uint8_t>({{1, 2, 3, 4}}, {{8, 8, 8, 8, 8, 1, 2, 8, 8, 3, 4, 8, 8, 8, 8, 8}}),
+      circle::TensorType::TensorType_UINT8, 1.0, 8},
+    // int8 value
+    PadParam{uniformTCD<int8_t>({{-2, -1, 1, 2}},
+                                {{-5, -5, -5, -5, -5, -2, -1, -5, -5, 1, 2, -5, -5, -5, -5, -5}}),
+             circle::TensorType::TensorType_INT8, 1.0, -5}));
+
 TEST_F(GenModelTest, neg_OneOp_Pad_InvalidPadRank)
 {
   CircleGen cgen;
@@ -91,3 +122,39 @@ TEST_F(GenModelTest, neg_OneOp_Pad_InvalidPadDim1)
 
   SUCCEED();
 }
+
+TEST_F(GenModelTest, neg_OneOp_Pad_Type)
+{
+  CircleGen cgen;
+  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  std::vector<int32_t> padding_data{0, 0, 1, 1, 1, 1, 0, 0};
+  uint32_t padding_buf = cgen.addBuffer(padding_data);
+  int padding = cgen.addTensor({{4, 2}, circle::TensorType::TensorType_INT32, padding_buf});
+  int out = cgen.addTensor({{1, 4, 4, 1}, circle::TensorType::TensorType_UINT8}, 1.0, 1);
+
+  cgen.addOperatorPad({{in, padding}, {out}});
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_Pad_QuantParam)
+{
+  CircleGen cgen;
+  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_UINT8}, 1.0, 1);
+  std::vector<int32_t> padding_data{0, 0, 1, 1, 1, 1, 0, 0};
+  uint32_t padding_buf = cgen.addBuffer(padding_data);
+  int padding = cgen.addTensor({{4, 2}, circle::TensorType::TensorType_INT32, padding_buf});
+  int out = cgen.addTensor({{1, 4, 4, 1}, circle::TensorType::TensorType_UINT8}, 1.0, 3);
+
+  cgen.addOperatorPad({{in, padding}, {out}});
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
diff --git a/tests/nnfw_api/src/one_op_tests/PadV2.cc b/tests/nnfw_api/src/one_op_tests/PadV2.cc
index f9fe5f644..3db2187b2 100644
--- a/tests/nnfw_api/src/one_op_tests/PadV2.cc
+++ b/tests/nnfw_api/src/one_op_tests/PadV2.cc
@@ -112,3 +112,49 @@ TEST_F(GenModelTest, neg_OneOp_PadV2_InvalidPadDim1)
 
   SUCCEED();
 }
+
+TEST_F(GenModelTest, neg_OneOp_PadV2_Type)
+{
+  CircleGen cgen;
+  int in = cgen.addTensor({{1, 1, 1, 1}, circle::TensorType::TensorType_FLOAT32});
+  std::vector<int32_t> padding_data{1, 1, 1, 1};
+  uint32_t padding_buf = cgen.addBuffer(padding_data);
+  int padding = cgen.addTensor({{4, 2}, circle::TensorType::TensorType_INT32, padding_buf});
+  std::vector<uint8_t> padding_value_data{3};
+  uint32_t padding_value_buf = cgen.addBuffer(padding_value_data);
+  int padding_value =
+    cgen.addTensor({{1}, circle::TensorType::TensorType_UINT8, padding_value_buf}, 1.0, 1);
+
+  int out = cgen.addTensor({{1, 4, 4, 1}, circle::TensorType::TensorType_FLOAT32});
+
+  cgen.addOperatorPadV2({{in, padding, padding_value}, {out}});
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_PadV2_QuantParam)
+{
+  CircleGen cgen;
+  int in = cgen.addTensor({{1, 1, 1, 1}, circle::TensorType::TensorType_UINT8}, 1.0, 2);
+  std::vector<int32_t> padding_data{1, 1, 1, 1};
+  uint32_t padding_buf = cgen.addBuffer(padding_data);
+  int padding = cgen.addTensor({{4, 2}, circle::TensorType::TensorType_INT32, padding_buf});
+  std::vector<uint8_t> padding_value_data{3};
+  uint32_t padding_value_buf = cgen.addBuffer(padding_value_data);
+  int padding_value =
+    cgen.addTensor({{1}, circle::TensorType::TensorType_UINT8, padding_value_buf}, 1.0, 1);
+
+  int out = cgen.addTensor({{1, 4, 4, 1}, circle::TensorType::TensorType_UINT8}, 1.0, 1);
+
+  cgen.addOperatorPadV2({{in, padding, padding_value}, {out}});
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
diff --git a/tests/nnfw_api/src/one_op_tests/Quantize.cc b/tests/nnfw_api/src/one_op_tests/Quantize.cc
new file mode 100644
index 000000000..5ab4d6297
--- /dev/null
+++ b/tests/nnfw_api/src/one_op_tests/Quantize.cc
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GenModelTest.h"
+
+#include <memory>
+
+CircleGen genSimpleQuantizeModel(circle::TensorType from_t, float input_scale, int input_zeropoint,
+                                 circle::TensorType to_t, float output_scale, int output_zeropoint)
+{
+  CircleGen cgen;
+  int in = cgen.addTensor({{1, 4, 4, 1}, from_t}, input_scale, input_zeropoint);
+  int out = cgen.addTensor({{1, 4, 4, 1}, to_t}, output_scale, output_zeropoint);
+  cgen.addOperatorQuantize({{in}, {out}});
+  cgen.setInputsAndOutputs({in}, {out});
+  return cgen;
+}
+
+TEST_F(GenModelTest, OneOp_Quantize_Uint8toInt8)
+{
+  CircleGen cgen =
+    genSimpleQuantizeModel(circle::TensorType_UINT8, 1., 128, circle::TensorType_INT8, 2., -10);
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(
+    TestCaseData{}
+      .addInput<uint8_t>({127, 48, 151, 232, 56, 176, 47, 37, 51, 52, 39, 94, 15, 108, 142, 243})
+      .addOutput<int8_t>(
+        {-10, -50, 2, 42, -46, 14, -50, -55, -48, -48, -54, -27, -66, -20, -3, 48}));
+  _context->setBackends({"cpu"});
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneOp_Quantize_Int8toUint8)
+{
+  CircleGen cgen =
+    genSimpleQuantizeModel(circle::TensorType_INT8, 2., -10, circle::TensorType_UINT8, 1., 128);
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(
+    TestCaseData{}
+      .addInput<int8_t>({-10, -50, 2, 42, -46, 14, -50, -55, -48, -48, -54, -27, -66, -20, -3, 48})
+      .addOutput<uint8_t>({128, 48, 152, 232, 56, 176, 48, 38, 52, 52, 40, 94, 16, 108, 142, 244}));
+  _context->setBackends({"cpu"});
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_Quantize_Uint8toInt16)
+{
+  CircleGen cgen =
+    genSimpleQuantizeModel(circle::TensorType_UINT8, 1., 128, circle::TensorType_INT16, 2., -10);
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_Quantize_Int8toInt16)
+{
+  CircleGen cgen =
+    genSimpleQuantizeModel(circle::TensorType_INT8, 2., -10, circle::TensorType_INT16, 1., 128);
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
diff --git a/tests/nnfw_api/src/one_op_tests/ResizeBilinear.cc b/tests/nnfw_api/src/one_op_tests/ResizeBilinear.cc
index 20320a0d3..5db08f168 100644
--- a/tests/nnfw_api/src/one_op_tests/ResizeBilinear.cc
+++ b/tests/nnfw_api/src/one_op_tests/ResizeBilinear.cc
@@ -18,25 +18,52 @@
 
 #include <memory>
 
-TEST_F(GenModelTest, OneOp_ResizeBilinear_SizeToConst)
+struct ResizeBilinearParam
 {
+  TestCaseData tcd;
+  circle::TensorType data_type = circle::TensorType::TensorType_FLOAT32;
+  float scale = 0.0f;
+  int64_t zero_point = 0;
+};
+
+class ResizeBilinearVariation : public GenModelTest,
+                                public ::testing::WithParamInterface<ResizeBilinearParam>
+{
+};
+
+TEST_P(ResizeBilinearVariation, Test)
+{
+  auto &param = GetParam();
+
   CircleGen cgen;
   std::vector<int32_t> size_data{3, 3};
   uint32_t size_buf = cgen.addBuffer(size_data);
   int size = cgen.addTensor({{1}, circle::TensorType::TensorType_INT32, size_buf});
-  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
-  int out = cgen.addTensor({{1, 3, 3, 1}, circle::TensorType::TensorType_FLOAT32});
+  int in = cgen.addTensor({{1, 2, 2, 1}, param.data_type}, param.scale, param.zero_point);
+  int out = cgen.addTensor({{1, 3, 3, 1}, param.data_type}, param.scale, param.zero_point);
   cgen.addOperatorResizeBilinear({{in, size}, {out}});
   cgen.setInputsAndOutputs({in}, {out});
 
   _context = std::make_unique<GenModelTestContext>(cgen.finish());
-  _context->addTestCase(
-    uniformTCD<float>({{1, 1, 2, 2}}, {{1, 1, 1, 1.666666667, 1.666666667, 1.666666667, 2, 2, 2}}));
+  _context->addTestCase(param.tcd);
   _context->setBackends({"acl_cl", "acl_neon", "cpu"});
 
   SUCCEED();
 }
 
+INSTANTIATE_TEST_CASE_P(
+  GenModelTest, ResizeBilinearVariation,
+  ::testing::Values(
+    // float value
+    ResizeBilinearParam{uniformTCD<float>({{1, 1, 2, 2}}, {{1, 1, 1, 1.666666667, 1.666666667,
+                                                            1.666666667, 2, 2, 2}})},
+    // uint8 value
+    ResizeBilinearParam{uniformTCD<uint8_t>({{3, 6, 9, 12}}, {{3, 5, 6, 7, 9, 10, 9, 11, 12}}),
+                        circle::TensorType::TensorType_UINT8, 1.0, 0},
+    // int8 value
+    ResizeBilinearParam{uniformTCD<int8_t>({{-6, -3, 9, 12}}, {{-6, -4, -3, 4, 6, 7, 9, 11, 12}}),
+                        circle::TensorType::TensorType_INT8, 1.0, 0}));
+
 TEST_F(GenModelTest, OneOp_ResizeBilinear_SizeToVar)
 {
   CircleGen cgen;
diff --git a/tests/nnfw_api/src/one_op_tests/Slice.cc b/tests/nnfw_api/src/one_op_tests/Slice.cc
new file mode 100644
index 000000000..960cd88e3
--- /dev/null
+++ b/tests/nnfw_api/src/one_op_tests/Slice.cc
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GenModelTest.h"
+
+struct SliceVariationParam
+{
+  std::vector<int32_t> input_shape;
+  std::vector<int32_t> begins;
+  std::vector<int32_t> sizes;
+  TestCaseData tcd;
+
+  circle::TensorType input_type = circle::TensorType::TensorType_FLOAT32;
+  float scale = 0.0f;
+  int64_t zero_point = 0;
+  circle::TensorType begins_type = circle::TensorType::TensorType_INT32;
+};
+
+class SliceVariation : public GenModelTest,
+                       public ::testing::WithParamInterface<SliceVariationParam>
+{
+};
+
+TEST_P(SliceVariation, Test)
+{
+  auto &param = GetParam();
+
+  CircleGen cgen;
+
+  int in = cgen.addTensor({param.input_shape, param.input_type}, param.scale, param.zero_point);
+  int out = cgen.addTensor({param.sizes, param.input_type}, param.scale, param.zero_point);
+  if (param.begins_type == circle::TensorType::TensorType_INT32)
+  {
+    uint32_t begins_buf = cgen.addBuffer(param.begins);
+    int rank = param.begins.size();
+    int begins = cgen.addTensor({{rank}, param.begins_type, begins_buf});
+
+    uint32_t sizes_buf = cgen.addBuffer(param.sizes);
+    int sizes = cgen.addTensor({{rank}, param.begins_type, sizes_buf});
+
+    cgen.addOperatorSlice({{in, begins, sizes}, {out}});
+  }
+  else if (param.begins_type == circle::TensorType::TensorType_INT64)
+  {
+    std::vector<int64_t> begins_64(param.begins.size());
+    std::vector<int64_t> sizes_64(param.sizes.size());
+    for (int i = 0; i < param.begins.size(); i++)
+    {
+      begins_64[i] = param.begins[i];
+      sizes_64[i] = param.sizes[i];
+    }
+
+    uint32_t begins_buf = cgen.addBuffer(begins_64);
+    int rank = param.begins.size();
+    int begins = cgen.addTensor({{rank}, param.begins_type, begins_buf});
+
+    uint32_t sizes_buf = cgen.addBuffer(sizes_64);
+    int sizes = cgen.addTensor({{rank}, param.begins_type, sizes_buf});
+
+    cgen.addOperatorSlice({{in, begins, sizes}, {out}});
+  }
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(param.tcd);
+
+  // acl don't support int64 yet
+  if (param.begins_type == circle::TensorType::TensorType_INT64)
+  {
+    _context->setBackends({"cpu"});
+  }
+  else
+  {
+    _context->setBackends({"cpu", "acl_cl", "acl_neon"});
+  }
+
+  SUCCEED();
+}
+
+INSTANTIATE_TEST_CASE_P(
+  GenModelTest, SliceVariation,
+  ::testing::Values(
+    SliceVariationParam{
+      {2, 2, 3, 1},
+      {0, 1, 1, 0},
+      {1, 1, 2, 1},
+      uniformTCD<float>({{1, 2, 3, 11, 12, 13, 21, 22, 23, 31, 32, 33}}, {{12, 13}})},
+    SliceVariationParam{
+      {2, 2, 3, 1},
+      {0, 1, 1, 0},
+      {1, 1, 2, 1},
+      uniformTCD<uint8_t>({{1, 2, 3, 11, 12, 13, 21, 22, 23, 31, 32, 33}}, {{12, 13}}),
+      circle::TensorType::TensorType_UINT8,
+      1,
+      0},
+    SliceVariationParam{
+      {2, 2, 3, 1},
+      {0, 1, 1, 0},
+      {1, 1, 2, 1},
+      uniformTCD<float>({{1, 2, 3, 11, 12, 13, 21, 22, 23, 31, 32, 33}}, {{12, 13}}),
+      circle::TensorType::TensorType_FLOAT32,
+      0,
+      0,
+      circle::TensorType::TensorType_INT64}));
+
+TEST_F(GenModelTest, neg_OneOp_Slice_Type)
+{
+  CircleGen cgen;
+  int in = cgen.addTensor({{1, 3, 3, 2}, circle::TensorType::TensorType_FLOAT32});
+  std::vector<float> begins_data = {0, 0, 1, 0};
+  uint32_t begins_buf = cgen.addBuffer(begins_data);
+  int begins = cgen.addTensor({{4}, circle::TensorType::TensorType_FLOAT32, begins_buf});
+  std::vector<float> sizes_data = {1, 2, 1, 1};
+  uint32_t sizes_buf = cgen.addBuffer(sizes_data);
+  int sizes = cgen.addTensor({{4}, circle::TensorType::TensorType_FLOAT32, sizes_buf});
+  int out = cgen.addTensor({{1, 2, 1, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorSlice({{in, begins, sizes}, {out}});
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_Slice_DiffType)
+{
+  CircleGen cgen;
+  int in = cgen.addTensor({{1, 3, 3, 2}, circle::TensorType::TensorType_FLOAT32});
+  std::vector<int32_t> begins_data = {0, 0, 1, 0};
+  uint32_t begins_buf = cgen.addBuffer(begins_data);
+  int begins = cgen.addTensor({{4}, circle::TensorType::TensorType_INT32, begins_buf});
+  std::vector<int64_t> sizes_data = {1, 2, 1, 1};
+  uint32_t sizes_buf = cgen.addBuffer(sizes_data);
+  int sizes = cgen.addTensor({{4}, circle::TensorType::TensorType_INT64, sizes_buf});
+  int out = cgen.addTensor({{1, 2, 1, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorSlice({{in, begins, sizes}, {out}});
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
diff --git a/tests/nnfw_api/src/one_op_tests/Softmax.cc b/tests/nnfw_api/src/one_op_tests/Softmax.cc
new file mode 100644
index 000000000..80fd17b12
--- /dev/null
+++ b/tests/nnfw_api/src/one_op_tests/Softmax.cc
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GenModelTest.h"
+
+// beta = 0.1
+// input/output shape: {1, 2, 1, 4}
+struct SoftmaxParam
+{
+  TestCaseData tcd;
+  circle::TensorType data_type = circle::TensorType::TensorType_FLOAT32;
+  float input_scale = 0.0f;
+  int64_t input_zero_point = 0;
+};
+
+class SoftmaxVariation : public GenModelTest, public ::testing::WithParamInterface<SoftmaxParam>
+{
+};
+
+TEST_P(SoftmaxVariation, Test)
+{
+  auto &param = GetParam();
+
+  CircleGen cgen;
+
+  // NNAPI spec and tflite test use fixed output scale and zero-point
+  float out_scale = 0.0;
+  int64_t out_zero_point = 0;
+  if (param.data_type == circle::TensorType::TensorType_UINT8)
+  {
+    out_scale = 1.0f / 256;
+  }
+  else if (param.data_type == circle::TensorType::TensorType_INT8)
+  {
+    out_scale = 1.0f / 256;
+    out_zero_point = -128;
+  }
+
+  int input =
+    cgen.addTensor({{1, 2, 1, 4}, param.data_type}, param.input_scale, param.input_zero_point);
+  int out = cgen.addTensor({{1, 2, 1, 4}, param.data_type}, out_scale, out_zero_point);
+  cgen.addOperatorSoftmax({{input}, {out}}, 0.1);
+  cgen.setInputsAndOutputs({input}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(param.tcd);
+  _context->setBackends({"cpu", "acl_neon", "acl_cl"});
+
+  SUCCEED();
+}
+
+// Test with different value type
+INSTANTIATE_TEST_CASE_P(
+  GenModelTest, SoftmaxVariation,
+  ::testing::Values(
+    // float value
+    SoftmaxParam{
+      uniformTCD<float>({{0, -6, 2, 4, 3, -2, 10, 1}},
+                        {{.23463, .12877, .28658, .35003, .22528, .13664, .45365, .18443}})},
+    // uint8 value
+    SoftmaxParam{
+      uniformTCD<uint8_t>({{10, 4, 12, 14, 13, 8, 20, 11}}, {{60, 33, 73, 90, 58, 35, 116, 47}}),
+      circle::TensorType::TensorType_UINT8, 1.0, 10},
+    // int8 value
+    SoftmaxParam{
+      uniformTCD<int8_t>({{0, -6, 2, 4, 3, -2, 10, 1}}, {{-68, -95, -55, -38, -70, -93, -12, -81}}),
+      circle::TensorType::TensorType_INT8, 1.0, 0}));
+
+TEST_F(GenModelTest, neg_OneOp_Softmax_Type)
+{
+  CircleGen cgen;
+  int input = cgen.addTensor({{1, 2, 1, 4}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 2, 1, 4}, circle::TensorType::TensorType_INT8}, 1.0, 0);
+  cgen.addOperatorSoftmax({{input}, {out}}, 0.1);
+  cgen.setInputsAndOutputs({input}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
diff --git a/tests/nnfw_api/src/one_op_tests/Sub.cc b/tests/nnfw_api/src/one_op_tests/Sub.cc
new file mode 100644
index 000000000..bb4fecd2d
--- /dev/null
+++ b/tests/nnfw_api/src/one_op_tests/Sub.cc
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "GenModelTest.h"
+
+#include <memory>
+
+TEST_F(GenModelTest, OneOp_Sub_Uint8_VarVar)
+{
+  CircleGen cgen;
+  int lhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_UINT8}, 1.0, 3);
+  int rhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_UINT8}, 2.0, 1);
+  int out = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_UINT8}, 0.5, 2);
+  cgen.addOperatorSub({{lhs, rhs}, {out}}, circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({lhs, rhs}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<uint8_t>({{13, 12, 25, 40}, {5, 4, 7, 0}}, {{6, 8, 22, 80}}));
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneOp_Sub_Int8_VarVar)
+{
+  CircleGen cgen;
+  int lhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_INT8}, 1.0, 2);
+  int rhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_INT8}, 2.0, 3);
+  int out = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_INT8}, 0.5, -6);
+  cgen.addOperatorSub({{lhs, rhs}, {out}}, circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({lhs, rhs}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<int8_t>({{1, 3, 2, 4}, {5, -4, -7, 4}}, {{-16, 24, 34, -6}}));
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneOp_SubBroadcast_Uint8_VarVar)
+{
+  CircleGen cgen;
+  int lhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_UINT8}, 1.0, 3);
+  int rhs = cgen.addTensor({{1, 1, 1, 1}, circle::TensorType::TensorType_UINT8}, 2.0, 1);
+  int out = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_UINT8}, 0.5, 2);
+  cgen.addOperatorSub({{lhs, rhs}, {out}}, circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({lhs, rhs}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<uint8_t>({{13, 12, 25, 40}, {5}}, {{6, 4, 30, 60}}));
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, OneOp_SubBroadcast_Int8_VarVar)
+{
+  CircleGen cgen;
+  int lhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_INT8}, 1.0, 2);
+  int rhs = cgen.addTensor({{1, 1, 1, 1}, circle::TensorType::TensorType_INT8}, 2.0, 3);
+  int out = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_INT8}, 0.5, -6);
+  cgen.addOperatorSub({{lhs, rhs}, {out}}, circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({lhs, rhs}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->addTestCase(uniformTCD<int8_t>({{1, 3, 2, 4}, {5}}, {{-16, -12, -14, -10}}));
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_Sub_InvalidType)
+{
+  CircleGen cgen;
+  int lhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int rhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_UINT8}, 0.1, 2);
+  int out = cgen.addTensor({{1, 2, 3, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorSub({{lhs, rhs}, {out}}, circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({lhs, rhs}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_Sub_InvalidShape)
+{
+  CircleGen cgen;
+  int lhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int rhs = cgen.addTensor({{1, 2, 3, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 2, 3, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorSub({{lhs, rhs}, {out}}, circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({lhs, rhs}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+  _context->expectFailCompile();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_Sub_OneOperand)
+{
+  CircleGen cgen;
+  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 2, 3, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorSub({{in}, {out}}, circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
+
+TEST_F(GenModelTest, neg_OneOp_Sub_ThreeOperands)
+{
+  CircleGen cgen;
+  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 2, 3, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorSub({{in, in, in}, {out}}, circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({in}, {out});
+
+  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  _context->setBackends({"acl_cl", "acl_neon", "cpu"});
+  _context->expectFailModelLoad();
+
+  SUCCEED();
+}
diff --git a/tests/nnfw_api/src/one_op_tests/While.cc b/tests/nnfw_api/src/one_op_tests/While.cc
index a5929a514..ee0a9df46 100644
--- a/tests/nnfw_api/src/one_op_tests/While.cc
+++ b/tests/nnfw_api/src/one_op_tests/While.cc
@@ -15,57 +15,14 @@
  */
 
 #include "GenModelTest.h"
+#include "WhileTestModel.h"
 
 #include <memory>
 
 TEST_F(GenModelTest, OneOp_While)
 {
-  // The model looks just like the below pseudocode
-  //
-  // function model(x)
-  // {
-  //   while (x < 100.0)
-  //   {
-  //     x = x + 10.0;
-  //   }
-  //   return x
-  // }
-
-  CircleGen cgen;
-  std::vector<float> incr_data{10};
-  uint32_t incr_buf = cgen.addBuffer(incr_data);
-  std::vector<float> end_data{100};
-  uint32_t end_buf = cgen.addBuffer(end_data);
-
-  // primary subgraph
-  {
-    int x_in = cgen.addTensor({{1}, circle::TensorType_FLOAT32});
-    int x_out = cgen.addTensor({{1}, circle::TensorType_FLOAT32});
-    cgen.addOperatorWhile({{x_in}, {x_out}}, 1, 2);
-    cgen.setInputsAndOutputs({x_in}, {x_out});
-  }
-
-  // cond subgraph
-  {
-    cgen.nextSubgraph();
-    int x = cgen.addTensor({{1}, circle::TensorType_FLOAT32});
-    int end = cgen.addTensor({{1}, circle::TensorType_FLOAT32, end_buf});
-    int result = cgen.addTensor({{1}, circle::TensorType_BOOL});
-    cgen.addOperatorLess({{x, end}, {result}});
-    cgen.setInputsAndOutputs({x}, {result});
-  }
-
-  // body subgraph
-  {
-    cgen.nextSubgraph();
-    int x_in = cgen.addTensor({{1}, circle::TensorType_FLOAT32});
-    int incr = cgen.addTensor({{1}, circle::TensorType_FLOAT32, incr_buf});
-    int x_out = cgen.addTensor({{1}, circle::TensorType_FLOAT32});
-    cgen.addOperatorAdd({{x_in, incr}, {x_out}}, circle::ActivationFunctionType_NONE);
-    cgen.setInputsAndOutputs({x_in}, {x_out});
-  }
-
-  _context = std::make_unique<GenModelTestContext>(cgen.finish());
+  WhileModelLoop10 model;
+  _context = std::make_unique<GenModelTestContext>(std::move(model.cbuf));
   _context->addTestCase(uniformTCD<float>({{0}}, {{100}}));
   _context->addTestCase(uniformTCD<float>({{2}}, {{102}}));
   _context->addTestCase(uniformTCD<float>({{22}}, {{102}}));
diff --git a/tests/nnfw_api/src/one_op_tests/WhileTestModel.h b/tests/nnfw_api/src/one_op_tests/WhileTestModel.h
new file mode 100644
index 000000000..a1873cc5a
--- /dev/null
+++ b/tests/nnfw_api/src/one_op_tests/WhileTestModel.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_API_TEST_WHILE_TEST_MODEL_H__
+#define __NNFW_API_TEST_WHILE_TEST_MODEL_H__
+
+#include "GenModelTest.h"
+
+#include <memory>
+
+class WhileModelLoop10
+{
+public:
+  WhileModelLoop10()
+  {
+    // The model looks just like the below pseudocode
+    //
+    // function model(x)
+    // {
+    //   while (x < 100.0)
+    //   {
+    //     x = x + 10.0;
+    //   }
+    //   return x
+    // }
+    CircleGen cgen;
+    std::vector<float> incr_data{10};
+    uint32_t incr_buf = cgen.addBuffer(incr_data);
+    std::vector<float> end_data{100};
+    uint32_t end_buf = cgen.addBuffer(end_data);
+
+    // primary subgraph
+    {
+      int x_in = cgen.addTensor({{1}, circle::TensorType_FLOAT32});
+      int x_out = cgen.addTensor({{1}, circle::TensorType_FLOAT32});
+      cgen.addOperatorWhile({{x_in}, {x_out}}, 1, 2);
+      cgen.setInputsAndOutputs({x_in}, {x_out});
+    }
+
+    // cond subgraph
+    {
+      cgen.nextSubgraph();
+      int x = cgen.addTensor({{1}, circle::TensorType_FLOAT32});
+      int end = cgen.addTensor({{1}, circle::TensorType_FLOAT32, end_buf});
+      int result = cgen.addTensor({{1}, circle::TensorType_BOOL});
+      cgen.addOperatorLess({{x, end}, {result}});
+      cgen.setInputsAndOutputs({x}, {result});
+    }
+
+    // body subgraph
+    {
+      cgen.nextSubgraph();
+      int x_in = cgen.addTensor({{1}, circle::TensorType_FLOAT32});
+      int incr = cgen.addTensor({{1}, circle::TensorType_FLOAT32, incr_buf});
+      int x_out = cgen.addTensor({{1}, circle::TensorType_FLOAT32});
+      cgen.addOperatorAdd({{x_in, incr}, {x_out}}, circle::ActivationFunctionType_NONE);
+      cgen.setInputsAndOutputs({x_in}, {x_out});
+    }
+    cbuf = cgen.finish();
+  }
+
+  int inputCount() { return 1; }
+  int outputputCount() { return 1; }
+  int sizeOfDType() { return sizeof(float); }
+
+  CircleBuffer cbuf;
+};
+
+#endif // __NNFW_API_TEST_WHILE_TEST_MODEL_H__
diff --git a/tests/scripts/CMakeLists.txt b/tests/scripts/CMakeLists.txt
index 40e0dfdaa..ec319cab2 100644
--- a/tests/scripts/CMakeLists.txt
+++ b/tests/scripts/CMakeLists.txt
@@ -17,10 +17,6 @@ install(PROGRAMS ${MODEL_TEST_SCRIPT} DESTINATION test/models)
 file(GLOB TFLITE_CONFIG_DIR models/tflite)
 install(DIRECTORY ${TFLITE_CONFIG_DIR} DESTINATION test/models)
 
-# Install nnpackage test config
-file(GLOB NNPACKAGE_MODEL_CONFIG_DIR models/nnfw_api_gtest)
-install(DIRECTORY ${NNPACKAGE_MODEL_CONFIG_DIR} DESTINATION test/models)
-
 # Install test list
 file(GLOB TEST_LIST_DIR list)
 install(DIRECTORY ${TEST_LIST_DIR} DESTINATION test)
diff --git a/tests/scripts/benchmark.sh b/tests/scripts/benchmark.sh
index a6bb821b7..177941189 100644
--- a/tests/scripts/benchmark.sh
+++ b/tests/scripts/benchmark.sh
@@ -92,7 +92,7 @@ $BRIDGE shell tar -zxf $TEST_ROOT/nnpkg.tar.gz -C $TEST_ROOT/nnpkg
 $BRIDGE shell rm $TEST_ROOT/nnpkg.tar.gz
 
 # 1. Run
-$BRIDGE shell LD_LIBRARY_PATH=$TEST_ROOT/Product/out/lib OP_SEQ_MAX_NODE=1 TRACE_FILEPATH=$TEST_ROOT/trace.json BACKENDS=$BACKENDS $TEST_ROOT/Product/out/bin/nnpackage_run --nnpackage $NNPKG_PATH_TARGET -r $NUM_RUNS
+$BRIDGE shell LD_LIBRARY_PATH=$TEST_ROOT/Product/out/lib TRACE_FILEPATH=$TEST_ROOT/trace.json BACKENDS=$BACKENDS $TEST_ROOT/Product/out/bin/nnpackage_run --nnpackage $NNPKG_PATH_TARGET -r $NUM_RUNS
 
 # 2. Pull result file
 echo "Pulling data from target to trace.json"
diff --git a/tests/scripts/command/prepare-model b/tests/scripts/command/prepare-model
index 9fd790ebe..5b3340813 100644
--- a/tests/scripts/command/prepare-model
+++ b/tests/scripts/command/prepare-model
@@ -18,7 +18,6 @@ COMMAND_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 INSTALL_DIR="$(dirname $(dirname $COMMAND_DIR))"
 
 MD5_CHECK="on"
-DOWNLOAD_MODEL="all"
 
 function Usage()
 {
@@ -26,7 +25,7 @@ function Usage()
     echo ""
     echo "Options:"
     echo "      --ignoremd5                         Ignore MD5 check when download model files"
-    echo "      --model=(all|nnpackage|tflite)      Download test model (default=all)"
+    echo "      --model=(all|nnpackage|tflite)      Download test model (deprecated option: always all)"
 }
 
 for i in "$@"
@@ -40,7 +39,7 @@ do
             MD5_CHECK="off"
             ;;
         --model=*)
-            DOWNLOAD_MODEL=${i#*=}
+            # deprecated
             ;;
         *)
             echo "Unknown option: $i"
@@ -56,15 +55,4 @@ if [[ -z "$MODELFILE_SERVER" ]]; then
 fi
 echo "Download from $MODELFILE_SERVER"
 
-if [[ $DOWNLOAD_MODEL == "all" ]] || [[ $DOWNLOAD_MODEL == "tflite" ]]; then
-    # Download tflite models
-    $INSTALL_DIR/test/models/run_test.sh --download=on --run=off --md5=$MD5_CHECK
-fi
-
-if [[ $DOWNLOAD_MODEL == "all" ]] || [[ $DOWNLOAD_MODEL == "nnpackage" ]]; then
-    # Download nnpackage model
-    NNPACKAGE_CONFIG_DIR=$INSTALL_DIR/test/models/nnfw_api_gtest/
-    NNPACKAGE_CACHE_DIR=$INSTALL_DIR/unittest_standalone/nnfw_api_gtest_models/
-    $INSTALL_DIR/test/models/run_test.sh --download=on --run=off --md5=$MD5_CHECK \
-        --configdir=$NNPACKAGE_CONFIG_DIR --cachedir=$NNPACKAGE_CACHE_DIR
-fi
+$INSTALL_DIR/test/models/run_test.sh --download=on --run=off --md5=$MD5_CHECK
diff --git a/tests/scripts/command/verify-tflite b/tests/scripts/command/verify-tflite
index 98765cc20..fff1106ec 100644
--- a/tests/scripts/command/verify-tflite
+++ b/tests/scripts/command/verify-tflite
@@ -18,7 +18,7 @@ COMMAND_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 INSTALL_DIR="$(dirname $(dirname $COMMAND_DIR))"
 
 MD5_CHECK="on"
-TFLITE_LOADER="nnapi"
+TFLITE_LOADER="loader"
 REPORT_DIR="report"
 TEST_LIST_FILE=
 
@@ -78,7 +78,7 @@ if [[ $TFLITE_LOADER == "nnapi" ]]; then
     TEST_DRIVER=nnapi_test
 elif [[ $TFLITE_LOADER == "loader" ]]; then
     TEST_NAME="Loader Verification"
-    TEST_DRIVER=tflite_loader_test_tool
+    TEST_DRIVER=tflite_comparator
 else
     Usage
     exit 1
diff --git a/tests/scripts/list/frameworktest_list.aarch64.acl_cl.txt b/tests/scripts/list/nnapi_test.aarch64.list
index dd8d3b710..dd8d3b710 100644
--- a/tests/scripts/list/frameworktest_list.aarch64.acl_cl.txt
+++ b/tests/scripts/list/nnapi_test.aarch64.list
diff --git a/tests/scripts/list/frameworktest_list.armv7l.acl_cl.txt b/tests/scripts/list/nnapi_test.armv7l.list
index dd8d3b710..dd8d3b710 100644
--- a/tests/scripts/list/frameworktest_list.armv7l.acl_cl.txt
+++ b/tests/scripts/list/nnapi_test.armv7l.list
diff --git a/tests/scripts/list/tflite_comparator.aarch64.acl_cl.list b/tests/scripts/list/tflite_comparator.aarch64.acl_cl.list
new file mode 100644
index 000000000..24fa98072
--- /dev/null
+++ b/tests/scripts/list/tflite_comparator.aarch64.acl_cl.list
@@ -0,0 +1,41 @@
+MODELS/inception_module
+MODELS/mobilenet
+add
+average_pool_2d
+batch_to_space_nd2
+cast
+concat
+conv_2d
+depthwise_conv_2d
+div
+exp
+floor
+fullyconnected
+gather
+l2_normalization
+max
+max_pool_2d
+mean
+min
+mul
+neg
+pack
+pad
+reduce_max
+reduce_mean
+reduce_sum/float
+relu
+relu6
+reshape
+resize_bilinear
+rsqrt
+slice
+softmax
+space_to_batch_nd2
+space_to_depth
+squeeze
+strided_slice
+sub
+tanh
+transpose
+transpose_conv
diff --git a/tests/scripts/list/frameworktest_list.aarch64.acl_neon.txt b/tests/scripts/list/tflite_comparator.aarch64.acl_neon.list
index b58d39ab7..0d443a79d 100644
--- a/tests/scripts/list/frameworktest_list.aarch64.acl_neon.txt
+++ b/tests/scripts/list/tflite_comparator.aarch64.acl_neon.list
@@ -7,12 +7,9 @@ concat
 conv_2d
 depthwise_conv_2d
 div
-embedding_lookup
 floor
 gather
-hashtable_lookup
 l2_normalization
-l2_pool_2d
 logistic
 max
 max_pool_2d
diff --git a/tests/scripts/list/frameworktest_list.aarch64.cpu.txt b/tests/scripts/list/tflite_comparator.aarch64.cpu.list
index 4b4b7fb24..4b4b7fb24 100644
--- a/tests/scripts/list/frameworktest_list.aarch64.cpu.txt
+++ b/tests/scripts/list/tflite_comparator.aarch64.cpu.list
diff --git a/tests/scripts/list/tflite_comparator.armv7l.acl_cl.list b/tests/scripts/list/tflite_comparator.armv7l.acl_cl.list
new file mode 100644
index 000000000..24fa98072
--- /dev/null
+++ b/tests/scripts/list/tflite_comparator.armv7l.acl_cl.list
@@ -0,0 +1,41 @@
+MODELS/inception_module
+MODELS/mobilenet
+add
+average_pool_2d
+batch_to_space_nd2
+cast
+concat
+conv_2d
+depthwise_conv_2d
+div
+exp
+floor
+fullyconnected
+gather
+l2_normalization
+max
+max_pool_2d
+mean
+min
+mul
+neg
+pack
+pad
+reduce_max
+reduce_mean
+reduce_sum/float
+relu
+relu6
+reshape
+resize_bilinear
+rsqrt
+slice
+softmax
+space_to_batch_nd2
+space_to_depth
+squeeze
+strided_slice
+sub
+tanh
+transpose
+transpose_conv
diff --git a/tests/scripts/list/frameworktest_list.armv7l.acl_neon.txt b/tests/scripts/list/tflite_comparator.armv7l.acl_neon.list
index 9df071bbb..8cce41af0 100644
--- a/tests/scripts/list/frameworktest_list.armv7l.acl_neon.txt
+++ b/tests/scripts/list/tflite_comparator.armv7l.acl_neon.list
@@ -7,13 +7,10 @@ concat
 conv_2d
 depthwise_conv_2d
 div
-embedding_lookup
 floor
 fullyconnected
 gather
-hashtable_lookup
 l2_normalization
-l2_pool_2d
 logistic
 max
 max_pool_2d
diff --git a/tests/scripts/list/frameworktest_list.armv7l.cpu.txt b/tests/scripts/list/tflite_comparator.armv7l.cpu.list
index cf3297795..cf3297795 100644
--- a/tests/scripts/list/frameworktest_list.armv7l.cpu.txt
+++ b/tests/scripts/list/tflite_comparator.armv7l.cpu.list
diff --git a/tests/scripts/list/frameworktest_list.noarch.interp.txt b/tests/scripts/list/tflite_comparator.noarch.interp.list
index 3555ee28e..3555ee28e 100644
--- a/tests/scripts/list/frameworktest_list.noarch.interp.txt
+++ b/tests/scripts/list/tflite_comparator.noarch.interp.list
diff --git a/tests/scripts/list/frameworktest_list.x86_64.cpu.txt b/tests/scripts/list/tflite_comparator.x86_64.cpu.list
index 5750ec4de..5750ec4de 100644
--- a/tests/scripts/list/frameworktest_list.x86_64.cpu.txt
+++ b/tests/scripts/list/tflite_comparator.x86_64.cpu.list
diff --git a/tests/scripts/list/tflite_loader_list.aarch64.txt b/tests/scripts/list/tflite_loader_list.aarch64.txt
deleted file mode 100644
index e04d89d3f..000000000
--- a/tests/scripts/list/tflite_loader_list.aarch64.txt
+++ /dev/null
@@ -1,35 +0,0 @@
-MODELS/inception_module
-MODELS/mobilenet
-add
-average_pool_2d
-batch_to_space_nd2
-concat
-conv_2d/convolution1
-depthwise_conv_2d
-div
-exp
-fullyconnected/fc1
-logistic
-max
-max_pool_2d/maxpool1
-mean
-min
-mul
-pack
-pad
-reduce_max
-reduce_sum/float
-relu
-relu6
-reshape/reshape1
-resize_bilinear
-rsqrt
-slice
-softmax
-space_to_batch_nd2
-sqrt
-squeeze
-sub
-tanh
-transpose
-transpose_conv
diff --git a/tests/scripts/list/tflite_loader_list.armv7l.txt b/tests/scripts/list/tflite_loader_list.armv7l.txt
deleted file mode 100644
index e04d89d3f..000000000
--- a/tests/scripts/list/tflite_loader_list.armv7l.txt
+++ /dev/null
@@ -1,35 +0,0 @@
-MODELS/inception_module
-MODELS/mobilenet
-add
-average_pool_2d
-batch_to_space_nd2
-concat
-conv_2d/convolution1
-depthwise_conv_2d
-div
-exp
-fullyconnected/fc1
-logistic
-max
-max_pool_2d/maxpool1
-mean
-min
-mul
-pack
-pad
-reduce_max
-reduce_sum/float
-relu
-relu6
-reshape/reshape1
-resize_bilinear
-rsqrt
-slice
-softmax
-space_to_batch_nd2
-sqrt
-squeeze
-sub
-tanh
-transpose
-transpose_conv
diff --git a/tests/scripts/models/nnfw_api_gtest/add/config.sh b/tests/scripts/models/nnfw_api_gtest/add/config.sh
deleted file mode 100644
index e6e8677e7..000000000
--- a/tests/scripts/models/nnfw_api_gtest/add/config.sh
+++ /dev/null
@@ -1 +0,0 @@
-MODELFILE_NAME="add.zip"
diff --git a/tests/scripts/models/nnfw_api_gtest/add_invalid_manifest/config.sh b/tests/scripts/models/nnfw_api_gtest/add_invalid_manifest/config.sh
deleted file mode 100644
index 92c903274..000000000
--- a/tests/scripts/models/nnfw_api_gtest/add_invalid_manifest/config.sh
+++ /dev/null
@@ -1 +0,0 @@
-MODELFILE_NAME="add_invalid_manifest.zip"
diff --git a/tests/scripts/models/nnfw_api_gtest/add_no_manifest/config.sh b/tests/scripts/models/nnfw_api_gtest/add_no_manifest/config.sh
deleted file mode 100644
index 0d697a2a0..000000000
--- a/tests/scripts/models/nnfw_api_gtest/add_no_manifest/config.sh
+++ /dev/null
@@ -1 +0,0 @@
-MODELFILE_NAME="add_no_manifest.zip"
diff --git a/tests/scripts/models/nnfw_api_gtest/if_dynamic/config.sh b/tests/scripts/models/nnfw_api_gtest/if_dynamic/config.sh
deleted file mode 100644
index 3b8506c6f..000000000
--- a/tests/scripts/models/nnfw_api_gtest/if_dynamic/config.sh
+++ /dev/null
@@ -1 +0,0 @@
-MODELFILE_NAME="if_dynamic.zip"
diff --git a/tests/scripts/models/nnfw_api_gtest/while_dynamic/config.sh b/tests/scripts/models/nnfw_api_gtest/while_dynamic/config.sh
deleted file mode 100644
index ff14d4e05..000000000
--- a/tests/scripts/models/nnfw_api_gtest/while_dynamic/config.sh
+++ /dev/null
@@ -1 +0,0 @@
-MODELFILE_NAME="while_dynamic.zip"
diff --git a/tests/tools/nnpackage_run/src/allocation.h b/tests/tools/nnpackage_run/src/allocation.h
index e7f1a9c75..20e21eb59 100644
--- a/tests/tools/nnpackage_run/src/allocation.h
+++ b/tests/tools/nnpackage_run/src/allocation.h
@@ -33,6 +33,6 @@ public:
 private:
   void *data_;
 };
-} // end of namespace
+} // namespace nnpkg_run
 
 #endif // __NNPACKAGE_RUN_ALLOCATION_H__
diff --git a/tests/tools/nnpackage_run/src/h5formatter.cc b/tests/tools/nnpackage_run/src/h5formatter.cc
index eeedcb77a..e207465d4 100644
--- a/tests/tools/nnpackage_run/src/h5formatter.cc
+++ b/tests/tools/nnpackage_run/src/h5formatter.cc
@@ -137,6 +137,12 @@ void H5Formatter::loadInputs(const std::string &filename, std::vector<Allocation
             throw std::runtime_error(
               "model input type is qasymm8, bool or uint8. But h5 data type is different.");
           break;
+        case NNFW_TYPE_TENSOR_QUANT8_ASYMM_SIGNED:
+          if (type == H5::PredType::STD_I8BE || type == H5::PredType::STD_I8LE)
+            data_set.read(inputs[i].data(), H5::PredType::NATIVE_INT8);
+          else
+            throw std::runtime_error("model input type is int8. But h5 data type is different.");
+          break;
         default:
           throw std::runtime_error("nnpkg_run can load f32, i32, qasymm8, bool and uint8.");
       }
@@ -221,6 +227,13 @@ void H5Formatter::dumpOutputs(const std::string &filename, std::vector<Allocatio
           data_set.write(outputs[i].data(), H5::PredType::NATIVE_INT8);
           break;
         }
+        case NNFW_TYPE_TENSOR_QUANT8_ASYMM_SIGNED:
+        {
+          H5::DataSet data_set =
+            value_group.createDataSet(std::to_string(i), H5::PredType::STD_I8LE, data_space);
+          data_set.write(outputs[i].data(), H5::PredType::NATIVE_INT8);
+          break;
+        }
         default:
           throw std::runtime_error("nnpkg_run can dump f32, i32, qasymm8, bool and uint8.");
       }
diff --git a/tests/tools/nnpackage_run/src/h5formatter.h b/tests/tools/nnpackage_run/src/h5formatter.h
index 203ba0e72..5c831021b 100644
--- a/tests/tools/nnpackage_run/src/h5formatter.h
+++ b/tests/tools/nnpackage_run/src/h5formatter.h
@@ -38,6 +38,6 @@ public:
 private:
   nnfw_session *session_;
 };
-} // end of namespace
+} // namespace nnpkg_run
 
 #endif // __NNPACKAGE_RUN_H5FORMATTER_H__
diff --git a/tests/tools/nnpackage_run/src/nnfw_util.cc b/tests/tools/nnpackage_run/src/nnfw_util.cc
index 6c37eed45..a57069bd8 100644
--- a/tests/tools/nnpackage_run/src/nnfw_util.cc
+++ b/tests/tools/nnpackage_run/src/nnfw_util.cc
@@ -40,9 +40,9 @@ uint64_t bufsize_for(const nnfw_tensorinfo *ti)
     sizeof(bool),    /* NNFW_TYPE_TENSOR_BOOL = 3 */
     sizeof(uint8_t), /* NNFW_TYPE_TENSOR_UINT8 = 4 */
     sizeof(int64_t), /* NNFW_TYPE_TENSOR_INT64 = 5 */
-
+    sizeof(int8_t),  /* NNFW_TYPE_TENSOR_QUANT8_ASYMM_SIGNED = 6 */
   };
   return elmsize[ti->dtype] * num_elems(ti);
 }
 
-} // end of namespace
+} // namespace nnpkg_run
diff --git a/tests/tools/nnpackage_run/src/nnpackage_run.cc b/tests/tools/nnpackage_run/src/nnpackage_run.cc
index 5bde74ff7..1fcab512a 100644
--- a/tests/tools/nnpackage_run/src/nnpackage_run.cc
+++ b/tests/tools/nnpackage_run/src/nnpackage_run.cc
@@ -97,7 +97,7 @@ int main(const int argc, char **argv)
         nnfw_tensorinfo ti;
         NNPR_ENSURE_STATUS(nnfw_input_tensorinfo(session, i, &ti));
 
-        if (ti.dtype < NNFW_TYPE_TENSOR_FLOAT32 || ti.dtype > NNFW_TYPE_TENSOR_INT64)
+        if (ti.dtype < NNFW_TYPE_TENSOR_FLOAT32 || ti.dtype > NNFW_TYPE_TENSOR_QUANT8_ASYMM_SIGNED)
         {
           std::cerr << "E: not supported input type" << std::endl;
           exit(-1);
@@ -114,7 +114,7 @@ int main(const int argc, char **argv)
         nnfw_tensorinfo ti;
         NNPR_ENSURE_STATUS(nnfw_output_tensorinfo(session, i, &ti));
 
-        if (ti.dtype < NNFW_TYPE_TENSOR_FLOAT32 || ti.dtype > NNFW_TYPE_TENSOR_INT64)
+        if (ti.dtype < NNFW_TYPE_TENSOR_FLOAT32 || ti.dtype > NNFW_TYPE_TENSOR_QUANT8_ASYMM_SIGNED)
         {
           std::cerr << "E: not supported output type" << std::endl;
           exit(-1);
diff --git a/tests/tools/nnpackage_run/src/randomgen.h b/tests/tools/nnpackage_run/src/randomgen.h
index 9ca51dd11..898df34fc 100644
--- a/tests/tools/nnpackage_run/src/randomgen.h
+++ b/tests/tools/nnpackage_run/src/randomgen.h
@@ -35,6 +35,6 @@ public:
 private:
   nnfw_session *session_;
 };
-} // end of namespace
+} // namespace nnpkg_run
 
 #endif // __NNPACKAGE_RUN_RANDOMGEN_H__
diff --git a/tests/tools/tflite_comparator/CMakeLists.txt b/tests/tools/tflite_comparator/CMakeLists.txt
new file mode 100644
index 000000000..54e3f61fd
--- /dev/null
+++ b/tests/tools/tflite_comparator/CMakeLists.txt
@@ -0,0 +1,23 @@
+if(NOT BUILD_TFLITE_COMPARATOR_TEST_TOOL)
+  message("skipping tflite comparator tool build")
+  return()
+endif(NOT BUILD_TFLITE_COMPARATOR_TEST_TOOL)
+
+if(NOT BUILD_ONERT)
+  message("skipping tflite comparator tool build: onert is not built")
+  return()
+endif(NOT BUILD_ONERT)
+
+list(APPEND SOURCES "src/tflite_comparator.cc")
+list(APPEND SOURCES "src/args.cc")
+
+nnfw_find_package(Boost REQUIRED program_options system filesystem)
+
+add_executable(tflite_comparator ${SOURCES})
+target_include_directories(tflite_comparator PRIVATE ${Boost_INCLUDE_DIRS})
+
+target_link_libraries(tflite_comparator nnfw-dev)
+target_link_libraries(tflite_comparator nnfw_lib_tflite nnfw_lib_misc)
+target_link_libraries(tflite_comparator ${Boost_PROGRAM_OPTIONS_LIBRARY} ${Boost_SYSTEM_LIBRARY} ${Boost_FILESYSTEM_LIBRARY})
+
+install(TARGETS tflite_comparator DESTINATION bin)
diff --git a/tests/tools/tflite_loader/src/args.cc b/tests/tools/tflite_comparator/src/args.cc
index e9fb141ca..ecab20b17 100644
--- a/tests/tools/tflite_loader/src/args.cc
+++ b/tests/tools/tflite_comparator/src/args.cc
@@ -47,8 +47,8 @@ void Args::Initialize(void)
 
 void Args::print(char **argv)
 {
-  std::cout << "tflite_loader" << std::endl << std::endl;
-  std::cout << "Load tflite model by Loader and TFLite and compare their output" << std::endl;
+  std::cout << "tflite_comparator" << std::endl << std::endl;
+  std::cout << "Load tflite model by onert and TFLite, and compare their output" << std::endl;
   std::cout << "Usage:" << std::endl;
   std::cout << argv[0] << " --tflite model_file.tflite --data input_data.dat" << std::endl;
   std::cout << _options;
diff --git a/tests/tools/tflite_loader/src/args.h b/tests/tools/tflite_comparator/src/args.h
index 4d0e8ff41..4d0e8ff41 100644
--- a/tests/tools/tflite_loader/src/args.h
+++ b/tests/tools/tflite_comparator/src/args.h
diff --git a/tests/tools/tflite_loader/src/tflite_loader.cc b/tests/tools/tflite_comparator/src/tflite_comparator.cc
index f77570c74..7e5190527 100644
--- a/tests/tools/tflite_loader/src/tflite_loader.cc
+++ b/tests/tools/tflite_comparator/src/tflite_comparator.cc
@@ -20,6 +20,7 @@
 #include <nnfw_internal.h>
 
 #include <misc/EnvVar.h>
+#include <misc/fp32.h>
 #include <misc/RandomGenerator.h>
 
 #include <tflite/Assert.h>
@@ -36,7 +37,6 @@ using namespace tflite;
 using namespace nnfw::tflite;
 
 const int FILE_ERROR = 2;
-const float DIFFERENCE_THRESHOLD = 10e-5;
 
 #define NNFW_ASSERT_FAIL(expr, msg)   \
   if ((expr) != NNFW_STATUS_NO_ERROR) \
@@ -300,7 +300,7 @@ int main(const int argc, char **argv)
     std::cerr << e.what() << std::endl;
     exit(FILE_ERROR);
   }
-  interpreter->SetNumThreads(nnfw::misc::EnvVar("THREAD").asInt(-1));
+  interpreter->SetNumThreads(nnfw::misc::EnvVar("THREAD").asInt(1));
 
   auto sess = std::make_shared<nnfw::tflite::InterpreterSession>(interpreter.get());
   sess->prepare();
@@ -320,6 +320,7 @@ int main(const int argc, char **argv)
   // Calculate max difference over all outputs
   float max_float_difference = 0.0f;
   bool find_unmatched_output = false;
+  auto tolerance = nnfw::misc::EnvVar("TOLERANCE").asInt(1);
 
   for (uint32_t out_idx = 0; out_idx < num_outputs; out_idx++)
   {
@@ -356,8 +357,9 @@ int main(const int argc, char **argv)
           if (std::abs(refval - val) > max_float_difference)
             max_float_difference = std::abs(refval - val);
 
-          if (max_float_difference > DIFFERENCE_THRESHOLD)
-            matched = false;
+          matched = nnfw::misc::fp32::absolute_epsilon_equal(refval, val)
+                      ? true
+                      : nnfw::misc::fp32::epsilon_equal(refval, val, tolerance);
         }
         break;
       case NNFW_TYPE_TENSOR_INT64:
@@ -377,10 +379,6 @@ int main(const int argc, char **argv)
   if (find_unmatched_output)
   {
     std::cout << "[Comparison] outputs is not equal!" << std::endl;
-    if (max_float_difference > DIFFERENCE_THRESHOLD)
-    {
-      std::cout << "[Comparison] Float outputs is not equal!" << std::endl;
-    }
     ret = 1;
   }
   else
diff --git a/tests/tools/tflite_loader/CMakeLists.txt b/tests/tools/tflite_loader/CMakeLists.txt
deleted file mode 100644
index 6be315893..000000000
--- a/tests/tools/tflite_loader/CMakeLists.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-if(NOT BUILD_TFLITE_LOADER_TEST_TOOL)
-  message("skipping tflite loader tool build")
-  return()
-endif(NOT BUILD_TFLITE_LOADER_TEST_TOOL)
-
-if(NOT BUILD_ONERT)
-  message("skipping tflite loader tool build: onert is not built")
-  return()
-endif(NOT BUILD_ONERT)
-
-list(APPEND SOURCES "src/tflite_loader.cc")
-list(APPEND SOURCES "src/args.cc")
-
-nnfw_find_package(Boost REQUIRED program_options system filesystem)
-
-add_executable(tflite_loader_test_tool ${SOURCES})
-target_include_directories(tflite_loader_test_tool PRIVATE ${Boost_INCLUDE_DIRS})
-
-target_link_libraries(tflite_loader_test_tool nnfw-dev)
-target_link_libraries(tflite_loader_test_tool nnfw_lib_tflite nnfw_lib_misc)
-target_link_libraries(tflite_loader_test_tool ${Boost_PROGRAM_OPTIONS_LIBRARY} ${Boost_SYSTEM_LIBRARY} ${Boost_FILESYSTEM_LIBRARY})
-
-install(TARGETS tflite_loader_test_tool DESTINATION bin)
diff --git a/tests/tools/tflite_run/src/tflite_run.cc b/tests/tools/tflite_run/src/tflite_run.cc
index d42f99234..14f501258 100644
--- a/tests/tools/tflite_run/src/tflite_run.cc
+++ b/tests/tools/tflite_run/src/tflite_run.cc
@@ -26,6 +26,7 @@
 #include "tflite/Diff.h"
 #include "tflite/Assert.h"
 #include "tflite/Session.h"
+#include "tflite/RandomInputInitializer.h"
 #include "tflite/InterpreterSession.h"
 #include "tflite/NNAPISession.h"
 #include "misc/tensor/IndexIterator.h"
@@ -71,7 +72,7 @@ public:
   }
 };
 
-} // namespace anonymous
+} // namespace
 
 int main(const int argc, char **argv)
 {
@@ -112,7 +113,7 @@ int main(const int argc, char **argv)
       BuiltinOpResolver resolver;
       InterpreterBuilder builder(*model, resolver);
       TFLITE_ENSURE(builder(&interpreter))
-      interpreter->SetNumThreads(nnfw::misc::EnvVar("THREAD").asInt(-1));
+      interpreter->SetNumThreads(nnfw::misc::EnvVar("THREAD").asInt(1));
     });
   }
   catch (const std::exception &e)
@@ -196,70 +197,8 @@ int main(const int argc, char **argv)
     const int seed = 1; /* TODO Add an option for seed value */
     nnfw::misc::RandomGenerator randgen{seed, 0.0f, 2.0f};
 
-    // No input specified. So we fill the input tensors with random values.
-    for (const auto &o : interpreter->inputs())
-    {
-      TfLiteTensor *tensor = interpreter->tensor(o);
-      if (tensor->type == kTfLiteInt32)
-      {
-        // Generate singed 32-bit integer (s32) input
-        auto tensor_view = nnfw::tflite::TensorView<int32_t>::make(*interpreter, o);
-
-        int32_t value = 0;
-
-        nnfw::misc::tensor::iterate(tensor_view.shape())
-          << [&](const nnfw::misc::tensor::Index &ind) {
-               // TODO Generate random values
-               // Gather operation: index should be within input coverage.
-               tensor_view.at(ind) = value;
-               value++;
-             };
-      }
-      else if (tensor->type == kTfLiteUInt8)
-      {
-        // Generate unsigned 8-bit integer input
-        auto tensor_view = nnfw::tflite::TensorView<uint8_t>::make(*interpreter, o);
-
-        auto fp = static_cast<uint8_t (nnfw::misc::RandomGenerator::*)(
-          const ::nnfw::misc::tensor::Shape &, const ::nnfw::misc::tensor::Index &)>(
-          &nnfw::misc::RandomGenerator::generate<uint8_t>);
-        const nnfw::misc::tensor::Object<uint8_t> data(tensor_view.shape(),
-                                                       std::bind(fp, randgen, _1, _2));
-
-        nnfw::misc::tensor::iterate(tensor_view.shape())
-          << [&](const nnfw::misc::tensor::Index &ind) {
-               const auto value = data.at(ind);
-               tensor_view.at(ind) = value;
-             };
-      }
-      else if (tensor->type == kTfLiteBool)
-      {
-        // Generate bool input
-        auto tensor_view = nnfw::tflite::TensorView<bool>::make(*interpreter, o);
-
-        auto fp = static_cast<bool (nnfw::misc::RandomGenerator::*)(
-          const ::nnfw::misc::tensor::Shape &, const ::nnfw::misc::tensor::Index &)>(
-          &nnfw::misc::RandomGenerator::generate<bool>);
-        const nnfw::misc::tensor::Object<bool> data(tensor_view.shape(),
-                                                    std::bind(fp, randgen, _1, _2));
-
-        nnfw::misc::tensor::iterate(tensor_view.shape())
-          << [&](const nnfw::misc::tensor::Index &ind) {
-               const auto value = data.at(ind);
-               tensor_view.at(ind) = value;
-             };
-      }
-      else
-      {
-        assert(tensor->type == kTfLiteFloat32);
-
-        const float *end = reinterpret_cast<const float *>(tensor->data.raw_const + tensor->bytes);
-        for (float *ptr = tensor->data.f; ptr < end; ptr++)
-        {
-          *ptr = randgen.generate<float>();
-        }
-      }
-    }
+    RandomInputInitializer initializer{randgen};
+    initializer.run(*(interpreter.get()));
   }
 
   TFLiteRun::TensorDumper tensor_dumper;
diff --git a/tests/tools/tflite_vanilla_run/CMakeLists.txt b/tests/tools/tflite_vanilla_run/CMakeLists.txt
index 19e21e923..a673058a4 100644
--- a/tests/tools/tflite_vanilla_run/CMakeLists.txt
+++ b/tests/tools/tflite_vanilla_run/CMakeLists.txt
@@ -6,7 +6,7 @@ if(NOT BUILD_TENSORFLOW_LITE_2_3_0)
   set(BUILD_TENSORFLOW_LITE_2_3_0 ON)
 endif()
 
-nnfw_find_package(TensorFlowLite-2.3.0 REQUIRED)
+nnfw_find_package(TensorFlowLite EXACT 2.3.0 REQUIRED)
 nnfw_find_package(Boost REQUIRED)
 
 list(APPEND TFLITE_RUN_SRCS "src/tflite_vanilla_run.cc")
diff --git a/tests/tools/tflite_vanilla_run/src/tflite_vanilla_run.cc b/tests/tools/tflite_vanilla_run/src/tflite_vanilla_run.cc
index e9fb04c7d..77b5e7a37 100644
--- a/tests/tools/tflite_vanilla_run/src/tflite_vanilla_run.cc
+++ b/tests/tools/tflite_vanilla_run/src/tflite_vanilla_run.cc
@@ -73,7 +73,7 @@ public:
   }
 };
 
-} // namespace anonymous
+} // namespace
 
 int main(const int argc, char **argv)
 {
diff --git a/tools/.clang-format b/tools/.clang-format
deleted file mode 120000
index 0ff66f331..000000000
--- a/tools/.clang-format
+++ /dev/null
@@ -1 +0,0 @@
-../.clang-format.8
-\ No newline at end of file
diff --git a/tools/cross/aarch64/sources.list.trusty b/tools/cross/aarch64/sources.list.trusty
deleted file mode 100644
index 8aa98a259..000000000
--- a/tools/cross/aarch64/sources.list.trusty
+++ /dev/null
@@ -1,11 +0,0 @@
-deb http://ports.ubuntu.com/ubuntu-ports/ trusty main restricted universe
-deb-src http://ports.ubuntu.com/ubuntu-ports/ trusty main restricted universe
-
-deb http://ports.ubuntu.com/ubuntu-ports/ trusty-updates main restricted universe
-deb-src http://ports.ubuntu.com/ubuntu-ports/ trusty-updates main restricted universe
-
-deb http://ports.ubuntu.com/ubuntu-ports/ trusty-backports main restricted
-deb-src http://ports.ubuntu.com/ubuntu-ports/ trusty-backports main restricted
-
-deb http://ports.ubuntu.com/ubuntu-ports/ trusty-security main restricted universe multiverse
-deb-src http://ports.ubuntu.com/ubuntu-ports/ trusty-security main restricted universe multiverse
diff --git a/tools/cross/arm/sources.list.trusty b/tools/cross/arm/sources.list.trusty
deleted file mode 100644
index 8aa98a259..000000000
--- a/tools/cross/arm/sources.list.trusty
+++ /dev/null
@@ -1,11 +0,0 @@
-deb http://ports.ubuntu.com/ubuntu-ports/ trusty main restricted universe
-deb-src http://ports.ubuntu.com/ubuntu-ports/ trusty main restricted universe
-
-deb http://ports.ubuntu.com/ubuntu-ports/ trusty-updates main restricted universe
-deb-src http://ports.ubuntu.com/ubuntu-ports/ trusty-updates main restricted universe
-
-deb http://ports.ubuntu.com/ubuntu-ports/ trusty-backports main restricted
-deb-src http://ports.ubuntu.com/ubuntu-ports/ trusty-backports main restricted
-
-deb http://ports.ubuntu.com/ubuntu-ports/ trusty-security main restricted universe multiverse
-deb-src http://ports.ubuntu.com/ubuntu-ports/ trusty-security main restricted universe multiverse
diff --git a/tools/cross/install_rootfs.sh b/tools/cross/install_rootfs.sh
index 5a65dac85..fa32c7350 100755
--- a/tools/cross/install_rootfs.sh
+++ b/tools/cross/install_rootfs.sh
@@ -5,7 +5,7 @@ usage()
 {
     echo "Usage: $0 [BuildArch] [LinuxCodeName] [--setproxy=IP] [--skipunmount]"
     echo "BuildArch can be: arm(default), aarch64 and armel"
-    echo "LinuxCodeName - optional, Code name for Linux, can be: bionic(default), trusty, xenial, focal"
+    echo "LinuxCodeName - optional, Code name for Linux, can be: xenial, bionic(default), focal"
     echo "                          If BuildArch is armel, this can be tizen(default)"
     echo "--setproxy=IP - optional, IP is the proxy server IP address or url with portnumber"
     echo "                           default no proxy. Example: --setproxy=127.1.2.3:8080"
@@ -69,9 +69,6 @@ for i in "$@" ; do
             __UbuntuRepo=
             __LinuxCodeName=
             ;;
-        trusty)
-            __LinuxCodeName=trusty
-            ;;
         xenial)
             __LinuxCodeName=xenial
             ;;
diff --git a/tools/nnpackage_tool/gen_golden/gen_golden.py b/tools/nnpackage_tool/gen_golden/gen_golden.py
index 125a69cac..79c86e6d7 100755
--- a/tools/nnpackage_tool/gen_golden/gen_golden.py
+++ b/tools/nnpackage_tool/gen_golden/gen_golden.py
@@ -91,6 +91,9 @@ if __name__ == '__main__':
             if this_dtype == tf.uint8:
                 input_values.append(
                     np.random.randint(0, 255, this_shape).astype(np.uint8))
+            if this_dtype == tf.int8:
+                input_values.append(
+                    np.random.randint(-127, 127, this_shape).astype(np.int8))
             elif this_dtype == tf.float32:
                 input_values.append(
                     np.random.random_sample(this_shape).astype(np.float32))
@@ -134,6 +137,9 @@ if __name__ == '__main__':
             if this_dtype == np.uint8:
                 input_values.append(
                     np.random.randint(0, 255, this_shape).astype(np.uint8))
+            if this_dtype == np.int8:
+                input_values.append(
+                    np.random.randint(-127, 127, this_shape).astype(np.int8))
             elif this_dtype == np.float32:
                 input_values.append(
                     np.random.random_sample(this_shape).astype(np.float32))
@@ -158,10 +164,11 @@ if __name__ == '__main__':
 
     # dump input and output in h5
     import h5py
-    supported_dtypes = ("float32", "uint8", "bool", "int32", "int64")
+    supported_dtypes = ("float32", "uint8", "int8", "bool", "int32", "int64")
     h5dtypes = {
         "float32": ">f4",
         "uint8": "u1",
+        "int8": "i1",
         "bool": "u1",
         "int32": "int32",
         "int64": "int64"
diff --git a/tools/pareto_profiler/README.md b/tools/pareto_profiler/README.md
new file mode 100644
index 000000000..85d999be1
--- /dev/null
+++ b/tools/pareto_profiler/README.md
@@ -0,0 +1,95 @@
+This folder contains the necessary scripts to perform a pareto front estimation for machine learning models. Currently, the scripts support target devices running on Tizen, as well as `Odroid-XU4`.
+
+The contents of the folder can be categorized into the following groups:
+
+- [Generator scripts to map decision variables to `nnpackage_run` parameters](#mapping-decision-to-parameters)
+- [Estimator scripts to compute pareto front](#pareto-estimation)
+
+The following subsections describe the role of each script in detail.
+
+## Mapping Decision to Parameters
+The generator script `gen_oplist.py` is located under `generator` folder, and encodes large integer representations for `nnpackage` backend assignments. Effectively, it maps suitable backend assignments to integer values. For example, a graph with only three operations and two backends will have a integer representation in the range `(0, 7)`. Thus a value `0` might imply all operations run on `cpu`, while `7` might imply that all operations run on `acl_cl` backend. As will be described below, the integer representation of `nnpackage` parameters serves as a convenient decision space for pareto estimation.
+
+Setting up parameters for `nnpackage_run` requires a knowledge of model-specific operations. To this end, the `gen_oplist.py` script generates for each model, a `oplist` of unique operations. If an exhaustive mapping of backends to operation sequences is preferred, then `gen_oplist.py` also generates a so-called `opmap` list for uniquely observed `<operation name, data size>` pairs. 
+
+`gen_oplist.py` is run on the development environment (read: *Desktop PC*) as shown below:
+```
+python3 gen_oplist.py <tflite model> <target>
+```
+
+The list of model operations and their mapping to graph node indexes are stored in a  *oplist.json* file, and transferred to the target device. For further details about usage, type `python3 gen_oplist.py --help`.
+
+## Pareto Estimation
+Scripts under the `estimator` folder fall under two categories, namely an [exhaustive, brute-force profiling](#exhaustive-profiling), and a [on-device version of pareto estimation](#on-device-pareto-estimation). These are described in detail below.
+
+## Exhaustive Profiling
+For the sake of testing several pareto estimation algorithms *offline* on common lookup data, the `generator` folder includes a `brute_force_profiler.py` that records all solutions in the decision *or* assignment space. `brute_force_profiler.py` is typically run on target device, with the following syntax:
+
+```
+python brute_force_profiler.py <model> <target> <run_folder> [--dumpfile=<filename>]
+```
+For details, type `python brute_force_profiler.py --help`. Below is a example of the dump generated by the brute-force profiler:
+
+```
+{"oplist": ["Pool2D", "BinaryArithmetic", "DepthwiseConv2D", "Conv2D", "Reshape"], 
+  "solutions": [
+    {"memory": 56388, "id": 0, "time": 72.525}, 
+    {"memory": 63624, "id": 1, "time": 86.532}, 
+    {"memory": 64320, "id": 2, "time": 69.352}, 
+    {"memory": 65376, "id": 3, "time": 76.436}, 
+    {"memory": 73016, "id": 4, "time": 69.634}, 
+    {"memory": 73492, "id": 5, "time": 47.013}, 
+    {"memory": 74488, "id": 6, "time": 95.01}, 
+    {"memory": 74844, "id": 7, "time": 111.329}, 
+    {"memory": 393324, "id": 8, "time": 98.956}, 
+    {"memory": 395088, "id": 9, "time": 103.24}, 
+    {"memory": 396180, "id": 10, "time": 68.107}, 
+    {"memory": 395932, "id": 11, "time": 86.109}, 
+    {"memory": 402468, "id": 12, "time": 25.477}, 
+    {"memory": 402800, "id": 13, "time": 25.42}, 
+    {"memory": 403904, "id": 14, "time": 9.168}, 
+    {"memory": 404476, "id": 15, "time": 7.801}, 
+....
+    {"memory": 403940, "id": 30, "time": 9.145}, 
+    {"memory": 403568, "id": 31, "time": 8.034}]}
+```
+
+**Note**: As of present, the pareto estimation algorithms run on-device, and will support an *offline* mode in the near future.
+
+## On Device Pareto Estimation
+Currently the `estimator` folder includes only a `random_sampler.py`, however, in future, it will feature a set of pareto estimation algorithms. Regardless of the algorithm, the following steps must be carried out in sequence:
+
+1. Generate the oplist using `gen_oplist.py`, and transfer the JSON file to the target device. This step is performed on the development environment
+
+2. Copy the contents of the `estimator` folder to the target (*scp* for odroid, *sdb push* for tizen), at a preferred location
+
+3. On the target device, run the pareto-estimation algorithm. The following example shows how to run `random_sampler.py` (see `python random_sampler.py --help` for details)
+```
+python random_sampler.py /root/img_model/mobilenetv2/ /opt/usr/nnfw-test/Product/out/bin --mode=name --dumpfile=/tmp/mobilenetv2_opname_profile.json --iterations=20
+```
+After profiling, the results can be viewed under the filename provided by the `--dumpfile` argument. Below is an illustrative example of the same model that was brute-forced above:
+
+```
+{"configs": {
+  "4": "BACKENDS=\"acl_cl;cpu\" OP_BACKEND_Pool2D=cpu OP_BACKEND_DepthwiseConv2D=cpu OP_BACKEND_Reshape=acl_cl OP_BACKEND_Conv2D=cpu OP_BACKEND_BinaryArithmetic=cpu ", 
+  "10": "BACKENDS=\"acl_cl;cpu\" OP_BACKEND_Pool2D=cpu OP_BACKEND_DepthwiseConv2D=acl_cl OP_BACKEND_Reshape=cpu OP_BACKEND_Conv2D=acl_cl OP_BACKEND_BinaryArithmetic=cpu ", 
+  "14": "BACKENDS=\"acl_cl;cpu\" OP_BACKEND_Pool2D=cpu OP_BACKEND_DepthwiseConv2D=acl_cl OP_BACKEND_Reshape=acl_cl OP_BACKEND_Conv2D=acl_cl OP_BACKEND_BinaryArithmetic=cpu ", 
+  "16": "BACKENDS=\"acl_cl;cpu\" OP_BACKEND_Pool2D=cpu OP_BACKEND_DepthwiseConv2D=cpu OP_BACKEND_Reshape=cpu OP_BACKEND_Conv2D=cpu OP_BACKEND_BinaryArithmetic=acl_cl ", 
+  "20": "BACKENDS=\"acl_cl;cpu\" OP_BACKEND_Pool2D=cpu OP_BACKEND_DepthwiseConv2D=cpu OP_BACKEND_Reshape=acl_cl OP_BACKEND_Conv2D=cpu OP_BACKEND_BinaryArithmetic=acl_cl ", 
+  "21": "BACKENDS=\"acl_cl;cpu\" OP_BACKEND_Pool2D=acl_cl OP_BACKEND_DepthwiseConv2D=cpu OP_BACKEND_Reshape=acl_cl OP_BACKEND_Conv2D=cpu OP_BACKEND_BinaryArithmetic=acl_cl ", 
+  "31": "BACKENDS=\"acl_cl;cpu\" OP_BACKEND_Pool2D=acl_cl OP_BACKEND_DepthwiseConv2D=acl_cl OP_BACKEND_Reshape=acl_cl OP_BACKEND_Conv2D=acl_cl OP_BACKEND_BinaryArithmetic=acl_cl "}, 
+  "oplist": ["Pool2D", "DepthwiseConv2D", "Reshape", "Conv2D", "BinaryArithmetic"], 
+  "solutions": [
+    {"exec_time": 76.138, "max_rss": 62712, "id": 4}, 
+    {"exec_time": 72.719, "max_rss": 65272, "id": 16}, 
+    {"exec_time": 22.409, "max_rss": 403120, "id": 14}, 
+    {"exec_time": 28.138, "max_rss": 403064, "id": 10}, 
+    {"exec_time": 70.656, "max_rss": 65536, "id": 20}, 
+    {"exec_time": 68.805, "max_rss": 66076, "id": 21}, 
+    {"exec_time": 8.201, "max_rss": 404656, "id": 31}], "mode": "name"}
+```
+**Note**: The pareto-estimation algorithms require the use of python `numpy` package, so make sure to install it beforehand.
+
+
+
+
diff --git a/tools/pareto_profiler/estimator/Hlps.py b/tools/pareto_profiler/estimator/Hlps.py
new file mode 100644
index 000000000..ba0925d6f
--- /dev/null
+++ b/tools/pareto_profiler/estimator/Hlps.py
@@ -0,0 +1,257 @@
+#! /usr/bin/python
+
+# Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import sys
+import Queue
+import utils
+import signal
+from pareto import ParetoData
+
+
+class Hlps:
+    """ 
+    Initialize Runner and Pareto data structure
+  """
+
+    def __init__(self, runner, num_backends, num_samples):
+        self._runner = runner
+        self._num_backends = num_backends
+        self._num_samples = num_samples
+        self._marked = {}
+        self._extended_search = False
+        self._iteration = 0
+        self._pareto_obj = ParetoData()
+
+    """
+    Method to generate new samples from a given sample v_vec. 
+    The new samples bear a hamming distance hd from the provided sample.    
+  """
+
+    def gen_hamming(self, v_vec, hd=1, nsamples=None):
+        if nsamples is None:
+            nsamples = self._num_backends - 1
+        ret = np.zeros((nsamples, len(v_vec)), dtype=int)
+        v = v_vec
+        marked = np.full(len(v), False, dtype=bool)
+        cnt = 0
+
+        for r in range(nsamples):
+            ret[r] = v
+        rnd_pos = np.random.permutation(range(len(v)))
+        for i in range(hd):
+            pos = rnd_pos[i]
+            marked[pos] = True
+            for r in range(nsamples):
+                ret[r][pos] = (v[pos] - r - 1) % self._num_backends
+
+        return ret
+
+    """
+      Method to generate all samples from a given sample v_vec, that
+      have a hamming distance of one with respect to it.
+  """
+
+    def gen_hamming_one(self, v_vec, invert=False):
+        ret = np.zeros(((self._num_backends - 1) * len(v_vec), len(v_vec)), dtype=int)
+        if invert == False:
+            v = v_vec
+        else:
+            v = [1 - x for x in v_vec]
+        for nb in range(1, self._num_backends):
+            c = 0
+            for r in range((nb - 1) * len(v), nb * len(v)):
+                ret[r] = v
+                ret[r][c] = (v[c] - nb) % self._num_backends
+                c += 1
+        return ret
+
+    """
+      Enable profiling over extended search space
+  """
+
+    def enable_extended_search(self):
+        self._extended_search = True
+        for key in self._pareto_obj.get_pareto_keys():
+            config = self._pareto_obj.get_config(key)
+            extended_val = self._runner.get_extended_solution(config)
+            self._pareto_obj.set_config(key, extended_val)
+        self._iteration = 0
+
+    """
+      HLPS algorithm implementation provided here.
+      Description: Starting with a random sample, fill up a sampling 
+      queue with hamming neighbors. Fetch samples from queue,
+      each time checking for pareto optimality. Pareto-optimal samples
+      are then explored/exploited to generate new samples that are added to the queue.
+      Algorithm phase terminates when the queue is empty.
+      Repeat this phase in a multi-shot invokation for better results.
+  """
+
+    def hlps_routine(self, config_ids):
+        # Initialize
+        solution_q = Queue.Queue()
+        visited = {}
+        nbits = self._runner.get_nbits(self._extended_search)
+        is_extended = self._runner.get_mode_extended()
+        nsolutions = self._num_backends**nbits
+
+        stop_insert = False
+
+        cnt = 0
+        q_add_cnt = 0
+        round_cnt = 0
+
+        def extended_solution(s):
+            return self._runner.get_extended_solution(s)
+
+        def mark_solution(s):
+            if is_extended == True and self._extended_search == False:
+                self._marked[extended_solution(s)] = True
+            else:
+                self._marked[s] = True
+
+        def is_marked(s):
+            if is_extended == True and self._extended_search == False:
+                return (extended_solution(s) in self._marked)
+            else:
+                return (s in self._marked)
+
+        def visit_solution(s):
+            if is_extended == True and self._extended_search == False:
+                visited[extended_solution(s)] = True
+            else:
+                visited[s] = True
+
+        def is_visited(s):
+            if is_extended == True and self._extended_search == False:
+                return (extended_solution(s) in visited)
+            else:
+                return (s in visited)
+
+        def sigint_handler(signum, frame):
+            print("Round cnt = ", round_cnt)
+
+        signal.signal(signal.SIGINT, sigint_handler)
+        if len(config_ids) > 0:
+            for solution in config_ids:
+                if is_extended == True and self._extended_search == True and self._iteration == 0:
+                    s = extended_solution(solution)
+                else:
+                    s = solution
+                s_vec = utils.int_to_vec(s, self._num_backends, nbits)
+
+                candidate = self.gen_hamming_one(s_vec)
+                for hd in range((self._num_backends - 1) * nbits):
+                    candidate_int = int(''.join(str(x) for x in reversed(candidate[hd])),
+                                        self._num_backends)
+                    if is_marked(candidate_int) == False:
+                        solution_q.put(candidate_int)
+                        mark_solution(candidate_int)
+                        q_add_cnt += 1
+        else:
+            start_seed = int(np.random.rand() * (nsolutions))
+            solution_q.put(start_seed)
+            q_add_cnt += 1
+
+        self._iteration += 1
+        # Main routine
+        while not solution_q.empty():
+            s = solution_q.get()
+            mark_solution(s)
+            stop_insert = False
+            if (round_cnt % 100 == 0):
+                print("sample count = ", round_cnt)
+            if self._extended_search == True:
+                print("Queue size is ", solution_q.qsize())
+
+            if is_extended == True and self._extended_search == False:
+                time_val, memory_val = self._runner.profile_by_opname(s)
+            elif is_extended == True:
+                time_val, memory_val = self._runner.profile_by_opindex(s)
+            else:
+                time_val, memory_val = self._runner.profile_by_opname(s)
+            round_cnt += 1
+
+            utils.progressbar(round_cnt, nsolutions, prefix="% samples computed. : ")
+            self._pareto_obj.update_pareto_solutions(
+                s, time_val, memory_val, explore_flag=True)
+
+            for key in self._pareto_obj.get_pareto_keys():
+                pareto_sample = self._pareto_obj.get_config(key)
+                explore_sample = self._pareto_obj.get_exploration(key)
+
+                if is_visited(pareto_sample):
+                    continue
+                visit_solution(pareto_sample)
+                s_vec = utils.int_to_vec(pareto_sample, self._num_backends, nbits)
+
+                if explore_sample == True:
+                    # Explore solutions over a larger range
+                    for hd in range(1, nbits + 1):
+                        if stop_insert is True:
+                            break
+
+                        candidate = self.gen_hamming(s_vec, hd=hd)
+                        for i in range(self._num_backends - 1):
+                            if stop_insert is True:
+                                break
+                            candidate_int = int(
+                                ''.join(str(x) for x in reversed(candidate[i])),
+                                self._num_backends)
+                            try:
+                                if is_marked(candidate_int) == False:
+                                    solution_q.put(candidate_int)
+                                    q_add_cnt += 1
+                            except IndexError:
+                                print("candidate[i] = ", candidate[i],
+                                      ', candidate_int = ', candidate_int)
+                                sys.exit(-1)
+                            if (q_add_cnt >= self._num_samples):
+                                print("Queue full in explore")
+                                stop_insert = True
+                else:
+                    # Exploit solutions within immediate neighborhood
+                    candidate = self.gen_hamming_one(s_vec)
+
+                    for j in range((self._num_backends - 1) * nbits):
+                        if stop_insert is True:
+                            break
+                        candidate_int = int(
+                            ''.join(str(x) for x in reversed(candidate[j])),
+                            self._num_backends)
+                        if is_marked(candidate_int) == False:
+                            solution_q.put(candidate_int)
+                            q_add_cnt += 1
+                        if (q_add_cnt >= self._num_samples):
+                            print("Queue full in exploit")
+                            stop_insert = True
+                    self._pareto_obj.set_exploration(key)
+
+        pfront = set([
+            self._pareto_obj.get_config(key)
+            for key in self._pareto_obj.get_pareto_keys()
+        ])
+        return pfront, q_add_cnt
+
+    """
+      Method to dump results from HLPS
+  """
+
+    def dump_results(self, dumpdata):
+        dumpdata = self._pareto_obj.dump_pareto_solutions(dumpdata)
+        dumpdata = self._runner.dump_config(dumpdata)
+        return dumpdata
diff --git a/tools/pareto_profiler/estimator/brute_force_profiler.py b/tools/pareto_profiler/estimator/brute_force_profiler.py
new file mode 100644
index 000000000..9516fc343
--- /dev/null
+++ b/tools/pareto_profiler/estimator/brute_force_profiler.py
@@ -0,0 +1,71 @@
+#! /usr/bin/python
+import argparse
+import json
+import sys
+from profile_args import ProfileArgs
+from runner import Runner
+from utils import progressbar
+
+if __name__ == "__main__":
+    parser = ProfileArgs(
+        prog="brute_force_profiler.py", description="Profiles nnpackage_run using oplist")
+    # Parse arguments
+    args = parser.parse_args()
+    modelfile = args.model
+    mode = args.mode
+    n_backends = args.backends
+    dumpfile = args.dumpfile
+
+    # Initialize a runner for given model and target
+    runner = Runner(args.model, args.run_folder, args.backends, args.mode)
+    nruns = runner.get_solution_spacelen()
+    profile_results = {}
+    profile_results['solutions'] = []
+    chk_ptr = 0
+
+    # Profile each backend setting, record execution time and peak memory
+    for r in range(nruns):
+        if (r % 100) == 0:
+            # Checkpointing results, in case the runs take too long
+            if chk_ptr > 0:
+                with open("/tmp/solutions.json") as ifile:
+                    tmp_results = json.load(ifile)
+
+                with open("/tmp/solutions.json", "w") as ofile:
+                    json.dump(tmp_results + profile_results['solutions'][chk_ptr:], ofile)
+            else:
+                with open("/tmp/solutions.json", "w") as ofile:
+                    json.dump(profile_results['solutions'], ofile)
+            chk_ptr = r
+
+        if args.mode == "name":
+            exec_time, max_rss = runner.profile_by_opname(r)
+        elif args.mode == "index":
+            exec_time, max_rss = runner.profile_by_opindex(r)
+        else:
+            print("Invalid mode ", mode)
+            sys.exit(-1)
+
+        profile_results['solutions'].append({
+            "time": exec_time,
+            "memory": max_rss,
+            "id": r
+        })
+        progressbar(r, nruns, prefix="% samples computed. : ")
+    progressbar(nruns, nruns, prefix="% samples computed. : ")
+
+    oplist, opmap, opname_by_indx = runner.get_opconfig()
+
+    if args.mode == "index":
+        profile_results['oplist'] = oplist
+        profile_results['opmap'] = opmap
+        profile_results['opname_by_indx'] = opname_by_indx
+    elif args.mode == "name":
+        profile_results['oplist'] = oplist
+    else:
+        print("Invalid mode ", mode)
+        sys.exit(-1)
+
+    with open(dumpfile, "w") as ofile:
+        json.dump(profile_results, ofile)
+    print "\nDone.."
diff --git a/tools/pareto_profiler/estimator/hlps_sampler.py b/tools/pareto_profiler/estimator/hlps_sampler.py
new file mode 100644
index 000000000..a4c1e4fd8
--- /dev/null
+++ b/tools/pareto_profiler/estimator/hlps_sampler.py
@@ -0,0 +1,99 @@
+#! /usr/bin/python
+
+# Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import utils
+import sys
+import json
+import time
+from Hlps import Hlps
+from profile_args import ProfileArgs
+from runner import Runner
+
+
+def hlps_profiler(modelfile,
+                  run_folder,
+                  num_backends=2,
+                  mode="name",
+                  nruns=3,
+                  num_samples=2000,
+                  dumpfile=None):
+    runner = Runner(modelfile, run_folder, num_backends, mode=mode)
+    hlps = Hlps(runner, num_backends=num_backends, num_samples=num_samples)
+
+    config_set = set()
+    sample_cnt = 0
+    total_reject_list = []
+
+    for r in range(nruns):
+        config_set, sample_cnt_iter = hlps.hlps_routine(config_set)
+        sample_cnt += sample_cnt_iter
+
+    # Add the index mode search here.
+    print("Starting search over extended space")
+    print("\n")
+    if mode == "index":
+        hlps.enable_extended_search()
+        for r in range(nruns):
+            config_set, sample_cnt_iter = hlps.hlps_routine(config_set)
+            sample_cnt += sample_cnt_iter
+
+    # Export results to json file
+    # Dump profiler results
+    dumpdata = {}
+    dumpdata['mode'] = args.mode
+    dumpdata['sample_cnt'] = sample_cnt
+    dumpdata = hlps.dump_results(dumpdata)
+    with open(dumpfile, "w") as ofile:
+        json.dump(dumpdata, ofile)
+
+
+if __name__ == "__main__":
+    t_start = time.time()
+    parser = ProfileArgs(
+        "hlps_on_device.py",
+        description="On-Device Optimizing Profiler for TensorFlowLite Models")
+    parser.add_argument(
+        '--iterations',
+        type=int,
+        default=3,
+        help='Number of iterations, less than 10 should be enough')
+    parser.add_argument(
+        '--samples', type=int, default=2000, help='Number of samples per iteration')
+    parser.add_argument(
+        '--offline',
+        type=bool,
+        default=False,
+        help='Set to True for running over profiled data')
+    parser.add_argument('--profiled_data', type=str, help='Profile file with path')
+
+    args = parser.parse_args()
+
+    hlps_profiler(
+        args.model,
+        args.run_folder,
+        num_backends=args.backends,
+        mode=args.mode,
+        nruns=args.iterations,
+        num_samples=args.samples,
+        dumpfile=args.dumpfile)
+    t_end = time.time()
+    with open(args.dumpfile, "r") as ifile:
+        dumpdata = json.load(ifile)
+    dumpdata['profiling time'] = (t_end - t_start)
+    with open(args.dumpfile, "w") as ofile:
+        json.dump(dumpdata, ofile)
+    print("done.., profiling time = ", (t_end - t_start), " seconds")
diff --git a/tools/pareto_profiler/estimator/pareto.py b/tools/pareto_profiler/estimator/pareto.py
new file mode 100644
index 000000000..9c62eb358
--- /dev/null
+++ b/tools/pareto_profiler/estimator/pareto.py
@@ -0,0 +1,84 @@
+#! /usr/bin/python
+
+# Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class ParetoData:
+    def __init__(self):
+        self._pareto_solutions = {}
+        self._configs = {}
+        self._cnt = 0
+        self._explore = {}
+
+    def add_pareto_entry(self,
+                         sample,
+                         exec_time,
+                         max_rss,
+                         key,
+                         explore_flag,
+                         check_one_hop=True):
+        self._pareto_solutions[key] = [exec_time, max_rss]
+        self._configs[key] = sample
+        if explore_flag == True and check_one_hop == True:
+            self._explore[key] = False
+        elif explore_flag == True and check_one_hop == False:
+            self._explore[key] = True
+
+    def update_pareto_solutions(self, sample, exec_time, max_rss, explore_flag=False):
+        new_item = True
+        if self._pareto_solutions:
+            for key in list(self._pareto_solutions):
+                if self._pareto_solutions[key][0] < exec_time and self._pareto_solutions[key][1] < max_rss:
+                    new_item = False
+                    break
+                elif self._pareto_solutions[key][0] > exec_time and self._pareto_solutions[key][1] > max_rss:
+                    self.add_pareto_entry(sample, exec_time, max_rss, key, explore_flag,
+                                          True)
+                    new_item = False
+
+        if new_item is True:
+            self.add_pareto_entry(sample, exec_time, max_rss, self._cnt, explore_flag,
+                                  False)
+            self._cnt += 1
+
+    def dump_pareto_solutions(self, dumpdata):
+        marked = {}
+        pareto_results = []
+        for i in range(self._cnt):
+            if self._configs[i] not in marked:
+                marked[self._configs[i]] = True
+                pareto_results.append({
+                    "id": self._configs[i],
+                    "exec_time": self._pareto_solutions[i][0],
+                    "max_rss": self._pareto_solutions[i][1]
+                })
+        dumpdata.update({"solutions": pareto_results})
+
+        return dumpdata
+
+    def get_pareto_keys(self):
+        return self._configs.keys()
+
+    def get_config(self, key):
+        return self._configs[key]
+
+    def get_exploration(self, key):
+        return self._explore[key]
+
+    def set_exploration(self, key):
+        self._explore[key] = True
+
+    def set_config(self, key, extended_value):
+        self._configs[key] = extended_value
diff --git a/tools/pareto_profiler/estimator/profile_args.py b/tools/pareto_profiler/estimator/profile_args.py
new file mode 100644
index 000000000..c4e019df8
--- /dev/null
+++ b/tools/pareto_profiler/estimator/profile_args.py
@@ -0,0 +1,37 @@
+#! /usr/bin/python
+
+# Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+
+class ProfileArgs(argparse.ArgumentParser):
+    def __init__(self, *args, **kwargs):
+        super(ProfileArgs, self).__init__(args, kwargs)
+        self.add_argument(
+            'model', type=str, default=None, help='nnpackage name with path')
+        self.add_argument('run_folder', type=str, help="path to nnpackage_run executable")
+        self.add_argument(
+            '--mode',
+            type=str.lower,
+            choices=["index", "name"],
+            default="name",
+            help='Profile by operation index or name')
+        self.add_argument('--backends', type=int, default=2, help='Number of backends')
+        self.add_argument(
+            '--dumpfile',
+            type=str.lower,
+            default="/tmp/final_result.json",
+            help='JSON Dumpfile name with path')
diff --git a/tools/pareto_profiler/estimator/random_sampler.py b/tools/pareto_profiler/estimator/random_sampler.py
new file mode 100644
index 000000000..7646ea62c
--- /dev/null
+++ b/tools/pareto_profiler/estimator/random_sampler.py
@@ -0,0 +1,60 @@
+#! /usr/bin/python
+import argparse
+import json
+import numpy as np
+import sys
+import subprocess
+import time
+from pareto import ParetoData
+from profile_args import ProfileArgs
+from runner import Runner
+from utils import progressbar
+
+if __name__ == "__main__":
+    t_start = time.time()
+    parser = ProfileArgs("random_sampler.py", description="Random sampler")
+    parser.add_argument(
+        '--iterations', type=int, default=100, help='Number of iterations')
+
+    # Parse arguments
+    args = parser.parse_args()
+    dumpfile = args.dumpfile
+    iterations = args.iterations
+
+    # Initialize a runner and Pareto data structure obj
+    runner = Runner(args.model, args.run_folder, args.backends, args.mode)
+    pareto_obj = ParetoData()
+    # Initialize variables for random sampler
+    n_assignments = runner.get_solution_spacelen()
+    n_iterations = min(iterations, n_assignments)
+    chk_ptr = 0
+    marked_samples = {}
+
+    # Profile at random over solution space
+    for r in range(n_iterations):
+        random_sample = int(np.random.rand() * n_assignments)
+        while random_sample in marked_samples:
+            random_sample = int(np.random.rand() * n_assignments)
+        marked_samples[random_sample] = True
+        if args.mode == "name":
+            exec_time, max_rss = runner.profile_by_opname(random_sample)
+        elif args.mode == "index":
+            exec_time, max_rss = runner.profile_by_opindex(random_sample)
+        else:
+            print("Invalid mode ", mode)
+            sys.exit(-1)
+
+        pareto_obj.update_pareto_solutions(random_sample, exec_time, max_rss)
+        progressbar(r, n_assignments, prefix="% samples computed. : ")
+    progressbar(r + 1, n_assignments, prefix="% samples computed. : ")
+
+    # Dump profiler results
+    dumpdata = {}
+    dumpdata['mode'] = args.mode
+    dumpdata = pareto_obj.dump_pareto_solutions(dumpdata)
+    dumpdata = runner.dump_config(dumpdata)
+    with open(dumpfile, "w") as ofile:
+        json.dump(dumpdata, ofile)
+    t_end = time.time()
+    print("\n")
+    print("done.., profiling time = ", (t_end - t_start), " seconds")
diff --git a/tools/pareto_profiler/estimator/runner.py b/tools/pareto_profiler/estimator/runner.py
new file mode 100644
index 000000000..d2b66d6fb
--- /dev/null
+++ b/tools/pareto_profiler/estimator/runner.py
@@ -0,0 +1,148 @@
+#! /usr/bin/python
+
+# Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import numpy as np
+from utils import fetch_config_by_name
+from utils import fetch_config_by_indx
+from utils import generate_vars
+from utils import generate_vars_for_indx
+from utils import exec_shell
+from utils import import_configs
+from utils import int_to_vec
+import sys
+
+
+class Mapper:
+    def __init__(self, opmap, oplist, opname_by_index):
+        self._opmap = opmap
+        self._oplist = oplist
+        self._opname_by_indx = opname_by_index
+
+    def get_oplist(self):
+        return self._oplist
+
+    def get_opmap(self):
+        return self._opmap
+
+    def get_opname_by_indx(self):
+        return self._opname_by_indx
+
+    def get_indices(self, value):
+        indx_list = []
+        for i in range(len(self._opname_by_indx)):
+            if self._opname_by_indx[i] == value:
+                indx_list.append(i)
+        return indx_list
+
+    def map_to_extended_space(self, n, backends):
+        n_vec = int_to_vec(n, backends, len(self._oplist))
+        extended_vec = np.zeros(max(self._opmap) + 1, dtype=int)
+        cnt = 0
+
+        for allocation in n_vec:
+            extended_pos = list(
+                set([self._opmap[i] for i in self.get_indices(self._oplist[cnt])]))
+            try:
+                extended_vec[extended_pos] = allocation
+            except IndexError:
+                print("extended_vec size = ", extended_vec.size, ", extended_pos = ",
+                      extended_pos)
+            cnt += 1
+        extended_n = int(''.join(str(i) for i in extended_vec[::-1]), 2)
+        return extended_n
+
+
+class Runner:
+    def __init__(self, model, run_folder, num_backends, mode):
+        self._model = model
+        self._run_folder = run_folder
+        self._mode = mode
+        oplist, opmap, opname_by_index = import_configs(mode)
+        self._mapper = Mapper(opmap, oplist, opname_by_index)
+        self._nbackends = num_backends
+        self._extended_map = {}
+
+    def get_solution_spacelen(self):
+        if self._mode == "name":
+            return self._nbackends**len(self._mapper.get_oplist())
+        elif self._mode == "index":
+            return self._nbackends**max(self._mapper.get_opmap())
+        else:
+            print("Unknown mode ", mode, ", exiting profiler")
+            sys.exit(-1)
+
+    def get_nbits(self, extended_search_mode):
+        if self._mode == "index" and extended_search_mode == True:
+            return max(self._mapper.get_opmap())
+        else:
+            return len(self._mapper.get_oplist())
+
+    def get_mode_extended(self):
+        return (self._mode == "index")
+
+    def get_extended_solution(self, s):
+        if s in self._extended_map:
+            return self._extended_map[s]
+
+        extended_value = self._mapper.map_to_extended_space(s, self._nbackends)
+        self._extended_map[s] = extended_value
+        return extended_value
+
+    def run_inference(self, solution):
+        cmd_str = [
+            ". /tmp/envvars.sh && " + self._run_folder + "/nnpackage_run -w1 -r1 -m1 -l "
+            + self._model + "/metadata/tc/input.h5 " + self._model + " 2> /dev/null"
+        ]
+        res = exec_shell(cmd_str, newline_split=True)
+        try:
+            exec_time = float(res[4].split(' ')[-2])
+            max_rss = int(res[13].split(' ')[-2])
+        except IndexError:
+            print("got index error at config ", solution)
+            print("result: ", res)
+            print("####")
+            sys.exit(-1)
+        return (exec_time, max_rss)
+
+    def profile_by_opname(self, solution):
+        generate_vars(self._mapper.get_oplist(), solution, self._nbackends)
+        return self.run_inference(solution)
+
+    def profile_by_opindex(self, solution):
+        generate_vars_for_indx(self._mapper.get_opmap(), solution, self._nbackends)
+        return self.run_inference(solution)
+
+    def get_opconfig(self):
+        return self._mapper.get_oplist(), self._mapper.get_opmap(
+        ), self._mapper.get_opname_by_indx()
+
+    def dump_config(self, dumpdata):
+        if self._mode == "name":
+            dumpdata.update({'oplist': self._mapper.get_oplist()})
+        elif self._mode == "index":
+            dumpdata.update({'oplist': self._mapper.get_opmap()})
+
+        configs = {}
+        for solution in dumpdata['solutions']:
+            if self._mode == "name":
+                configs[int(solution["id"])] = fetch_config_by_name(
+                    dumpdata['oplist'], solution["id"], self._nbackends)
+            elif self._mode == "index":
+                configs[int(solution["id"])] = fetch_config_by_indx(
+                    dumpdata['oplist'], solution["id"], self._nbackends)
+        dumpdata.update({'configs': configs})
+        return dumpdata
diff --git a/tools/pareto_profiler/estimator/utils.py b/tools/pareto_profiler/estimator/utils.py
new file mode 100644
index 000000000..9278674e3
--- /dev/null
+++ b/tools/pareto_profiler/estimator/utils.py
@@ -0,0 +1,201 @@
+#! /usr/bin/python
+import subprocess
+import numpy as np
+import sys
+import os
+import json
+"""
+  General executor for bash-like shell. Supports multiline results.
+"""
+
+
+def exec_shell(command_str, newline_split=False):
+    result = subprocess.Popen(command_str, shell=True, stdout=subprocess.PIPE)
+    out, err = result.communicate()
+    if (newline_split):
+        res = out.decode("utf-8").split('\n')
+        res = res[:-1]
+        return res
+    else:
+        return out.decode("utf-8").split("\n")[0]
+
+
+"""
+  Given a number and its base, return its symbol-wise vector representation
+"""
+
+
+def int_to_vec(n, b, n_operations):
+    number_arr = np.zeros(n_operations, dtype=int)
+    i = n_operations - 1
+    while (n != 0):
+        number_arr[i] = n % b
+        n = n // b
+        i -= 1
+
+    return number_arr[::-1]
+
+
+"""
+  Generate onert backend mapping for each graph node, give the encoded information in parameters.
+  The details of the parameters are as follows:
+  1. oplist     - a vector that maps each graph node to a unique <operation name, data size> id 
+                  that was generated by an earlier script (gen_oplist.py)
+  2. number     - the encoded backend assignment, typically a very long integer 
+  3. base_value - for practical purposes, this is equivalent to the number of backends
+"""
+
+
+def generate_vars_for_indx(oplist, number, base_value):
+    ofile = open('/tmp/envvars.sh', 'w')
+    backend_map = {0: "=cpu", 1: "=acl_cl", 2: "=acl_neon"}
+
+    if (base_value == 2):
+        ofile.write("export BACKENDS=\"acl_cl;cpu\"")
+    elif (base_value == 3):
+        ofile.write("export BACKENDS=\"acl_cl;acl_neon;cpu\"")
+    ofile.write("\n")
+    number_arr = int_to_vec(number, base_value, len(oplist))
+    cnt = 0
+    op_backend_map_str = "export OP_BACKEND_MAP=\""
+    for cnt in range(len(oplist)):
+        backend_str = backend_map[int(number_arr[oplist[cnt]])]
+        op_backend_map_str += ''.join([str(cnt), backend_str])
+
+        if (cnt < (len(oplist) - 1)):
+            op_backend_map_str += ";"
+        else:
+            op_backend_map_str += "\""
+    ofile.write(op_backend_map_str)
+    ofile.write("\n")
+    ofile.close()
+
+
+"""
+  Print onert backend mapping for each graph node, give the encoded information in parameters.
+  The details of the parameters are as follows:
+  1. oplist     - a vector that maps each graph node to a unique <operation name, data size> id 
+                  that was generated by an earlier script (gen_oplist.py)
+  2. number     - the encoded backend assignment, typically a very long integer 
+  3. base_value - for practical purposes, this is equivalent to the number of backends
+"""
+
+
+def fetch_config_by_indx(oplist, number, base_value):
+    var_str = ""
+    backend_map = {0: "=cpu", 1: "=acl_cl", 2: "=acl_neon"}
+
+    if (base_value == 2):
+        var_str += "BACKENDS=\"acl_cl;cpu\""
+    elif (base_value == 3):
+        var_str += "BACKENDS=\"acl_cl;acl_neon;cpu\""
+    var_str += " "
+    number_arr = int_to_vec(number, base_value, len(oplist))
+    cnt = 0
+    var_str += "OP_BACKEND_MAP=\""
+    op_backend_map_str = ""
+    for cnt in range(len(oplist)):
+        backend_str = backend_map[int(number_arr[oplist[cnt]])]
+        op_backend_map_str += ''.join([str(cnt), backend_str])
+
+        if (cnt < (len(oplist) - 1)):
+            op_backend_map_str += ";"
+        else:
+            op_backend_map_str += "\""
+    var_str += op_backend_map_str
+    return var_str
+
+
+"""
+  Generate onert backend mapping for each graph operation name, give the encoded information in parameters.
+  The details of the parameters are as follows:
+  1. oplist     - a vector that maps each graph node to a unique operation name. 
+                  The list is generated by an earlier script (gen_oplist.py)
+  2. number     - the encoded backend assignment, typically a long integer 
+  3. base_value - for practical purposes, this is equivalent to the number of backends
+"""
+
+
+def generate_vars(oplist, number, base_value):
+    ofile = open('/tmp/envvars.sh', 'w')
+    backend_map = {0: "=cpu", 1: "=acl_cl", 2: "=acl_neon"}
+    if (base_value == 2):
+        ofile.write("export BACKENDS=\"acl_cl;cpu\"")
+    elif (base_value == 3):
+        ofile.write("export BACKENDS=\"acl_cl;acl_neon;cpu\"")
+    ofile.write("\n")
+    number_str = int_to_vec(number, base_value, len(oplist))
+
+    cnt = 0
+    for n in number_str:
+        op_backend_map_str = ''.join(
+            ["export OP_BACKEND_", oplist[cnt], backend_map[int(n)]])
+        ofile.write(op_backend_map_str)
+        ofile.write("\n")
+        cnt += 1
+    ofile.close()
+
+
+"""
+  Print onert backend mapping for each graph operation name, give the encoded information in parameters.
+  The details of the parameters are as follows:
+  1. oplist     - a vector that maps each graph node to a unique operation name. 
+                  The list is generated by an earlier script (gen_oplist.py)
+  2. number     - the encoded backend assignment, typically a long integer 
+  3. base_value - for practical purposes, this is equivalent to the number of backends
+"""
+
+
+def fetch_config_by_name(oplist, number, base_value):
+    var_str = ""
+    backend_map = {0: "=cpu", 1: "=acl_cl", 2: "=acl_neon"}
+    if (base_value == 2):
+        var_str += "BACKENDS=\"acl_cl;cpu\""
+    elif (base_value == 3):
+        var_str += "BACKENDS=\"acl_cl;acl_neon;cpu\""
+    var_str += " "
+
+    number_str = int_to_vec(number, base_value, len(oplist))
+
+    cnt = 0
+    for n in number_str:
+        var_str += ''.join(["OP_BACKEND_", oplist[cnt], backend_map[int(n)]])
+        var_str += " "
+        cnt += 1
+    return var_str
+
+
+"""
+  Import operation list, map and relevant information for profiling. Note: These information should have been
+  dumped under /tmp/oplist.json using the gen_oplist.py script.
+"""
+
+
+def import_configs(mode):
+    if not os.path.isfile('/tmp/oplist.json'):
+        print("No oplist")
+        sys.exit(-1)
+    with open('/tmp/oplist.json', 'r') as ifile:
+        data = json.load(ifile)
+    oplist = data['oplist']
+    if mode == "name":
+        nbits = len(oplist)
+        return oplist, None, None
+    elif mode == "index":
+        opmap = data['opmap']
+        opname_by_indx = data['opname_by_indx']
+        return oplist, opmap, opname_by_indx
+
+    print("mode is incorrect")
+    sys.exit(-1)
+
+
+"""
+  Generic Progress bar display
+"""
+
+
+def progressbar(current_cnt, max_cnt, prefix="", file=sys.stdout):
+    x = int(current_cnt * 100.0 / max_cnt)
+    file.write("%s[%s%s] %i/%i\r" % (prefix, "#" * x, "." * (100 - x), x, 100))
+    file.flush()
diff --git a/tools/pareto_profiler/generator/gen_oplist.py b/tools/pareto_profiler/generator/gen_oplist.py
new file mode 100644
index 000000000..5511937d2
--- /dev/null
+++ b/tools/pareto_profiler/generator/gen_oplist.py
@@ -0,0 +1,165 @@
+#! /usr/bin/python
+import argparse
+import tensorflow as tf
+import sys
+sys.path.append("../estimator")
+import subprocess
+import os
+import json
+from functools import reduce
+from utils import exec_shell
+"""
+  Generates from a tflite model, a list of unique onert operation names used in the model
+"""
+
+
+def generate_oplist_by_name(tflite_file):
+    with open("operations_map.json") as ifile:
+        data = json.load(ifile)
+    op_dict = data['op_dict']
+
+    intr = tf.lite.Interpreter(tflite_file)
+    intr.allocate_tensors()
+    tf_opset = set(op['op_name'] for op in intr._get_ops_details())
+    try:
+        onert_ops = set([op_dict[op] for op in tf_opset])
+    except KeyError:
+        print("Invalid mapping, check your tensorflow ops for new/unknown mappings: ",
+              tf_opset)
+        sys.exit(-1)
+    return onert_ops
+
+
+"""
+  Returns the total data size for the model graph node (inputs + outputs)
+  Params:
+  op: operation instance (obtained from _get_ops_details())
+  tsr: tensor instance (obtained from get_tensor_details()) 
+"""
+
+
+def get_op_data_size(op, tsr):
+    data_size = 0
+    for idx in op['inputs']:
+        if tsr[idx]['shape'].size > 0:
+            data_size += reduce(lambda x, y: x * y,
+                                tsr[idx]['shape']) * tsr[idx]['shape'].dtype.itemsize
+
+    for idx in op['outputs']:
+        if tsr[idx]['shape'].size > 0:
+            data_size += reduce(lambda x, y: x * y,
+                                tsr[idx]['shape']) * tsr[idx]['shape'].dtype.itemsize
+    return data_size
+
+
+"""
+  Generates from a tflite model, the following outputs:
+  1.  opmap - a symbol/bit index mapping from every graph operation to a unique <operation name, data size> index identifier. This mapping
+      will be used later when profiling the model at runtime.
+
+  2.  oplist - a list of unique onert operation names used in the model
+
+  3.  opname_by_index - a list of onert operation names, indexed by their topological order in the model
+"""
+
+
+def generate_oplist_by_name_size(tflite_file):
+    intr = tf.lite.Interpreter(tflite_file)
+    intr.allocate_tensors()
+    ops = intr._get_ops_details()
+    tsr = intr.get_tensor_details()
+
+    opset = set()
+    oplist = set()
+    indx = []
+    opname_by_indx = []
+    # Fetch tensorflow operation mapping to onert kernels
+    with open("operations_map.json") as ifile:
+        data = json.load(ifile)
+    op_dict = data['op_dict']
+
+    # Fetch all unique operation names and <operation name, tensordata size> pairs
+    for op in ops:
+        opset.add((op['op_name'], get_op_data_size(op, tsr)))
+        oplist.add(op_dict[op['op_name']])
+        indx.append(op['index'])
+    opname_by_indx = [op_dict[ops[i]['op_name']] for i in indx]
+
+    # Create a 'm' bit/symbol map indexed by <opname, tensordata size> values
+    inv_opset_map = {}
+    i = 0
+    for op in opset:
+        inv_opset_map[op] = i
+        i += 1
+
+    # Map 'n' operation symbol space to 'm' <opname, tensordata size> space
+    op_map = []
+    for op in ops:
+        data_size = get_op_data_size(op, tsr)
+        op_map.append(inv_opset_map[(op['op_name'], data_size)])
+
+    return op_map, oplist, opname_by_indx
+
+
+"""
+Script to generate oplist, given the following details:
+1. Modelfile
+2. target device type
+3. Additional information, such as authetication for file tranfer
+
+Info: python gen_oplist.py --help
+"""
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description='''gen_backend: Generates oplist and uploads to target''',
+        epilog="""Success.""")
+    parser.add_argument(
+        '--auth', type=str, default=None, help='authentication: <user@host>')
+    parser.add_argument(
+        '--mode',
+        type=str.lower,
+        choices=["index", "name"],
+        default="name",
+        help='Profile by operation index or name')
+    parser.add_argument('model', type=str, default=None, help='tflite name with path')
+    parser.add_argument(
+        'target',
+        type=str.lower,
+        choices=['tizen', 'odroid'],
+        default="odroid",
+        help='target name')
+
+    # Parse arguments
+    args = parser.parse_args()
+    modelfile = args.model
+    target = args.target
+    mode = args.mode
+    if target == "odroid":
+        auth_str = args.auth
+        if auth_str is None:
+            print("Need valid authentication")
+            sys.exit(-1)
+
+    # Generate oplist
+    if mode == "name":
+        opset = generate_oplist_by_name(modelfile)
+        print(opset)
+        with open('/tmp/oplist.json', 'w') as opfile:
+            data = {}
+            data['oplist'] = list(opset)
+            json.dump(data, opfile)
+    elif mode == "index":
+        data = {}
+        opmap, oplist, opname_by_indx = generate_oplist_by_name_size(modelfile)
+        data['opmap'] = opmap
+        data['oplist'] = list(oplist)
+        data['opname_by_indx'] = opname_by_indx
+        with open('/tmp/oplist.json', 'w') as opfile:
+            json.dump(data, opfile)
+    # Upload oplist to target
+    if target == "tizen":
+        exec_shell("sdb push /tmp/oplist.json /tmp/oplist.json")
+    elif target == "odroid":
+        print("auth_str = ", auth_str)
+        exec_shell("scp  /tmp/oplist.json " + auth_str + ":/tmp/oplist.json")
+    print("done...")
diff --git a/tools/pareto_profiler/generator/operations_map.json b/tools/pareto_profiler/generator/operations_map.json
new file mode 100644
index 000000000..c35547ed9
--- /dev/null
+++ b/tools/pareto_profiler/generator/operations_map.json
@@ -0,0 +1,36 @@
+{ "op_dict": {
+  "SUM":"Reduce",
+  "ADD":"BinaryArithmetic",
+  "SUB":"BinaryArithmetic",
+  "DIV":"BinaryArithmetic",
+  "MUL":"BinaryArithmetic",
+  "REDUCE_MAX": "Reduce",
+  "REDUCE_MIN": "Reduce",
+  "CONV_2D": "Conv2D",
+  "PACK":"Pack",
+  "SOFTMAX":"Softmax",
+  "CONCATENATION":"Concat",
+  "EXP":"ElementwiseUnary",
+  "RESHAPE":"Reshape",
+  "SPLIT_V":"SplitV",
+  "ARG_MAX": "ArgMax",
+  "BATCH_TO_SPACE_ND":"BatchToSpaceND",
+  "DEPTHWISE_CONV_2D":"DepthwiseConv2D",
+  "LOGISTIC":"ElementwiseActivation",
+  "MEAN":"Reduce",
+  "RELU6":"ElementwiseActivation",
+  "RELU":"ElementwiseActivation",
+  "RESIZE_BILINEAR":"ResizeBilinear",
+  "REVERSE_V2":"Reverse",
+  "SPACE_TO_BATCH_ND":"SpaceToBatchND",
+  "AVERAGE_POOL_2D": "Pool2D",
+  "MAX_POOL_2D": "Pool2D",
+  "GATHER": "Gather",
+  "CAST": "ElementwiseUnary",
+  "FULLY_CONNECTED": "FullyConnected",
+  "PAD": "Pad",
+  "SLICE" : "Slice",
+  "STRIDED_SLICE": "StridedSlice",
+  "TRANSPOSE": "Transpose",
+  "UNPACK": "Unpack"
+}}
diff --git a/tools/stab/README.md b/tools/stab/README.md
new file mode 100644
index 000000000..c52ba4183
--- /dev/null
+++ b/tools/stab/README.md
@@ -0,0 +1,54 @@
+# Stab - Static Backend Scheduler
+
+`Stab` is a tool to schedule backend for each opration using profiled data
+
+nnpackage with backend configuration will be created at `./tools/stab/nnpkg_sched`
+
+Supported backends : `cpu`, `ruy`, and `xnnpack`
+- Other backends will be supported when `stab` can measure and use permutation time between backends
+
+## Scheduling Process
+
+1. Upload ONE runtime and nnpackage to remote device
+   - Use `/tmp/ONE` folder on remote device
+1. Profile execution time of each backend on remote device
+1. Get profile result from remote device
+   - Profile result is saved at `./tools/stab/traces` on host
+1. Scheduler backend for each operation to get fastest inference time
+   - Use fastest backend for each operation
+1. Generate nnpackage with backend configuration
+   - Generated at `./tools/stab/nnpkg_sched`
+
+## Prerequisite
+
+- Install Python>=3. Tested on Python 3.6.9 and 3.7.5
+- Register SSH keys to use ssh commands without entering password
+  ```bash
+  ssh-keygen -t rsa
+  ssh-copy-id -i ~/.ssh/id_rsa.pub remote_user@remote_ip
+  ```
+
+## Usage
+
+```
+Usage: python3 ./tools/stab/stab.py --nnpackage nnpackage_dir --ip <IP>
+Runs nnpackage on remote device and create nnpackaged with scheduled backends
+
+required arguments:
+  --nnpackage NNPACKAGE
+                        nnpackage folder to profile
+  --ip IP               IP address of remote client
+
+optional arguments:
+  -h, --help            show this help message and exit
+  -n NUM_THREADS, --num_threads NUM_THREADS
+                        Number of threads used by one runtime
+  -u USER, --user USER  User of remote client
+  -v, --verbose         Print verbose message
+  --no-profile          Disable profiling
+
+Examples:
+    python3 ./tools/stab/stab.py --nnpackage ../nnpkg_tst/inception --ip 1.1.1.1               => Profile on remote device 1.1.1.1 with current user
+    python3 ./tools/stab/stab.py --nnpackage ../nnpkg_tst/inception --ip 1.1.1.1 -n 4          => Profile on remote device 1.1.1.1 using 4 thread for ONE runtime
+    python3 ./tools/stab/stab.py --nnpackage ../nnpkg_tst/inception --ip 1.1.1.1 --user odroid => Profile on remote device 1.1.1.1 with user odroid
+```
diff --git a/tools/stab/backend_profiler.py b/tools/stab/backend_profiler.py
new file mode 100644
index 000000000..c9d71332d
--- /dev/null
+++ b/tools/stab/backend_profiler.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+from op_list_parser import OpListParser
+from remote import RemoteSSH
+
+
+class BackendProfiler():
+    """
+    Run ONE runtime on remote device to create TRACE file which has operation execution time
+
+    TODO : Support Android device profiling
+    """
+
+    def __init__(self, user, ip, nnpackage_dir, num_threads):
+        self.remote_ssh = RemoteSSH(user, ip, nnpackage_dir, num_threads)
+        self.backend_op_list = OpListParser().parse()
+        self.backend_list = ["cpu"]
+        self.backend_list.extend([backend for backend in self.backend_op_list])
+
+    def sync(self):
+        logging.info("Upload ONE runtime and nnpackage to remote device")
+        self.remote_ssh.sync_binary()
+
+    def profile(self):
+        for backend in self.backend_list:
+            logging.info(f"Profiling {backend} backend")
+            self.remote_ssh.profile_backend(backend, self.backend_op_list)
+            self.remote_ssh.sync_trace(backend)
diff --git a/tools/stab/backend_scheduler.py b/tools/stab/backend_scheduler.py
new file mode 100644
index 000000000..e18a1556f
--- /dev/null
+++ b/tools/stab/backend_scheduler.py
@@ -0,0 +1,156 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json, logging
+from pathlib import Path
+from op_list_parser import OpListParser
+from nnpkg_helper import NnpkgHelper
+
+
+class BackendScheduler:
+    """
+    Read profiled data and select proper backend for each operation
+    Scheduled nnpackage is saved at ./tools/stab/nnpkg_sched
+
+    TODO : Use permutation time for better scheduling
+    """
+
+    def __init__(self, nnpkg_dir, num_threads):
+        self.nnpkg_dir = Path(nnpkg_dir).resolve()
+        self.num_threads = num_threads
+        self.root_path = Path(__file__).parents[2]
+        self.nnpkg_helper = NnpkgHelper()
+
+    def read_traces(self, backend_list):
+        op_time = {}
+        inference_time = {}
+        for backend in backend_list:
+            try:
+                # Trace file is located at ./tools/stab/traces
+                trace_path = Path(
+                    __file__
+                ).parent / 'traces' / f"{self.nnpkg_dir.name}_{backend}_{self.num_threads}"
+                logging.debug(f"Trace path : {trace_path}")
+                with open(trace_path) as f:
+                    data = json.load(f)
+                    execution_data = data['Execution_Data']
+                    for entry in execution_data:
+                        if entry == "memory":
+                            continue
+                        elif entry == "runtime":
+                            inference_time[backend] = execution_data['runtime']['Graph'][
+                                'Avg_Time']
+                            continue
+                        op_backend = entry
+                        backend_data = execution_data[op_backend]
+                        for op in backend_data:
+                            op_index = int(op.split(' ')[2][1:])
+                            op_type = op.split(' ')[-1]
+                            time = int(backend_data[op]["Avg_Time"])
+                            if op_index not in op_time.keys():
+                                op_time[op_index] = {op_backend: time}
+                                op_time[op_index].update({"type": op_type})
+                            else:
+                                op_time[op_index].update({op_backend: time})
+            except IOError as e:
+                logging.warning(e)
+        return op_time, inference_time
+
+    def schedule(self):
+        backend_op_list = OpListParser().parse()
+        backend_list = ["cpu"]
+        backend_list.extend([backend for backend in backend_op_list])
+
+        op_time, backend_infer_time = self.read_traces(backend_list)
+
+        backend_mapping = {}
+
+        target_ops = set()
+        for _, v in backend_op_list.items():
+            target_ops.update(v)
+
+        # Find fastest backend for each operation
+        for op_index, value in sorted(op_time.items()):
+            op_type = value['type']
+            if op_type not in target_ops:
+                continue
+
+            logging.debug(f"----- Operation {op_index} -----")
+            op_infer_time = 0
+            for backend in backend_list:
+                if backend not in value:
+                    continue
+                backend_time = value[backend]
+
+                logging.debug(f"{backend}[{backend_time}]")
+                if op_infer_time == 0 or backend_time < op_infer_time:
+                    op_infer_time = backend_time
+                    backend_mapping[op_index] = backend
+
+        # Find default backend for Conv2D
+        default_backend = min(backend_infer_time, key=backend_infer_time.get)
+
+        # Create OP_BACKEND_MAP string
+        backend_conf = ""
+        for op_index, backend in sorted(backend_mapping.items()):
+            if backend != default_backend:
+                backend_conf += "{}={};".format(op_index, backend)
+
+        # Select fastet backend for each operation
+        logging.info("-------- Expected inference time ---------")
+        inference_time = {}
+        for backend in backend_list:
+            inference_time[backend] = 0
+            for op_index, value in sorted(op_time.items()):
+                if backend in value:
+                    inference_time[backend] += value[backend]
+                else:
+                    inference_time[backend] += value["cpu"]
+
+        schedule_time = 0
+        for op_index, value in sorted(op_time.items()):
+            op_type = value['type']
+            if op_type not in target_ops:
+                schedule_time += value["cpu"]
+                continue
+            else:
+                op_backend = backend_mapping[op_index]
+                schedule_time += value[op_backend]
+                if (default_backend != op_backend):
+                    logging.debug("[{}] {} -> {} : {:.2f} ms decrease".format(
+                        op_index, default_backend, op_backend,
+                        (value[default_backend] - value[op_backend]) / 1000))
+
+        for backend in backend_list:
+            logging.info(f"{backend} backend : {inference_time[backend]/1000:.2f} ms")
+        logging.info(f"Backend scheduling : {schedule_time / 1000:.2f} ms")
+
+        logging.info("-------- Backend Scheduling --------")
+        cmd = []
+        cmd += [f"OP_BACKEND_MAP={backend_conf}"]
+        for target_backend, op_list in backend_op_list.items():
+            if default_backend == target_backend:
+                for op in op_list:
+                    cmd += [f"OP_BACKEND_{op}={default_backend}"]
+        cmd += [f"BACKENDS={';'.join(backend_list)}"]
+        cmd += [f"RUY_THREADS={self.num_threads}"]
+        cmd += [f"XNNPACK_THREADS={self.num_threads}"]
+        logging.info(' '.join(cmd))
+
+        # Create nnpackage with backend mapping
+        dst_dir = Path(__file__).parent / 'nnpkg_sched' / self.nnpkg_dir.name
+        self.nnpkg_helper.copy(self.nnpkg_dir, dst_dir)
+        self.nnpkg_helper.add_config(dst_dir, cmd)
diff --git a/tools/stab/nnpkg_helper.py b/tools/stab/nnpkg_helper.py
new file mode 100644
index 000000000..7e68760ff
--- /dev/null
+++ b/tools/stab/nnpkg_helper.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json, logging
+from distutils.dir_util import copy_tree
+from pathlib import Path
+
+
+class NnpkgHelper:
+    """
+    Helper class for nnpackage
+    """
+
+    def __init__(self):
+        self.config_name = 'config.cfg'
+
+    def copy(self, src, dst):
+        copy_tree(str(src), str(dst))
+
+    def add_config(self, src, configs):
+        manifest_path = Path(src).resolve() / 'metadata' / 'MANIFEST'
+        config_path = Path(src).resolve() / 'metadata' / self.config_name
+
+        try:
+            # Read MANIFEST file
+            with open(manifest_path, 'r') as manifest_file:
+                data = json.load(manifest_file)
+
+            # Add configs to MANIFEST file
+            with open(manifest_path, 'w') as manifest_file:
+                data['configs'] = [self.config_name]
+                json.dump(data, manifest_file, indent=2)
+
+            # Write config.cfg file
+            with open(config_path, 'w') as config_file:
+                config_file.write('\n'.join(configs))
+
+            logging.info(f"Scheduled nnpackage is saved at {src}")
+
+        except IOError as e:
+            logging.warn(e)
+        except:
+            logging.warn("Error")
diff --git a/tools/stab/op_list.txt b/tools/stab/op_list.txt
new file mode 100644
index 000000000..7c5565655
--- /dev/null
+++ b/tools/stab/op_list.txt
@@ -0,0 +1,2 @@
+ruy:Conv2D,FullyConnected
+xnnpack:Conv2D,DepthwiseConv2D,FullyConnected
diff --git a/tools/stab/op_list_parser.py b/tools/stab/op_list_parser.py
new file mode 100644
index 000000000..d9fba508b
--- /dev/null
+++ b/tools/stab/op_list_parser.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+
+
+class OpListParser():
+    """
+    Read op_list.txt to create supported operation list for each backend
+
+    TODO : Reads supported tensor type for each operation (FP32 or INT8)
+    """
+
+    def __init__(self):
+        self.file_name = "op_list.txt"
+        self.op_list_file = Path(__file__).parent / self.file_name
+
+    def parse(self):
+        backend_op_list = {}
+        with open(self.op_list_file, 'r') as f:
+            lines = f.readlines()
+            for line in lines:
+                line = line.rstrip()
+                backend, _, op_list_str = line.partition(':')
+                op_list = op_list_str.split(',')
+                backend_op_list[backend] = op_list
+        return backend_op_list
diff --git a/tools/stab/remote.py b/tools/stab/remote.py
new file mode 100644
index 000000000..d63021152
--- /dev/null
+++ b/tools/stab/remote.py
@@ -0,0 +1,102 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import subprocess, logging
+from pathlib import Path
+
+
+class RemoteSSH():
+    """
+    Execute commands on remove device using SSH
+
+    TODO : Using SSH library instead of direct ssh call
+    """
+
+    def __init__(self, user, ip, nnpkg_dir, num_threads):
+        self.base_dir = Path('/tmp/ONE')
+        self.trace_dir = 'traces'
+        self.host = f"{user}@{ip}" if user != None else ip
+        self.nnpkg_dir = Path(nnpkg_dir).resolve()
+        self.nnpkg_name = self.nnpkg_dir.name
+        self.root_path = Path(__file__).resolve().parents[2]
+        self.num_threads = num_threads
+
+    def sync_binary(self):
+        bin_dir = self.root_path / 'Product/armv7l-linux.release/out'
+        if (not bin_dir.is_dir()):
+            logging.warn(f"Build dir [{bin_dir}] is not exist")
+            exit()
+        elif (not self.nnpkg_dir.is_dir()):
+            logging.warn(f"nnpackage dir [{self.nnpkg_dir}] is not exist")
+            exit()
+        else:
+            # Create temporary folder
+            subprocess.call(
+                ["ssh", f"{self.host}", "mkdir", "-p", self.base_dir / self.trace_dir])
+            # Syne ONE runtime
+            subprocess.call([
+                "rsync", "-az", "--exclude", "test-suite.tar.gz", bin_dir,
+                self.remote(self.base_dir)
+            ])
+            # Sync target nnpackage
+            subprocess.call(["rsync", "-az", self.nnpkg_dir, self.remote(self.base_dir)])
+
+    def sync_trace(self, backend):
+        remote_trace_path = self.remote_trace_path(backend)
+        local_trace_path = self.local_trace_path(backend)
+        local_trace_path.parent.mkdir(parents=True, exist_ok=True)
+        logging.debug(f"Remote trace path : {self.remote(remote_trace_path)}")
+        logging.debug(f"Local trace path : {local_trace_path}")
+        # Sync trace file
+        subprocess.call(
+            ["rsync", "-az",
+             self.remote(remote_trace_path), local_trace_path])
+
+    def profile_backend(self, backend, backend_op_list):
+        nnpkg_run_path = self.base_dir / 'out/bin/nnpackage_run'
+        nnpkg_path = self.base_dir / self.nnpkg_dir.name
+
+        cmd = ["ssh", f"{self.host}"]
+        cmd += [f"TRACE_FILEPATH={self.remote_trace_path(backend)}"]
+        for target_backend, op_list in backend_op_list.items():
+            if backend == target_backend:
+                for op in op_list:
+                    cmd += [f"OP_BACKEND_{op}={backend}"]
+        cmd += [f"XNNPACK_THREADS={self.num_threads}"]
+        cmd += [f"RUY_THREADS={self.num_threads}"]
+        cmd += [f"BACKENDS=\'{';'.join(['cpu', backend])}\'"]
+        cmd += [f"{nnpkg_run_path}"]
+        cmd += [f"--nnpackage"]
+        cmd += [f"{nnpkg_path}"]
+        cmd += [f"-w5 -r50"]
+        logging.debug(f"SSH command : {' '.join(cmd)}")
+        subprocess.call(cmd)
+
+    def base_path():
+        pass
+
+    def remote(self, path):
+        return f"{self.host}:{path}"
+
+    # TODO Create class for path generation
+    def trace_name(self, backend):
+        return f"{self.nnpkg_name}_{backend}_{self.num_threads}"
+
+    def remote_trace_path(self, backend):
+        return self.base_dir / self.trace_dir / self.trace_name(backend)
+
+    def local_trace_path(self, backend):
+        return Path(__file__).parent / self.trace_dir / self.trace_name(backend)
diff --git a/tools/stab/stab.py b/tools/stab/stab.py
new file mode 100644
index 000000000..7a069df5d
--- /dev/null
+++ b/tools/stab/stab.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+
+# Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse, logging, sys
+from backend_profiler import BackendProfiler
+from backend_scheduler import BackendScheduler
+
+
+def main(args):
+    if args.profile:
+        backend_profiler = BackendProfiler(args.user, args.ip, args.nnpackage,
+                                           args.num_threads)
+        backend_profiler.sync()
+        backend_profiler.profile()
+    backend_scheduler = BackendScheduler(args.nnpackage, args.num_threads)
+    backend_scheduler.schedule()
+
+
+if __name__ == "__main__":
+    arg_parser = argparse.ArgumentParser(add_help=False)
+    required = arg_parser.add_argument_group('required arguments')
+    optional = arg_parser.add_argument_group('optional arguments')
+
+    # Add back help
+    optional.add_argument(
+        '-h',
+        '--help',
+        action='help',
+        default=argparse.SUPPRESS,
+        help='show this help message and exit')
+    required.add_argument(
+        "--nnpackage", type=str, required=True, help="nnpackage folder to profile")
+    required.add_argument(
+        "--ip", type=str, required=True, help="IP address of remote client")
+    optional.add_argument(
+        "-n",
+        "--num_threads",
+        type=int,
+        default=1,
+        help="Number of threads used by one runtime")
+    optional.add_argument("-u", "--user", type=str, help="User of remote client")
+    optional.add_argument(
+        "-v",
+        "--verbose",
+        action='store_const',
+        dest="verbose_level",
+        default=logging.INFO,
+        const=logging.DEBUG,
+        help="Print verbose message")
+    optional.add_argument(
+        "--no-profile", dest='profile', action='store_false', help="Disable profiling")
+    optional.set_defaults(profile=True)
+    args = arg_parser.parse_args()
+
+    logging.basicConfig(
+        stream=sys.stdout,
+        level=args.verbose_level,
+        format="[%(levelname).5s] %(message)s")
+
+    main(args)
diff --git a/tools/tflitefile_tool/select_operator.py b/tools/tflitefile_tool/select_operator.py
index a1aa6f263..dccb3454f 100755
--- a/tools/tflitefile_tool/select_operator.py
+++ b/tools/tflitefile_tool/select_operator.py
@@ -188,6 +188,11 @@ def GenerateQuantization(new_builder, selected_quantization):
         tflite.QuantizationParameters.QuantizationParametersAddZeroPoint(
             new_builder, new_zeropoint)
 
+    quantized_dimension = selected_quantization.QuantizedDimension()
+    if quantized_dimension != 0:
+        tflite.QuantizationParameters.QuantizationParametersAddQuantizedDimension(
+            new_builder, quantized_dimension)
+
     return tflite.QuantizationParameters.QuantizationParametersEnd(new_builder)
author	Chunseok Lee <chunseok.lee@samsung.com>	2021-04-20 18:01:41 +0900
committer	Chunseok Lee <chunseok.lee@samsung.com>	2021-04-20 18:01:41 +0900
commit	589bb1db6db6784efe21b3fbbfbfdb79aaa5f14e (patch)
tree	47a2b23ce4220e3a4150c8b12ed941555272fb0c
parent	62529acabbafce7730601ed01d5709d7bc0d378a (diff)
download	nnfw-589bb1db6db6784efe21b3fbbfbfdb79aaa5f14e.tar.gz nnfw-589bb1db6db6784efe21b3fbbfbfdb79aaa5f14e.tar.bz2 nnfw-589bb1db6db6784efe21b3fbbfbfdb79aaa5f14e.zip