Imported Upstream version 1.8.0upstream/1.8.0 submit/tizen/20200814.062151

author: Chunseok Lee <chunseok.lee@samsung.com> 2020-08-14 15:19:19 +0900
committer: Chunseok Lee <chunseok.lee@samsung.com> 2020-08-14 15:19:19 +0900
commit: 042b262b3633b6c0f577aed6cb4b980ad0c1dcf3 (patch)
tree: e79fb9ffe65b21bdc5863306db2757ab187a3306
parent: 05e0ec30a632339a8533082476f27bda31ccde16 (diff)
download: nnfw-042b262b3633b6c0f577aed6cb4b980ad0c1dcf3.tar.gz
nnfw-042b262b3633b6c0f577aed6cb4b980ad0c1dcf3.tar.bz2
nnfw-042b262b3633b6c0f577aed6cb4b980ad0c1dcf3.zip
852 files changed, 17904 insertions, 15233 deletions
diff --git a/.ahub/tcchecker-tca/config.yaml b/.ahub/tcchecker-tca/config.yaml
new file mode 100644
index 000000000..cd34d792f
--- /dev/null
+++ b/.ahub/tcchecker-tca/config.yaml
@@ -0,0 +1,43 @@
+version: 2
+test:
+  - name: NN Runtime
+    testCaseLanguage: CPP
+    testFW: GTEST
+    testCaseFolder:
+      - ./compute/test/cker
+      - ./runtime/onert/core/src/backend/cpu_common
+      - ./runtime/onert/frontend/nnapi
+      - ./runtime/onert/test/core/compiler
+      - ./runtime/onert/test/core/exec
+      - ./runtime/onert/test/core/interp
+      - ./runtime/onert/test/graph
+      - ./runtime/onert/test/graph/operand
+      - ./runtime/onert/test/graph/operation
+      - ./runtime/onert/test/graph/verifier
+      - ./runtime/onert/test/ir
+      - ./runtime/onert/test/util
+      - ./tests/nnapi/src
+      - ./tests/nnfw_api/src
+      - ./tests/tools/tflite_run/src
+
+    testFile:
+      - extension: cpp
+        any: true
+      - extension: cc
+        any: true
+
+    testCase:
+      - condition:
+        - functionName:
+            starts:
+              - TEST
+ 
+    negativeTestCase:
+      - condition:
+        - testName:
+            starts:
+              - neg_
+
+    positiveTestCase:
+      - condition:
+        - inverse: negativeTestCase
diff --git a/.ctags b/.ctags
index 13c27abbe..8815f1fc6 100644
--- a/.ctags
+++ b/.ctags
@@ -3,5 +3,6 @@
 --exclude=build
 --exclude=tags
 --exclude=tests/scripts/framework/cache
+--exclude=tests/scripts/models/cache
 --exclude=tools/cross/rootfs
 --exclude=doxygen
diff --git a/.gitignore b/.gitignore
index d0931912a..263856b3f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -8,7 +8,7 @@
 *.pyc
 
 # Test cache for model download
-/tests/scripts/framework/cache
+/tests/scripts/**/cache
 
 # Test report
 /report
diff --git a/Makefile.template b/Makefile.template
index 6c919f3cb..a21937dea 100644
--- a/Makefile.template
+++ b/Makefile.template
@@ -108,7 +108,9 @@ build: build_internal
 
 install: $(TIMESTAMP_INSTALL)
 
-create_tar: runtime_tar_internal
+create_package: runtime_tar_internal
+
+create_acl_tar: acl_tar_internal
 
 clean:
 	rm -rf $(WORKSPACE)
@@ -149,10 +151,13 @@ install_internal:
 	touch $(TIMESTAMP_INSTALL)
 
 runtime_tar_internal: $(TIMESTAMP_BUILD) install_internal
-	tar -zcf nnfw-package.tar.gz -C $(INSTALL_PATH) lib
-	tar -zcf nnfw-dev-package.tar.gz -C $(INSTALL_PATH) include/nnfw
-	tar -zcf nnfw-internal-dev-package.tar.gz -C $(INSTALL_PATH) include/onert
-	mv nnfw-*package.tar.gz $(INSTALL_PATH)/.
+	tar -zcf $(WORKSPACE)/nnfw-package.tar.gz -C $(INSTALL_PATH) lib
+	tar -zcf $(WORKSPACE)/nnfw-devel-package.tar.gz -C $(INSTALL_PATH) include/nnfw
+	tar -zcf $(WORKSPACE)/nnfw-plugin-devel-package.tar.gz -C $(INSTALL_PATH) include/onert
+	tar -zcf $(WORKSPACE)/nnfw-test-package.tar.gz -C ${INSTALL_PATH} bin test unittest unittest_standalone
+
+acl_tar_internal: $(BUILD_FOLDER)
+	tar -zcf $(WORKSPACE)/nnfw-acl.tar.gz -C ${OVERLAY_FOLDER} lib
 
 install_internal_acl:
 # Workaround to install acl for test (ignore error when there is no file to copy)
diff --git a/compiler/.ahub/tcchecker-tca/config.yaml b/compiler/.ahub/tcchecker-tca/config.yaml
new file mode 100644
index 000000000..ef681de1a
--- /dev/null
+++ b/compiler/.ahub/tcchecker-tca/config.yaml
@@ -0,0 +1,54 @@
+version: 2
+test:
+  - name: NN Compiler
+    testCaseLanguage: CPP
+    testFW: GTEST
+    testCaseFolder:
+      - ./angkor
+      - ./arser
+      - ./circle2circle
+      - ./circle-quantizer
+      - ./cwrap
+      - ./foder
+      - ./hermes
+      - ./hermes-std
+      - ./loco
+      - ./locomotiv
+      - ./locop
+      - ./logo
+      - ./logo-core
+      - ./luci
+      - ./luci-interpreter
+      - ./luci-value-test
+      - ./mio-circle
+      - ./mio-tflite
+      - ./oops
+      - ./pepper-assert
+      - ./pepper-str
+      - ./pepper-strcast
+      - ./pp
+      - ./record-minmax
+      - ./safemain
+      - ./souschef
+      - ./stdex
+      - ./tflite2circle
+
+    testFile:
+      - extension: .test.cpp
+        any: true
+
+    testCase:
+      - condition:
+        - functionName:
+            starts:
+              - TEST
+
+    negativeTestCase:
+      - condition:
+        - testName:
+            ends:
+              - _NEG
+
+    positiveTestCase:
+      - condition:
+        - inverse: negativeTestCase
diff --git a/compiler/bcq-tools/CMakeLists.txt b/compiler/bcq-tools/CMakeLists.txt
new file mode 100644
index 000000000..fcf01de7d
--- /dev/null
+++ b/compiler/bcq-tools/CMakeLists.txt
@@ -0,0 +1,27 @@
+set(BCQ_TOOLS_FILES
+    generate_bcq_output_arrays
+    preserve_bcq_info
+)
+
+foreach(BCQ_TOOLS IN ITEMS ${BCQ_TOOLS_FILES})
+
+  set(BCQ_TOOLS_FILE ${BCQ_TOOLS})
+  set(BCQ_TOOLS_SRC "${CMAKE_CURRENT_SOURCE_DIR}/${BCQ_TOOLS_FILE}")
+  set(BCQ_TOOLS_BIN "${CMAKE_CURRENT_BINARY_DIR}/${BCQ_TOOLS_FILE}")
+  set(BCQ_TOOLS_TARGET "${BCQ_TOOLS}_target")
+
+  add_custom_command(OUTPUT ${BCQ_TOOLS_BIN}
+    COMMAND ${CMAKE_COMMAND} -E copy "${BCQ_TOOLS_SRC}" "${BCQ_TOOLS_BIN}"
+    DEPENDS ${BCQ_TOOLS_SRC}
+    COMMENT "Generate ${BCQ_TOOLS_BIN}"
+  )
+
+  add_custom_target(${BCQ_TOOLS_TARGET} ALL DEPENDS ${BCQ_TOOLS_BIN})
+
+  install(FILES ${BCQ_TOOLS_BIN}
+          PERMISSIONS OWNER_WRITE OWNER_READ OWNER_EXECUTE
+                      GROUP_READ GROUP_EXECUTE
+                      WORLD_READ WORLD_EXECUTE
+          DESTINATION bin)
+
+endforeach(BCQ_TOOLS)
diff --git a/compiler/bcq-tools/README.md b/compiler/bcq-tools/README.md
new file mode 100644
index 000000000..18b0f4826
--- /dev/null
+++ b/compiler/bcq-tools/README.md
@@ -0,0 +1,78 @@
+# BCQ Tools
+
+This directory includes some tools related with BCQ.
+
+## preserve_bcq_info
+
+### Purpose
+
+`preserve_bcq_info` is for preserving constant nodes which include BCQ information.
+When `.pb` file is converted to `.tflite` file by TFlite converter, constant nodes whose values are exactly same are removed and then linked to only one representative node.
+This makes us impossible to know what constant node should be linked to a node which we want to apply BCQ.
+One of the solutions is making all the same constant nodes different by inserting unique values and ignore the newly generated unique values when BCQ fusing is applied.
+`preserve_bcq_info` will generate and insert unique dummy values to the constant nodes whose values are same not to be removed by Tensorflow Lite converter.
+As a result, BCQ information will be preserved.
+
+### How to use
+
+```bash
+preserve_bcq_info \
+--input_path /path/to/original_model.pb \
+--output_path /path/to/preserved_model.pb
+```
+
+### How it works
+
+If we add unique dummy value at the end of each constant nodes, all the constant nodes would be different. Following is an example.
+
+```
+[Original Constant Nodes]
+const(value=[1, 2, 3], name='const1')
+const(value=[1, 2, 3], name='const2')
+const(value=[1, 2, 3], name='const3')
+
+[After BCQ information preserved]
+const(value=[1, 2, 3, -1], name='const1')
+const(value=[1, 2, 3, -2], name='const2')
+const(value=[1, 2, 3, -3], name='const3')
+```
+
+For dummy values, negative values are used instead of positive values.
+This is because positive valus may be confused with original constant node values.
+For your information, unique dummy value starts from -1 and moves to -2, -3, ..., -N, where N is the number of preserved constant nodes.
+
+### Caution
+
+- Newly generated dummy values should be ignored when the constant nodes are used.
+
+## generate_bcq_output_arrays
+
+### Purpose
+
+To apply BCQ, BCQ information nodes should be designated as model output so that they are alive even after TFLite conversion is finished.
+However, there are so many nodes to designate and sometimes we cannot copy and paste all of them because the string size is too big.
+`generate_bcq_output_arrays` is for generating output_arrays, which include BCQ information nodes.
+
+### How to use
+
+```bash
+generate_bcq_output_arrays \
+--input_path /path/to/original_model.pb \
+--output_path /path/to/output_arrays.txt
+```
+
+### How it works
+
+```
+[Original BCQ information nodes]
+const(value=[1, 2, 3, -1], name='const1')
+const(value=[1, 2, 3, -2], name='const2')
+const(value=[1, 2, 3, -3], name='const3')
+
+[Generated output_arrays]
+,const1,const2,const3
+```
+
+### Caution
+
+- Generated output_arrays will be start with comma.
diff --git a/compiler/bcq-tools/generate_bcq_output_arrays b/compiler/bcq-tools/generate_bcq_output_arrays
new file mode 100644
index 000000000..48e8a9373
--- /dev/null
+++ b/compiler/bcq-tools/generate_bcq_output_arrays
@@ -0,0 +1,90 @@
+#!/usr/bin/env python3
+
+import tensorflow as tf
+
+import argparse
+import sys
+
+
+def _get_parser():
+    """
+    Returns an ArgumentParser for generating output_arrays.
+    """
+    parser = argparse.ArgumentParser(
+        description=("Command line tool to generated output_arrays of BCQ nodes"))
+
+    # Input and output path.
+    parser.add_argument(
+        "-i",
+        "--input_path",
+        type=str,
+        help="Full filepath of the input file.",
+        required=True)
+    parser.add_argument(
+        "-o",
+        "--output_path",
+        type=str,
+        help="Full filepath of the output file.",
+        required=True)
+
+    return parser
+
+
+def load_graph(frozen_graph_filename):
+    """
+    Load graph from frozen pb file
+    """
+    with tf.compat.v1.gfile.GFile(frozen_graph_filename, "rb") as f:
+        graph_def = tf.compat.v1.GraphDef()
+        graph_def.ParseFromString(f.read())
+    with tf.Graph().as_default() as graph:
+        tf.import_graph_def(graph_def, name='')
+    return graph
+
+
+def dtype2str(dtype):
+    if dtype == "int32":
+        return "TF_INT32"
+    elif dtype == "int64":
+        return "TF_INT64"
+    elif dtype == "float32":
+        return "TF_FLOAT"
+    elif dtype == "bool":
+        return "TF_BOOL"
+    else:
+        raise Exception("Not supported dtype")
+
+
+def print_output_arrays(flags):
+    graph_model = load_graph(flags.input_path)
+    graph_model_def = graph_model.as_graph_def()
+    ops = graph_model.get_operations()
+
+    output_names = [op.outputs[0].name for op in ops 
+        if op.type == "Const" and "bcqinfo_" in op.outputs[0].name]
+
+    output_arrays = ""    
+    for output_name in output_names:
+        output_arrays += ","
+
+        colon_index = output_name.find(":")
+        if colon_index == -1:
+            output_arrays += output_name
+        else:
+            output_arrays += output_name[:colon_index]
+
+    f = open(flags.output_path, 'w')
+    f.write(output_arrays)
+    f.close()
+
+
+def main():
+    # Parse argument.
+    parser = _get_parser()
+    flags = parser.parse_known_args(args=sys.argv[1:])
+
+    print_output_arrays(flags[0])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/compiler/bcq-tools/preserve_bcq_info b/compiler/bcq-tools/preserve_bcq_info
new file mode 100644
index 000000000..2ede8d4d0
--- /dev/null
+++ b/compiler/bcq-tools/preserve_bcq_info
@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+
+import tensorflow as tf
+import numpy as np
+
+import argparse
+import sys
+
+
+def _get_parser():
+    """
+    Returns an ArgumentParser for preserving BCQ information.
+    """
+    parser = argparse.ArgumentParser(
+        description=("Command line tool to preserve BCQ information"))
+
+    # Input and output path.
+    parser.add_argument(
+        "-i",
+        "--input_path",
+        type=str,
+        help="Full filepath of the input file.",
+        required=True)
+    parser.add_argument(
+        "-o",
+        "--output_path",
+        type=str,
+        help="Full filepath of the output file.",
+        required=True)
+
+    return parser
+
+
+def load_graph(frozen_graph_filename):
+    """
+    Load graph from frozen pb file
+    """
+    with tf.compat.v1.gfile.GFile(frozen_graph_filename, "rb") as f:
+        graph_def = tf.compat.v1.GraphDef()
+        graph_def.ParseFromString(f.read())
+    with tf.Graph().as_default() as graph:
+        tf.import_graph_def(graph_def, name='')
+    return graph
+
+
+def preserve_bcq_info(flags):
+    """
+    Generate unique dummy value from -1 to -N.
+
+    We use negative values to preserve BCQ information because
+    positive values may cause some confusion with real BCQ information values.
+    """
+
+    class UniqueValueGen:
+        def __init__(self):
+            self.unique_value = -1
+
+        def gen(self):
+            val = self.unique_value
+            self.unique_value = val - 1
+            return val
+
+    unique_value = UniqueValueGen()
+
+    original_graph_model = load_graph(flags.input_path)
+    original_graph_model_def = original_graph_model.as_graph_def()
+
+    new_graph = tf.compat.v1.GraphDef()
+    substitution_dict = {}
+
+    DT_INT32 = None  # Just for copying DT_INT32 attribute value
+
+    for node in original_graph_model_def.node:
+        if node.op == "Const":
+            # Because bcqinfo_do_w_x is BOOL type, we cannot add dummy value at the end.
+            # Therefore we should convert the type to INT32 type.
+            if "/bcqinfo_do_w_x" in node.name:
+                original_tensor = tf.make_ndarray(node.attr["value"].tensor)
+                substitution_dict[node.name] = tf.make_tensor_proto(
+                    [int(original_tensor[0]), unique_value.gen()], tf.int32)
+
+            preserved_bcqinfo_list = ["/bcqinfo_number_of_clusters", "/bcqinfo_size_of_clusters", 
+                "/bcqinfo_qbits_of_clusters"]
+
+            if any(name in node.name for name in preserved_bcqinfo_list):
+                original_tensor = tf.make_ndarray(
+                    node.attr["value"].tensor)  # variable name change
+                substitution_dict[node.name] = tf.make_tensor_proto(
+                    np.append(original_tensor, unique_value.gen()), tf.int32)
+                DT_INT32 = node.attr["dtype"]
+
+    for node in original_graph_model_def.node:
+        if node.name in substitution_dict:
+            new_node = new_graph.node.add()
+            new_node.op = "Const"
+            new_node.name = node.name
+            new_node.attr["dtype"].CopyFrom(DT_INT32)
+            new_node.attr["value"].tensor.CopyFrom(substitution_dict[node.name])
+        else:
+            new_node = new_graph.node.add()
+            new_node.CopyFrom(node)
+
+    tf.io.write_graph(new_graph, '.', flags.output_path, False)
+
+
+def main():
+    # Parse argument.
+    parser = _get_parser()
+    flags = parser.parse_known_args(args=sys.argv[1:])
+
+    # Generate a new pb file, which BCQ information is preserved.
+    preserve_bcq_info(flags[0])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/compiler/circle-quantizer/CMakeLists.txt b/compiler/circle-quantizer/CMakeLists.txt
index 1335057eb..009bfabea 100644
--- a/compiler/circle-quantizer/CMakeLists.txt
+++ b/compiler/circle-quantizer/CMakeLists.txt
@@ -13,5 +13,6 @@ target_link_libraries(circle-quantizer luci_service)
 target_link_libraries(circle-quantizer luci_pass)
 target_link_libraries(circle-quantizer luci_export)
 target_link_libraries(circle-quantizer arser)
+target_link_libraries(circle-quantizer vconone)
 
 install(TARGETS circle-quantizer DESTINATION bin)
diff --git a/compiler/circle-quantizer/requires.cmake b/compiler/circle-quantizer/requires.cmake
index 2293e53f8..c21e28e8d 100644
--- a/compiler/circle-quantizer/requires.cmake
+++ b/compiler/circle-quantizer/requires.cmake
@@ -5,3 +5,4 @@ require("safemain")
 require("luci")
 require("oops")
 require("arser")
+require("vconone")
diff --git a/compiler/circle-quantizer/src/CircleQuantizer.cpp b/compiler/circle-quantizer/src/CircleQuantizer.cpp
index b56b547a9..8d3a80c91 100644
--- a/compiler/circle-quantizer/src/CircleQuantizer.cpp
+++ b/compiler/circle-quantizer/src/CircleQuantizer.cpp
@@ -25,6 +25,7 @@
 
 #include <oops/InternalExn.h>
 #include <arser/arser.h>
+#include <vconone/vconone.h>
 
 #include <functional>
 #include <iostream>
@@ -36,6 +37,12 @@ using OptionHook = std::function<int(const char **)>;
 using Algorithms = luci::CircleOptimizer::Options::Algorithm;
 using AlgorithmParameters = luci::CircleOptimizer::Options::AlgorithmParameters;
 
+void print_version(void)
+{
+  std::cout << "circle-quantizer version " << vconone::get_string() << std::endl;
+  std::cout << vconone::get_copyright() << std::endl;
+}
+
 int entry(int argc, char **argv)
 {
   // Simple argument parser (based on map)
@@ -49,13 +56,20 @@ int entry(int argc, char **argv)
 
   arser::Arser arser("circle-quantizer provides circle model quantization");
 
+  arser.add_argument("--version")
+      .nargs(0)
+      .required(false)
+      .default_value(false)
+      .help("Show version information and exit")
+      .exit_with(print_version);
+
   arser.add_argument(qdqw)
       .nargs(3)
       .type(arser::DataType::STR_VEC)
       .required(false)
       .help("Quantize-dequantize weight values required action before quantization. "
             "Three arguments required: input_dtype(float32) "
-            "output_dtype(uint8) granularity(layer)");
+            "output_dtype(uint8) granularity(layer, channel)");
 
   arser.add_argument(qwmm)
       .nargs(3)
@@ -63,7 +77,7 @@ int entry(int argc, char **argv)
       .required(false)
       .help("Quantize with min/max values. "
             "Three arguments required: input_dtype(float32) "
-            "output_dtype(uint8) granularity(layer)");
+            "output_dtype(uint8) granularity(layer, channel)");
 
   arser.add_argument("input").nargs(1).type(arser::DataType::STR).help("Input circle model");
   arser.add_argument("output").nargs(1).type(arser::DataType::STR).help("Output circle model");
diff --git a/compiler/circle-tensordump/driver/Driver.cpp b/compiler/circle-tensordump/driver/Driver.cpp
index a55cd4574..5bab9f59e 100644
--- a/compiler/circle-tensordump/driver/Driver.cpp
+++ b/compiler/circle-tensordump/driver/Driver.cpp
@@ -46,7 +46,14 @@ int entry(int argc, char **argv)
   {
     std::cout << err.what() << std::endl;
     std::cout << arser;
-    return 0;
+    return 255;
+  }
+
+  if (arser["--tensors_to_hdf5"] == arser["--tensors"])
+  {
+    std::cout << "[Error] You must specify one option for how to print." << std::endl;
+    std::cout << arser;
+    return 255;
   }
 
   std::unique_ptr<circletensordump::DumpInterface> dump;
diff --git a/compiler/circle-tensordump/src/Dump.cpp b/compiler/circle-tensordump/src/Dump.cpp
index dfa78f031..a8d32564f 100644
--- a/compiler/circle-tensordump/src/Dump.cpp
+++ b/compiler/circle-tensordump/src/Dump.cpp
@@ -136,6 +136,7 @@ void DumpTensors::run(std::ostream &os, const circle::Model *model, const std::s
         auto max = quant_param->max();
         auto scale = quant_param->scale();
         auto zero_point = quant_param->zero_point();
+        auto quantized_dimension = quant_param->quantized_dimension();
 
         os << " " + print_format2 + "   ├── min        : ";
         ::print_comma_sepearted(os, min);
@@ -146,9 +147,11 @@ void DumpTensors::run(std::ostream &os, const circle::Model *model, const std::s
         os << " " + print_format2 + "   ├── scale      : ";
         ::print_comma_sepearted(os, scale);
         os << std::endl;
-        os << " " + print_format2 + "   └── zero_point : ";
+        os << " " + print_format2 + "   ├── zero_point : ";
         ::print_comma_sepearted(os, zero_point);
         os << std::endl;
+        os << " " + print_format2 + "   └── quantized_dimension : " << quantized_dimension;
+        os << std::endl;
       }
 
       // buffer
@@ -229,7 +232,7 @@ std::vector<hsize_t> hdf5_dims_cast(const flatbuffers::Vector<T> *data,
 }
 
 /**
- *  This function writes data to given hdf5 file like below.
+ *  This function writes vector data to given hdf5 file like below.
  *
  *  GROUP "group_name"
  *   ㄴDATATYPE "type"
@@ -238,9 +241,9 @@ std::vector<hsize_t> hdf5_dims_cast(const flatbuffers::Vector<T> *data,
  *   ㄴDATA "data"
  */
 template <typename T>
-void write_data_to_hdf5(H5::H5File &file, std::string &group_name, std::string dataset_name,
-                        const H5::PredType &type, const flatbuffers::Vector<T> *data,
-                        std::vector<hsize_t> dims)
+void write_vector_data_to_hdf5(H5::H5File &file, std::string &group_name, std::string dataset_name,
+                               const H5::PredType &type, const flatbuffers::Vector<T> *data,
+                               std::vector<hsize_t> dims)
 {
   if (data == nullptr)
     return;
@@ -250,6 +253,17 @@ void write_data_to_hdf5(H5::H5File &file, std::string &group_name, std::string d
   dataset->write(data->data(), type);
 }
 
+/// @brief This function writes scalar data to given hdf5 file
+template <typename T>
+void write_scalar_data_to_hdf5(H5::H5File &file, std::string &group_name, std::string dataset_name,
+                               const H5::PredType &type, T data)
+{
+  auto dataspace = std::make_unique<H5::DataSpace>(H5S_SCALAR);
+  auto dataset = std::make_unique<H5::DataSet>(
+      file.createDataSet(group_name + "/" + dataset_name, type, *dataspace));
+  dataset->write(&data, type);
+}
+
 } // namespace
 
 namespace circletensordump
@@ -297,8 +311,9 @@ void DumpTensorsToHdf5::run(std::ostream &os, const circle::Model *model,
       auto buff_data_ptr = reader.buffers()->Get(buff_idx)->data();
       if (buff_data_ptr)
       {
-        ::write_data_to_hdf5(file, group_name, "weights", ::hdf5_dtype_cast(tensor->type()),
-                             buff_data_ptr, ::hdf5_dims_cast(buff_data_ptr, tensor->shape()));
+        ::write_vector_data_to_hdf5(file, group_name, "weights", ::hdf5_dtype_cast(tensor->type()),
+                                    buff_data_ptr,
+                                    ::hdf5_dims_cast(buff_data_ptr, tensor->shape()));
       }
 
       // write quantization parameters
@@ -306,17 +321,20 @@ void DumpTensorsToHdf5::run(std::ostream &os, const circle::Model *model,
       if (quant_param)
       {
         auto min = quant_param->min();
-        ::write_data_to_hdf5(file, group_name, "min", H5::PredType::NATIVE_FLOAT, min,
-                             ::hdf5_dims_cast(min));
+        ::write_vector_data_to_hdf5(file, group_name, "min", H5::PredType::NATIVE_FLOAT, min,
+                                    ::hdf5_dims_cast(min));
         auto max = quant_param->max();
-        ::write_data_to_hdf5(file, group_name, "max", H5::PredType::NATIVE_FLOAT, max,
-                             ::hdf5_dims_cast(max));
+        ::write_vector_data_to_hdf5(file, group_name, "max", H5::PredType::NATIVE_FLOAT, max,
+                                    ::hdf5_dims_cast(max));
         auto scale = quant_param->scale();
-        ::write_data_to_hdf5(file, group_name, "scale", H5::PredType::NATIVE_FLOAT, scale,
-                             ::hdf5_dims_cast(scale));
+        ::write_vector_data_to_hdf5(file, group_name, "scale", H5::PredType::NATIVE_FLOAT, scale,
+                                    ::hdf5_dims_cast(scale));
         auto zero_point = quant_param->zero_point();
-        ::write_data_to_hdf5(file, group_name, "zero_point", H5::PredType::NATIVE_INT64, zero_point,
-                             ::hdf5_dims_cast(zero_point));
+        ::write_vector_data_to_hdf5(file, group_name, "zero_point", H5::PredType::NATIVE_INT64,
+                                    zero_point, ::hdf5_dims_cast(zero_point));
+        auto quantized_dimension = quant_param->quantized_dimension();
+        ::write_scalar_data_to_hdf5(file, group_name, "quantized_dimension",
+                                    H5::PredType::NATIVE_INT32, quantized_dimension);
       }
     }
   }
diff --git a/compiler/circle-verify/src/Driver.cpp b/compiler/circle-verify/src/Driver.cpp
index 1af31d986..7a44c65b9 100644
--- a/compiler/circle-verify/src/Driver.cpp
+++ b/compiler/circle-verify/src/Driver.cpp
@@ -35,7 +35,7 @@ int entry(int argc, char **argv)
   {
     std::cout << err.what() << std::endl;
     std::cout << arser;
-    return 0;
+    return 255;
   }
 
   auto verifier = std::make_unique<VerifyFlatbuffers>();
diff --git a/compiler/circle2circle-dredd-recipe-test/CMakeLists.txt b/compiler/circle2circle-dredd-recipe-test/CMakeLists.txt
index 6663cb938..4bcaae347 100644
--- a/compiler/circle2circle-dredd-recipe-test/CMakeLists.txt
+++ b/compiler/circle2circle-dredd-recipe-test/CMakeLists.txt
@@ -1,25 +1,12 @@
 nnas_include(TargetRequire)
 
 unset(REQUIRED_TARGETS)
-list(APPEND REQUIRED_TARGETS circlechef)
 list(APPEND REQUIRED_TARGETS circle-inspect)
 list(APPEND REQUIRED_TARGETS circle-verify)
 list(APPEND REQUIRED_TARGETS circle2circle)
 list(APPEND REQUIRED_TARGETS dredd_rule_lib)
-list(APPEND REQUIRED_TARGETS tflchef)
-list(APPEND REQUIRED_TARGETS tflite2circle)
 TargetRequire_Return(${REQUIRED_TARGETS})
 
-nncc_find_resource(TensorFlowLiteRecipes)
-nncc_find_resource(CircleRecipes)
-
-set(TFLITE_RECIPE_REPO "${TensorFlowLiteRecipes_DIR}")
-set(CIRCLE_RECIPE_REPO "${CircleRecipes_DIR}")
-unset(RECIPE_REPO)
-
-set(TEST_RECIPE_FILENAME "test.recipe")
-set(TEST_RULE_FILENAME "test.rule")
-
 unset(TEST_DEPS)
 unset(TEST_NAMES)
 
@@ -27,21 +14,9 @@ set(options "")
 set(oneValueArgs "")
 set(multiValueArgs PASS)
 
-macro(Add RECIPE)
-  if(NOT EXISTS "${TFLITE_RECIPE_REPO}/${RECIPE}/test.recipe")
-    if(NOT EXISTS "${CIRCLE_RECIPE_REPO}/${RECIPE}/test.recipe")
-      message(FATAL_ERROR "Missing recipe of '${RECIPE}' test")
-    else()
-      set(RECIPE_REPO ${CIRCLE_RECIPE_REPO})
-    endif()
-  else()
-    set(RECIPE_REPO ${TFLITE_RECIPE_REPO})
-  endif()
-
-  if(NOT EXISTS "${RECIPE_REPO}/${RECIPE}/test.rule")
-    message(FATAL_ERROR "Missing rule of '${RECIPE}' test")
-  endif()
+get_target_property(ARTIFACTS_BIN_PATH testDataGenerator BINARY_DIR)
 
+macro(Add RECIPE)
   cmake_parse_arguments(ARG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
   unset(OPT_OPTIONS)
   foreach(src ${ARG_PASS})
@@ -49,71 +24,20 @@ macro(Add RECIPE)
     list(APPEND OPT_OPTIONS "--${src}")
   endforeach(src ${ARG_PASS})
 
-  set(RECIPE_FILE "${RECIPE}.recipe")
-  set(RECIPE_SOURCE_PATH "${RECIPE_REPO}/${RECIPE}/${TEST_RECIPE_FILENAME}")
-  set(RECIPE_BINARY_PATH "${CMAKE_CURRENT_BINARY_DIR}/${RECIPE_FILE}")
-  
-  set(RULE_FILE "${RECIPE}.rule")
-  set(RULE_SOURCE_PATH "${RECIPE_REPO}/${RECIPE}/${TEST_RULE_FILENAME}")
-  set(RULE_BINARY_PATH "${CMAKE_CURRENT_BINARY_DIR}/${RULE_FILE}")
-
-  set(TFLITE_FILE "${RECIPE}.tflite")
-  set(TFLITE_OUTPUT_PATH "${CMAKE_CURRENT_BINARY_DIR}/${TFLITE_FILE}")
-
   set(CIRCLE_FILE "${RECIPE}.circle")
-  set(CIRCLE_OUTPUT_PATH "${CMAKE_CURRENT_BINARY_DIR}/${CIRCLE_FILE}")
+  set(CIRCLE_PATH "${ARTIFACTS_BIN_PATH}/${CIRCLE_FILE}")
 
   set(OPT_CIRCLE_FILE "${RECIPE}.opt.circle")
   set(OPT_CIRCLE_OUTPUT_PATH "${CMAKE_CURRENT_BINARY_DIR}/${OPT_CIRCLE_FILE}")
 
-  # Copy .recipe
-  add_custom_command(OUTPUT ${RECIPE_BINARY_PATH}
-    COMMAND ${CMAKE_COMMAND} -E copy "${RECIPE_SOURCE_PATH}" "${RECIPE_BINARY_PATH}"
-    DEPENDS ${RECIPE_SOURCE_PATH}
-    COMMENT "Generate ${RECIPE_FILE}"
-  )
-
-  # Copy .rule
-  add_custom_command(OUTPUT ${RULE_BINARY_PATH}
-    COMMAND ${CMAKE_COMMAND} -E copy "${RULE_SOURCE_PATH}" "${RULE_BINARY_PATH}"
-    DEPENDS ${RULE_SOURCE_PATH}
-    COMMENT "Generate ${RULE_FILE}"
-  )
-
-  if(${RECIPE_REPO} STREQUAL ${TFLITE_RECIPE_REPO})
-    # Generate .tflite
-    add_custom_command(OUTPUT ${TFLITE_OUTPUT_PATH}
-      COMMAND $<TARGET_FILE:tflchef-file> ${RECIPE_BINARY_PATH} ${TFLITE_OUTPUT_PATH}
-      DEPENDS $<TARGET_FILE:tflchef-file> ${RECIPE_BINARY_PATH}
-      COMMENT "Generate ${TFLITE_FILE}"
-    )
-
-    # Generate .circle
-    add_custom_command(OUTPUT ${CIRCLE_OUTPUT_PATH}
-      COMMAND $<TARGET_FILE:tflite2circle> ${TFLITE_OUTPUT_PATH} ${CIRCLE_OUTPUT_PATH}
-      DEPENDS $<TARGET_FILE:tflite2circle> ${TFLITE_OUTPUT_PATH}
-      COMMENT "Generate ${CIRCLE_FILE}"
-    )
-
-    list(APPEND TEST_DEPS ${TFLITE_OUTPUT_PATH})
-  else()
-   # Generate .circle
-    add_custom_command(OUTPUT ${CIRCLE_OUTPUT_PATH}
-      COMMAND $<TARGET_FILE:circlechef-file> ${RECIPE_BINARY_PATH} ${CIRCLE_OUTPUT_PATH}
-      DEPENDS $<TARGET_FILE:circlechef-file> ${RECIPE_BINARY_PATH}
-      COMMENT "Generate ${CIRCLE_FILE}"
-    )
-  endif()
-
   # Generate optimized .circle
   add_custom_command(OUTPUT ${OPT_CIRCLE_OUTPUT_PATH}
-    COMMAND $<TARGET_FILE:circle2circle> ${OPT_OPTIONS} ${CIRCLE_OUTPUT_PATH} ${OPT_CIRCLE_OUTPUT_PATH}
-    DEPENDS $<TARGET_FILE:circle2circle> ${CIRCLE_OUTPUT_PATH}
+    COMMAND $<TARGET_FILE:circle2circle> ${OPT_OPTIONS} ${CIRCLE_PATH} ${OPT_CIRCLE_OUTPUT_PATH}
+    DEPENDS $<TARGET_FILE:circle2circle> ${CIRCLE_PATH}
     COMMENT "Generate ${OPT_CIRCLE_FILE}"
   )
 
-  list(APPEND TEST_DEPS ${RECIPE_BINARY_PATH} ${RULE_BINARY_PATH}
-                        ${CIRCLE_OUTPUT_PATH} ${OPT_CIRCLE_OUTPUT_PATH})
+  list(APPEND TEST_DEPS ${OPT_CIRCLE_OUTPUT_PATH})
   list(APPEND TEST_NAMES ${RECIPE})
 endmacro(Add)
 
@@ -174,12 +98,15 @@ list(APPEND TEST_DEPS "${RULE_LIB_BINARY_PATH}")
 
 # Generate dependencies
 add_custom_target(circle2circle_dredd_recipe_test ALL DEPENDS ${TEST_DEPS})
+add_dependencies(circle2circle_dredd_recipe_test common_artifacts_deps)
+
+get_target_property(ARTIFACTS_BIN_PATH testDataGenerator BINARY_DIR)
 
 # Run tests
 add_test(
   NAME circle2circle_dredd_recipe_test
   COMMAND "${TEST_RUNNER}"
           "${TEST_CONFIG}"
-          "${CMAKE_CURRENT_BINARY_DIR}"
+          "${ARTIFACTS_BIN_PATH}"
           ${TEST_NAMES}
 )
diff --git a/compiler/circle2circle-dredd-recipe-test/requires.cmake b/compiler/circle2circle-dredd-recipe-test/requires.cmake
index e4a5b71a7..70e7c5295 100644
--- a/compiler/circle2circle-dredd-recipe-test/requires.cmake
+++ b/compiler/circle2circle-dredd-recipe-test/requires.cmake
@@ -1,7 +1,5 @@
-require("circlechef")
 require("circle2circle")
 require("circle-inspect")
 require("circle-verify")
+require("common-artifacts")
 require("dredd-rule-lib")
-require("tflchef")
-require("tflite2circle")
diff --git a/compiler/circle2circle-dredd-recipe-test/test.lst b/compiler/circle2circle-dredd-recipe-test/test.lst
index 202f66938..6328a64db 100644
--- a/compiler/circle2circle-dredd-recipe-test/test.lst
+++ b/compiler/circle2circle-dredd-recipe-test/test.lst
@@ -11,9 +11,10 @@
 ## TFLITE RECIPE
 
 Add(Net_InstanceNorm_001 PASS fuse_instnorm)
-# Add(Net_InstanceNorm_002 PASS fuse_instnorm)
+Add(Net_InstanceNorm_002 PASS fuse_instnorm)
 Add(BatchMatMulV2_000 PASS resolve_customop_batchmatmul)
 Add(MatMul_000 PASS resolve_customop_matmul)
+Add(DepthwiseConv2D_003 PASS)
 
 ## CIRCLE RECIPE
 
diff --git a/compiler/circle2circle-dredd-recipe-test/testall.sh b/compiler/circle2circle-dredd-recipe-test/testall.sh
index 33a2036bb..2899587ba 100755
--- a/compiler/circle2circle-dredd-recipe-test/testall.sh
+++ b/compiler/circle2circle-dredd-recipe-test/testall.sh
@@ -13,21 +13,22 @@ if [[ $# -lt 2 ]]; then
   exit 255
 fi
 
+WORKDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 CONFIG_PATH="$1"; shift
-WORKDIR="$1"; shift
+RESOURCE_DIR="$1"; shift
 
 source "${CONFIG_PATH}"
 
 echo "-- Found circle-inspect: ${CIRCLE_INSPECT_PATH}"
 echo "-- Found circle-verify: ${CIRCLE_VERIFY_PATH}"
 echo "-- Found circle2circle: ${CIRCLE2CIRCLE_PATH}"
-echo "-- Found workdir: ${WORKDIR}"
+echo "-- Found common-artifacts: ${RESOURCE_DIR}"
 
 TESTED=()
 PASSED=()
 FAILED=()
 
-pushd "${WORKDIR}"
+pushd ${WORKDIR}
 while [[ $# -ne 0 ]]; do
   PREFIX="$1"; shift
 
@@ -40,7 +41,7 @@ while [[ $# -ne 0 ]]; do
   cat > "${PREFIX}.log" <(
     exec 2>&1
 
-    echo "-- Found tflite: ${PREFIX}.tflite"
+    echo "-- Found circle: ${PREFIX}.opt.circle"
 
     # Exit immediately if any command fails
     set -e
@@ -55,7 +56,7 @@ while [[ $# -ne 0 ]]; do
     set +x
 
     # (COMPILED_FILE, INSPECT_PROG_PATH, VERIFY_PROG_PATH, ERROR_LOG) must be set for rule-lib.sh
-    COMPILED_FILE="${WORKDIR}/${PREFIX}.opt.circle"
+    COMPILED_FILE="${PREFIX}.opt.circle"
     INSPECT_PROG_PATH=${CIRCLE_INSPECT_PATH}
     VERIFY_PROG_PATH=${CIRCLE_VERIFY_PATH}
     ERROR_LOG="${PREFIX}.error"
@@ -66,7 +67,7 @@ while [[ $# -ne 0 ]]; do
     trap 'echo "** ERROR **" ; cat "${ERROR_LOG}"' ERR
 
     source rule-lib.sh
-    source "${PREFIX}.rule"
+    source "${RESOURCE_DIR}/${PREFIX}.rule"
 
     # unset
     trap - ERR
diff --git a/compiler/circle2circle/CMakeLists.txt b/compiler/circle2circle/CMakeLists.txt
index 7b2bf9b02..f60c896d8 100644
--- a/compiler/circle2circle/CMakeLists.txt
+++ b/compiler/circle2circle/CMakeLists.txt
@@ -19,6 +19,7 @@ target_link_libraries(circle2circle luci_service)
 target_link_libraries(circle2circle luci_pass)
 target_link_libraries(circle2circle luci_export)
 target_link_libraries(circle2circle arser)
+target_link_libraries(circle2circle vconone)
 
 install(TARGETS circle2circle DESTINATION bin)
 
@@ -44,3 +45,4 @@ target_link_libraries(circle2circle_test luci_service)
 target_link_libraries(circle2circle_test luci_pass)
 target_link_libraries(circle2circle_test luci_export)
 target_link_libraries(circle2circle_test arser)
+target_link_libraries(circle2circle_test vconone)
diff --git a/compiler/circle2circle/requires.cmake b/compiler/circle2circle/requires.cmake
index 8cbb90dbf..36a9efd16 100644
--- a/compiler/circle2circle/requires.cmake
+++ b/compiler/circle2circle/requires.cmake
@@ -9,3 +9,4 @@ require("hermes")
 require("hermes-std")
 require("luci")
 require("arser")
+require("vconone")
diff --git a/compiler/circle2circle/src/Circle2Circle.cpp b/compiler/circle2circle/src/Circle2Circle.cpp
index 6888d26e3..849597b46 100644
--- a/compiler/circle2circle/src/Circle2Circle.cpp
+++ b/compiler/circle2circle/src/Circle2Circle.cpp
@@ -26,6 +26,7 @@
 
 #include <oops/InternalExn.h>
 #include <arser/arser.h>
+#include <vconone/vconone.h>
 
 #include <functional>
 #include <iostream>
@@ -34,6 +35,12 @@
 using Algorithms = luci::CircleOptimizer::Options::Algorithm;
 using AlgorithmParameters = luci::CircleOptimizer::Options::AlgorithmParameters;
 
+void print_version(void)
+{
+  std::cout << "circle2circle version " << vconone::get_string() << std::endl;
+  std::cout << vconone::get_copyright() << std::endl;
+}
+
 int entry(int argc, char **argv)
 {
   // Simple argument parser (based on map)
@@ -44,6 +51,13 @@ int entry(int argc, char **argv)
 
   arser::Arser arser("circle2circle provides circle model optimization and transformations");
 
+  arser.add_argument("--version")
+      .nargs(0)
+      .required(false)
+      .default_value(false)
+      .help("Show version information and exit")
+      .exit_with(print_version);
+
   arser.add_argument("--all").nargs(0).required(false).default_value(false).help(
       "Enable all optimize options");
 
diff --git a/compiler/circlechef/CMakeLists.txt b/compiler/circlechef/CMakeLists.txt
index cba7d0a4e..3e2ddcbb3 100644
--- a/compiler/circlechef/CMakeLists.txt
+++ b/compiler/circlechef/CMakeLists.txt
@@ -18,4 +18,6 @@ add_subdirectory(core)
 add_subdirectory(circle)
 # Tools
 add_subdirectory(tools)
-add_subdirectory(tests)
+if(ENABLE_TEST)
+  add_subdirectory(tests)
+endif(ENABLE_TEST)
diff --git a/compiler/circlechef/circle/src/RecipeChef.cpp b/compiler/circlechef/circle/src/RecipeChef.cpp
index 17ef1be6e..51326c7f8 100644
--- a/compiler/circlechef/circle/src/RecipeChef.cpp
+++ b/compiler/circlechef/circle/src/RecipeChef.cpp
@@ -181,6 +181,8 @@ std::unique_ptr<ModelRecipe> generate_recipe(const circle::Model *model)
         for (uint32_t idx = 0; idx < quant->zero_point()->size(); ++idx)
           chef_quant->add_zero_point(quant->zero_point()->Get(idx));
       }
+      circlechef::TensorQuantization *chef_quant = operand->mutable_quant();
+      chef_quant->set_quantized_dimension(quant->quantized_dimension());
     }
   }
 
diff --git a/compiler/circlechef/core/src/ModelChef.cpp b/compiler/circlechef/core/src/ModelChef.cpp
index 76aeacdd9..d81467d68 100644
--- a/compiler/circlechef/core/src/ModelChef.cpp
+++ b/compiler/circlechef/core/src/ModelChef.cpp
@@ -413,6 +413,7 @@ template <typename T> void cook_graph(const T &graph, CookParams &cp)
       quant_builder.add_min(quant_min);
       quant_builder.add_scale(quant_scale);
       quant_builder.add_zero_point(quant_zero_point);
+      quant_builder.add_quantized_dimension(quant.quantized_dimension());
 
       // Update QuantizationParameters Index
       quant_index = quant_builder.Finish();
diff --git a/compiler/circlechef/proto/circlechef.proto b/compiler/circlechef/proto/circlechef.proto
index b8c009b38..3e5e6b168 100644
--- a/compiler/circlechef/proto/circlechef.proto
+++ b/compiler/circlechef/proto/circlechef.proto
@@ -35,6 +35,7 @@ message TensorQuantization {
   repeated float max = 2;
   repeated float scale = 3;
   repeated int64 zero_point = 4;
+  optional int32 quantized_dimension = 5 [default = 0];
 }
 
 message Operand {
diff --git a/compiler/circlechef/tools/file/Driver.cpp b/compiler/circlechef/tools/file/Driver.cpp
index a15da4002..bcc0c7ae9 100644
--- a/compiler/circlechef/tools/file/Driver.cpp
+++ b/compiler/circlechef/tools/file/Driver.cpp
@@ -41,7 +41,7 @@ int entry(int argc, char **argv)
   {
     std::cout << err.what() << std::endl;
     std::cout << arser;
-    return 0;
+    return 255;
   }
 
   int32_t model_version = 1;
diff --git a/compiler/circlechef/tools/reverse/Driver.cpp b/compiler/circlechef/tools/reverse/Driver.cpp
index 9c0b9ea24..8a2b85fc7 100644
--- a/compiler/circlechef/tools/reverse/Driver.cpp
+++ b/compiler/circlechef/tools/reverse/Driver.cpp
@@ -38,7 +38,7 @@ int entry(int argc, char **argv)
   {
     std::cout << err.what() << std::endl;
     std::cout << arser;
-    return 0;
+    return 255;
   }
 
   std::string circle_path = arser.get<std::string>("circle");
diff --git a/compiler/circledump/driver/Driver.cpp b/compiler/circledump/driver/Driver.cpp
index b8f561fee..657f24fe0 100644
--- a/compiler/circledump/driver/Driver.cpp
+++ b/compiler/circledump/driver/Driver.cpp
@@ -33,7 +33,7 @@ int entry(int argc, char **argv)
   {
     std::cout << err.what() << '\n';
     std::cout << arser;
-    return 0;
+    return 255;
   }
 
   std::string circle_path = arser.get<std::string>("circle");
diff --git a/compiler/circledump/src/OpPrinter.cpp b/compiler/circledump/src/OpPrinter.cpp
index 2c0320396..3294bb23d 100644
--- a/compiler/circledump/src/OpPrinter.cpp
+++ b/compiler/circledump/src/OpPrinter.cpp
@@ -593,6 +593,20 @@ public:
   }
 };
 
+class UniquePrinter : public OpPrinter
+{
+public:
+  void options(const circle::Operator *op, std::ostream &os) const override
+  {
+    if (auto *params = op->builtin_options_as_UniqueOptions())
+    {
+      os << "    ";
+      os << "idx_out_type(" << EnumNameTensorType(params->idx_out_type()) << ") ";
+      os << std::endl;
+    }
+  }
+};
+
 class WhilePrinter : public OpPrinter
 {
 public:
@@ -710,9 +724,11 @@ OpPrinterRegistry::OpPrinterRegistry()
   _op_map[circle::BuiltinOperator_MAX_POOL_2D] = make_unique<Pool2DPrinter>();
   _op_map[circle::BuiltinOperator_MIRROR_PAD] = make_unique<MirrorPadPrinter>();
   _op_map[circle::BuiltinOperator_MUL] = make_unique<MulPrinter>();
+  // There is no Option for NON_MAX_SUPPRESSION_V4
   _op_map[circle::BuiltinOperator_ONE_HOT] = make_unique<OneHotPrinter>();
   _op_map[circle::BuiltinOperator_PACK] = make_unique<PackPrinter>();
   // There is no Option for PAD
+  // There is no Option for PADV2
   // There is no Option for PRELU
   // There is no Option for RELU
   // There is no Option for RELU6
@@ -744,6 +760,7 @@ OpPrinterRegistry::OpPrinterRegistry()
   _op_map[circle::BuiltinOperator_SUM] = make_unique<ReducerPrinter>();
   _op_map[circle::BuiltinOperator_TRANSPOSE_CONV] = make_unique<TransposeConvPrinter>();
   // There is no Option for TOPK_V2
+  _op_map[circle::BuiltinOperator_UNIQUE] = make_unique<UniquePrinter>();
   _op_map[circle::BuiltinOperator_WHILE] = make_unique<WhilePrinter>();
   _op_map[circle::BuiltinOperator_CUSTOM] = make_unique<CustomOpPrinter>();
 
diff --git a/compiler/common-artifacts/CMakeLists.txt b/compiler/common-artifacts/CMakeLists.txt
index ee4191d73..ef50e8d43 100644
--- a/compiler/common-artifacts/CMakeLists.txt
+++ b/compiler/common-artifacts/CMakeLists.txt
@@ -13,30 +13,47 @@ if(${PYTHON_VERSION_MINOR} LESS 3)
 endif()
 
 # Create python virtual environment with tensorflow 1.13.2
-set(VIRTUALENV_OVERLAY "${NNCC_OVERLAY_DIR}/venv_1_13_2")
+set(VIRTUALENV_OVERLAY_TF_1_13_2 "${NNCC_OVERLAY_DIR}/venv_1_13_2")
+
+# Create python virtual environment with tensorflow 2.3.0
+set(VIRTUALENV_OVERLAY_TF_2_3_0 "${NNCC_OVERLAY_DIR}/venv_2_3_0")
+
+add_custom_command(
+  OUTPUT ${VIRTUALENV_OVERLAY_TF_1_13_2}
+  COMMAND ${PYTHON_EXECUTABLE} -m venv ${VIRTUALENV_OVERLAY_TF_1_13_2}
+)
 
 add_custom_command(
-  OUTPUT ${VIRTUALENV_OVERLAY}
-  COMMAND ${PYTHON_EXECUTABLE} -m venv ${VIRTUALENV_OVERLAY}
+  OUTPUT ${VIRTUALENV_OVERLAY_TF_2_3_0}
+  COMMAND ${PYTHON_EXECUTABLE} -m venv ${VIRTUALENV_OVERLAY_TF_2_3_0}
 )
 
 # Create requirements.txt and install required pip packages
 set(REQUIREMENTS_FILE "requirements.txt")
-set(REQUIREMENTS_OVERLAY_PATH "${NNCC_OVERLAY_DIR}/${REQUIREMENTS_FILE}")
+set(REQUIREMENTS_OVERLAY_PATH_TF_1_13_2 "${VIRTUALENV_OVERLAY_TF_1_13_2}/${REQUIREMENTS_FILE}")
+set(REQUIREMENTS_OVERLAY_PATH_TF_2_3_0 "${VIRTUALENV_OVERLAY_TF_2_3_0}/${REQUIREMENTS_FILE}")
 
 add_custom_command(
-  OUTPUT ${REQUIREMENTS_OVERLAY_PATH}
-  COMMAND ${CMAKE_COMMAND} -E echo "tensorflow==1.13.2" > ${REQUIREMENTS_OVERLAY_PATH}
-  COMMAND ${VIRTUALENV_OVERLAY}/bin/python -m pip --default-timeout=1000 install --upgrade pip setuptools
-  COMMAND ${VIRTUALENV_OVERLAY}/bin/python -m pip --default-timeout=1000 install -r ${REQUIREMENTS_OVERLAY_PATH} --upgrade
-  DEPENDS ${VIRTUALENV_OVERLAY} ${REQUIREMENTS_OVERLAY_PATH}
+  OUTPUT ${REQUIREMENTS_OVERLAY_PATH_TF_1_13_2}
+  COMMAND ${CMAKE_COMMAND} -E echo "tensorflow==1.13.2" > ${REQUIREMENTS_OVERLAY_PATH_TF_1_13_2}
+  COMMAND ${VIRTUALENV_OVERLAY_TF_1_13_2}/bin/python -m pip --default-timeout=1000 install --upgrade pip setuptools
+  COMMAND ${VIRTUALENV_OVERLAY_TF_1_13_2}/bin/python -m pip --default-timeout=1000 install -r ${REQUIREMENTS_OVERLAY_PATH_TF_1_13_2} --upgrade
+  DEPENDS ${VIRTUALENV_OVERLAY_TF_1_13_2}
 )
 
-add_custom_target(common_artifacts_python_deps ALL
-  DEPENDS ${VIRTUALENV_OVERLAY} ${REQUIREMENTS_OVERLAY_PATH}
+add_custom_command(
+  OUTPUT ${REQUIREMENTS_OVERLAY_PATH_TF_2_3_0}
+  COMMAND ${CMAKE_COMMAND} -E remove -f ${REQUIREMENTS_OVERLAY_PATH_TF_2_3_0}
+  COMMAND ${CMAKE_COMMAND} -E echo "tensorflow-cpu==2.3.0" >> ${REQUIREMENTS_OVERLAY_PATH_TF_2_3_0}
+  COMMAND ${CMAKE_COMMAND} -E echo "flatbuffers==1.12" >> ${REQUIREMENTS_OVERLAY_PATH_TF_2_3_0}
+  COMMAND ${VIRTUALENV_OVERLAY_TF_2_3_0}/bin/python -m pip --default-timeout=1000 install --upgrade pip setuptools
+  COMMAND ${VIRTUALENV_OVERLAY_TF_2_3_0}/bin/python -m pip --default-timeout=1000 install -r ${REQUIREMENTS_OVERLAY_PATH_TF_2_3_0} --upgrade
+  DEPENDS ${VIRTUALENV_OVERLAY_TF_2_3_0}
 )
 
-# TODO Create python virtual environment with tensorflow 2.3.0-rc0
+add_custom_target(common_artifacts_python_deps ALL
+  DEPENDS ${VIRTUALENV_OVERLAY_TF_1_13_2} ${VIRTUALENV_OVERLAY_TF_2_3_0} ${REQUIREMENTS_OVERLAY_PATH_TF_1_13_2} ${REQUIREMENTS_OVERLAY_PATH_TF_2_3_0}
+)
 
 #[[ Generate common resources ]]
 # TODO add pbtxt
@@ -52,6 +69,7 @@ set(SOURCES src/TestDataGenerator.cpp)
 add_executable(testDataGenerator ${SOURCES})
 target_include_directories(testDataGenerator PRIVATE ${HDF5_INCLUDE_DIRS})
 target_link_libraries(testDataGenerator PRIVATE ${HDF5_CXX_LIBRARIES})
+target_link_libraries(testDataGenerator PRIVATE arser)
 target_link_libraries(testDataGenerator PRIVATE foder)
 target_link_libraries(testDataGenerator PRIVATE luci_import)
 target_link_libraries(testDataGenerator PRIVATE luci_interpreter)
diff --git a/compiler/common-artifacts/exclude.lst b/compiler/common-artifacts/exclude.lst
index b614b7182..fe9933ae0 100644
--- a/compiler/common-artifacts/exclude.lst
+++ b/compiler/common-artifacts/exclude.lst
@@ -5,9 +5,12 @@
 
 #[[ optimize : Exclude from circle optimization(circle2circle) ]]
 ## TensorFlowLiteRecipes
-optimize(ReLU6_000)
-optimize(Where_000)
-optimize(Where_001)
+optimize(Unique_000)
+optimize(Unique_001)
+optimize(Unique_002)
+optimize(Unique_003)
+optimize(Unique_U8_000)
+optimize(Unique_U8_001)
 
 ## CircleRecipes
 
@@ -46,6 +49,7 @@ tcgenerate(DepthToSpace_000)
 tcgenerate(DepthwiseConv2D_001) # runtime doesn't support dilation
 tcgenerate(DepthwiseConv2D_003) # runtime doesn't support dilation
 tcgenerate(DepthwiseConv2D_U8_000)
+tcgenerate(DepthwiseConv2D_U8_001)  # luci-interpreter doesn't support channel-wise quantization yet
 tcgenerate(Div_000)
 tcgenerate(ELU_000)
 tcgenerate(Equal_000)
@@ -90,13 +94,15 @@ tcgenerate(Maximum_000)
 tcgenerate(MaxPool2D_U8_000)
 tcgenerate(Mean_U8_000)
 tcgenerate(Minimum_000)
+tcgenerate(NonMaxSuppressionV4_000)
+tcgenerate(NonMaxSuppressionV4_001)
 tcgenerate(MirrorPad_000)
 tcgenerate(Mul_U8_000)
 tcgenerate(Neg_000)
 tcgenerate(Net_Dangle_001)
 tcgenerate(Net_InstanceNorm_001)
 tcgenerate(Net_InstanceNorm_002)
-tcgenerate(Net_ZeroDim_001) # fix luci
+tcgenerate(Net_ZeroDim_001) # luci-interpreter doesn't support zero dim
 tcgenerate(NotEqual_000)
 tcgenerate(OneHot_000)
 tcgenerate(OneHot_001)
@@ -105,6 +111,7 @@ tcgenerate(OneHot_003)
 tcgenerate(Pack_000)
 tcgenerate(Pack_U8_000)
 tcgenerate(Pad_U8_000)
+tcgenerate(PadV2_000)
 tcgenerate(Pow_000)
 tcgenerate(PRelu_000)
 tcgenerate(Range_000)
@@ -120,11 +127,12 @@ tcgenerate(ReduceProd_001)
 tcgenerate(ReduceProd_002)
 tcgenerate(ReduceProd_003)
 tcgenerate(ReLU_000)
-tcgenerate(ReLU6_000) # luci NYI
+tcgenerate(ReLU6_000)
 tcgenerate(ReLUN1To1_000)
-tcgenerate(Reshape_003) # fix luci
+tcgenerate(Reshape_003) # luci-interpreter doesn't support reshape without built-in option
 tcgenerate(Reshape_U8_000)
 tcgenerate(ResizeBilinear_000)
+tcgenerate(ResizeBilinear_U8_000) # luci-interpreter
 tcgenerate(ResizeNearestNeighbor_000)
 tcgenerate(ReverseSequence_000)
 tcgenerate(ReverseV2_000)
@@ -148,7 +156,7 @@ tcgenerate(SpaceToBatchND_002)
 tcgenerate(SpaceToBatchND_003)
 tcgenerate(SpaceToDepth_000)
 tcgenerate(SparseToDense_000)
-tcgenerate(SplitV_000) # fix luci
+tcgenerate(SplitV_000)
 tcgenerate(Sqrt_000)
 tcgenerate(Square_000)
 tcgenerate(SquaredDifference_000)
@@ -164,22 +172,21 @@ tcgenerate(Sum_001)
 tcgenerate(Tanh_000)
 tcgenerate(Tile_000)
 tcgenerate(Tile_U8_000)
-tcgenerate(TopKV2_000) # fix luci
-tcgenerate(TopKV2_001) # fix luci
-tcgenerate(TransposeConv_000) # fix interpreter
+tcgenerate(TopKV2_000)
+tcgenerate(TopKV2_001)
 tcgenerate(Unique_000)
 tcgenerate(Unique_001)
 tcgenerate(Unique_002)
 tcgenerate(Unique_003)
 tcgenerate(Unique_U8_000)
 tcgenerate(Unique_U8_001)
-tcgenerate(Where_000) # luci NYI
-tcgenerate(Where_001) # luci NYI
-tcgenerate(While_000) # fix luci
+tcgenerate(Where_000)
+tcgenerate(Where_001)
+tcgenerate(While_000)
 tcgenerate(While_001)
 tcgenerate(While_002)
 tcgenerate(While_003)
-tcgenerate(YUV_TO_RGB_000) # fix luci
+tcgenerate(YUV_TO_RGB_000)
 tcgenerate(YUV_TO_RGB_U8_000)
 tcgenerate(ZerosLike_000)
 
diff --git a/compiler/common-artifacts/requires.cmake b/compiler/common-artifacts/requires.cmake
index 8c27565cf..d7bed21fe 100644
--- a/compiler/common-artifacts/requires.cmake
+++ b/compiler/common-artifacts/requires.cmake
@@ -1,3 +1,4 @@
+require("arser")
 require("circle2circle")
 require("circlechef")
 require("foder")
diff --git a/compiler/common-artifacts/src/TestDataGenerator.cpp b/compiler/common-artifacts/src/TestDataGenerator.cpp
index 739300d18..7a07dd88e 100644
--- a/compiler/common-artifacts/src/TestDataGenerator.cpp
+++ b/compiler/common-artifacts/src/TestDataGenerator.cpp
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <arser/arser.h>
 #include <foder/FileLoader.h>
 #include <luci/Importer.h>
 #include <luci_interpreter/Interpreter.h>
@@ -62,10 +63,9 @@ template <typename T> void geneate_random_data(std::mt19937 &gen, void *data, ui
   }
 }
 
-void fill_random_data(void *data, uint32_t size, loco::DataType dtype)
+void fill_random_data(void *data, uint32_t size, loco::DataType dtype, uint32_t seed)
 {
-  std::random_device rd;  // used to obtain a seed for the random number engine
-  std::mt19937 gen(rd()); // standard mersenne_twister_engine seeded with rd()
+  std::mt19937 gen(seed); // standard mersenne_twister_engine seeded with rd()
 
   switch (dtype)
   {
@@ -90,7 +90,25 @@ void fill_random_data(void *data, uint32_t size, loco::DataType dtype)
 
 int entry(int argc, char **argv)
 {
-  std::string circle_file{argv[1]};
+  arser::Arser arser;
+  arser.add_argument("circle").type(arser::DataType::STR).help("Circle file you want to test");
+  arser.add_argument("--fixed_seed")
+      .required(false)
+      .nargs(0)
+      .help("Put a fixed seed into the random number generator");
+
+  try
+  {
+    arser.parse(argc, argv);
+  }
+  catch (const std::runtime_error &err)
+  {
+    std::cout << err.what() << std::endl;
+    std::cout << arser;
+    return 255;
+  }
+
+  std::string circle_file = arser.get<std::string>("circle");
   size_t last_dot_index = circle_file.find_last_of(".");
   std::string prefix = circle_file.substr(0, last_dot_index);
 
@@ -136,6 +154,7 @@ int entry(int argc, char **argv)
   std::unique_ptr<H5::Group> output_value_group =
       std::make_unique<H5::Group>(output_file.createGroup("value"));
 
+  std::random_device rd; // used to obtain a seed for the random number engine
   uint32_t input_index = 0;
   for (uint32_t g = 0; g < circle_model->subgraphs()->size(); g++)
   {
@@ -174,7 +193,10 @@ int entry(int argc, char **argv)
       std::vector<int8_t> data(byte_size);
 
       // generate random data
-      fill_random_data(data.data(), data_size, input_node->dtype());
+      if (arser["--fixed_seed"])
+        fill_random_data(data.data(), data_size, input_node->dtype(), 0);
+      else
+        fill_random_data(data.data(), data_size, input_node->dtype(), rd());
 
       dataset->write(data.data(), dtype);
 
diff --git a/compiler/hermes/src/hermes.test.cpp b/compiler/hermes/src/hermes.test.cpp
index 2cbc0939d..ea7ef65d8 100644
--- a/compiler/hermes/src/hermes.test.cpp
+++ b/compiler/hermes/src/hermes.test.cpp
@@ -18,7 +18,28 @@
 
 #include <gtest/gtest.h>
 
-TEST(HermesTest, simple_usecase)
+namespace
 {
-  // TO BE FILLED
+
+class Logger final : public hermes::Source
+{
+public:
+  Logger(hermes::Context *ctx);
+  ~Logger();
+};
+
+Logger::Logger(hermes::Context *ctx) { activate(ctx->sources(), ctx->bus()); }
+Logger::~Logger() { deactivate(); }
+
+} // namespace
+
+TEST(HermesTest, logger_constructor_NEG)
+{
+  hermes::Context context;
+  // we expect segmentfault from nullptr->sources()
+  ASSERT_DEATH(Logger logger(&context), "");
+
+  SUCCEED();
 }
+
+// TODO add HermesTest simple_usecase
diff --git a/compiler/locomotiv/src/Node/BiasEncode.test.cpp b/compiler/locomotiv/src/Node/BiasEncode.test.cpp
index cdb255ccb..4680f5c5a 100644
--- a/compiler/locomotiv/src/Node/BiasEncode.test.cpp
+++ b/compiler/locomotiv/src/Node/BiasEncode.test.cpp
@@ -90,6 +90,16 @@ template <typename T> void test()
 }
 } // namespace
 
-TEST(NodeExecution_BiasEncode, s32) { test<int32_t>(); }
+TEST(NodeExecution_BiasEncode, s32)
+{
+  test<int32_t>();
+
+  SUCCEED();
+}
 
-TEST(NodeExecution_BiasEncode, f32) { test<float>(); }
+TEST(NodeExecution_BiasEncode, f32)
+{
+  test<float>();
+
+  SUCCEED();
+}
diff --git a/compiler/locomotiv/src/Node/MatMul.test.cpp b/compiler/locomotiv/src/Node/MatMul.test.cpp
index f1f3a52d3..7d942e1d0 100644
--- a/compiler/locomotiv/src/Node/MatMul.test.cpp
+++ b/compiler/locomotiv/src/Node/MatMul.test.cpp
@@ -142,6 +142,8 @@ TEST(NodeExecution_MatMul, f32_2x3_3x3)
   };
 
   run_test<float>(lhs, rhs, out, Shape{2, 3}, Shape{3, 3}, Shape{2, 3}, loco::DataType::FLOAT32);
+
+  SUCCEED();
 }
 
 /* from the code below:
@@ -183,6 +185,8 @@ TEST(NodeExecution_MatMul, s32_4x2_2x6)
   };
 
   run_test<int32_t>(lhs, rhs, out, Shape{4, 2}, Shape{2, 6}, Shape{4, 6}, loco::DataType::S32);
+
+  SUCCEED();
 }
 
 // clang-format on
diff --git a/compiler/locop/src/FormattedGraph.test.cpp b/compiler/locop/src/FormattedGraph.test.cpp
index c9808d3a2..aff9ebe5f 100644
--- a/compiler/locop/src/FormattedGraph.test.cpp
+++ b/compiler/locop/src/FormattedGraph.test.cpp
@@ -28,6 +28,8 @@ TEST(LinearV1FormatterTest, simple)
 
   // TODO Validate the output (when the implementation becomes stable)
   std::cout << locop::fmt<locop::LinearV1>(g) << std::endl;
+
+  SUCCEED();
 }
 
 TEST(LinearV1FormatterTest, user_defined_node_summary_builder)
diff --git a/compiler/locop/src/FormattedTensorShape.test.cpp b/compiler/locop/src/FormattedTensorShape.test.cpp
index 0f0017ab4..fc85df3a6 100644
--- a/compiler/locop/src/FormattedTensorShape.test.cpp
+++ b/compiler/locop/src/FormattedTensorShape.test.cpp
@@ -30,4 +30,6 @@ TEST(FormattedTensorShapeTest, BracketFormat)
   tensor_shape->dim(0) = 4;
 
   std::cout << fmt<TensorShapeFormat::Bracket>(tensor_shape.get()) << std::endl;
+
+  SUCCEED();
 }
diff --git a/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h b/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h
index 998789882..4ac3d8660 100644
--- a/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h
+++ b/compiler/luci-interpreter/include/luci_interpreter/core/Tensor.h
@@ -79,12 +79,11 @@ private:
 //
 // Note that due to historical and performance reasons, per-tensor quantization uses unsigned
 // integer types, while per-channel uses signed types assuming 'zero_point' == 0.
-//
-// TODO Add 'quantized_dimension' field for per-channel case when IR provides it.
 struct AffineQuantization
 {
   std::vector<float> scale;
   std::vector<int32_t> zero_point;
+  int32_t quantized_dimension;
 };
 
 class Tensor
@@ -108,6 +107,12 @@ public:
     return _quantization.zero_point[0];
   }
 
+  const std::vector<float> &scales() const { return _quantization.scale; }
+
+  const std::vector<int32_t> &zero_points() const { return _quantization.zero_point; }
+
+  int32_t quantized_dimension() const { return _quantization.quantized_dimension; }
+
   template <typename T> const T *data() const { return reinterpret_cast<const T *>(_data.get()); }
 
   template <typename T> T *data() { return reinterpret_cast<T *>(_data.get()); }
diff --git a/compiler/luci-interpreter/src/core/KernelParams.h b/compiler/luci-interpreter/src/core/KernelParams.h
index a32e0d4a5..65d119761 100644
--- a/compiler/luci-interpreter/src/core/KernelParams.h
+++ b/compiler/luci-interpreter/src/core/KernelParams.h
@@ -56,6 +56,11 @@ struct Conv2DParams
   Activation activation;
 };
 
+struct DepthToSpaceParams
+{
+  int block_size;
+};
+
 struct DepthwiseConv2DParams
 {
   Padding padding;
diff --git a/compiler/luci-interpreter/src/kernels/Add.cpp b/compiler/luci-interpreter/src/kernels/Add.cpp
index 9b9334792..9ed155e94 100644
--- a/compiler/luci-interpreter/src/kernels/Add.cpp
+++ b/compiler/luci-interpreter/src/kernels/Add.cpp
@@ -36,7 +36,10 @@ Add::Add(const Tensor *input1, const Tensor *input2, Tensor *output, const AddPa
 
 void Add::configure()
 {
-  assert(input1()->element_type() == input2()->element_type());
+  if (input1()->element_type() != input2()->element_type())
+  {
+    throw std::runtime_error("Input Tensor Data Type Mismatch.");
+  }
   output()->resize(calculateShapeForBroadcast(input1()->shape(), input2()->shape()));
 }
 
diff --git a/compiler/luci-interpreter/src/kernels/Add.test.cpp b/compiler/luci-interpreter/src/kernels/Add.test.cpp
index 54e1cc672..705b648c8 100644
--- a/compiler/luci-interpreter/src/kernels/Add.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Add.test.cpp
@@ -169,6 +169,33 @@ TEST(AddTest, Float)
   }
 }
 
+TEST(AddTest, Input_Output_Type_NEG)
+{
+  Tensor input1_tensor = makeInputTensor<DataType::FLOAT32>({1}, {1.f});
+  Tensor input2_tensor = makeInputTensor<DataType::S32>({1}, {2});
+  Tensor output_tensor = makeOutputTensor(DataType::FLOAT32);
+
+  AddParams params{};
+  params.activation = Activation::RELU;
+
+  Add kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+  EXPECT_ANY_THROW(kernel.configure());
+}
+
+TEST(AddTest, Invalid_Input_Type_NEG)
+{
+  Tensor input1_tensor = makeInputTensor<DataType::S64>({1}, {1});
+  Tensor input2_tensor = makeInputTensor<DataType::S64>({1}, {2});
+  Tensor output_tensor = makeOutputTensor(DataType::S64);
+
+  AddParams params{};
+  params.activation = Activation::RELU;
+
+  Add kernel(&input1_tensor, &input2_tensor, &output_tensor, params);
+  kernel.configure();
+  EXPECT_ANY_THROW(kernel.execute());
+}
+
 } // namespace
 } // namespace kernels
 } // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/CMakeLists.txt b/compiler/luci-interpreter/src/kernels/CMakeLists.txt
index fe3623135..a1fd1deaf 100644
--- a/compiler/luci-interpreter/src/kernels/CMakeLists.txt
+++ b/compiler/luci-interpreter/src/kernels/CMakeLists.txt
@@ -12,6 +12,8 @@ set(SOURCES
     Concatenation.cpp
     Conv2D.h
     Conv2D.cpp
+    DepthToSpace.h
+    DepthToSpace.cpp
     DepthwiseConv2D.h
     DepthwiseConv2D.cpp
     Elu.h
@@ -40,6 +42,10 @@ set(SOURCES
     Pad.cpp
     Reshape.h
     Reshape.cpp
+    Reverse.h
+    Reverse.cpp
+    Slice.h
+    Slice.cpp
     Softmax.h
     Softmax.cpp
     SpaceToDepth.h
@@ -77,6 +83,7 @@ set(TEST_SOURCES
     AveragePool2D.test.cpp
     Concatenation.test.cpp
     Conv2D.test.cpp
+    DepthToSpace.test.cpp
     DepthwiseConv2D.test.cpp
     Elu.test.cpp
     FullyConnected.test.cpp
@@ -91,6 +98,8 @@ set(TEST_SOURCES
     Mul.test.cpp
     Pad.test.cpp
     Reshape.test.cpp
+    Reverse.test.cpp
+    Slice.test.cpp
     Softmax.test.cpp
     SpaceToDepth.test.cpp
     Split.test.cpp
diff --git a/compiler/luci-interpreter/src/kernels/DepthToSpace.cpp b/compiler/luci-interpreter/src/kernels/DepthToSpace.cpp
new file mode 100644
index 000000000..cab63e26d
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/DepthToSpace.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "DepthToSpace.h"
+#include "Utils.h"
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+DepthToSpace::DepthToSpace(const Tensor *input, Tensor *output, const DepthToSpaceParams &params)
+    : KernelWithParams<DepthToSpaceParams>({input}, {output}, params)
+{
+}
+
+void DepthToSpace::configure()
+{
+  if (input()->shape().num_dims() != 4)
+  {
+    throw std::runtime_error("Invalid input num_dims.");
+  }
+  if (output()->element_type() != DataType::FLOAT32 && output()->element_type() != DataType::U8 &&
+      output()->element_type() != DataType::S8 && output()->element_type() != DataType::S32 &&
+      output()->element_type() != DataType::S64)
+  {
+    throw std::runtime_error("Invalid output type");
+  }
+  if (input()->element_type() != output()->element_type())
+  {
+    throw std::runtime_error("Type mismatch on input and output.");
+  }
+  const int block_size = params().block_size;
+  const int32_t input_height = input()->shape().dim(1);
+  const int32_t input_width = input()->shape().dim(2);
+  const int32_t input_channels = input()->shape().dim(3);
+  int32_t output_height = input_height * block_size;
+  int32_t output_width = input_width * block_size;
+  int32_t output_channels = input_channels / block_size / block_size;
+
+  assert(input_height == output_height / block_size);
+  assert(input_width == output_width / block_size);
+  assert(input_channels == output_channels * block_size * block_size);
+
+  Shape output_shape(4);
+  output_shape.dim(0) = input()->shape().dim(0);
+  output_shape.dim(1) = output_height;
+  output_shape.dim(2) = output_width;
+  output_shape.dim(3) = output_channels;
+
+  output()->resize(output_shape);
+}
+
+void DepthToSpace::execute() const
+{
+  tflite::DepthToSpaceParams op_params;
+  op_params.block_size = params().block_size;
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      tflite::optimized_ops::DepthToSpace(op_params, getTensorShape(input()),
+                                          getTensorData<float>(input()), getTensorShape(output()),
+                                          getTensorData<float>(output()));
+      break;
+    case DataType::U8:
+      tflite::optimized_ops::DepthToSpace(op_params, getTensorShape(input()),
+                                          getTensorData<uint8_t>(input()), getTensorShape(output()),
+                                          getTensorData<uint8_t>(output()));
+      break;
+    default:
+      throw std::runtime_error("Unsupported Type.");
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/DepthToSpace.h b/compiler/luci-interpreter/src/kernels/DepthToSpace.h
new file mode 100644
index 000000000..63ce37610
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/DepthToSpace.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_DEPTHTOSPACE_H
+#define LUCI_INTERPRETER_KERNELS_DEPTHTOSPACE_H
+
+#include "core/Kernel.h"
+#include "core/KernelParams.h"
+
+#include <vector>
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class DepthToSpace : public KernelWithParams<DepthToSpaceParams>
+{
+public:
+  DepthToSpace(const Tensor *input, Tensor *output, const DepthToSpaceParams &params);
+
+  const Tensor *input() const { return _inputs[0]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_DEPTHTOSPACE_H
diff --git a/compiler/luci-interpreter/src/kernels/DepthToSpace.test.cpp b/compiler/luci-interpreter/src/kernels/DepthToSpace.test.cpp
new file mode 100644
index 000000000..1b805702d
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/DepthToSpace.test.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/DepthToSpace.h"
+#include "kernels/TestUtils.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T> class DepthToSpaceTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t>;
+TYPED_TEST_CASE(DepthToSpaceTest, DataTypes);
+
+TYPED_TEST(DepthToSpaceTest, SimpleCase)
+{
+  std::vector<TypeParam> input_data{1, 2, 3, 4, 5, 6, 7, 8};
+  Shape input_shape{1, 1, 2, 4};
+  std::vector<TypeParam> output_data{1, 2, 5, 6, 3, 4, 7, 8};
+  std::vector<int32_t> output_shape{1, 2, 4, 1};
+
+  Tensor input_tensor = makeInputTensor<getElementType<TypeParam>()>(input_shape, input_data);
+  Tensor output_tensor = makeOutputTensor(getElementType<TypeParam>());
+
+  DepthToSpaceParams params{};
+  params.block_size = 2;
+
+  DepthToSpace kernel = DepthToSpace(&input_tensor, &output_tensor, params);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<TypeParam>(output_tensor),
+              ::testing::ElementsAreArray(output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp b/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp
index fad450d66..f53eaca94 100644
--- a/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/L2Normalize.test.cpp
@@ -45,12 +45,9 @@ TEST(L2NormalizeTest, Float)
               ElementsAreArray(ArrayFloatNear(ref_output_data)));
 }
 
-TEST(L2NormalizeTest, Uint8Quantized)
-{
-  // TODO
-  // Implement GetDequantizedOutput Function.
-  // Create Test for Uint8 Case
-}
+// TODO Uint8Quantized
+// Implement GetDequantizedOutput Function.
+// Create Test for Uint8 Case
 
 } // namespace
 } // namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp b/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp
index b0c06e7a3..c79d3d6bc 100644
--- a/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/LeakyRelu.test.cpp
@@ -61,15 +61,14 @@ TEST(LeakReluTest, FloatSimple)
                    1.0f, -0.5f, -1.0f, // Row 2
                },
                /*alpha=*/0.5f, getElementType<float>());
-}
 
-TEST(LeakReluTest, Uint8Simple)
-{
-  // TODO
-  // Implement GetDequantizedOutput Function.
-  // Create Test for Uint8 Case
+  SUCCEED();
 }
 
+// TODO Uint8Simple
+// Implement GetDequantizedOutput Function.
+// Create Test for Uint8 Case
+
 } // namespace
 } // namespace kernels
 } // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Logistic.test.cpp b/compiler/luci-interpreter/src/kernels/Logistic.test.cpp
index 17456a4a8..00feddf3d 100644
--- a/compiler/luci-interpreter/src/kernels/Logistic.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/Logistic.test.cpp
@@ -49,10 +49,8 @@ TEST(LogisticTest, Float)
   // TODO make a Shape checking of output_tensor.
 }
 
-TEST(LogisticTest, Uint8)
-{
-  // Need to Implement GetDequantizedOutput Function.
-}
+// TODO Uint8
+// Need to Implement GetDequantizedOutput Function.
 
 } // namespace
 } // namespace kernels
diff --git a/compiler/luci-interpreter/src/kernels/Reverse.cpp b/compiler/luci-interpreter/src/kernels/Reverse.cpp
new file mode 100644
index 000000000..a46308412
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Reverse.cpp
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Reverse.h"
+#include "kernels/Utils.h"
+#include <tensorflow/lite/kernels/internal/reference/reference_ops.h>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+
+Reverse::Reverse(const Tensor *input, const Tensor *axes, Tensor *output)
+    : Kernel({input, axes}, {output})
+{
+}
+
+void Reverse::configure()
+{
+  assert(axes()->shape().num_dims() == 1);
+  assert(input()->shape().num_dims() >= axes()->shape().num_elements());
+  if (input()->element_type() != DataType::S32 && input()->element_type() != DataType::FLOAT32 &&
+      input()->element_type() != DataType::U8 && input()->element_type() != DataType::S16 &&
+      input()->element_type() != DataType::S64)
+  {
+    throw std::runtime_error("Unsupported input type.");
+  }
+  if (axes()->element_type() != DataType::S32)
+  {
+    throw std::runtime_error("Unsupported axes type.");
+  }
+  if (axes()->shape().num_elements() > 1)
+  {
+    throw std::runtime_error("Current implementation does not support more than 1 axis.");
+  }
+  int axis_value = getTensorData<int32_t>(axes())[0];
+  if (axis_value < 0 || axis_value >= input()->shape().num_dims())
+  {
+    throw std::runtime_error("Invalid axes value");
+  }
+  assert(input()->element_type() == output()->element_type());
+
+  output()->resize(input()->shape());
+}
+
+void Reverse::execute() const
+{
+  int axis_value = getTensorData<int32_t>(axes())[0];
+  switch (output()->element_type())
+  {
+    case DataType::FLOAT32:
+      tflite::reference_ops::Reverse<float>(axis_value, getTensorShape(input()),
+                                            getTensorData<float>(input()), getTensorShape(output()),
+                                            getTensorData<float>(output()));
+      break;
+    case DataType::U8:
+      tflite::reference_ops::Reverse<uint8_t>(
+          axis_value, getTensorShape(input()), getTensorData<uint8_t>(input()),
+          getTensorShape(output()), getTensorData<uint8_t>(output()));
+      break;
+    default:
+      throw std::runtime_error("Unsupported output type");
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Reverse.h b/compiler/luci-interpreter/src/kernels/Reverse.h
new file mode 100644
index 000000000..3489dae28
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Reverse.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_REVERSE_H
+#define LUCI_INTERPRETER_KERNELS_REVERSE_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Reverse : public Kernel
+{
+public:
+  Reverse(const Tensor *input, const Tensor *axes, Tensor *output);
+
+  const Tensor *input() const { return _inputs[0]; }
+  const Tensor *axes() const { return _inputs[1]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_REVERSE_H
diff --git a/compiler/luci-interpreter/src/kernels/Reverse.test.cpp b/compiler/luci-interpreter/src/kernels/Reverse.test.cpp
new file mode 100644
index 000000000..5475a8bd3
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Reverse.test.cpp
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Reverse.h"
+#include "kernels/TestUtils.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T> class ReverseTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t>;
+TYPED_TEST_CASE(ReverseTest, DataTypes);
+
+TYPED_TEST(ReverseTest, MultiDimensions)
+{
+  // TypeParam
+  std::vector<TypeParam> input_data{1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                                    13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24};
+  Shape input_shape{4, 3, 2};
+  std::vector<int32_t> axis_data{1};
+  Shape axis_shape{1};
+
+  std::vector<TypeParam> output_data{5,  6,  3,  4,  1,  2,  11, 12, 9,  10, 7,  8,
+                                     17, 18, 15, 16, 13, 14, 23, 24, 21, 22, 19, 20};
+  std::vector<int32_t> output_shape{4, 3, 2};
+
+  Tensor input_tensor = makeInputTensor<getElementType<TypeParam>()>(input_shape, input_data);
+  Tensor axis_tensor = makeInputTensor<DataType::S32>(axis_shape, axis_data);
+
+  Tensor output_tensor = makeOutputTensor(getElementType<TypeParam>());
+
+  Reverse kernel = Reverse(&input_tensor, &axis_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<TypeParam>(output_tensor),
+              ::testing::ElementsAreArray(output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Slice.cpp b/compiler/luci-interpreter/src/kernels/Slice.cpp
new file mode 100644
index 000000000..c4bc3c57c
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Slice.cpp
@@ -0,0 +1,149 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Slice.h"
+#include "Utils.h"
+#include <tensorflow/lite/kernels/internal/optimized/optimized_ops.h>
+
+#include <cassert>
+#include <cstring>
+
+namespace luci_interpreter
+{
+
+namespace kernels
+{
+const int max_dim = 4;
+
+Slice::Slice(const Tensor *input, const Tensor *begin, const Tensor *size, Tensor *output)
+    : Kernel({input, begin, size}, {output})
+{
+}
+
+template <typename T>
+Shape calculateOutputShape(const Tensor *input, const Tensor *begin, const Tensor *size)
+{
+  Shape output_shape = Shape(input->shape().num_dims());
+  for (int idx = 0; idx < input->shape().num_dims(); idx++)
+  {
+    T size_value = getTensorData<T>(size)[idx];
+    if (size_value < 0)
+    {
+      if (size_value != -1)
+      {
+        throw std::runtime_error("Invalid size.");
+      }
+      size_value = input->shape().dim(idx) - getTensorData<T>(begin)[idx];
+    }
+    else
+    {
+      if (input->shape().dim(idx) < getTensorData<T>(begin)[idx] + size_value)
+      {
+        throw std::runtime_error("Invalid begin and size.");
+      }
+    }
+    output_shape.dim(idx) = static_cast<int>(size_value);
+  }
+  return output_shape;
+}
+
+template <typename T>
+void getBeginAndSizeVectors(int dimensions, const Tensor *begin, const Tensor *size,
+                            std::vector<int> *begins, std::vector<int> *sizes)
+{
+  for (int idx = dimensions - 1; idx >= 0; --idx)
+  {
+    begins->push_back(getTensorData<T>(begin)[idx]);
+    sizes->push_back(getTensorData<T>(size)[idx]);
+  }
+}
+
+void Slice::configure()
+{
+  assert(input()->element_type() == output()->element_type());
+  assert(begin()->element_type() == DataType::S32 || begin()->element_type() == DataType::S64);
+  assert(size()->element_type() == DataType::S32 || size()->element_type() == DataType::S64);
+  assert(begin()->shape().num_dims() == 1);
+  assert(size()->shape().num_dims() == 1);
+  assert(input()->shape().num_dims() <= max_dim);
+
+  if (begin()->element_type() == DataType::S32)
+  {
+    output()->resize(calculateOutputShape<int32_t>(input(), begin(), size()));
+  }
+  else if (begin()->element_type() == DataType::S64)
+  {
+    output()->resize(calculateOutputShape<int64_t>(input(), begin(), size()));
+  }
+  else
+  {
+    throw std::runtime_error("Unsupported type.");
+  }
+}
+
+void Slice::execute() const
+{
+  std::vector<int> begins;
+  begins.reserve(max_dim);
+  std::vector<int> sizes;
+  sizes.reserve(max_dim);
+  if (begin()->element_type() == DataType::S32)
+  {
+    getBeginAndSizeVectors<int32_t>(input()->shape().num_dims(), begin(), size(), &begins, &sizes);
+  }
+  else if (begin()->element_type() == DataType::S64)
+  {
+    getBeginAndSizeVectors<int64_t>(input()->shape().num_dims(), begin(), size(), &begins, &sizes);
+  }
+  else
+  {
+    throw std::runtime_error("Unsupported begin type.");
+  }
+  for (int i = input()->shape().num_dims(); i < max_dim; ++i)
+  {
+    begins.push_back(0);
+    sizes.push_back(1);
+  }
+
+  assert(begins.size() == 4);
+  assert(sizes.size() == 4);
+  tflite::SliceParams op_params{};
+  op_params.begin_count = 4;
+  op_params.size_count = 4;
+  for (int i = 0; i < 4; i++)
+  {
+    op_params.begin[i] = begins[3 - i];
+    op_params.size[i] = sizes[3 - i];
+  }
+  switch (input()->element_type())
+  {
+    case DataType::FLOAT32:
+      tflite::optimized_ops::Slice(op_params, getTensorShape(input()),
+                                   getTensorData<float>(input()), getTensorShape(output()),
+                                   getTensorData<float>(output()));
+      break;
+    case DataType::U8:
+      tflite::optimized_ops::Slice(op_params, getTensorShape(input()),
+                                   getTensorData<uint8_t>(input()), getTensorShape(output()),
+                                   getTensorData<uint8_t>(output()));
+      break;
+    default:
+      throw std::runtime_error("Unsupported input type.");
+  }
+}
+
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/Slice.h b/compiler/luci-interpreter/src/kernels/Slice.h
new file mode 100644
index 000000000..23c359608
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Slice.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef LUCI_INTERPRETER_KERNELS_SLICE_H
+#define LUCI_INTERPRETER_KERNELS_SLICE_H
+
+#include "core/Kernel.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+
+class Slice : public Kernel
+{
+public:
+  Slice(const Tensor *input, const Tensor *begin, const Tensor *size, Tensor *output);
+
+  const Tensor *input() const { return _inputs[0]; }
+  const Tensor *begin() const { return _inputs[1]; }
+  const Tensor *size() const { return _inputs[2]; }
+  Tensor *output() const { return _outputs[0]; }
+
+  void configure() override;
+  void execute() const override;
+};
+
+} // namespace kernels
+} // namespace luci_interpreter
+
+#endif // LUCI_INTERPRETER_KERNELS_SLICE_H
diff --git a/compiler/luci-interpreter/src/kernels/Slice.test.cpp b/compiler/luci-interpreter/src/kernels/Slice.test.cpp
new file mode 100644
index 000000000..a360a29cc
--- /dev/null
+++ b/compiler/luci-interpreter/src/kernels/Slice.test.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "kernels/Slice.h"
+#include "kernels/TestUtils.h"
+
+namespace luci_interpreter
+{
+namespace kernels
+{
+namespace
+{
+
+using namespace testing;
+
+template <typename T> class SliceTest : public ::testing::Test
+{
+};
+
+using DataTypes = ::testing::Types<float, uint8_t>;
+TYPED_TEST_CASE(SliceTest, DataTypes);
+
+TYPED_TEST(SliceTest, SimpleTest)
+{
+  std::vector<TypeParam> input_data{1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5, 6, 6, 6};
+  Shape input_shape{3, 2, 3, 1};
+  std::vector<int32_t> begin_data{1, 0, 0, 0};
+  Shape begin_shape{4};
+  std::vector<int32_t> size_data{2, 1, -1, 1};
+  Shape size_shape{4};
+  std::vector<TypeParam> output_data{3, 3, 3, 5, 5, 5};
+  std::vector<int32_t> output_shape{2, 1, 3, 1};
+
+  Tensor input_tensor = makeInputTensor<getElementType<TypeParam>()>(input_shape, input_data);
+  Tensor begin_tensor = makeInputTensor<DataType::S32>(begin_shape, begin_data);
+  Tensor size_tensor = makeInputTensor<DataType::S32>(size_shape, size_data);
+
+  Tensor output_tensor = makeOutputTensor(getElementType<TypeParam>());
+
+  Slice kernel(&input_tensor, &begin_tensor, &size_tensor, &output_tensor);
+  kernel.configure();
+  kernel.execute();
+
+  EXPECT_THAT(extractTensorData<TypeParam>(output_tensor),
+              ::testing::ElementsAreArray(output_data));
+  EXPECT_THAT(extractTensorShape(output_tensor), ::testing::ElementsAreArray(output_shape));
+}
+
+} // namespace
+} // namespace kernels
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp b/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp
index 3386d3683..b8c0ac497 100644
--- a/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp
+++ b/compiler/luci-interpreter/src/kernels/TransposeConv.test.cpp
@@ -68,6 +68,8 @@ TEST(TransposeConvTest, FloatSimple)
       /*output_data=*/{29, 62, 83, 75, 99, 192, 237, 198, 207, 372, 417, 330, 263, 446, 485, 365},
       /*params.padding=*/luci::Padding::SAME, /*stride_height=*/1, /*stride_width=*/1,
       getElementType<float>());
+
+  SUCCEED();
 }
 
 TEST(TransposeConvTest, FloatTwoFiltersTest)
@@ -82,21 +84,18 @@ TEST(TransposeConvTest, FloatTwoFiltersTest)
                        3352, 3652, 2760},
       /*params.padding=*/luci::Padding::SAME, /*stride_height=*/1, /*stride_width=*/1,
       getElementType<float>());
-}
 
-TEST(TransposeConvTest, Uint8Simple)
-{
-  // TODO
-  // Implement GetDequantizedOutput Function.
-  // Create Test for Uint8 Case
-}
-TEST(TransposeConvTest, Uint8FiltersTest)
-{
-  // TODO
-  // Implement GetDequantizedOutput Function.
-  // Create Test for Uint8 Case
+  SUCCEED();
 }
 
+// TODO Uint8Simple
+// Implement GetDequantizedOutput Function.
+// Create Test for Uint8 Case
+
+// TODO Uint8FiltersTest
+// Implement GetDequantizedOutput Function.
+// Create Test for Uint8 Case
+
 } // namespace
 } // namespace kernels
 } // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/CMakeLists.txt b/compiler/luci-interpreter/src/loader/CMakeLists.txt
index fb36c4ab0..d99485d06 100644
--- a/compiler/luci-interpreter/src/loader/CMakeLists.txt
+++ b/compiler/luci-interpreter/src/loader/CMakeLists.txt
@@ -1,3 +1,5 @@
+nnas_find_package(GTest REQUIRED)
+
 set(SOURCES
     GraphLoader.h
     GraphLoader.cpp
@@ -13,3 +15,8 @@ target_include_directories(luci_interpreter_loader PUBLIC "${LUCI_INTERPRETER_SO
 target_link_libraries(luci_interpreter_loader
     PUBLIC luci_lang luci_interpreter_core
     PRIVATE luci_interpreter_kernels nncc_common)
+
+set(TEST_SOURCES KernelBuilder.test.cpp)
+
+GTest_AddTest(luci_interpreter_loader_test ${TEST_SOURCES})
+target_link_libraries(luci_interpreter_loader_test luci_interpreter_loader)
diff --git a/compiler/luci-interpreter/src/loader/GraphLoader.cpp b/compiler/luci-interpreter/src/loader/GraphLoader.cpp
index 779fa0647..95c654769 100644
--- a/compiler/luci-interpreter/src/loader/GraphLoader.cpp
+++ b/compiler/luci-interpreter/src/loader/GraphLoader.cpp
@@ -16,7 +16,6 @@
 
 #include "loader/GraphLoader.h"
 
-#include "loader/ModuleLoader.h"
 #include "loader/KernelBuilder.h"
 
 #include <loco/IR/Algorithm.h>
@@ -70,9 +69,10 @@ bool isExecutableNode(const luci::CircleNode *node)
   switch (node->opcode())
   {
     // These nodes denote inputs / outputs of a graph.
-    case luci::CircleOpcode::CONST:
+    case luci::CircleOpcode::CIRCLECONST:
     case luci::CircleOpcode::CIRCLEINPUT:
     case luci::CircleOpcode::CIRCLEOUTPUT:
+    case luci::CircleOpcode::CIRCLEOUTPUTEXCLUDE:
     // The following nodes denote outputs of multiple-output nodes.
     case luci::CircleOpcode::CIRCLEIFOUT:
     case luci::CircleOpcode::CIRCLESPLITOUT:
@@ -102,11 +102,12 @@ bool isTensorProducingNode(const luci::CircleNode *node)
 
 } // namespace
 
-GraphLoader::GraphLoader(const ModuleLoader &module_loader, const loco::Graph *graph,
-                         RuntimeGraph *runtime_graph, RuntimeToIR &runtime_to_ir,
-                         std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor)
-    : _module_loader(module_loader), _graph(graph), _runtime_graph(runtime_graph),
-      _runtime_to_ir(runtime_to_ir), _node_to_tensor(node_to_tensor)
+GraphLoader::GraphLoader(
+    const loco::Graph *graph, RuntimeGraph *runtime_graph, RuntimeToIR &runtime_to_ir,
+    const std::unordered_map<const loco::Graph *, RuntimeGraph *> &graph_to_runtime_graph,
+    std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor)
+    : _graph(graph), _runtime_graph(runtime_graph), _runtime_to_ir(runtime_to_ir),
+      _graph_to_runtime_graph(graph_to_runtime_graph), _node_to_tensor(node_to_tensor)
 {
 }
 
@@ -136,6 +137,7 @@ void GraphLoader::loadTensors()
       const luci::CircleQuantParam *params = node->quantparam();
       quantization.scale.assign(params->scale.cbegin(), params->scale.cend());
       quantization.zero_point.assign(params->zerop.cbegin(), params->zerop.cend());
+      quantization.quantized_dimension = params->quantized_dimension;
     }
 
     auto tensor = std::make_unique<Tensor>(node->dtype(), std::move(shape), std::move(quantization),
@@ -178,7 +180,7 @@ void GraphLoader::initInputOutputTensors() const
 
 void GraphLoader::loadOperators()
 {
-  KernelBuilder kernel_builder(_module_loader, *this);
+  KernelBuilder kernel_builder(_graph_to_runtime_graph, _node_to_tensor);
 
   // Create kernels for executable nodes. This has to be done in execution order.
   for (const loco::Node *loco_node :
@@ -195,11 +197,4 @@ void GraphLoader::loadOperators()
   }
 }
 
-void GraphLoader::load()
-{
-  loadTensors();
-  initInputOutputTensors();
-  loadOperators();
-}
-
 } // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/GraphLoader.h b/compiler/luci-interpreter/src/loader/GraphLoader.h
index e0adc0f6c..89c5bcad7 100644
--- a/compiler/luci-interpreter/src/loader/GraphLoader.h
+++ b/compiler/luci-interpreter/src/loader/GraphLoader.h
@@ -27,29 +27,23 @@
 namespace luci_interpreter
 {
 
-class ModuleLoader;
-
 class GraphLoader
 {
 public:
-  GraphLoader(const ModuleLoader &module_loader, const loco::Graph *graph,
-              RuntimeGraph *runtime_graph, RuntimeToIR &runtime_to_ir,
+  GraphLoader(const loco::Graph *graph, RuntimeGraph *runtime_graph, RuntimeToIR &runtime_to_ir,
+              const std::unordered_map<const loco::Graph *, RuntimeGraph *> &graph_to_runtime_graph,
               std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor);
 
-  void load();
-
-  Tensor *getTensorForNode(const loco::Node *node) const { return _node_to_tensor.at(node); }
-
-private:
-  void loadOperators();
-  void initInputOutputTensors() const;
   void loadTensors();
+  void initInputOutputTensors() const;
+  void loadOperators();
 
-  const ModuleLoader &_module_loader;
+private:
   const loco::Graph *_graph;
   RuntimeGraph *_runtime_graph;
   RuntimeToIR &_runtime_to_ir;
 
+  const std::unordered_map<const loco::Graph *, RuntimeGraph *> &_graph_to_runtime_graph;
   std::unordered_map<const loco::Node *, Tensor *> &_node_to_tensor;
 };
 
diff --git a/compiler/luci-interpreter/src/loader/KernelBuilder.cpp b/compiler/luci-interpreter/src/loader/KernelBuilder.cpp
index 56da961dd..12c7f4526 100644
--- a/compiler/luci-interpreter/src/loader/KernelBuilder.cpp
+++ b/compiler/luci-interpreter/src/loader/KernelBuilder.cpp
@@ -21,6 +21,7 @@
 #include "kernels/AveragePool2D.h"
 #include "kernels/Concatenation.h"
 #include "kernels/Conv2D.h"
+#include "kernels/DepthToSpace.h"
 #include "kernels/DepthwiseConv2D.h"
 #include "kernels/Elu.h"
 #include "kernels/FullyConnected.h"
@@ -35,6 +36,8 @@
 #include "kernels/Mul.h"
 #include "kernels/Pad.h"
 #include "kernels/Reshape.h"
+#include "kernels/Reverse.h"
+#include "kernels/Slice.h"
 #include "kernels/Softmax.h"
 #include "kernels/SpaceToDepth.h"
 #include "kernels/Split.h"
@@ -43,8 +46,6 @@
 #include "kernels/Unpack.h"
 #include "kernels/Transpose.h"
 #include "kernels/TransposeConv.h"
-#include "loader/GraphLoader.h"
-#include "loader/ModuleLoader.h"
 
 #include <stdexcept>
 
@@ -68,20 +69,23 @@ static std::vector<const loco::Node *> collectOutputNodes(const luci::CircleNode
 
 const Tensor *KernelBuilder::getInputTensor(const loco::Node *node) const
 {
-  const Tensor *tensor = _graph_loader.getTensorForNode(node);
+  const Tensor *tensor = _node_to_tensor.at(node);
   assert(tensor != nullptr);
   return tensor;
 }
 
 const Tensor *KernelBuilder::getOptionalInputTensor(const loco::Node *node) const
 {
-  // TODO Revise this when optional inputs are implemented in the IR.
+  if (dynamic_cast<const luci::CircleOutputExclude *>(node))
+  {
+    return nullptr;
+  }
   return getInputTensor(node);
 }
 
 Tensor *KernelBuilder::getOutputTensor(const loco::Node *node) const
 {
-  Tensor *tensor = _graph_loader.getTensorForNode(node);
+  Tensor *tensor = _node_to_tensor.at(node);
   assert(tensor != nullptr);
   return tensor;
 }
@@ -98,7 +102,7 @@ KernelBuilder::getOutputTensors(const std::vector<const loco::Node *> &nodes) co
 
 RuntimeGraph *KernelBuilder::getRuntimeGraph(const loco::Graph *graph) const
 {
-  RuntimeGraph *runtime_graph = _module_loader.getRuntimeGraph(graph);
+  RuntimeGraph *runtime_graph = _graph_to_runtime_graph.at(graph);
   assert(runtime_graph != nullptr);
   return runtime_graph;
 }
@@ -120,14 +124,14 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleAdd *node)
 std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleArgMax *node)
 {
   assert(node->arity() == 2);
-  const Tensor *input1 = getInputTensor(node->input());
-  const Tensor *input2 = getInputTensor(node->dimension());
+  const Tensor *input = getInputTensor(node->input());
+  const Tensor *axis = getInputTensor(node->dimension());
   Tensor *output = getOutputTensor(node);
 
   ArgMaxParams params{};
   params.output_type = node->output_type();
 
-  return std::make_unique<kernels::ArgMax>(input1, input2, output, params);
+  return std::make_unique<kernels::ArgMax>(input, axis, output, params);
 }
 
 std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleAveragePool2D *node)
@@ -188,6 +192,19 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleConv2D *node)
   return std::make_unique<kernels::Conv2D>(input, filter, bias, output, params);
 }
 
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleDepthToSpace *node)
+{
+  assert(node->arity() == 1);
+
+  const Tensor *input = getInputTensor(node->input());
+  Tensor *output = getOutputTensor(node);
+
+  DepthToSpaceParams params{};
+  params.block_size = node->block_size();
+
+  return std::make_unique<kernels::DepthToSpace>(input, output, params);
+}
+
 std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleDepthwiseConv2D *node)
 {
   assert(node->arity() == 3);
@@ -224,14 +241,14 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleFullyConnected *n
   assert(node->arity() == 3);
 
   const Tensor *input = getInputTensor(node->input());
-  const Tensor *filter = getInputTensor(node->weights());
+  const Tensor *weights = getInputTensor(node->weights());
   const Tensor *bias = getOptionalInputTensor(node->bias());
   Tensor *output = getOutputTensor(node);
 
   FullyConnectedParams params{};
   params.activation = node->fusedActivationFunction();
 
-  return std::make_unique<kernels::FullyConnected>(input, filter, bias, output, params);
+  return std::make_unique<kernels::FullyConnected>(input, weights, bias, output, params);
 }
 
 std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleIf *node)
@@ -255,6 +272,11 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleIf *node)
                                        else_graph);
 }
 
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleInput *)
+{
+  throw std::runtime_error("Input node cannot be executed.");
+}
+
 std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleL2Normalize *node)
 {
   assert(node->arity() == 1);
@@ -323,11 +345,6 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleLogistic *node)
   return std::make_unique<kernels::Logistic>(input, output);
 }
 
-std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleInput *)
-{
-  throw std::runtime_error("Input node cannot be executed.");
-}
-
 std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleMaxPool2D *node)
 {
   assert(node->arity() == 1);
@@ -402,6 +419,30 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleReshape *node)
   return std::make_unique<kernels::Reshape>(input, shape, output);
 }
 
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleReverseV2 *node)
+{
+  assert(node->arity() == 2);
+
+  const Tensor *input = getInputTensor(node->tensor());
+  const Tensor *axes = getInputTensor(node->axis());
+  Tensor *output = getOutputTensor(node);
+
+  return std::make_unique<kernels::Reverse>(input, axes, output);
+}
+
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleSlice *node)
+{
+  assert(node->arity() == 3);
+
+  const Tensor *input = getInputTensor(node->input());
+  const Tensor *begin = getInputTensor(node->begin());
+  const Tensor *size = getInputTensor(node->size());
+
+  Tensor *output = getOutputTensor(node);
+
+  return std::make_unique<kernels::Slice>(input, begin, size, output);
+}
+
 std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleSoftmax *node)
 {
   assert(node->arity() == 1);
@@ -442,6 +483,19 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleSplit *node)
   return std::make_unique<kernels::Split>(axis, input, std::move(outputs));
 }
 
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleSqueeze *node)
+{
+  assert(node->arity() == 1);
+
+  const Tensor *input = getInputTensor(node->input());
+  Tensor *output = getOutputTensor(node);
+
+  SqueezeParams params{};
+  params.squeeze_dims = node->squeeze_dims();
+
+  return std::make_unique<kernels::Squeeze>(input, output, params);
+}
+
 std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleStridedSlice *node)
 {
   assert(node->arity() == 4);
@@ -463,21 +517,15 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleStridedSlice *nod
   return std::make_unique<kernels::StridedSlice>(input, begin, end, strides, output, params);
 }
 
-std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleSqueeze *node)
+std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleTranspose *node)
 {
-  assert(node->arity() == 1);
+  assert(node->arity() == 2);
 
-  const Tensor *input = getInputTensor(node->input());
+  const Tensor *input = getInputTensor(node->a());
+  const Tensor *perm = getInputTensor(node->perm());
   Tensor *output = getOutputTensor(node);
 
-  SqueezeParams params{};
-  assert(node->squeeze_dims().size() <= 4);
-  for (size_t i = 0; i < node->squeeze_dims().size(); i++)
-  {
-    params.squeeze_dims.push_back(node->squeeze_dims().at(i));
-  }
-
-  return std::make_unique<kernels::Squeeze>(input, output, params);
+  return std::make_unique<kernels::Transpose>(input, perm, output);
 }
 
 std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleTransposeConv *node)
@@ -515,15 +563,4 @@ std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleUnpack *node)
   return std::make_unique<kernels::Unpack>(input, std::move(outputs), params);
 }
 
-std::unique_ptr<Kernel> KernelBuilder::visit(const luci::CircleTranspose *node)
-{
-  assert(node->arity() == 2);
-
-  const Tensor *input = getInputTensor(node->a());
-  const Tensor *perm = getInputTensor(node->perm());
-  Tensor *output = getOutputTensor(node);
-
-  return std::make_unique<kernels::Transpose>(input, perm, output);
-}
-
 } // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/KernelBuilder.h b/compiler/luci-interpreter/src/loader/KernelBuilder.h
index 7e30d395b..d5c5a4b56 100644
--- a/compiler/luci-interpreter/src/loader/KernelBuilder.h
+++ b/compiler/luci-interpreter/src/loader/KernelBuilder.h
@@ -24,18 +24,18 @@
 
 #include <memory>
 #include <vector>
+#include <unordered_map>
 
 namespace luci_interpreter
 {
 
-class GraphLoader;
-class ModuleLoader;
-
 class KernelBuilder : public luci::CircleNodeVisitor<std::unique_ptr<Kernel>>
 {
 public:
-  KernelBuilder(const ModuleLoader &module_loader, const GraphLoader &graph_loader)
-      : _module_loader(module_loader), _graph_loader(graph_loader)
+  KernelBuilder(
+      const std::unordered_map<const loco::Graph *, RuntimeGraph *> &graph_to_runtime_graph,
+      const std::unordered_map<const loco::Node *, Tensor *> &node_to_tensor)
+      : _graph_to_runtime_graph(graph_to_runtime_graph), _node_to_tensor(node_to_tensor)
   {
   }
 
@@ -45,6 +45,7 @@ public:
   std::unique_ptr<Kernel> visit(const luci::CircleConcatenation *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleConv2D *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleConst *node) override;
+  std::unique_ptr<Kernel> visit(const luci::CircleDepthToSpace *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleDepthwiseConv2D *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleElu *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleFullyConnected *node) override;
@@ -61,6 +62,8 @@ public:
   std::unique_ptr<Kernel> visit(const luci::CircleOutput *node) override;
   std::unique_ptr<Kernel> visit(const luci::CirclePad *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleReshape *node) override;
+  std::unique_ptr<Kernel> visit(const luci::CircleReverseV2 *node) override;
+  std::unique_ptr<Kernel> visit(const luci::CircleSlice *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleSoftmax *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleSpaceToDepth *node) override;
   std::unique_ptr<Kernel> visit(const luci::CircleSplit *node) override;
@@ -82,8 +85,8 @@ private:
   RuntimeGraph *getRuntimeGraph(const loco::Graph *graph) const;
 
 private:
-  const ModuleLoader &_module_loader;
-  const GraphLoader &_graph_loader;
+  const std::unordered_map<const loco::Graph *, RuntimeGraph *> &_graph_to_runtime_graph;
+  const std::unordered_map<const loco::Node *, Tensor *> &_node_to_tensor;
 };
 
 } // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp b/compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp
new file mode 100644
index 000000000..33bc8ec9b
--- /dev/null
+++ b/compiler/luci-interpreter/src/loader/KernelBuilder.test.cpp
@@ -0,0 +1,743 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "loader/GraphLoader.h"
+#include "loader/KernelBuilder.h"
+
+#include <kernels/Add.h>
+#include <kernels/ArgMax.h>
+#include <kernels/AveragePool2D.h>
+#include <kernels/Concatenation.h>
+#include <kernels/Conv2D.h>
+#include <kernels/DepthToSpace.h>
+#include <kernels/DepthwiseConv2D.h>
+#include <kernels/Elu.h>
+#include <kernels/FullyConnected.h>
+#include <kernels/L2Normalize.h>
+#include <kernels/L2Pool2D.h>
+#include <kernels/LeakyRelu.h>
+#include <kernels/LocalResponseNormalization.h>
+#include <kernels/Logistic.h>
+#include <kernels/MaxPool2D.h>
+#include <kernels/Mean.h>
+#include <kernels/Mul.h>
+#include <kernels/Pad.h>
+#include <kernels/Reshape.h>
+#include <kernels/Reverse.h>
+#include <kernels/Slice.h>
+#include <kernels/Softmax.h>
+#include <kernels/SpaceToDepth.h>
+#include <kernels/Split.h>
+#include <kernels/Squeeze.h>
+#include <kernels/StridedSlice.h>
+#include <kernels/Transpose.h>
+#include <kernels/TransposeConv.h>
+#include <kernels/Unpack.h>
+
+#include <gmock/gmock.h>
+
+namespace luci_interpreter
+{
+namespace
+{
+
+using namespace testing;
+
+class KernelBuilderTest : public Test
+{
+protected:
+  luci::CircleInput *createInputNode() { return createNode<luci::CircleInput>(); }
+
+  template <typename NodeT, typename... Args> NodeT *createNode(Args &&... args)
+  {
+    auto *node = _graph.nodes()->create<NodeT>(std::forward<Args>(args)...);
+    // The actual type does not matter for the purpose of the tests.
+    // NOTE The type is meaningless for nodes with multiple outputs (corresponding *Out nodes carry
+    //  actual output types).
+    node->dtype(loco::DataType::FLOAT32);
+    return node;
+  }
+
+  template <typename NodeOutT> NodeOutT *createNodeOut(loco::Node *node, int index)
+  {
+    auto *node_out = createNode<NodeOutT>();
+    node_out->input(node);
+    node_out->index(index);
+    return node_out;
+  }
+
+  template <typename KernelT> std::unique_ptr<KernelT> buildKernel(const luci::CircleNode *op)
+  {
+    std::unordered_map<const loco::Graph *, RuntimeGraph *> graph_to_runtime_graph;
+
+    RuntimeGraph runtime_graph(nullptr);
+    RuntimeToIR runtime_to_ir;
+    GraphLoader graph_loader(&_graph, &runtime_graph, runtime_to_ir, graph_to_runtime_graph,
+                             _node_to_tensor);
+    graph_loader.loadTensors();
+
+    KernelBuilder kernel_builder(graph_to_runtime_graph, _node_to_tensor);
+
+    auto kernel = op->accept(&kernel_builder);
+    return std::unique_ptr<KernelT>(dynamic_cast<KernelT *>(kernel.release()));
+  }
+
+  void checkTensor(const Tensor *tensor, const loco::Node *node)
+  {
+    EXPECT_THAT(tensor, Eq(_node_to_tensor.at(node)));
+  }
+
+private:
+  loco::Graph _graph;
+  std::unordered_map<const loco::Node *, Tensor *> _node_to_tensor;
+};
+
+TEST_F(KernelBuilderTest, Add)
+{
+  auto *input1 = createInputNode();
+  auto *input2 = createInputNode();
+
+  auto *op = createNode<luci::CircleAdd>();
+  op->x(input1);
+  op->y(input2);
+
+  op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto kernel = buildKernel<kernels::Add>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input1(), input1);
+  checkTensor(kernel->input2(), input2);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, ArgMax)
+{
+  auto *input = createInputNode();
+  auto *axis = createInputNode();
+
+  auto *op = createNode<luci::CircleArgMax>();
+  op->input(input);
+  op->dimension(axis);
+
+  op->output_type(loco::DataType::FLOAT32);
+
+  auto kernel = buildKernel<kernels::ArgMax>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->axis(), axis);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().output_type, Eq(op->output_type()));
+}
+
+TEST_F(KernelBuilderTest, AveragePool2D)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleAveragePool2D>();
+  op->value(input);
+
+  op->padding(luci::Padding::SAME);
+  op->filter()->h(11);
+  op->filter()->w(13);
+  op->stride()->h(17);
+  op->stride()->w(19);
+  op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto kernel = buildKernel<kernels::AveragePool2D>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().padding, Eq(op->padding()));
+  EXPECT_THAT(kernel->params().filter_height, Eq(op->filter()->h()));
+  EXPECT_THAT(kernel->params().filter_width, Eq(op->filter()->w()));
+  EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h()));
+  EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w()));
+  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, Concatenation)
+{
+  auto *input1 = createInputNode();
+  auto *input2 = createInputNode();
+
+  auto *op = createNode<luci::CircleConcatenation>(2);
+  op->values(0, input1);
+  op->values(1, input2);
+  op->axis(11);
+
+  auto kernel = buildKernel<kernels::Concatenation>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(0), input1);
+  checkTensor(kernel->input(1), input2);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().axis, Eq(op->axis()));
+}
+
+TEST_F(KernelBuilderTest, Conv2D)
+{
+  auto *input = createInputNode();
+  auto *filter = createInputNode();
+  auto *bias = createInputNode();
+
+  auto *op = createNode<luci::CircleConv2D>();
+  op->input(input);
+  op->filter(filter);
+  op->bias(bias);
+
+  op->padding(luci::Padding::SAME);
+  op->stride()->h(11);
+  op->stride()->w(13);
+  op->dilation()->h(17);
+  op->dilation()->w(19);
+  op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto kernel = buildKernel<kernels::Conv2D>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->filter(), filter);
+  checkTensor(kernel->bias(), bias);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().padding, Eq(op->padding()));
+  EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h()));
+  EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w()));
+  EXPECT_THAT(kernel->params().dilation_height_factor, Eq(op->dilation()->h()));
+  EXPECT_THAT(kernel->params().dilation_width_factor, Eq(op->dilation()->w()));
+  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, DepthToSpace)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleDepthToSpace>();
+  op->input(input);
+
+  op->block_size(11);
+
+  auto kernel = buildKernel<kernels::DepthToSpace>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().block_size, Eq(op->block_size()));
+}
+
+TEST_F(KernelBuilderTest, DepthwiseConv2D)
+{
+  auto *input = createInputNode();
+  auto *filter = createInputNode();
+  auto *bias = createInputNode();
+
+  auto *op = createNode<luci::CircleDepthwiseConv2D>();
+  op->input(input);
+  op->filter(filter);
+  op->bias(bias);
+
+  op->padding(luci::Padding::SAME);
+  op->depthMultiplier(11);
+  op->stride()->h(13);
+  op->stride()->w(17);
+  op->dilation()->h(19);
+  op->dilation()->w(23);
+  op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto kernel = buildKernel<kernels::DepthwiseConv2D>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->filter(), filter);
+  checkTensor(kernel->bias(), bias);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().padding, Eq(op->padding()));
+  EXPECT_THAT(kernel->params().depth_multiplier, Eq(op->depthMultiplier()));
+  EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h()));
+  EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w()));
+  EXPECT_THAT(kernel->params().dilation_height_factor, Eq(op->dilation()->h()));
+  EXPECT_THAT(kernel->params().dilation_width_factor, Eq(op->dilation()->w()));
+  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, Elu)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleElu>();
+  op->features(input);
+
+  auto kernel = buildKernel<kernels::Elu>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, FullyConnected)
+{
+  auto *input = createInputNode();
+  auto *weights = createInputNode();
+  auto *bias = createInputNode();
+
+  auto *op = createNode<luci::CircleFullyConnected>();
+  op->input(input);
+  op->weights(weights);
+  op->bias(bias);
+
+  op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto kernel = buildKernel<kernels::FullyConnected>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->weights(), weights);
+  checkTensor(kernel->bias(), bias);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, L2Normalize)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleL2Normalize>();
+  op->x(input);
+
+  op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto kernel = buildKernel<kernels::L2Normalize>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, L2Pool2D)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleL2Pool2D>();
+  op->value(input);
+
+  op->padding(luci::Padding::SAME);
+  op->filter()->h(11);
+  op->filter()->w(13);
+  op->stride()->h(17);
+  op->stride()->w(19);
+  op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto kernel = buildKernel<kernels::L2Pool2D>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().padding, Eq(op->padding()));
+  EXPECT_THAT(kernel->params().filter_height, Eq(op->filter()->h()));
+  EXPECT_THAT(kernel->params().filter_width, Eq(op->filter()->w()));
+  EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h()));
+  EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w()));
+  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, LeakyRelu)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleLeakyRelu>();
+  op->features(input);
+
+  op->alpha(11.0f);
+
+  auto kernel = buildKernel<kernels::LeakyRelu>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().alpha, Eq(op->alpha()));
+}
+
+TEST_F(KernelBuilderTest, LocalResponseNormalization)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleLocalResponseNormalization>();
+  op->input(input);
+
+  op->radius(11);
+  op->bias(13.0f);
+  op->alpha(15.0f);
+  op->beta(17.0f);
+
+  auto kernel = buildKernel<kernels::LocalResponseNormalization>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().radius, Eq(op->radius()));
+  EXPECT_THAT(kernel->params().bias, Eq(op->bias()));
+  EXPECT_THAT(kernel->params().alpha, Eq(op->alpha()));
+  EXPECT_THAT(kernel->params().beta, Eq(op->beta()));
+}
+
+TEST_F(KernelBuilderTest, Logistic)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleLogistic>();
+  op->x(input);
+
+  auto kernel = buildKernel<kernels::Logistic>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, MaxPool2D)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleMaxPool2D>();
+  op->value(input);
+
+  op->padding(luci::Padding::SAME);
+  op->filter()->h(11);
+  op->filter()->w(13);
+  op->stride()->h(17);
+  op->stride()->w(19);
+  op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto kernel = buildKernel<kernels::MaxPool2D>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().padding, Eq(op->padding()));
+  EXPECT_THAT(kernel->params().filter_height, Eq(op->filter()->h()));
+  EXPECT_THAT(kernel->params().filter_width, Eq(op->filter()->w()));
+  EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h()));
+  EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w()));
+  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, Mean)
+{
+  auto *input = createInputNode();
+  auto *axes = createInputNode();
+
+  auto *op = createNode<luci::CircleMean>();
+  op->input(input);
+  op->reduction_indices(axes);
+
+  op->keep_dims(true);
+
+  auto kernel = buildKernel<kernels::Mean>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->axes(), axes);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().keep_dims, Eq(op->keep_dims()));
+}
+
+TEST_F(KernelBuilderTest, Mul)
+{
+  auto *input1 = createInputNode();
+  auto *input2 = createInputNode();
+
+  auto *op = createNode<luci::CircleMul>();
+  op->x(input1);
+  op->y(input2);
+
+  op->fusedActivationFunction(luci::FusedActFunc::RELU);
+
+  auto kernel = buildKernel<kernels::Mul>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input1(), input1);
+  checkTensor(kernel->input2(), input2);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().activation, Eq(op->fusedActivationFunction()));
+}
+
+TEST_F(KernelBuilderTest, Pad)
+{
+  auto *input = createInputNode();
+  auto *paddings = createInputNode();
+
+  auto *op = createNode<luci::CirclePad>();
+  op->input(input);
+  op->paddings(paddings);
+
+  auto kernel = buildKernel<kernels::Pad>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->paddings(), paddings);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, Reshape)
+{
+  auto *input = createInputNode();
+  auto *shape = createInputNode();
+
+  auto *op = createNode<luci::CircleReshape>();
+  op->tensor(input);
+  op->shape(shape);
+
+  auto kernel = buildKernel<kernels::Reshape>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->shape(), shape);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, ReverseV2)
+{
+  auto *input = createInputNode();
+  auto *axes = createInputNode();
+
+  auto *op = createNode<luci::CircleReverseV2>();
+  op->tensor(input);
+  op->axis(axes);
+
+  auto kernel = buildKernel<kernels::Reverse>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->axes(), axes);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, Slice)
+{
+  auto *input = createInputNode();
+  auto *begin = createInputNode();
+  auto *size = createInputNode();
+
+  auto *op = createNode<luci::CircleSlice>();
+  op->input(input);
+  op->begin(begin);
+  op->size(size);
+
+  auto kernel = buildKernel<kernels::Slice>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->begin(), begin);
+  checkTensor(kernel->size(), size);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, Softmax)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleSoftmax>();
+  op->logits(input);
+
+  op->beta(11.0f);
+
+  auto kernel = buildKernel<kernels::Softmax>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().beta, Eq(op->beta()));
+}
+
+TEST_F(KernelBuilderTest, SpaceToDepth)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleSpaceToDepth>();
+  op->input(input);
+
+  op->block_size(11);
+
+  auto kernel = buildKernel<kernels::SpaceToDepth>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().block_size, op->block_size());
+}
+
+TEST_F(KernelBuilderTest, Split)
+{
+  auto *axis = createInputNode();
+  auto *input = createInputNode();
+  auto *op = createNode<luci::CircleSplit>();
+  auto *output1 = createNodeOut<luci::CircleSplitOut>(op, 0);
+  auto *output2 = createNodeOut<luci::CircleSplitOut>(op, 1);
+
+  op->split_dim(axis);
+  op->input(input);
+
+  op->num_split(2);
+
+  auto kernel = buildKernel<kernels::Split>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->axis(), axis);
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(0), output1);
+  checkTensor(kernel->output(1), output2);
+}
+
+TEST_F(KernelBuilderTest, Squeeze)
+{
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleSqueeze>();
+  op->input(input);
+
+  op->squeeze_dims({11, 13});
+
+  auto kernel = buildKernel<kernels::Squeeze>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().squeeze_dims, ElementsAreArray(op->squeeze_dims()));
+}
+
+TEST_F(KernelBuilderTest, StridedSlice)
+{
+  auto *input = createInputNode();
+  auto *begin = createInputNode();
+  auto *end = createInputNode();
+  auto *strides = createInputNode();
+
+  auto *op = createNode<luci::CircleStridedSlice>();
+  op->input(input);
+  op->begin(begin);
+  op->end(end);
+  op->strides(strides);
+
+  op->begin_mask(11);
+  op->ellipsis_mask(13);
+  op->end_mask(17);
+  op->new_axis_mask(19);
+  op->shrink_axis_mask(23);
+
+  auto kernel = buildKernel<kernels::StridedSlice>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->begin(), begin);
+  checkTensor(kernel->end(), end);
+  checkTensor(kernel->strides(), strides);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().begin_mask, Eq(op->begin_mask()));
+  EXPECT_THAT(kernel->params().ellipsis_mask, Eq(op->ellipsis_mask()));
+  EXPECT_THAT(kernel->params().end_mask, Eq(op->end_mask()));
+  EXPECT_THAT(kernel->params().new_axis_mask, Eq(op->new_axis_mask()));
+  EXPECT_THAT(kernel->params().shrink_axis_mask, Eq(op->shrink_axis_mask()));
+}
+
+TEST_F(KernelBuilderTest, Transpose)
+{
+  auto *input = createInputNode();
+  auto *perm = createInputNode();
+
+  auto *op = createNode<luci::CircleTranspose>();
+  op->a(input);
+  op->perm(perm);
+
+  auto kernel = buildKernel<kernels::Transpose>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->perm(), perm);
+  checkTensor(kernel->output(), op);
+}
+
+TEST_F(KernelBuilderTest, TransposeConv)
+{
+  auto *output_shape = createInputNode();
+  auto *filter = createInputNode();
+  auto *input = createInputNode();
+
+  auto *op = createNode<luci::CircleTransposeConv>();
+  op->inputSizes(output_shape);
+  op->filter(filter);
+  op->outBackprop(input);
+
+  op->padding(luci::Padding::SAME);
+  op->stride()->h(11);
+  op->stride()->w(13);
+
+  auto kernel = buildKernel<kernels::TransposeConv>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->output_shape(), output_shape);
+  checkTensor(kernel->filter(), filter);
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(), op);
+  EXPECT_THAT(kernel->params().padding, Eq(op->padding()));
+  EXPECT_THAT(kernel->params().stride_height, Eq(op->stride()->h()));
+  EXPECT_THAT(kernel->params().stride_width, Eq(op->stride()->w()));
+}
+
+TEST_F(KernelBuilderTest, Unpack)
+{
+  auto *input = createInputNode();
+  auto *op = createNode<luci::CircleUnpack>();
+  auto *output1 = createNodeOut<luci::CircleUnpackOut>(op, 0);
+  auto *output2 = createNodeOut<luci::CircleUnpackOut>(op, 1);
+
+  op->value(input);
+
+  op->num(2);
+  op->axis(11);
+
+  auto kernel = buildKernel<kernels::Unpack>(op);
+  ASSERT_THAT(kernel, NotNull());
+
+  checkTensor(kernel->input(), input);
+  checkTensor(kernel->output(0), output1);
+  checkTensor(kernel->output(1), output2);
+  EXPECT_THAT(kernel->params().axis, Eq(op->axis()));
+}
+
+TEST_F(KernelBuilderTest, NonExisting1_NEG)
+{
+  auto *op = createNode<luci::CircleConst>();
+  ASSERT_ANY_THROW(buildKernel<Kernel>(op));
+}
+
+TEST_F(KernelBuilderTest, NonExisting2_NEG)
+{
+  auto *op = createNode<luci::CircleInput>();
+  ASSERT_ANY_THROW(buildKernel<Kernel>(op));
+}
+
+TEST_F(KernelBuilderTest, NonExisting3_NEG)
+{
+  auto *op = createNode<luci::CircleOutput>();
+  ASSERT_ANY_THROW(buildKernel<Kernel>(op));
+}
+
+} // namespace
+} // namespace luci_interpreter
diff --git a/compiler/luci-interpreter/src/loader/ModuleLoader.cpp b/compiler/luci-interpreter/src/loader/ModuleLoader.cpp
index 7780a61b6..b9a2ae0a9 100644
--- a/compiler/luci-interpreter/src/loader/ModuleLoader.cpp
+++ b/compiler/luci-interpreter/src/loader/ModuleLoader.cpp
@@ -41,8 +41,11 @@ void ModuleLoader::load()
   {
     const loco::Graph *graph = _module->graph(i);
     RuntimeGraph *runtime_graph = _graph_to_runtime_graph.at(graph);
-    GraphLoader loader(*this, graph, runtime_graph, _runtime_to_ir, _node_to_tensor);
-    loader.load();
+    GraphLoader loader(graph, runtime_graph, _runtime_to_ir, _graph_to_runtime_graph,
+                       _node_to_tensor);
+    loader.loadTensors();
+    loader.initInputOutputTensors();
+    loader.loadOperators();
   }
 }
 
diff --git a/compiler/luci-interpreter/src/loader/ModuleLoader.h b/compiler/luci-interpreter/src/loader/ModuleLoader.h
index 954dbfb61..1af0ed747 100644
--- a/compiler/luci-interpreter/src/loader/ModuleLoader.h
+++ b/compiler/luci-interpreter/src/loader/ModuleLoader.h
@@ -36,11 +36,6 @@ public:
 
   void load();
 
-  RuntimeGraph *getRuntimeGraph(const loco::Graph *graph) const
-  {
-    return _graph_to_runtime_graph.at(graph);
-  }
-
 private:
   const luci::Module *_module;
   RuntimeModule *_runtime_module;
diff --git a/compiler/luci-value-test/CMakeLists.txt b/compiler/luci-value-test/CMakeLists.txt
index 3a5c42b11..ec7463409 100644
--- a/compiler/luci-value-test/CMakeLists.txt
+++ b/compiler/luci-value-test/CMakeLists.txt
@@ -20,6 +20,6 @@ add_test(NAME luci_value_test
   COMMAND "${CMAKE_CURRENT_SOURCE_DIR}/evalverify.sh"
           "${CMAKE_CURRENT_BINARY_DIR}"
           "${ARTIFACTS_BIN_PATH}"
-          "${NNCC_OVERLAY_DIR}/venv_1_13_2"
+          "${NNCC_OVERLAY_DIR}/venv_2_3_0"
           ${LUCI_VALUE_TESTS}
 )
diff --git a/compiler/luci-value-test/evalverify.sh b/compiler/luci-value-test/evalverify.sh
index dfd55a691..12c9a459a 100755
--- a/compiler/luci-value-test/evalverify.sh
+++ b/compiler/luci-value-test/evalverify.sh
@@ -4,8 +4,10 @@
 #
 # HOW TO USE
 #
-# ./evalverify.sh <path/to/work_dir> <TEST 1> <TEST 2> ...
-# work_dir : build directory of luci-value-test (ex: build/compiler/luci-value-test)
+# ./evalverify.sh <path/to/bin_dir> <path/to/work_dir> <path/to/venv_dir> <TEST 1> <TEST 2> ...
+# bin_dir  : build directory of luci-value-test (ex: build/compiler/luci-value-test)
+# work_dir : artifacts directoy where test materials exist
+# venv_dir : python virtual environment home directory
 
 VERIFY_SOURCE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 VERIFY_SCRIPT_PATH="${VERIFY_SOURCE_PATH}/luci_eval_verifier.py"
diff --git a/compiler/luci-value-test/luci_eval_verifier.py b/compiler/luci-value-test/luci_eval_verifier.py
index 6999110b6..7a2cebb91 100755
--- a/compiler/luci-value-test/luci_eval_verifier.py
+++ b/compiler/luci-value-test/luci_eval_verifier.py
@@ -35,6 +35,10 @@ for i in range(num_inputs):
         input_data = np.array(
             np.random.randint(0, 256, size=input_details["shape"]),
             input_details["dtype"])
+    elif input_details["dtype"] == np.bool_:
+        input_data = np.array(
+            np.random.choice(a=[True, False], size=input_details["shape"]),
+            input_details["dtype"])
     else:
         raise SystemExit("Unsupported input dtype")
 
@@ -44,11 +48,6 @@ for i in range(num_inputs):
 # Do inference
 interpreter.invoke()
 
-# Get reference output data.
-assert len(interpreter.get_output_details()) == 1  # TODO: Support multiple outputs
-output_details = interpreter.get_output_details()[0]
-ref_output_data = interpreter.get_tensor(output_details["index"])
-
 # Execute luci interpreter.
 subprocess.run(
     [
@@ -56,27 +55,56 @@ subprocess.run(
         str(num_inputs), circle_model + ".input", circle_model + ".output"
     ],
     check=True)
-output_data = np.fromfile(circle_model + ".output", output_details["dtype"])
-shape_file = open(circle_model + ".output.shape", 'r')
-output_shape = [int(i) for i in shape_file.read().split(',')]
-shape_file.close()
-luci_output_data = np.reshape(output_data, output_shape)
 
 # Compare the results.
-try:
-    if output_details["dtype"] == np.uint8:
-        if np.allclose(luci_output_data, ref_output_data, rtol=0, atol=0) == False:
-            raise SystemExit("Execution result of " + tflite_model +
-                             " does not match with " + circle_model)
-    elif output_details["dtype"] == np.float32:
-        if np.allclose(
-                luci_output_data, ref_output_data, rtol=1.e-5, atol=1.e-5) == False:
-            raise SystemExit("Execution result of " + tflite_model +
-                             " does not match with " + circle_model)
-    else:
-        raise SystemExit("Unsupported data type: ", output_details["dtype"])
-except:
-    print(traceback.format_exc())
-    quit(255)
+for idx in range(len(interpreter.get_output_details())):
+    output_details = interpreter.get_output_details()[idx]
+    output_data = np.fromfile(circle_model + ".output" + str(idx),
+                              output_details["dtype"])
+    shape_file = open(circle_model + ".output" + str(idx) + ".shape", 'r')
+    output_shape = [int(i) for i in shape_file.read().split(',')]
+    luci_output_data = np.reshape(output_data, output_shape)
+    try:
+        if output_details["dtype"] == np.uint8:
+            if np.allclose(
+                    luci_output_data,
+                    interpreter.get_tensor(
+                        interpreter.get_output_details()[idx]["index"]),
+                    rtol=0,
+                    atol=0) == False:
+                raise SystemExit("Execution result of " + tflite_model +
+                                 " does not match with " + circle_model)
+        elif output_details["dtype"] == np.float32:
+            if np.allclose(
+                    luci_output_data,
+                    interpreter.get_tensor(
+                        interpreter.get_output_details()[idx]["index"]),
+                    rtol=1.e-5,
+                    atol=1.e-5) == False:
+                raise SystemExit("Execution result of " + tflite_model +
+                                 " does not match with " + circle_model)
+        elif output_details["dtype"] == np.int64:
+            if np.allclose(
+                    luci_output_data,
+                    interpreter.get_tensor(
+                        interpreter.get_output_details()[idx]["index"]),
+                    rtol=0,
+                    atol=0) == False:
+                raise SystemExit("Execution result of " + tflite_model +
+                                 " does not match with " + circle_model)
+        elif output_details["dtype"] == np.int32:
+            if np.allclose(
+                    luci_output_data,
+                    interpreter.get_tensor(
+                        interpreter.get_output_details()[idx]["index"]),
+                    rtol=0,
+                    atol=0) == False:
+                raise SystemExit("Execution result of " + tflite_model +
+                                 " does not match with " + circle_model)
+        else:
+            raise SystemExit("Unsupported data type: ", output_details["dtype"])
+    except:
+        print(traceback.format_exc())
+        quit(255)
 
 quit(0)
diff --git a/compiler/luci-value-test/test.lst b/compiler/luci-value-test/test.lst
index 6a332f92c..0e5231eca 100644
--- a/compiler/luci-value-test/test.lst
+++ b/compiler/luci-value-test/test.lst
@@ -1,81 +1,183 @@
 #addeval(Abs_000)
 addeval(Add_000)
+#addeval(Add_001)
 addeval(Add_U8_000)
-#addeval(ArgMax_000)
-#addeval(ArgMax_001)
-#addeval(ArgMax_002)
-#addeval(ArgMax_003)
-#addeval(ArgMax_U8_000)
-#addeval(ArgMax_U8_001)
-#addeval(ArgMax_U8_002)
-#addeval(ArgMax_U8_003)
+#addeval(AddN_000)
+addeval(ArgMax_000)
+addeval(ArgMax_001)
+addeval(ArgMax_002)
+addeval(ArgMax_003)
+addeval(ArgMax_U8_000)
+addeval(ArgMax_U8_001)
+addeval(ArgMax_U8_002)
+addeval(ArgMax_U8_003)
+#addeval(ArgMin_000)
+#addeval(ArgMin_001)
+#addeval(ArgMin_002)
+#addeval(ArgMin_003)
+#addeval(ArgMin_U8_000)
+#addeval(ArgMin_U8_001)
+#addeval(ArgMin_U8_002)
+#addeval(ArgMin_U8_003)
 addeval(AveragePool2D_000)
+#addeval(BatchMatMul_000)
 #addeval(BatchMatMulV2_000)
 #addeval(BatchMatMulV2_001)
 #addeval(BatchToSpaceND_000)
 #addeval(Cast_000)
+#addeval(Cast_001)
+#addeval(Ceil_000)
 addeval(Concatenation_000)
 addeval(Concatenation_U8_000)
 addeval(Conv2D_000)
 addeval(Conv2D_001)
 addeval(Conv2D_002)
+#addeval(Conv2D_003)
 addeval(Conv2D_U8_000)
 addeval(Conv2D_U8_001)
 #addeval(Cos_000)
+#addeval(DepthToSpace_000)
 addeval(DepthwiseConv2D_000)
 addeval(DepthwiseConv2D_U8_000)
+#addeval(DepthwiseConv2D_U8_001)
+addeval(DepthwiseConv2D_001)
 #addeval(Div_000)
+addeval(ELU_000)
 #addeval(Equal_000)
 #addeval(Exp_000)
+#addeval(ExpandDims_000)
+#addeval(ExpandDims_001)
+#addeval(ExpandDims_002)
+#addeval(ExpandDims_003)
+#addeval(Fill_000)
+#addeval(Fill_001)
+#addeval(Floor_000)
+#addeval(FloorDiv_000)
+#addeval(FloorDiv_001)
+#addeval(FloorMod_000)
+#addeval(FloorMod_001)
 addeval(FullyConnected_000)
 addeval(FullyConnected_001)
-#addeval(FullyConnected_002)
+addeval(FullyConnected_002)
 #addeval(FullyConnected_U8_000)
 #addeval(Gather_000)
-#addeval(If_000)
-#addeval(If_001)
+#addeval(GatherNd_000)
+#addeval(Greater_000)
+#addeval(GreaterEqual_000)
+addeval(If_000)
+addeval(If_001)
+addeval(L2Normalize_000)
+addeval(L2Pool2D_000)
+#addeval(L2Pool2D_U8_000)
+addeval(LeakyRelu_000)
+#addeval(Less_000)
+#addeval(LessEqual_000)
+addeval(LocalResponseNormalization_000)
+#addeval(Log_000)
+#addeval(LogicalAnd_000)
 #addeval(LogicalNot_000)
 #addeval(LogicalOr_000)
-#addeval(Logistic_000)
+addeval(Logistic_000)
+#addeval(LogSoftmax_000)
+#addeval(MatMul_000)
+#addeval(MatrixDiag_000)
+#addeval(MatrixSetDiag_000)
+#addeval(Maximum_000)
 addeval(MaxPool2D_000)
 addeval(MaxPool2D_U8_000)
 addeval(Mean_000)
 addeval(Mean_001)
-addeval(Mean_U8_000)
+#addeval(Mean_U8_000)
+#addeval(Minimum_000)
+#addeval(MirrorPad_000)
 addeval(Mul_000)
 #addeval(Mul_U8_000)
+#addeval(Neg_000)
+#addeval(NotEqual_000)
+#addeval(OneHot_000)
+#addeval(OneHot_001)
+#addeval(OneHot_002)
+#addeval(OneHot_003)
 #addeval(Pack_000)
 #addeval(Pack_U8_000)
 addeval(Pad_000)
 addeval(Pad_U8_000)
+#addeval(Pow_000)
+#addeval(PRelu_000)
+#addeval(Range_000)
+#addeval(Rank_000)
+#addeval(ReduceAny_000)
+#addeval(ReduceAny_001)
+#addeval(ReduceAny_002)
+#addeval(ReduceAny_003)
+#addeval(ReduceMax_000)
+#addeval(ReduceMin_000)
 #addeval(ReduceProd_000)
 #addeval(ReduceProd_001)
 #addeval(ReduceProd_002)
 #addeval(ReduceProd_003)
 #addeval(ReLU_000)
+#addeval(ReLU6_000)
+#addeval(ReLUN1To1_000)
 addeval(Reshape_000)
 addeval(Reshape_001)
 addeval(Reshape_002)
 #addeval(Reshape_003)
 addeval(Reshape_U8_000)
+#addeval(ResizeBilinear_000)
+#addeval(ResizeNearestNeighbor_000)
+#addeval(ReverseSequence_000)
+#addeval(ReverseV2_000)
+#addeval(Round_000)
 #addeval(Rsqrt_000)
+#addeval(ScatterNd_000)
+#addeval(SegmentSum_000)
+#addeval(Select_000)
+#addeval(Select_001)
+#addeval(Select_002)
+#addeval(SelectV2_000)
+#addeval(SelectV2_001)
+#addeval(SelectV2_002)
+#addeval(Shape_000)
 #addeval(Sin_000)
+addeval(Slice_000)
 addeval(Softmax_000)
 #addeval(Softmax_U8_000)
 #addeval(SpaceToBatchND_000)
 #addeval(SpaceToBatchND_001)
 #addeval(SpaceToBatchND_002)
 #addeval(SpaceToBatchND_003)
-#addeval(StridedSlice_000)
-#addeval(StridedSlice_001)
+addeval(SpaceToDepth_000)
+#addeval(SparseToDense_000)
+addeval(Split_000)
+#addeval(SplitV_000)
+#addeval(Sqrt_000)
+#addeval(Square_000)
+#addeval(SquaredDifference_000)
+addeval(Squeeze_000)
+addeval(StridedSlice_000)
+addeval(StridedSlice_001)
+addeval(StridedSlice_002)
 #addeval(Sub_000)
 #addeval(Sub_U8_000)
+#addeval(Sum_000)
+#addeval(Sum_001)
 #addeval(Tanh_000)
 #addeval(Tile_000)
 #addeval(Tile_U8_000)
-#addeval(Transpose_000)
-#addeval(Unpack_000)
-#addeval(Unpack_001)
-#addeval(Unpack_002)
+#addeval(TopKV2_000)
+#addeval(TopKV2_001)
+addeval(Transpose_000)
+#addeval(TransposeConv_000)
+addeval(Unpack_000)
+addeval(Unpack_001)
+addeval(Unpack_002)
+addeval(Unpack_003)
+#addeval(Where_000)
+#addeval(Where_001)
 #addeval(While_000)
 #addeval(While_001)
+#addeval(While_002)
+#addeval(While_003)
+#addeval(YUV_TO_RGB_U8_000)
+#addeval(ZerosLike_000)
diff --git a/compiler/luci-value-test/tester/src/EvalTester.cpp b/compiler/luci-value-test/tester/src/EvalTester.cpp
index 58f62f54c..09eef223a 100644
--- a/compiler/luci-value-test/tester/src/EvalTester.cpp
+++ b/compiler/luci-value-test/tester/src/EvalTester.cpp
@@ -129,7 +129,7 @@ int entry(int argc, char **argv)
   assert(num_inputs == input_nodes.size());
   for (int32_t i = 0; i < num_inputs; i++)
   {
-    const auto *input_node = dynamic_cast<const luci::CircleInput *>(input_nodes[i]);
+    const auto *input_node = loco::must_cast<const luci::CircleInput *>(input_nodes[i]);
     std::vector<char> input_data(getTensorSize(input_node));
     readDataFromFile(std::string(input_prefix) + std::to_string(i), input_data.data(),
                      input_data.size());
@@ -141,24 +141,35 @@ int entry(int argc, char **argv)
 
   // Get output.
   const auto output_nodes = loco::output_nodes(module->graph());
-  // TODO: Support multiple outputs
-  assert(output_nodes.size() == 1);
-  const auto *output_node = dynamic_cast<const luci::CircleOutput *>(output_nodes[0]);
-  std::vector<char> output_data(getTensorSize(output_node));
-  interpreter.readOutputTensor(output_node, output_data.data(), output_data.size());
-
-  // Output data is written in ${output_file}
-  // (ex: Add.circle.output)
-  // Output shape is written in ${output_file}.shape
-  // (ex: Add.circle.output.shape)
-  // TODO: Use HDF5 file format
-  writeDataToFile(output_file, output_data.data(), output_data.size());
-  auto shape_str = std::to_string(output_node->dim(0).value());
-  for (int i = 1; i < output_node->rank(); i++)
+  for (int i = 0; i < module->graph()->outputs()->size(); i++)
   {
-    shape_str += ",";
-    shape_str += std::to_string(output_node->dim(i).value());
+    const auto *output_node = loco::must_cast<const luci::CircleOutput *>(output_nodes[i]);
+    std::vector<char> output_data(getTensorSize(output_node));
+    interpreter.readOutputTensor(output_node, output_data.data(), output_data.size());
+
+    // Output data is written in ${output_file}
+    // (ex: Add.circle.output0)
+    // Output shape is written in ${output_file}.shape
+    // (ex: Add.circle.output0.shape)
+    writeDataToFile(std::string(output_file) + std::to_string(i), output_data.data(),
+                    output_data.size());
+    // In case of Tensor output is Scalar value.
+    // The output tensor with rank 0 is treated as a scalar with shape (1)
+    if (output_node->rank() == 0)
+    {
+      writeDataToFile(std::string(output_file) + std::to_string(i) + ".shape", "1", 1);
+    }
+    else
+    {
+      auto shape_str = std::to_string(output_node->dim(0).value());
+      for (int j = 1; j < output_node->rank(); j++)
+      {
+        shape_str += ",";
+        shape_str += std::to_string(output_node->dim(j).value());
+      }
+      writeDataToFile(std::string(output_file) + std::to_string(i) + ".shape", shape_str.c_str(),
+                      shape_str.size());
+    }
   }
-  writeDataToFile(std::string(output_file) + ".shape", shape_str.c_str(), shape_str.size());
   return EXIT_SUCCESS;
 }
diff --git a/compiler/luci/export/src/CircleOperationExporter.cpp b/compiler/luci/export/src/CircleOperationExporter.cpp
index 3c01b676f..bca122050 100644
--- a/compiler/luci/export/src/CircleOperationExporter.cpp
+++ b/compiler/luci/export/src/CircleOperationExporter.cpp
@@ -102,6 +102,7 @@ public:
   void visit(luci::CircleMirrorPad *) final;
   void visit(luci::CircleMul *) final;
   void visit(luci::CircleNeg *) final;
+  void visit(luci::CircleNonMaxSuppressionV4 *) final;
   void visit(luci::CircleNotEqual *) final;
   void visit(luci::CircleOneHot *) final;
   void visit(luci::CirclePack *) final;
@@ -149,6 +150,7 @@ public:
   void visit(luci::CircleTopKV2 *) final;
   void visit(luci::CircleTranspose *) final;
   void visit(luci::CircleTransposeConv *) final;
+  void visit(luci::CircleUnique *) final;
   void visit(luci::CircleUnpack *) final;
   void visit(luci::CircleWhere *) final;
   void visit(luci::CircleWhile *) final;
@@ -165,9 +167,11 @@ public:
   // Virtual for multiple-outputs
   void visit(luci::CircleCustomOut *) final {}
   void visit(luci::CircleIfOut *) final {}
+  void visit(luci::CircleNonMaxSuppressionV4Out *) final {}
   void visit(luci::CircleSplitOut *) final {}
   void visit(luci::CircleSplitVOut *) final {}
   void visit(luci::CircleTopKV2Out *) final {}
+  void visit(luci::CircleUniqueOut *) final {}
   void visit(luci::CircleUnpackOut *) final {}
   void visit(luci::CircleWhileOut *) final {}
 
@@ -599,7 +603,9 @@ void OperationExporter::visit(luci::CircleLocalResponseNormalization *node)
 {
   export_simple(node, circle::BuiltinOperator_LOCAL_RESPONSE_NORMALIZATION,
                 circle::BuiltinOptions_LocalResponseNormalizationOptions,
-                CreateLocalResponseNormalizationOptions(builder).Union());
+                CreateLocalResponseNormalizationOptions(builder, node->radius(), node->bias(),
+                                                        node->alpha(), node->beta())
+                    .Union());
 }
 
 void OperationExporter::visit(luci::CircleLog *node)
@@ -691,6 +697,49 @@ void OperationExporter::visit(luci::CircleNeg *node)
                 CreateNegOptions(builder).Union());
 }
 
+void OperationExporter::visit(luci::CircleNonMaxSuppressionV4 *node)
+{
+  auto nms_outs = loco::succs(node);
+  assert(nms_outs.size() == 2);
+
+  uint32_t op_idx =
+      md.registerBuiltinOpcode(circle::BuiltinOperator_NON_MAX_SUPPRESSION_V4, node->op_version());
+  std::vector<int32_t> inputs_vec{
+      get_tensor_index(node->boxes()),           get_tensor_index(node->scores()),
+      get_tensor_index(node->max_output_size()), get_tensor_index(node->iou_threshold()),
+      get_tensor_index(node->score_threshold()),
+  };
+  std::vector<int32_t> outputs_vec;
+
+  for (uint32_t idx = 0; idx < nms_outs.size(); ++idx)
+  {
+    // store in order of index
+    bool found = false;
+    for (auto out : nms_outs)
+    {
+      auto nms_out = loco::must_cast<luci::CircleNonMaxSuppressionV4Out *>(out);
+      if (nms_out->index() == static_cast<int32_t>(idx))
+      {
+        outputs_vec.push_back(get_tensor_index(nms_out));
+        found = true;
+        break;
+      }
+    }
+    if (!found)
+    {
+      INTERNAL_EXN("Invalid NonMaxSuppressionV4 output");
+    }
+  }
+
+  auto inputs = builder.CreateVector(inputs_vec);
+  auto outputs = builder.CreateVector(outputs_vec);
+  auto options = CreateNonMaxSuppressionV4Options(builder);
+  auto op_offset =
+      CreateOperator(builder, op_idx, inputs, outputs,
+                     circle::BuiltinOptions_NonMaxSuppressionV4Options, options.Union());
+  gd._operators.push_back(op_offset);
+}
+
 void OperationExporter::visit(luci::CircleNotEqual *node)
 {
   export_simple(node, circle::BuiltinOperator_NOT_EQUAL, circle::BuiltinOptions_NotEqualOptions,
@@ -890,7 +939,7 @@ void OperationExporter::visit(luci::CircleSpaceToDepth *node)
 {
   export_simple(node, circle::BuiltinOperator_SPACE_TO_DEPTH,
                 circle::BuiltinOptions_SpaceToDepthOptions,
-                CreateSpaceToDepthOptions(builder).Union());
+                CreateSpaceToDepthOptions(builder, node->block_size()).Union());
 }
 
 void OperationExporter::visit(luci::CircleSparseToDense *node)
@@ -1090,6 +1139,43 @@ void OperationExporter::visit(luci::CircleTransposeConv *node)
                     .Union());
 }
 
+void OperationExporter::visit(luci::CircleUnique *node)
+{
+  auto unique_outs = loco::succs(node);
+  assert(int32_t(unique_outs.size()) == 2);
+  uint32_t op_idx = md.registerBuiltinOpcode(circle::BuiltinOperator_UNIQUE, node->op_version());
+
+  std::vector<int32_t> inputs_vec{get_tensor_index(node->input())};
+  std::vector<int32_t> outputs_vec;
+
+  for (int32_t index = 0; index < 2; index++)
+  {
+    // store in order of index
+    bool found = false;
+    for (auto out : unique_outs)
+    {
+      auto unique_out = loco::must_cast<luci::CircleUniqueOut *>(out);
+      if (unique_out->index() == index)
+      {
+        outputs_vec.push_back(get_tensor_index(unique_out));
+        found = true;
+        break;
+      }
+    }
+    if (!found)
+    {
+      INTERNAL_EXN("Invalid Unique output");
+    }
+  }
+
+  auto inputs = builder.CreateVector(inputs_vec);
+  auto outputs = builder.CreateVector(outputs_vec);
+  auto options = CreateUniqueOptions(builder, to_circle_tensortype(node->idx_out_type()));
+  auto op_offset = CreateOperator(builder, op_idx, inputs, outputs,
+                                  circle::BuiltinOptions_UniqueOptions, options.Union());
+  gd._operators.push_back(op_offset);
+}
+
 void OperationExporter::visit(luci::CircleUnpack *node)
 {
   LOGGER(l);
diff --git a/compiler/luci/export/src/CircleTensorExporter.cpp b/compiler/luci/export/src/CircleTensorExporter.cpp
index 5cad3920b..dc8c2fbc9 100644
--- a/compiler/luci/export/src/CircleTensorExporter.cpp
+++ b/compiler/luci/export/src/CircleTensorExporter.cpp
@@ -302,7 +302,10 @@ encodeQuantizationParameters(FlatBufferBuilder &builder, luci::CircleQuantParam
     scale = builder.CreateVector(quantparam->scale);
     zero_point = builder.CreateVector(quantparam->zerop);
   }
-  return circle::CreateQuantizationParameters(builder, min, max, scale, zero_point);
+  // Note: QuantizationDetails is not supported
+  return circle::CreateQuantizationParameters(builder, min, max, scale, zero_point,
+                                              circle::QuantizationDetails::QuantizationDetails_NONE,
+                                              0, quantparam->quantized_dimension);
 }
 
 void exportOpDefinedTensor(const CircleTensoInfo &info, FlatBufferBuilder &builder,
diff --git a/compiler/luci/import/include/luci/Import/Nodes.h b/compiler/luci/import/include/luci/Import/Nodes.h
index 2719a5aec..825c2147d 100644
--- a/compiler/luci/import/include/luci/Import/Nodes.h
+++ b/compiler/luci/import/include/luci/Import/Nodes.h
@@ -73,6 +73,7 @@
 #include "Nodes/CircleMirrorPad.h"
 #include "Nodes/CircleMul.h"
 #include "Nodes/CircleNeg.h"
+#include "Nodes/CircleNonMaxSuppressionV4.h"
 #include "Nodes/CircleNotEqual.h"
 #include "Nodes/CircleOneHot.h"
 #include "Nodes/CirclePack.h"
@@ -120,6 +121,7 @@
 #include "Nodes/CircleTopKV2.h"
 #include "Nodes/CircleTranspose.h"
 #include "Nodes/CircleTransposeConv.h"
+#include "Nodes/CircleUnique.h"
 #include "Nodes/CircleUnpack.h"
 #include "Nodes/CircleWhere.h"
 #include "Nodes/CircleWhile.h"
diff --git a/compiler/luci/import/include/luci/Import/Nodes/CircleNonMaxSuppressionV4.h b/compiler/luci/import/include/luci/Import/Nodes/CircleNonMaxSuppressionV4.h
new file mode 100644
index 000000000..f193aae35
--- /dev/null
+++ b/compiler/luci/import/include/luci/Import/Nodes/CircleNonMaxSuppressionV4.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_IMPORT_OP_CIRCLE_NON_MAX_SUPPRESSION_V4_H__
+#define __LUCI_IMPORT_OP_CIRCLE_NON_MAX_SUPPRESSION_V4_H__
+
+#include "luci/Import/GraphBuilderBase.h"
+
+namespace luci
+{
+
+class CircleNonMaxSuppressionV4GraphBuilder : public GraphBuilderBase
+{
+public:
+  bool validate(const ValidateArgs &args) const final;
+
+  void build(const circle::OperatorT &op, GraphBuilderContext *context) const final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_IMPORT_OP_CIRCLE_NON_MAX_SUPPRESSION_V4_H__
diff --git a/compiler/luci/import/include/luci/Import/Nodes/CircleUnique.h b/compiler/luci/import/include/luci/Import/Nodes/CircleUnique.h
new file mode 100644
index 000000000..ed5b5035d
--- /dev/null
+++ b/compiler/luci/import/include/luci/Import/Nodes/CircleUnique.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_IMPORT_OP_CIRCLE_UNIQUE_H__
+#define __LUCI_IMPORT_OP_CIRCLE_UNIQUE_H__
+
+#include "luci/Import/GraphBuilderBase.h"
+
+namespace luci
+{
+
+class CircleUniqueGraphBuilder : public GraphBuilderBase
+{
+public:
+  bool validate(const ValidateArgs &args) const final;
+
+  void build(const circle::OperatorT &op, GraphBuilderContext *context) const final;
+};
+
+} // namespace luci
+
+#endif // __LUCI_IMPORT_OP_CIRCLE_UNIQUE_H__
diff --git a/compiler/luci/import/src/CircleReader.cpp b/compiler/luci/import/src/CircleReader.cpp
index 81e945dd1..bc7f39762 100644
--- a/compiler/luci/import/src/CircleReader.cpp
+++ b/compiler/luci/import/src/CircleReader.cpp
@@ -156,6 +156,7 @@ luci_quantparam(const circle::QuantizationParametersT *quantization)
   const auto &max = quantization->max;
   const auto &scale = quantization->scale;
   const auto &zero_point = quantization->zero_point;
+  const auto &quantized_dimension = quantization->quantized_dimension;
 
   if ((!min.empty() && !max.empty()) || (!scale.empty() && !zero_point.empty()))
   {
@@ -165,6 +166,7 @@ luci_quantparam(const circle::QuantizationParametersT *quantization)
     quantparam->max = max;
     quantparam->scale = scale;
     quantparam->zerop = zero_point;
+    quantparam->quantized_dimension = quantized_dimension;
 
     return quantparam;
   }
diff --git a/compiler/luci/import/src/GraphBuilderRegistry.cpp b/compiler/luci/import/src/GraphBuilderRegistry.cpp
index d29557f74..cc328cc16 100644
--- a/compiler/luci/import/src/GraphBuilderRegistry.cpp
+++ b/compiler/luci/import/src/GraphBuilderRegistry.cpp
@@ -82,6 +82,7 @@ GraphBuilderRegistry::GraphBuilderRegistry()
   CIRCLE_NODE(MIRROR_PAD, CircleMirrorPadGraphBuilder);                                    // 100
   CIRCLE_NODE(MUL, CircleMulGraphBuilder);                                                 // 18
   CIRCLE_NODE(NEG, CircleNegGraphBuilder);                                                 // 59
+  CIRCLE_NODE(NON_MAX_SUPPRESSION_V4, CircleNonMaxSuppressionV4GraphBuilder);              // 120,
   CIRCLE_NODE(NOT_EQUAL, CircleNotEqualGraphBuilder);                                      // 72
   CIRCLE_NODE(ONE_HOT, CircleOneHotGraphBuilder);                                          // 85
   CIRCLE_NODE(PACK, CirclePackGraphBuilder);                                               // 83
@@ -129,6 +130,7 @@ GraphBuilderRegistry::GraphBuilderRegistry()
   CIRCLE_NODE(TOPK_V2, CircleTopKV2GraphBuilder);                                          // 48
   CIRCLE_NODE(TRANSPOSE, CircleTransposeGraphBuilder);                                     // 39
   CIRCLE_NODE(TRANSPOSE_CONV, CircleTransposeConvGraphBuilder);                            // 67
+  CIRCLE_NODE(UNIQUE, CircleUniqueGraphBuilder);                                           // 103
   CIRCLE_NODE(UNPACK, CircleUnpackGraphBuilder);                                           // 88
   CIRCLE_NODE(WHERE, CircleWhereGraphBuilder);                                             // 109
   CIRCLE_NODE(WHILE, CircleWhileGraphBuilder);                                             // 119
@@ -155,10 +157,8 @@ GraphBuilderRegistry::GraphBuilderRegistry()
   // BuiltinOperator_ARG_MAX = 56,
   // BuiltinOperator_PADV2 = 60,
   // BuiltinOperator_FAKE_QUANT = 80,
-  // BuiltinOperator_UNIQUE = 103,
   // BuiltinOperator_QUANTIZE = 114,
   // BuiltinOperator_HARD_SWISH = 117,
-  // BuiltinOperator_NON_MAX_SUPPRESSION_V4 = 120,
   // BuiltinOperator_NON_MAX_SUPPRESSION_V5 = 121,
   // BuiltinOperator_DENSIFY = 124,
 }
diff --git a/compiler/luci/import/src/Importer.test.cpp b/compiler/luci/import/src/Importer.test.cpp
index 4426e15fd..8366546f0 100644
--- a/compiler/luci/import/src/Importer.test.cpp
+++ b/compiler/luci/import/src/Importer.test.cpp
@@ -20,4 +20,9 @@
 
 #include <gtest/gtest.h>
 
-TEST(TensorFlowLiteImport, Dummy) { luci::Importer import; }
+TEST(TensorFlowLiteImport, Dummy)
+{
+  luci::Importer import;
+
+  SUCCEED();
+}
diff --git a/compiler/luci/import/src/Nodes/CircleAbs.cpp b/compiler/luci/import/src/Nodes/CircleAbs.cpp
index 9054986bd..3556dc7fa 100644
--- a/compiler/luci/import/src/Nodes/CircleAbs.cpp
+++ b/compiler/luci/import/src/Nodes/CircleAbs.cpp
@@ -36,7 +36,7 @@ CircleNode *CircleAbsGraphBuilder::build_node(const circle::OperatorT &,
                                               loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleAbs>();
-  node->x(inputs[0]);
+  node->x(inputs.at(0));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleAdd.cpp b/compiler/luci/import/src/Nodes/CircleAdd.cpp
index 3b1bb734f..b767d4af2 100644
--- a/compiler/luci/import/src/Nodes/CircleAdd.cpp
+++ b/compiler/luci/import/src/Nodes/CircleAdd.cpp
@@ -36,8 +36,8 @@ CircleNode *CircleAddGraphBuilder::build_node(const circle::OperatorT &op,
                                               loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleAdd>();
-  node->x(inputs[0]);
-  node->y(inputs[1]);
+  node->x(inputs.at(0));
+  node->y(inputs.at(1));
 
   const auto *options = op.builtin_options.AsAddOptions();
   node->fusedActivationFunction(luci_actfunc(options->fused_activation_function));
diff --git a/compiler/luci/import/src/Nodes/CircleArgMax.cpp b/compiler/luci/import/src/Nodes/CircleArgMax.cpp
index 2679827e2..10e8516f4 100644
--- a/compiler/luci/import/src/Nodes/CircleArgMax.cpp
+++ b/compiler/luci/import/src/Nodes/CircleArgMax.cpp
@@ -36,8 +36,8 @@ CircleNode *CircleArgMaxGraphBuilder::build_node(const circle::OperatorT &op,
                                                  loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleArgMax>();
-  node->input(inputs[0]);
-  node->dimension(inputs[1]);
+  node->input(inputs.at(0));
+  node->dimension(inputs.at(1));
 
   const auto *options = op.builtin_options.AsArgMaxOptions();
   node->output_type(luci_datatype(options->output_type));
diff --git a/compiler/luci/import/src/Nodes/CircleArgMin.cpp b/compiler/luci/import/src/Nodes/CircleArgMin.cpp
index 4d85bbff0..5ff534dbb 100644
--- a/compiler/luci/import/src/Nodes/CircleArgMin.cpp
+++ b/compiler/luci/import/src/Nodes/CircleArgMin.cpp
@@ -36,8 +36,8 @@ CircleNode *CircleArgMinGraphBuilder::build_node(const circle::OperatorT &op,
                                                  loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleArgMin>();
-  node->input(inputs[0]);
-  node->dimension(inputs[1]);
+  node->input(inputs.at(0));
+  node->dimension(inputs.at(1));
 
   const auto *options = op.builtin_options.AsArgMinOptions();
   node->output_type(luci_datatype(options->output_type));
diff --git a/compiler/luci/import/src/Nodes/CircleAveragePool2D.cpp b/compiler/luci/import/src/Nodes/CircleAveragePool2D.cpp
index cfc3cf126..ad011f71f 100644
--- a/compiler/luci/import/src/Nodes/CircleAveragePool2D.cpp
+++ b/compiler/luci/import/src/Nodes/CircleAveragePool2D.cpp
@@ -34,7 +34,7 @@ CircleNode *CircleAveragePool2DGraphBuilder::build_node(const circle::OperatorT
                                                         loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleAveragePool2D>();
-  node->value(inputs[0]);
+  node->value(inputs.at(0));
 
   const auto *options = op.builtin_options.AsPool2DOptions();
   node->padding(luci_padding(options->padding));
diff --git a/compiler/luci/import/src/Nodes/CircleBCQFullyConnected.cpp b/compiler/luci/import/src/Nodes/CircleBCQFullyConnected.cpp
index 7cc077ed6..16ecebd5c 100644
--- a/compiler/luci/import/src/Nodes/CircleBCQFullyConnected.cpp
+++ b/compiler/luci/import/src/Nodes/CircleBCQFullyConnected.cpp
@@ -37,11 +37,11 @@ CircleNode *CircleBCQFullyConnectedGraphBuilder::build_node(const circle::Operat
 {
   auto *node = graph->nodes()->create<CircleBCQFullyConnected>();
 
-  node->input(inputs[0]);
-  node->weights_scales(inputs[1]);
-  node->weights_binary(inputs[2]);
-  node->bias(inputs[3]);
-  node->weights_clusters(inputs[4]);
+  node->input(inputs.at(0));
+  node->weights_scales(inputs.at(1));
+  node->weights_binary(inputs.at(2));
+  node->bias(inputs.at(3));
+  node->weights_clusters(inputs.at(4));
 
   // TODO Find and move to appropriate place for setting optional input
   if (auto bias = dynamic_cast<luci::CircleOutputExclude *>(node->bias()))
diff --git a/compiler/luci/import/src/Nodes/CircleBCQGather.cpp b/compiler/luci/import/src/Nodes/CircleBCQGather.cpp
index c6d2ab559..464f1ac18 100644
--- a/compiler/luci/import/src/Nodes/CircleBCQGather.cpp
+++ b/compiler/luci/import/src/Nodes/CircleBCQGather.cpp
@@ -37,10 +37,10 @@ CircleNode *CircleBCQGatherGraphBuilder::build_node(const circle::OperatorT &op,
 {
   auto *node = graph->nodes()->create<CircleBCQGather>();
 
-  node->input_scales(inputs[0]);
-  node->input_binary(inputs[1]);
-  node->indices(inputs[2]);
-  node->input_clusters(inputs[3]);
+  node->input_scales(inputs.at(0));
+  node->input_binary(inputs.at(1));
+  node->indices(inputs.at(2));
+  node->input_clusters(inputs.at(3));
 
   const auto *options = op.builtin_options.AsBCQGatherOptions();
   node->input_hidden_size(options->input_hidden_size);
diff --git a/compiler/luci/import/src/Nodes/CircleBatchMatMul.cpp b/compiler/luci/import/src/Nodes/CircleBatchMatMul.cpp
index 6026b2a72..330775691 100644
--- a/compiler/luci/import/src/Nodes/CircleBatchMatMul.cpp
+++ b/compiler/luci/import/src/Nodes/CircleBatchMatMul.cpp
@@ -34,8 +34,8 @@ CircleNode *CircleBatchMatMulGraphBuilder::build_node(const circle::OperatorT &o
                                                       loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleBatchMatMul>();
-  node->x(inputs[0]);
-  node->y(inputs[1]);
+  node->x(inputs.at(0));
+  node->y(inputs.at(1));
 
   const auto *options = op.builtin_options.AsBatchMatMulOptions();
   node->adj_x(options->adjoint_lhs);
diff --git a/compiler/luci/import/src/Nodes/CircleBatchToSpaceND.cpp b/compiler/luci/import/src/Nodes/CircleBatchToSpaceND.cpp
index 4bbfadf64..8c2039fff 100644
--- a/compiler/luci/import/src/Nodes/CircleBatchToSpaceND.cpp
+++ b/compiler/luci/import/src/Nodes/CircleBatchToSpaceND.cpp
@@ -33,7 +33,7 @@ bool CircleBatchToSpaceNDGraphBuilder::validate(const ValidateArgs &args) const
 
   // input 1 and 2 should have INT32/INT64 type
   const auto &tensors = args.reader.tensors();
-  const auto &tensor_1 = tensors.at(inputs[1]);
+  const auto &tensor_1 = tensors.at(inputs.at(1));
   switch (tensor_1->type)
   {
     case circle::TensorType_INT32:
@@ -42,7 +42,7 @@ bool CircleBatchToSpaceNDGraphBuilder::validate(const ValidateArgs &args) const
     default:
       return false;
   }
-  const auto &tensor_2 = tensors.at(inputs[2]);
+  const auto &tensor_2 = tensors.at(inputs.at(2));
   switch (tensor_2->type)
   {
     case circle::TensorType_INT32:
@@ -53,7 +53,7 @@ bool CircleBatchToSpaceNDGraphBuilder::validate(const ValidateArgs &args) const
   }
 
   // Only support input shape dimension 3 and 4 only
-  const auto &tensor_0 = tensors.at(inputs[0]);
+  const auto &tensor_0 = tensors.at(inputs.at(0));
   const auto t_0_s = tensor_0->shape.size();
   if (t_0_s != 3 && t_0_s != 4)
     return false;
@@ -68,9 +68,9 @@ CircleNode *CircleBatchToSpaceNDGraphBuilder::build_node(const circle::OperatorT
                                                          loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleBatchToSpaceND>();
-  node->input(inputs[0]);
-  node->block_shape(inputs[1]);
-  node->crops(inputs[2]);
+  node->input(inputs.at(0));
+  node->block_shape(inputs.at(1));
+  node->crops(inputs.at(2));
 
   // No options for BatchToSpaceND
 
diff --git a/compiler/luci/import/src/Nodes/CircleCast.cpp b/compiler/luci/import/src/Nodes/CircleCast.cpp
index a4d09b505..7bdb63044 100644
--- a/compiler/luci/import/src/Nodes/CircleCast.cpp
+++ b/compiler/luci/import/src/Nodes/CircleCast.cpp
@@ -47,7 +47,7 @@ bool CircleCastGraphBuilder::validate(const ValidateArgs &args) const
     const circle::TensorT &output_tensor = *tensors[outputs[0]];
     auto name = tensor_name(output_tensor);
 
-    const auto &tensor_in = tensors.at(inputs[0]);
+    const auto &tensor_in = tensors.at(inputs.at(0));
     if (tensor_in->type != options->in_data_type)
     {
       if (settings->get(luci::UserSettings::Key::DisableValidation))
@@ -77,7 +77,7 @@ CircleNode *CircleCastGraphBuilder::build_node(const circle::OperatorT &op,
                                                loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleCast>();
-  node->x(inputs[0]);
+  node->x(inputs.at(0));
 
   const auto *options = op.builtin_options.AsCastOptions();
   if (options != nullptr)
@@ -87,7 +87,7 @@ CircleNode *CircleCastGraphBuilder::build_node(const circle::OperatorT &op,
   }
   else
   {
-    node->in_data_type(inputs[0]->dtype());
+    node->in_data_type(inputs.at(0)->dtype());
     node->out_data_type(loco::DataType::Unknown);
     // type inference should use node->dtype() for Unknown
     // export should use BuiltinOptions_NONE for Unknown
diff --git a/compiler/luci/import/src/Nodes/CircleCeil.cpp b/compiler/luci/import/src/Nodes/CircleCeil.cpp
index d3d6cd945..2e1aaa295 100644
--- a/compiler/luci/import/src/Nodes/CircleCeil.cpp
+++ b/compiler/luci/import/src/Nodes/CircleCeil.cpp
@@ -42,7 +42,7 @@ CircleNode *CircleCeilGraphBuilder::build_node(const circle::OperatorT &,
                                                loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleCeil>();
-  node->x(inputs[0]);
+  node->x(inputs.at(0));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleConv2D.cpp b/compiler/luci/import/src/Nodes/CircleConv2D.cpp
index 42c5c265a..9516ef16a 100644
--- a/compiler/luci/import/src/Nodes/CircleConv2D.cpp
+++ b/compiler/luci/import/src/Nodes/CircleConv2D.cpp
@@ -39,11 +39,11 @@ CircleNode *CircleConv2DGraphBuilder::build_node(const circle::OperatorT &op,
                                                  loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleConv2D>();
-  node->input(inputs[0]);
-  node->filter(inputs[1]);
+  node->input(inputs.at(0));
+  node->filter(inputs.at(1));
   // For now, bias is required (checked in `verify` method).
   assert(inputs.size() == 3);
-  node->bias(inputs[2]);
+  node->bias(inputs.at(2));
 
   const auto *options = op.builtin_options.AsConv2DOptions();
   node->padding(luci_padding(options->padding));
diff --git a/compiler/luci/import/src/Nodes/CircleCos.cpp b/compiler/luci/import/src/Nodes/CircleCos.cpp
index 5f61cc7f6..27d60c62c 100644
--- a/compiler/luci/import/src/Nodes/CircleCos.cpp
+++ b/compiler/luci/import/src/Nodes/CircleCos.cpp
@@ -36,7 +36,7 @@ CircleNode *CircleCosGraphBuilder::build_node(const circle::OperatorT &,
                                               loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleCos>();
-  node->x(inputs[0]);
+  node->x(inputs.at(0));
 
   // No options for Cos
 
diff --git a/compiler/luci/import/src/Nodes/CircleDepthToSpace.cpp b/compiler/luci/import/src/Nodes/CircleDepthToSpace.cpp
index 827b63468..49d31bb99 100644
--- a/compiler/luci/import/src/Nodes/CircleDepthToSpace.cpp
+++ b/compiler/luci/import/src/Nodes/CircleDepthToSpace.cpp
@@ -40,7 +40,7 @@ bool CircleDepthToSpaceGraphBuilder::validate(const ValidateArgs &args) const
 
   const auto &tensors = args.reader.tensors();
 
-  if (tensors[outputs[0]]->type != tensors[inputs[0]]->type)
+  if (tensors[outputs[0]]->type != tensors[inputs.at(0)]->type)
   {
     return false;
   }
@@ -56,7 +56,7 @@ CircleNode *CircleDepthToSpaceGraphBuilder::build_node(const circle::OperatorT &
                                                        loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleDepthToSpace>();
-  node->input(inputs[0]);
+  node->input(inputs.at(0));
 
   const auto *options = op.builtin_options.AsDepthToSpaceOptions();
   node->block_size(options->block_size);
diff --git a/compiler/luci/import/src/Nodes/CircleDepthwiseConv2D.cpp b/compiler/luci/import/src/Nodes/CircleDepthwiseConv2D.cpp
index 2b13f9ebb..53f85f2f5 100644
--- a/compiler/luci/import/src/Nodes/CircleDepthwiseConv2D.cpp
+++ b/compiler/luci/import/src/Nodes/CircleDepthwiseConv2D.cpp
@@ -40,11 +40,11 @@ CircleNode *CircleDepthwiseConv2DGraphBuilder::build_node(const circle::Operator
                                                           loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleDepthwiseConv2D>();
-  node->input(inputs[0]);
-  node->filter(inputs[1]);
+  node->input(inputs.at(0));
+  node->filter(inputs.at(1));
   if (inputs.size() != 3)
     throw oops::UserExn("DepthwiseConv2d without bias is unsupported");
-  node->bias(inputs[2]);
+  node->bias(inputs.at(2));
 
   const auto *options = op.builtin_options.AsDepthwiseConv2DOptions();
   node->padding(luci_padding(options->padding));
diff --git a/compiler/luci/import/src/Nodes/CircleDiv.cpp b/compiler/luci/import/src/Nodes/CircleDiv.cpp
index d09cfb815..615c224d7 100644
--- a/compiler/luci/import/src/Nodes/CircleDiv.cpp
+++ b/compiler/luci/import/src/Nodes/CircleDiv.cpp
@@ -37,8 +37,8 @@ CircleNode *CircleDivGraphBuilder::build_node(const circle::OperatorT &op,
                                               loco::Graph *graph) const
 {
   auto node = graph->nodes()->create<CircleDiv>();
-  node->x(inputs[0]);
-  node->y(inputs[1]);
+  node->x(inputs.at(0));
+  node->y(inputs.at(1));
 
   const auto *options = op.builtin_options.AsDivOptions();
   node->fusedActivationFunction(luci_actfunc(options->fused_activation_function));
diff --git a/compiler/luci/import/src/Nodes/CircleElu.cpp b/compiler/luci/import/src/Nodes/CircleElu.cpp
index 37a290cb1..919e95ee4 100644
--- a/compiler/luci/import/src/Nodes/CircleElu.cpp
+++ b/compiler/luci/import/src/Nodes/CircleElu.cpp
@@ -35,7 +35,7 @@ bool CircleEluGraphBuilder::validate(const ValidateArgs &args) const
     return false;
 
   const auto &tensors = args.reader.tensors();
-  const auto &tensor = tensors.at(inputs[0]);
+  const auto &tensor = tensors.at(inputs.at(0));
 
   switch (tensor->type)
   {
@@ -56,7 +56,7 @@ CircleNode *CircleEluGraphBuilder::build_node(const circle::OperatorT &,
                                               loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleElu>();
-  node->features(inputs[0]);
+  node->features(inputs.at(0));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleEqual.cpp b/compiler/luci/import/src/Nodes/CircleEqual.cpp
index a53f6e94b..1db33b8ac 100644
--- a/compiler/luci/import/src/Nodes/CircleEqual.cpp
+++ b/compiler/luci/import/src/Nodes/CircleEqual.cpp
@@ -34,7 +34,7 @@ bool CircleEqualGraphBuilder::validate(const ValidateArgs &args) const
 
   const auto &tensors = args.reader.tensors();
 
-  return tensors[inputs[0]]->type == tensors[inputs[1]]->type;
+  return tensors[inputs.at(0)]->type == tensors[inputs.at(1)]->type;
 }
 
 CircleNode *CircleEqualGraphBuilder::build_node(const circle::OperatorT &,
@@ -42,8 +42,8 @@ CircleNode *CircleEqualGraphBuilder::build_node(const circle::OperatorT &,
                                                 loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleEqual>();
-  node->x(inputs[0]);
-  node->y(inputs[1]);
+  node->x(inputs.at(0));
+  node->y(inputs.at(1));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleExp.cpp b/compiler/luci/import/src/Nodes/CircleExp.cpp
index a32851458..2c031d6b3 100644
--- a/compiler/luci/import/src/Nodes/CircleExp.cpp
+++ b/compiler/luci/import/src/Nodes/CircleExp.cpp
@@ -31,7 +31,7 @@ bool CircleExpGraphBuilder::validate(const ValidateArgs &args) const
 
   // input type check
   const auto &tensors = args.reader.tensors();
-  const auto &tensor = tensors.at(inputs[0]);
+  const auto &tensor = tensors.at(inputs.at(0));
   switch (tensor->type)
   {
     case circle::TensorType_FLOAT16:
@@ -51,7 +51,7 @@ CircleNode *CircleExpGraphBuilder::build_node(const circle::OperatorT &,
                                               loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleExp>();
-  node->x(inputs[0]);
+  node->x(inputs.at(0));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleExpandDims.cpp b/compiler/luci/import/src/Nodes/CircleExpandDims.cpp
index 1cef67a83..ab537c710 100644
--- a/compiler/luci/import/src/Nodes/CircleExpandDims.cpp
+++ b/compiler/luci/import/src/Nodes/CircleExpandDims.cpp
@@ -34,7 +34,7 @@ bool CircleExpandDimsGraphBuilder::validate(const ValidateArgs &args) const
 
   const auto &tensors = args.reader.tensors();
 
-  return tensors[inputs[1]]->type == circle::TensorType_INT32;
+  return tensors[inputs.at(1)]->type == circle::TensorType_INT32;
 }
 
 CircleNode *CircleExpandDimsGraphBuilder::build_node(const circle::OperatorT &,
@@ -42,8 +42,8 @@ CircleNode *CircleExpandDimsGraphBuilder::build_node(const circle::OperatorT &,
                                                      loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleExpandDims>();
-  node->input(inputs[0]);
-  node->axis(inputs[1]);
+  node->input(inputs.at(0));
+  node->axis(inputs.at(1));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleFill.cpp b/compiler/luci/import/src/Nodes/CircleFill.cpp
index 6c3d3a247..95d5b876b 100644
--- a/compiler/luci/import/src/Nodes/CircleFill.cpp
+++ b/compiler/luci/import/src/Nodes/CircleFill.cpp
@@ -37,8 +37,8 @@ CircleNode *CircleFillGraphBuilder::build_node(const circle::OperatorT &op,
                                                loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleFill>();
-  node->dims(inputs[0]);
-  node->value(inputs[1]);
+  node->dims(inputs.at(0));
+  node->value(inputs.at(1));
 
   const auto *options = op.builtin_options.AsFillOptions();
   (void)options;
diff --git a/compiler/luci/import/src/Nodes/CircleFloor.cpp b/compiler/luci/import/src/Nodes/CircleFloor.cpp
index 302a9eae3..ce756b3b1 100644
--- a/compiler/luci/import/src/Nodes/CircleFloor.cpp
+++ b/compiler/luci/import/src/Nodes/CircleFloor.cpp
@@ -42,7 +42,7 @@ CircleNode *CircleFloorGraphBuilder::build_node(const circle::OperatorT &,
                                                 loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleFloor>();
-  node->x(inputs[0]);
+  node->x(inputs.at(0));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleFloorDiv.cpp b/compiler/luci/import/src/Nodes/CircleFloorDiv.cpp
index 875197890..55f385d60 100644
--- a/compiler/luci/import/src/Nodes/CircleFloorDiv.cpp
+++ b/compiler/luci/import/src/Nodes/CircleFloorDiv.cpp
@@ -39,8 +39,8 @@ bool CircleFloorDivGraphBuilder::validate(const ValidateArgs &args) const
   }
 
   const auto &tensors = args.reader.tensors();
-  const auto &tensor_in_0 = tensors.at(inputs[0]);
-  const auto &tensor_in_1 = tensors.at(inputs[1]);
+  const auto &tensor_in_0 = tensors.at(inputs.at(0));
+  const auto &tensor_in_1 = tensors.at(inputs.at(1));
   const auto &tensor_out = tensors.at(outputs[0]);
 
   if (tensor_in_0->type != tensor_in_1->type)
@@ -59,8 +59,8 @@ CircleNode *CircleFloorDivGraphBuilder::build_node(const circle::OperatorT &,
                                                    loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleFloorDiv>();
-  node->x(inputs[0]);
-  node->y(inputs[1]);
+  node->x(inputs.at(0));
+  node->y(inputs.at(1));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleFloorMod.cpp b/compiler/luci/import/src/Nodes/CircleFloorMod.cpp
index 3ccdce0cd..2101e417e 100644
--- a/compiler/luci/import/src/Nodes/CircleFloorMod.cpp
+++ b/compiler/luci/import/src/Nodes/CircleFloorMod.cpp
@@ -33,8 +33,8 @@ bool CircleFloorModGraphBuilder::validate(const ValidateArgs &args) const
     return false;
 
   const auto &tensors = args.reader.tensors();
-  const auto &tensor_in_0 = tensors.at(inputs[0]);
-  const auto &tensor_in_1 = tensors.at(inputs[1]);
+  const auto &tensor_in_0 = tensors.at(inputs.at(0));
+  const auto &tensor_in_1 = tensors.at(inputs.at(1));
   if (tensor_in_0->type != tensor_in_1->type)
     return false;
 
@@ -48,8 +48,8 @@ CircleNode *CircleFloorModGraphBuilder::build_node(const circle::OperatorT &,
                                                    loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleFloorMod>();
-  node->x(inputs[0]);
-  node->y(inputs[1]);
+  node->x(inputs.at(0));
+  node->y(inputs.at(1));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleFullyConnected.cpp b/compiler/luci/import/src/Nodes/CircleFullyConnected.cpp
index 8937e78f1..65a863bde 100644
--- a/compiler/luci/import/src/Nodes/CircleFullyConnected.cpp
+++ b/compiler/luci/import/src/Nodes/CircleFullyConnected.cpp
@@ -38,9 +38,9 @@ CircleNode *CircleFullyConnectedGraphBuilder::build_node(const circle::OperatorT
                                                          loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleFullyConnected>();
-  node->input(inputs[0]);
-  node->weights(inputs[1]);
-  node->bias(inputs[2]); // bias is optional
+  node->input(inputs.at(0));
+  node->weights(inputs.at(1));
+  node->bias(inputs.at(2)); // bias is optional
 
   // TODO Find and move to appropriate place for setting optional input
   if (auto bias = dynamic_cast<luci::CircleOutputExclude *>(node->bias()))
diff --git a/compiler/luci/import/src/Nodes/CircleGather.cpp b/compiler/luci/import/src/Nodes/CircleGather.cpp
index 1caa05ec2..75447a38a 100644
--- a/compiler/luci/import/src/Nodes/CircleGather.cpp
+++ b/compiler/luci/import/src/Nodes/CircleGather.cpp
@@ -56,8 +56,8 @@ CircleNode *CircleGatherGraphBuilder::build_node(const circle::OperatorT &op,
 {
   auto *node = graph->nodes()->create<CircleGather>();
 
-  node->params(inputs[0]);
-  node->indices(inputs[1]);
+  node->params(inputs.at(0));
+  node->indices(inputs.at(1));
 
   const auto *options = op.builtin_options.AsGatherOptions();
   node->axis(options->axis);
diff --git a/compiler/luci/import/src/Nodes/CircleGatherNd.cpp b/compiler/luci/import/src/Nodes/CircleGatherNd.cpp
index 621d4ae92..981adbf63 100644
--- a/compiler/luci/import/src/Nodes/CircleGatherNd.cpp
+++ b/compiler/luci/import/src/Nodes/CircleGatherNd.cpp
@@ -36,7 +36,7 @@ bool CircleGatherNdGraphBuilder::validate(const ValidateArgs &args) const
   if (outputs.size() != 1)
     return false;
 
-  auto &indices_tensor = args.reader.tensors()[inputs[1]];
+  auto &indices_tensor = args.reader.tensors()[inputs.at(1)];
 
   if (!(indices_tensor->type == circle::TensorType::TensorType_INT32 ||
         indices_tensor->type == circle::TensorType::TensorType_INT64))
@@ -53,8 +53,8 @@ CircleNode *CircleGatherNdGraphBuilder::build_node(const circle::OperatorT &,
 {
   auto *node = graph->nodes()->create<CircleGatherNd>();
 
-  node->params(inputs[0]);
-  node->indices(inputs[1]);
+  node->params(inputs.at(0));
+  node->indices(inputs.at(1));
 
   // GatherNd options empty
 
diff --git a/compiler/luci/import/src/Nodes/CircleGreater.cpp b/compiler/luci/import/src/Nodes/CircleGreater.cpp
index 88107589c..1ad0467e4 100644
--- a/compiler/luci/import/src/Nodes/CircleGreater.cpp
+++ b/compiler/luci/import/src/Nodes/CircleGreater.cpp
@@ -43,7 +43,7 @@ bool CircleGreaterGraphBuilder::validate(const ValidateArgs &args) const
 
   const auto &tensors = args.reader.tensors();
 
-  if (tensors[inputs[0]]->type != tensors[inputs[1]]->type)
+  if (tensors[inputs.at(0)]->type != tensors[inputs.at(1)]->type)
     return false;
 
   // NOTE: real models do have output dtype NOT BOOL
@@ -67,8 +67,8 @@ CircleNode *CircleGreaterGraphBuilder::build_node(const circle::OperatorT &,
                                                   loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleGreater>();
-  node->x(inputs[0]);
-  node->y(inputs[1]);
+  node->x(inputs.at(0));
+  node->y(inputs.at(1));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleGreaterEqual.cpp b/compiler/luci/import/src/Nodes/CircleGreaterEqual.cpp
index dff1510c5..0ac63b017 100644
--- a/compiler/luci/import/src/Nodes/CircleGreaterEqual.cpp
+++ b/compiler/luci/import/src/Nodes/CircleGreaterEqual.cpp
@@ -40,7 +40,7 @@ bool CircleGreaterEqualGraphBuilder::validate(const ValidateArgs &args) const
 
   const auto &tensors = args.reader.tensors();
 
-  if (tensors[inputs[0]]->type != tensors[inputs[1]]->type)
+  if (tensors[inputs.at(0)]->type != tensors[inputs.at(1)]->type)
   {
     return false;
   }
@@ -53,8 +53,8 @@ CircleNode *CircleGreaterEqualGraphBuilder::build_node(const circle::OperatorT &
                                                        loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleGreaterEqual>();
-  node->x(inputs[0]);
-  node->y(inputs[1]);
+  node->x(inputs.at(0));
+  node->y(inputs.at(1));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleIf.cpp b/compiler/luci/import/src/Nodes/CircleIf.cpp
index d6090640d..db9ffe1cd 100644
--- a/compiler/luci/import/src/Nodes/CircleIf.cpp
+++ b/compiler/luci/import/src/Nodes/CircleIf.cpp
@@ -43,7 +43,7 @@ bool CircleIfGraphBuilder::validate(const ValidateArgs &args) const
 
   // input 0 should be BOOL type
   const auto &tensors = args.reader.tensors();
-  const auto &tensor = tensors.at(inputs[0]);
+  const auto &tensor = tensors.at(inputs.at(0));
   if (tensor->type != circle::TensorType_BOOL)
     return false;
 
diff --git a/compiler/luci/import/src/Nodes/CircleInstanceNorm.cpp b/compiler/luci/import/src/Nodes/CircleInstanceNorm.cpp
index b95c54c89..6349fd3b7 100644
--- a/compiler/luci/import/src/Nodes/CircleInstanceNorm.cpp
+++ b/compiler/luci/import/src/Nodes/CircleInstanceNorm.cpp
@@ -38,9 +38,9 @@ CircleNode *CircleInstanceNormGraphBuilder::build_node(const circle::OperatorT &
                                                        loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleInstanceNorm>();
-  node->input(inputs[0]);
-  node->gamma(inputs[1]);
-  node->beta(inputs[2]);
+  node->input(inputs.at(0));
+  node->gamma(inputs.at(1));
+  node->beta(inputs.at(2));
 
   const auto *options = op.builtin_options.AsInstanceNormOptions();
   node->epsilon(options->epsilon);
diff --git a/compiler/luci/import/src/Nodes/CircleL2Normalize.cpp b/compiler/luci/import/src/Nodes/CircleL2Normalize.cpp
index fe10a8572..e4fdc200c 100644
--- a/compiler/luci/import/src/Nodes/CircleL2Normalize.cpp
+++ b/compiler/luci/import/src/Nodes/CircleL2Normalize.cpp
@@ -46,7 +46,7 @@ CircleNode *CircleL2NormalizeGraphBuilder::build_node(const circle::OperatorT &o
                                                       loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleL2Normalize>();
-  node->x(inputs[0]);
+  node->x(inputs.at(0));
   const auto *options = op.builtin_options.AsL2NormOptions();
   node->fusedActivationFunction(luci_actfunc(options->fused_activation_function));
 
diff --git a/compiler/luci/import/src/Nodes/CircleL2Pool2D.cpp b/compiler/luci/import/src/Nodes/CircleL2Pool2D.cpp
index 023206695..202d9d6fb 100644
--- a/compiler/luci/import/src/Nodes/CircleL2Pool2D.cpp
+++ b/compiler/luci/import/src/Nodes/CircleL2Pool2D.cpp
@@ -38,7 +38,7 @@ CircleNode *CircleL2Pool2DGraphBuilder::build_node(const circle::OperatorT &op,
                                                    loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleL2Pool2D>();
-  node->value(inputs[0]);
+  node->value(inputs.at(0));
 
   const auto *options = op.builtin_options.AsPool2DOptions();
   node->padding(luci_padding(options->padding));
diff --git a/compiler/luci/import/src/Nodes/CircleLeakyRelu.cpp b/compiler/luci/import/src/Nodes/CircleLeakyRelu.cpp
index 4957ceae0..ad4979f39 100644
--- a/compiler/luci/import/src/Nodes/CircleLeakyRelu.cpp
+++ b/compiler/luci/import/src/Nodes/CircleLeakyRelu.cpp
@@ -39,7 +39,7 @@ CircleNode *CircleLeakyReluGraphBuilder::build_node(const circle::OperatorT &op,
                                                     loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleLeakyRelu>();
-  node->features(inputs[0]);
+  node->features(inputs.at(0));
 
   const auto *options = op.builtin_options.AsLeakyReluOptions();
   node->alpha(options->alpha);
diff --git a/compiler/luci/import/src/Nodes/CircleLess.cpp b/compiler/luci/import/src/Nodes/CircleLess.cpp
index 40ad28c6e..506036908 100644
--- a/compiler/luci/import/src/Nodes/CircleLess.cpp
+++ b/compiler/luci/import/src/Nodes/CircleLess.cpp
@@ -39,7 +39,7 @@ bool CircleLessGraphBuilder::validate(const ValidateArgs &args) const
   }
 
   const auto &tensors = args.reader.tensors();
-  const auto &tensor = tensors.at(inputs[0]);
+  const auto &tensor = tensors.at(inputs.at(0));
 
   switch (tensor->type)
   {
@@ -56,7 +56,7 @@ bool CircleLessGraphBuilder::validate(const ValidateArgs &args) const
       return false;
   }
 
-  if (tensors[inputs[1]]->type != tensor->type)
+  if (tensors[inputs.at(1)]->type != tensor->type)
   {
     return false;
   }
@@ -69,8 +69,8 @@ CircleNode *CircleLessGraphBuilder::build_node(const circle::OperatorT &,
                                                loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleLess>();
-  node->x(inputs[0]);
-  node->y(inputs[1]);
+  node->x(inputs.at(0));
+  node->y(inputs.at(1));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleLessEqual.cpp b/compiler/luci/import/src/Nodes/CircleLessEqual.cpp
index 13e995069..9b4f934a5 100644
--- a/compiler/luci/import/src/Nodes/CircleLessEqual.cpp
+++ b/compiler/luci/import/src/Nodes/CircleLessEqual.cpp
@@ -40,7 +40,7 @@ bool CircleLessEqualGraphBuilder::validate(const ValidateArgs &args) const
 
   const auto &tensors = args.reader.tensors();
 
-  if (tensors[inputs[0]]->type != tensors[inputs[1]]->type)
+  if (tensors[inputs.at(0)]->type != tensors[inputs.at(1)]->type)
   {
     return false;
   }
@@ -53,8 +53,8 @@ CircleNode *CircleLessEqualGraphBuilder::build_node(const circle::OperatorT &,
                                                     loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleLessEqual>();
-  node->x(inputs[0]);
-  node->y(inputs[1]);
+  node->x(inputs.at(0));
+  node->y(inputs.at(1));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleLocalResponseNormalization.cpp b/compiler/luci/import/src/Nodes/CircleLocalResponseNormalization.cpp
index 7b1f0db56..0e32f62de 100644
--- a/compiler/luci/import/src/Nodes/CircleLocalResponseNormalization.cpp
+++ b/compiler/luci/import/src/Nodes/CircleLocalResponseNormalization.cpp
@@ -37,7 +37,7 @@ CircleNode *CircleLocalResponseNormalizationGraphBuilder::build_node(
     const circle::OperatorT &op, const std::vector<CircleNode *> &inputs, loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleLocalResponseNormalization>();
-  node->input(inputs[0]);
+  node->input(inputs.at(0));
 
   const auto *options = op.builtin_options.AsLocalResponseNormalizationOptions();
   node->radius(options->radius);
diff --git a/compiler/luci/import/src/Nodes/CircleLog.cpp b/compiler/luci/import/src/Nodes/CircleLog.cpp
index 21408327d..346fc43bb 100644
--- a/compiler/luci/import/src/Nodes/CircleLog.cpp
+++ b/compiler/luci/import/src/Nodes/CircleLog.cpp
@@ -35,7 +35,7 @@ bool CircleLogGraphBuilder::validate(const ValidateArgs &args) const
   // Must be one of bfloat16, half, float32, float64, complex64, complex128.
   // Currently circle supports half(float16), float32, float64, complex64.
   const auto &tensors = args.reader.tensors();
-  const auto &tensor = tensors.at(inputs[0]);
+  const auto &tensor = tensors.at(inputs.at(0));
   switch (tensor->type)
   {
     case circle::TensorType_FLOAT16:
@@ -55,7 +55,7 @@ CircleNode *CircleLogGraphBuilder::build_node(const circle::OperatorT &,
                                               loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleLog>();
-  node->x(inputs[0]);
+  node->x(inputs.at(0));
 
   // No options for Log
 
diff --git a/compiler/luci/import/src/Nodes/CircleLogSoftmax.cpp b/compiler/luci/import/src/Nodes/CircleLogSoftmax.cpp
index e738c4a0c..ef69e868a 100644
--- a/compiler/luci/import/src/Nodes/CircleLogSoftmax.cpp
+++ b/compiler/luci/import/src/Nodes/CircleLogSoftmax.cpp
@@ -38,7 +38,7 @@ CircleNode *CircleLogSoftmaxGraphBuilder::build_node(const circle::OperatorT &,
                                                      loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleLogSoftmax>();
-  node->logits(inputs[0]);
+  node->logits(inputs.at(0));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleLogicalAnd.cpp b/compiler/luci/import/src/Nodes/CircleLogicalAnd.cpp
index 8509dbaf3..7844da0f6 100644
--- a/compiler/luci/import/src/Nodes/CircleLogicalAnd.cpp
+++ b/compiler/luci/import/src/Nodes/CircleLogicalAnd.cpp
@@ -46,8 +46,8 @@ CircleNode *CircleLogicalAndGraphBuilder::build_node(const circle::OperatorT &,
                                                      loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleLogicalAnd>();
-  node->x(inputs[0]);
-  node->y(inputs[1]);
+  node->x(inputs.at(0));
+  node->y(inputs.at(1));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleLogicalNot.cpp b/compiler/luci/import/src/Nodes/CircleLogicalNot.cpp
index b1ed3ea37..3758642e4 100644
--- a/compiler/luci/import/src/Nodes/CircleLogicalNot.cpp
+++ b/compiler/luci/import/src/Nodes/CircleLogicalNot.cpp
@@ -31,7 +31,7 @@ bool CircleLogicalNotGraphBuilder::validate(const ValidateArgs &args) const
   // Only BOOL type is allowed for the input
   const auto &inputs = args.op.inputs;
   const auto &tensors = args.reader.tensors();
-  const auto &tensor = tensors.at(inputs[0]);
+  const auto &tensor = tensors.at(inputs.at(0));
   if (tensor->type != circle::TensorType::TensorType_BOOL)
     return false;
 
@@ -43,7 +43,7 @@ CircleNode *CircleLogicalNotGraphBuilder::build_node(const circle::OperatorT &,
                                                      loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleLogicalNot>();
-  node->x(inputs[0]);
+  node->x(inputs.at(0));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleLogicalOr.cpp b/compiler/luci/import/src/Nodes/CircleLogicalOr.cpp
index 00eb9c5df..1b87e6f9c 100644
--- a/compiler/luci/import/src/Nodes/CircleLogicalOr.cpp
+++ b/compiler/luci/import/src/Nodes/CircleLogicalOr.cpp
@@ -46,8 +46,8 @@ CircleNode *CircleLogicalOrGraphBuilder::build_node(const circle::OperatorT &,
                                                     loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleLogicalOr>();
-  node->x(inputs[0]);
-  node->y(inputs[1]);
+  node->x(inputs.at(0));
+  node->y(inputs.at(1));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleLogistic.cpp b/compiler/luci/import/src/Nodes/CircleLogistic.cpp
index 85e7e55b2..9606e19cd 100644
--- a/compiler/luci/import/src/Nodes/CircleLogistic.cpp
+++ b/compiler/luci/import/src/Nodes/CircleLogistic.cpp
@@ -32,22 +32,8 @@ bool CircleLogisticGraphBuilder::validate(const ValidateArgs &args) const
   if (outputs.size() != 1)
     return false;
 
-  // Must be one of the following types
-  // float16, float32, float64, complex64, or complex128
   const auto &tensors = args.reader.tensors();
-  const auto &tensor = tensors.at(inputs[0]);
-  switch (tensor->type)
-  {
-    case circle::TensorType_FLOAT16:
-    case circle::TensorType_FLOAT32:
-    case circle::TensorType_FLOAT64:
-    case circle::TensorType_COMPLEX64:
-      break;
-    default:
-      return false;
-  }
-
-  if (tensors.at(inputs[0])->type != tensors.at(outputs[0])->type)
+  if (tensors.at(inputs.at(0))->type != tensors.at(outputs[0])->type)
     return false;
 
   return true;
@@ -58,7 +44,7 @@ CircleNode *CircleLogisticGraphBuilder::build_node(const circle::OperatorT &,
                                                    loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleLogistic>();
-  node->x(inputs[0]);
+  node->x(inputs.at(0));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleMatrixDiag.cpp b/compiler/luci/import/src/Nodes/CircleMatrixDiag.cpp
index f4ae03c58..a4a21a8b7 100644
--- a/compiler/luci/import/src/Nodes/CircleMatrixDiag.cpp
+++ b/compiler/luci/import/src/Nodes/CircleMatrixDiag.cpp
@@ -35,7 +35,7 @@ bool CircleMatrixDiagGraphBuilder::validate(const ValidateArgs &args) const
     return false;
 
   const auto &tensors = args.reader.tensors();
-  const auto &tensor = tensors.at(inputs[0]);
+  const auto &tensor = tensors.at(inputs.at(0));
 
   if (tensors[outputs[0]]->type != tensor->type)
     return false;
@@ -48,7 +48,7 @@ CircleNode *CircleMatrixDiagGraphBuilder::build_node(const circle::OperatorT &,
                                                      loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleMatrixDiag>();
-  node->diagonal(inputs[0]);
+  node->diagonal(inputs.at(0));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleMatrixSetDiag.cpp b/compiler/luci/import/src/Nodes/CircleMatrixSetDiag.cpp
index d6f6aee33..cf0313149 100644
--- a/compiler/luci/import/src/Nodes/CircleMatrixSetDiag.cpp
+++ b/compiler/luci/import/src/Nodes/CircleMatrixSetDiag.cpp
@@ -35,7 +35,7 @@ bool CircleMatrixSetDiagGraphBuilder::validate(const ValidateArgs &args) const
     return false;
 
   const auto &tensors = args.reader.tensors();
-  const auto &tensor = tensors.at(inputs[0]);
+  const auto &tensor = tensors.at(inputs.at(0));
 
   if (tensors[outputs[0]]->type != tensor->type)
     return false;
@@ -48,8 +48,8 @@ CircleNode *CircleMatrixSetDiagGraphBuilder::build_node(const circle::OperatorT
                                                         loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleMatrixSetDiag>();
-  node->input(inputs[0]);
-  node->diagonal(inputs[1]);
+  node->input(inputs.at(0));
+  node->diagonal(inputs.at(1));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleMaxPool2D.cpp b/compiler/luci/import/src/Nodes/CircleMaxPool2D.cpp
index 1798819cf..4bca0f40b 100644
--- a/compiler/luci/import/src/Nodes/CircleMaxPool2D.cpp
+++ b/compiler/luci/import/src/Nodes/CircleMaxPool2D.cpp
@@ -36,7 +36,7 @@ CircleNode *CircleMaxPool2DGraphBuilder::build_node(const circle::OperatorT &op,
                                                     loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleMaxPool2D>();
-  node->value(inputs[0]);
+  node->value(inputs.at(0));
 
   const auto *options = op.builtin_options.AsPool2DOptions();
   node->padding(luci_padding(options->padding));
diff --git a/compiler/luci/import/src/Nodes/CircleMaximum.cpp b/compiler/luci/import/src/Nodes/CircleMaximum.cpp
index 6ca7e4079..4d1468f19 100644
--- a/compiler/luci/import/src/Nodes/CircleMaximum.cpp
+++ b/compiler/luci/import/src/Nodes/CircleMaximum.cpp
@@ -35,7 +35,7 @@ bool CircleMaximumGraphBuilder::validate(const ValidateArgs &args) const
     return false;
 
   const auto &tensors = args.reader.tensors();
-  const auto &tensor = tensors.at(inputs[0]);
+  const auto &tensor = tensors.at(inputs.at(0));
 
   switch (tensor->type)
   {
@@ -49,7 +49,7 @@ bool CircleMaximumGraphBuilder::validate(const ValidateArgs &args) const
       return false;
   }
 
-  if (tensors[inputs[1]]->type != tensor->type)
+  if (tensors[inputs.at(1)]->type != tensor->type)
     return false;
 
   if (tensors[outputs[0]]->type != tensor->type)
@@ -63,8 +63,8 @@ CircleNode *CircleMaximumGraphBuilder::build_node(const circle::OperatorT &,
                                                   loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleMaximum>();
-  node->x(inputs[0]);
-  node->y(inputs[1]);
+  node->x(inputs.at(0));
+  node->y(inputs.at(1));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleMean.cpp b/compiler/luci/import/src/Nodes/CircleMean.cpp
index 8261c7b38..d8fa9a53d 100644
--- a/compiler/luci/import/src/Nodes/CircleMean.cpp
+++ b/compiler/luci/import/src/Nodes/CircleMean.cpp
@@ -34,8 +34,8 @@ CircleNode *CircleMeanGraphBuilder::build_node(const circle::OperatorT &op,
                                                loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleMean>();
-  node->input(inputs[0]);
-  node->reduction_indices(inputs[1]);
+  node->input(inputs.at(0));
+  node->reduction_indices(inputs.at(1));
 
   const auto *options = op.builtin_options.AsReducerOptions();
   node->keep_dims(options->keep_dims);
diff --git a/compiler/luci/import/src/Nodes/CircleMinimum.cpp b/compiler/luci/import/src/Nodes/CircleMinimum.cpp
index b770f365f..8b4daf197 100644
--- a/compiler/luci/import/src/Nodes/CircleMinimum.cpp
+++ b/compiler/luci/import/src/Nodes/CircleMinimum.cpp
@@ -35,7 +35,7 @@ bool CircleMinimumGraphBuilder::validate(const ValidateArgs &args) const
     return false;
 
   const auto &tensors = args.reader.tensors();
-  const auto &tensor = tensors.at(inputs[0]);
+  const auto &tensor = tensors.at(inputs.at(0));
 
   switch (tensor->type)
   {
@@ -49,7 +49,7 @@ bool CircleMinimumGraphBuilder::validate(const ValidateArgs &args) const
       return false;
   }
 
-  if (tensors[inputs[1]]->type != tensor->type)
+  if (tensors[inputs.at(1)]->type != tensor->type)
     return false;
 
   if (tensors[outputs[0]]->type != tensor->type)
@@ -63,8 +63,8 @@ CircleNode *CircleMinimumGraphBuilder::build_node(const circle::OperatorT &,
                                                   loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleMinimum>();
-  node->x(inputs[0]);
-  node->y(inputs[1]);
+  node->x(inputs.at(0));
+  node->y(inputs.at(1));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleMirrorPad.cpp b/compiler/luci/import/src/Nodes/CircleMirrorPad.cpp
index 41b5e5d80..e0ddd4c11 100644
--- a/compiler/luci/import/src/Nodes/CircleMirrorPad.cpp
+++ b/compiler/luci/import/src/Nodes/CircleMirrorPad.cpp
@@ -38,8 +38,8 @@ CircleNode *CircleMirrorPadGraphBuilder::build_node(const circle::OperatorT &op,
                                                     loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleMirrorPad>();
-  node->input(inputs[0]);
-  node->paddings(inputs[1]);
+  node->input(inputs.at(0));
+  node->paddings(inputs.at(1));
 
   const auto *options = op.builtin_options.AsMirrorPadOptions();
   node->mode(luci_mirrorpad_mode(options->mode));
diff --git a/compiler/luci/import/src/Nodes/CircleMul.cpp b/compiler/luci/import/src/Nodes/CircleMul.cpp
index d4412b96b..e3c4a7ee5 100644
--- a/compiler/luci/import/src/Nodes/CircleMul.cpp
+++ b/compiler/luci/import/src/Nodes/CircleMul.cpp
@@ -37,8 +37,8 @@ CircleNode *CircleMulGraphBuilder::build_node(const circle::OperatorT &op,
                                               loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleMul>();
-  node->x(inputs[0]);
-  node->y(inputs[1]);
+  node->x(inputs.at(0));
+  node->y(inputs.at(1));
 
   const auto *options = op.builtin_options.AsMulOptions();
   node->fusedActivationFunction(luci_actfunc(options->fused_activation_function));
diff --git a/compiler/luci/import/src/Nodes/CircleNeg.cpp b/compiler/luci/import/src/Nodes/CircleNeg.cpp
index 3d3079ca2..a64a69560 100644
--- a/compiler/luci/import/src/Nodes/CircleNeg.cpp
+++ b/compiler/luci/import/src/Nodes/CircleNeg.cpp
@@ -36,7 +36,7 @@ CircleNode *CircleNegGraphBuilder::build_node(const circle::OperatorT &,
                                               loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleNeg>();
-  node->x(inputs[0]);
+  node->x(inputs.at(0));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleNonMaxSuppressionV4.cpp b/compiler/luci/import/src/Nodes/CircleNonMaxSuppressionV4.cpp
new file mode 100644
index 000000000..a4ad4a53d
--- /dev/null
+++ b/compiler/luci/import/src/Nodes/CircleNonMaxSuppressionV4.cpp
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Import/Nodes/CircleNonMaxSuppressionV4.h"
+
+#include <luci/IR/Nodes/CircleNonMaxSuppressionV4.h>
+#include <luci/IR/Nodes/CircleNonMaxSuppressionV4Out.h>
+
+#include <loco.h>
+#include <oops/UserExn.h>
+
+namespace luci
+{
+
+bool CircleNonMaxSuppressionV4GraphBuilder::validate(const ValidateArgs &args) const
+{
+  const auto &inputs = args.op.inputs;
+  const auto &outputs = args.op.outputs;
+
+  if (inputs.size() != 5)
+    return false;
+  if (outputs.size() != 2)
+    return false;
+
+  const auto &tensors = args.reader.tensors();
+  const auto &boxes_tensor = tensors.at(inputs[0]);
+  if (boxes_tensor->shape.size() != 2)
+    return false;
+  if (boxes_tensor->shape.at(1) != 4)
+    return false;
+  if (boxes_tensor->shape.at(0) != tensors.at(inputs[1])->shape.at(0))
+    return false;
+
+  if (tensors.at(inputs[2])->type != circle::TensorType_INT32)
+    return false;
+  if (tensors.at(inputs[3])->type != circle::TensorType_FLOAT32)
+    return false;
+  if (tensors.at(inputs[4])->type != circle::TensorType_FLOAT32)
+    return false;
+
+  return true;
+}
+
+/**
+ * @brief  NonMaxSuppressionV4 Node builder
+ *
+ * @note   Current loco does not provide multiple outputs
+ *         We will create multiple NonMasSuppressionV4Oout nodes to emulate this
+ */
+
+void CircleNonMaxSuppressionV4GraphBuilder::build(const circle::OperatorT &op,
+                                                  GraphBuilderContext *context) const
+{
+  assert(context != nullptr);
+
+  auto graph = context->graph();
+
+  const std::vector<int32_t> &inputs = op.inputs;
+  const std::vector<int32_t> &outputs = op.outputs;
+  const auto &tensors = context->reader()->tensors();
+  const auto &opcodes = context->reader()->opcodes();
+  auto tensors_ptr = context->reader()->tensors_ptr();
+  assert(tensors_ptr != nullptr);
+
+  std::vector<CircleNode *> input_nodes;
+  for (const int32_t input_tensor_index : inputs)
+  {
+    input_nodes.push_back(context->nodefinder()->node(input_tensor_index));
+  }
+
+  // Create CircleNonMaxSuppressionV4
+  auto node = graph->nodes()->create<CircleNonMaxSuppressionV4>();
+  node->boxes(input_nodes[0]);
+  node->scores(input_nodes[1]);
+  node->max_output_size(input_nodes[2]);
+  node->iou_threshold(input_nodes[3]);
+  node->score_threshold(input_nodes[4]);
+
+  assert(outputs.size() == 2);
+  {
+    // Let's use name of output 0 as NonMaxSuppressionV4 name
+    const circle::TensorT &output_tensor = *tensors[outputs[0]];
+    node->name(tensor_name(output_tensor));
+    node->op_version(opcodes[op.opcode_index].get()->version);
+
+    // NOTE We don't set quantization for NonMaxSuppressionV4 itself but to virtual outputs
+  }
+
+  // Create virtual outputs of NonMaxSuppressionV4
+  for (size_t n = 0; n < outputs.size(); ++n)
+  {
+    const circle::TensorT &output_tensor = *tensors[outputs[n]];
+
+    auto *nodeout = graph->nodes()->create<CircleNonMaxSuppressionV4Out>();
+    copy_tensor_attributes(output_tensor, nodeout);
+
+    // mark shape_status
+    if (tensors_ptr->Get(outputs[n])->shape() == nullptr)
+      nodeout->shape_status(ShapeStatus::NOSHAPE);
+    else
+      nodeout->shape_status(ShapeStatus::VALID);
+
+    nodeout->input(node);
+    nodeout->index(n);
+
+    context->nodefinder()->enroll(outputs[n], nodeout);
+  }
+}
+
+} // namespace luci
diff --git a/compiler/luci/import/src/Nodes/CircleNotEqual.cpp b/compiler/luci/import/src/Nodes/CircleNotEqual.cpp
index 5b04856db..77e986de1 100644
--- a/compiler/luci/import/src/Nodes/CircleNotEqual.cpp
+++ b/compiler/luci/import/src/Nodes/CircleNotEqual.cpp
@@ -40,7 +40,7 @@ bool CircleNotEqualGraphBuilder::validate(const ValidateArgs &args) const
 
   const auto &tensors = args.reader.tensors();
 
-  if (tensors[inputs[0]]->type != tensors[inputs[1]]->type)
+  if (tensors[inputs.at(0)]->type != tensors[inputs.at(1)]->type)
   {
     return false;
   }
@@ -53,8 +53,8 @@ CircleNode *CircleNotEqualGraphBuilder::build_node(const circle::OperatorT &,
                                                    loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleNotEqual>();
-  node->x(inputs[0]);
-  node->y(inputs[1]);
+  node->x(inputs.at(0));
+  node->y(inputs.at(1));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleOneHot.cpp b/compiler/luci/import/src/Nodes/CircleOneHot.cpp
index 9fdbfa84d..69294e1ed 100644
--- a/compiler/luci/import/src/Nodes/CircleOneHot.cpp
+++ b/compiler/luci/import/src/Nodes/CircleOneHot.cpp
@@ -38,10 +38,10 @@ bool CircleOneHotGraphBuilder::validate(const ValidateArgs &args) const
     return false;
 
   const auto &tensors = args.reader.tensors();
-  const auto &indices = tensors.at(inputs[0]);
-  const auto &depth = tensors.at(inputs[1]);
-  const auto &on_value = tensors.at(inputs[2]);
-  const auto &off_value = tensors.at(inputs[3]);
+  const auto &indices = tensors.at(inputs.at(0));
+  const auto &depth = tensors.at(inputs.at(1));
+  const auto &on_value = tensors.at(inputs.at(2));
+  const auto &off_value = tensors.at(inputs.at(3));
 
   if (options->axis < -1 || options->axis > static_cast<int32_t>(indices->shape.size()))
     return false;
@@ -63,10 +63,10 @@ CircleNode *CircleOneHotGraphBuilder::build_node(const circle::OperatorT &op,
 {
   auto *node = graph->nodes()->create<CircleOneHot>();
 
-  node->indices(inputs[0]);
-  node->depth(inputs[1]);
-  node->on_value(inputs[2]);
-  node->off_value(inputs[3]);
+  node->indices(inputs.at(0));
+  node->depth(inputs.at(1));
+  node->on_value(inputs.at(2));
+  node->off_value(inputs.at(3));
 
   const auto *options = op.builtin_options.AsOneHotOptions();
   node->axis(options->axis);
diff --git a/compiler/luci/import/src/Nodes/CirclePRelu.cpp b/compiler/luci/import/src/Nodes/CirclePRelu.cpp
index 0d87cd423..c07920f7c 100644
--- a/compiler/luci/import/src/Nodes/CirclePRelu.cpp
+++ b/compiler/luci/import/src/Nodes/CirclePRelu.cpp
@@ -39,8 +39,8 @@ CircleNode *CirclePReluGraphBuilder::build_node(const circle::OperatorT &,
                                                 loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CirclePRelu>();
-  node->input(inputs[0]);
-  node->alpha(inputs[1]);
+  node->input(inputs.at(0));
+  node->alpha(inputs.at(1));
 
   // PRelu options are empty
 
diff --git a/compiler/luci/import/src/Nodes/CirclePad.cpp b/compiler/luci/import/src/Nodes/CirclePad.cpp
index 6abcf2d6c..999173b90 100644
--- a/compiler/luci/import/src/Nodes/CirclePad.cpp
+++ b/compiler/luci/import/src/Nodes/CirclePad.cpp
@@ -38,8 +38,8 @@ CircleNode *CirclePadGraphBuilder::build_node(const circle::OperatorT &op,
                                               loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CirclePad>();
-  node->input(inputs[0]);
-  node->paddings(inputs[1]);
+  node->input(inputs.at(0));
+  node->paddings(inputs.at(1));
 
   const auto *options = op.builtin_options.AsPadOptions();
   (void)options; // There are no options.
diff --git a/compiler/luci/import/src/Nodes/CirclePow.cpp b/compiler/luci/import/src/Nodes/CirclePow.cpp
index ff9833165..def012614 100644
--- a/compiler/luci/import/src/Nodes/CirclePow.cpp
+++ b/compiler/luci/import/src/Nodes/CirclePow.cpp
@@ -39,8 +39,8 @@ CircleNode *CirclePowGraphBuilder::build_node(const circle::OperatorT &,
                                               loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CirclePow>();
-  node->x(inputs[0]);
-  node->y(inputs[1]);
+  node->x(inputs.at(0));
+  node->y(inputs.at(1));
 
   // Pow options are empty
 
diff --git a/compiler/luci/import/src/Nodes/CircleRange.cpp b/compiler/luci/import/src/Nodes/CircleRange.cpp
index c21191605..38dc44ed6 100644
--- a/compiler/luci/import/src/Nodes/CircleRange.cpp
+++ b/compiler/luci/import/src/Nodes/CircleRange.cpp
@@ -36,9 +36,9 @@ CircleNode *CircleRangeGraphBuilder::build_node(const circle::OperatorT &,
                                                 loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleRange>();
-  node->start(inputs[0]);
-  node->limit(inputs[1]);
-  node->delta(inputs[2]);
+  node->start(inputs.at(0));
+  node->limit(inputs.at(1));
+  node->delta(inputs.at(2));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleRank.cpp b/compiler/luci/import/src/Nodes/CircleRank.cpp
index 705ae0120..12658b192 100644
--- a/compiler/luci/import/src/Nodes/CircleRank.cpp
+++ b/compiler/luci/import/src/Nodes/CircleRank.cpp
@@ -38,7 +38,7 @@ CircleNode *CircleRankGraphBuilder::build_node(const circle::OperatorT &,
                                                loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleRank>();
-  node->input(inputs[0]);
+  node->input(inputs.at(0));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleReduceAny.cpp b/compiler/luci/import/src/Nodes/CircleReduceAny.cpp
index 030c5304c..21a821951 100644
--- a/compiler/luci/import/src/Nodes/CircleReduceAny.cpp
+++ b/compiler/luci/import/src/Nodes/CircleReduceAny.cpp
@@ -31,8 +31,8 @@ bool CircleReduceAnyGraphBuilder::validate(const ValidateArgs &args) const
     return false;
 
   const auto &tensors = args.reader.tensors();
-  const auto &tensor_0 = tensors.at(inputs[0]);
-  const auto &tensor_1 = tensors.at(inputs[1]);
+  const auto &tensor_0 = tensors.at(inputs.at(0));
+  const auto &tensor_1 = tensors.at(inputs.at(1));
   const auto &tensor_o = tensors.at(outputs[0]);
 
   if (tensor_0->type != circle::TensorType_BOOL)
@@ -57,8 +57,8 @@ CircleNode *CircleReduceAnyGraphBuilder::build_node(const circle::OperatorT &op,
                                                     loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleReduceAny>();
-  node->input(inputs[0]);
-  node->reduction_indices(inputs[1]);
+  node->input(inputs.at(0));
+  node->reduction_indices(inputs.at(1));
 
   const auto *options = op.builtin_options.AsReducerOptions();
   node->keep_dims(options->keep_dims);
diff --git a/compiler/luci/import/src/Nodes/CircleReduceMax.cpp b/compiler/luci/import/src/Nodes/CircleReduceMax.cpp
index 8ca8e2e34..05492dbc6 100644
--- a/compiler/luci/import/src/Nodes/CircleReduceMax.cpp
+++ b/compiler/luci/import/src/Nodes/CircleReduceMax.cpp
@@ -33,7 +33,7 @@ bool CircleReduceMaxGraphBuilder::validate(const ValidateArgs &args) const
     return false;
 
   const auto &tensors = args.reader.tensors();
-  const auto &tensor_axis = tensors.at(inputs[1]);
+  const auto &tensor_axis = tensors.at(inputs.at(1));
 
   switch (tensor_axis->type)
   {
@@ -52,8 +52,8 @@ CircleNode *CircleReduceMaxGraphBuilder::build_node(const circle::OperatorT &op,
                                                     loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleReduceMax>();
-  node->input(inputs[0]);
-  node->reduction_indices(inputs[1]);
+  node->input(inputs.at(0));
+  node->reduction_indices(inputs.at(1));
 
   const auto *options = op.builtin_options.AsReducerOptions();
   node->keep_dims(options->keep_dims);
diff --git a/compiler/luci/import/src/Nodes/CircleReduceMin.cpp b/compiler/luci/import/src/Nodes/CircleReduceMin.cpp
index 3020c3778..117d5295a 100644
--- a/compiler/luci/import/src/Nodes/CircleReduceMin.cpp
+++ b/compiler/luci/import/src/Nodes/CircleReduceMin.cpp
@@ -33,7 +33,7 @@ bool CircleReduceMinGraphBuilder::validate(const ValidateArgs &args) const
     return false;
 
   const auto &tensors = args.reader.tensors();
-  const auto &tensor_axis = tensors.at(inputs[1]);
+  const auto &tensor_axis = tensors.at(inputs.at(1));
 
   switch (tensor_axis->type)
   {
@@ -52,8 +52,8 @@ CircleNode *CircleReduceMinGraphBuilder::build_node(const circle::OperatorT &op,
                                                     loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleReduceMin>();
-  node->input(inputs[0]);
-  node->reduction_indices(inputs[1]);
+  node->input(inputs.at(0));
+  node->reduction_indices(inputs.at(1));
 
   const auto *options = op.builtin_options.AsReducerOptions();
   node->keep_dims(options->keep_dims);
diff --git a/compiler/luci/import/src/Nodes/CircleReduceProd.cpp b/compiler/luci/import/src/Nodes/CircleReduceProd.cpp
index 2bb43f6ce..5f054586e 100644
--- a/compiler/luci/import/src/Nodes/CircleReduceProd.cpp
+++ b/compiler/luci/import/src/Nodes/CircleReduceProd.cpp
@@ -30,7 +30,7 @@ bool CircleReduceProdGraphBuilder::validate(const ValidateArgs &args) const
     return false;
 
   const auto &tensors = args.reader.tensors();
-  const auto &tensor_1 = tensors.at(inputs[1]);
+  const auto &tensor_1 = tensors.at(inputs.at(1));
 
   // TODO check input types
 
@@ -52,8 +52,8 @@ CircleNode *CircleReduceProdGraphBuilder::build_node(const circle::OperatorT &op
                                                      loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleReduceProd>();
-  node->input(inputs[0]);
-  node->reduction_indices(inputs[1]);
+  node->input(inputs.at(0));
+  node->reduction_indices(inputs.at(1));
 
   const auto *options = op.builtin_options.AsReducerOptions();
   node->keep_dims(options->keep_dims);
diff --git a/compiler/luci/import/src/Nodes/CircleRelu.cpp b/compiler/luci/import/src/Nodes/CircleRelu.cpp
index 056268a5b..8e1c32a3a 100644
--- a/compiler/luci/import/src/Nodes/CircleRelu.cpp
+++ b/compiler/luci/import/src/Nodes/CircleRelu.cpp
@@ -39,7 +39,7 @@ CircleNode *CircleReluGraphBuilder::build_node(const circle::OperatorT &,
                                                loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleRelu>();
-  node->features(inputs[0]);
+  node->features(inputs.at(0));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleRelu6.cpp b/compiler/luci/import/src/Nodes/CircleRelu6.cpp
index 5b443993b..0283d7350 100644
--- a/compiler/luci/import/src/Nodes/CircleRelu6.cpp
+++ b/compiler/luci/import/src/Nodes/CircleRelu6.cpp
@@ -39,7 +39,7 @@ CircleNode *CircleRelu6GraphBuilder::build_node(const circle::OperatorT &,
                                                 loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleRelu6>();
-  node->features(inputs[0]);
+  node->features(inputs.at(0));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleReluN1To1.cpp b/compiler/luci/import/src/Nodes/CircleReluN1To1.cpp
index edf662fb9..7f517bc0d 100644
--- a/compiler/luci/import/src/Nodes/CircleReluN1To1.cpp
+++ b/compiler/luci/import/src/Nodes/CircleReluN1To1.cpp
@@ -41,7 +41,7 @@ CircleNode *CircleReluN1To1GraphBuilder::build_node(const circle::OperatorT &,
                                                     loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleReluN1To1>();
-  node->features(inputs[0]);
+  node->features(inputs.at(0));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleReshape.cpp b/compiler/luci/import/src/Nodes/CircleReshape.cpp
index f72c152b1..996ae9d20 100644
--- a/compiler/luci/import/src/Nodes/CircleReshape.cpp
+++ b/compiler/luci/import/src/Nodes/CircleReshape.cpp
@@ -62,7 +62,7 @@ CircleNode *CircleReshapeGraphBuilder::build_node(const circle::OperatorT &op,
 {
   // If the second input is not provided, generate it based on the value of the attribute.
   // TODO Presence of the second input is the current requirement of the IR.
-  auto *shape_node = (inputs.size() == 2) ? inputs[1] : nullptr;
+  auto *shape_node = (inputs.size() == 2) ? inputs.at(1) : nullptr;
   if (shape_node == nullptr)
   {
     const auto *options = op.builtin_options.AsReshapeOptions();
@@ -77,7 +77,7 @@ CircleNode *CircleReshapeGraphBuilder::build_node(const circle::OperatorT &op,
   }
 
   auto *node = graph->nodes()->create<CircleReshape>();
-  node->tensor(inputs[0]);
+  node->tensor(inputs.at(0));
   node->shape(shape_node);
 
   const auto *options = op.builtin_options.AsReshapeOptions();
diff --git a/compiler/luci/import/src/Nodes/CircleResizeBilinear.cpp b/compiler/luci/import/src/Nodes/CircleResizeBilinear.cpp
index 6128f1b86..0fccb7b44 100644
--- a/compiler/luci/import/src/Nodes/CircleResizeBilinear.cpp
+++ b/compiler/luci/import/src/Nodes/CircleResizeBilinear.cpp
@@ -38,8 +38,8 @@ CircleNode *CircleResizeBilinearGraphBuilder::build_node(const circle::OperatorT
                                                          loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleResizeBilinear>();
-  node->input(inputs[0]);
-  node->size(inputs[1]);
+  node->input(inputs.at(0));
+  node->size(inputs.at(1));
 
   const auto *options = op.builtin_options.AsResizeBilinearOptions();
   node->align_corners(options->align_corners);
diff --git a/compiler/luci/import/src/Nodes/CircleResizeNearestNeighbor.cpp b/compiler/luci/import/src/Nodes/CircleResizeNearestNeighbor.cpp
index a1f1ef0ff..324323f59 100644
--- a/compiler/luci/import/src/Nodes/CircleResizeNearestNeighbor.cpp
+++ b/compiler/luci/import/src/Nodes/CircleResizeNearestNeighbor.cpp
@@ -37,8 +37,8 @@ CircleNode *CircleResizeNearestNeighborGraphBuilder::build_node(
     const circle::OperatorT &op, const std::vector<CircleNode *> &inputs, loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleResizeNearestNeighbor>();
-  node->input(inputs[0]);
-  node->size(inputs[1]);
+  node->input(inputs.at(0));
+  node->size(inputs.at(1));
 
   const auto *options = op.builtin_options.AsResizeNearestNeighborOptions();
   node->align_corners(options->align_corners);
diff --git a/compiler/luci/import/src/Nodes/CircleReverseSequence.cpp b/compiler/luci/import/src/Nodes/CircleReverseSequence.cpp
index 72d3b153d..ad11d4c63 100644
--- a/compiler/luci/import/src/Nodes/CircleReverseSequence.cpp
+++ b/compiler/luci/import/src/Nodes/CircleReverseSequence.cpp
@@ -34,8 +34,8 @@ bool CircleReverseSequenceGraphBuilder::validate(const ValidateArgs &args) const
     return false;
 
   const auto &tensors = args.reader.tensors();
-  const auto &tensor_in = tensors.at(inputs[0]);
-  const auto &tensor_lengths = tensors.at(inputs[1]);
+  const auto &tensor_in = tensors.at(inputs.at(0));
+  const auto &tensor_lengths = tensors.at(inputs.at(1));
   const auto &tensor_out = tensors.at(outputs[0]);
 
   switch (tensor_lengths->type)
@@ -58,8 +58,8 @@ CircleNode *CircleReverseSequenceGraphBuilder::build_node(const circle::Operator
                                                           loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleReverseSequence>();
-  node->input(inputs[0]);
-  node->seq_lengths(inputs[1]);
+  node->input(inputs.at(0));
+  node->seq_lengths(inputs.at(1));
 
   const auto *options = op.builtin_options.AsReverseSequenceOptions();
   node->seq_axis(options->seq_dim);
diff --git a/compiler/luci/import/src/Nodes/CircleReverseV2.cpp b/compiler/luci/import/src/Nodes/CircleReverseV2.cpp
index cd18128a7..e2e53bb4b 100644
--- a/compiler/luci/import/src/Nodes/CircleReverseV2.cpp
+++ b/compiler/luci/import/src/Nodes/CircleReverseV2.cpp
@@ -34,8 +34,8 @@ bool CircleReverseV2GraphBuilder::validate(const ValidateArgs &args) const
     return false;
 
   const auto &tensors = args.reader.tensors();
-  const auto &tensor_in = tensors.at(inputs[0]);
-  const auto &tensor_axis = tensors.at(inputs[1]);
+  const auto &tensor_in = tensors.at(inputs.at(0));
+  const auto &tensor_axis = tensors.at(inputs.at(1));
   const auto &tensor_out = tensors.at(outputs[0]);
 
   switch (tensor_axis->type)
@@ -58,8 +58,8 @@ CircleNode *CircleReverseV2GraphBuilder::build_node(const circle::OperatorT &,
                                                     loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleReverseV2>();
-  node->tensor(inputs[0]);
-  node->axis(inputs[1]);
+  node->tensor(inputs.at(0));
+  node->axis(inputs.at(1));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleRound.cpp b/compiler/luci/import/src/Nodes/CircleRound.cpp
index 896489521..ad77f9f03 100644
--- a/compiler/luci/import/src/Nodes/CircleRound.cpp
+++ b/compiler/luci/import/src/Nodes/CircleRound.cpp
@@ -37,7 +37,7 @@ bool CircleRoundGraphBuilder::validate(const ValidateArgs &args) const
   // bfloat16, half (float16), float32, float64, complex64, complex128
   // Currently, circle supports float16, float32, complex64
   const auto &tensors = args.reader.tensors();
-  const auto &tensor_in = tensors.at(inputs[0]);
+  const auto &tensor_in = tensors.at(inputs.at(0));
   const auto &tensor_out = tensors.at(outputs[0]);
 
   switch (tensor_in->type)
@@ -63,7 +63,7 @@ CircleNode *CircleRoundGraphBuilder::build_node(const circle::OperatorT &,
                                                 loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleRound>();
-  node->x(inputs[0]);
+  node->x(inputs.at(0));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleRsqrt.cpp b/compiler/luci/import/src/Nodes/CircleRsqrt.cpp
index b5de0b575..ae05fbbf9 100644
--- a/compiler/luci/import/src/Nodes/CircleRsqrt.cpp
+++ b/compiler/luci/import/src/Nodes/CircleRsqrt.cpp
@@ -33,7 +33,7 @@ bool CircleRsqrtGraphBuilder::validate(const ValidateArgs &args) const
   // bfloat16, half (float16), float32, float64, complex64, complex128
   // Currently, circle supports float16, float32, complex64
   const auto &tensors = args.reader.tensors();
-  const auto &tensor = tensors.at(inputs[0]);
+  const auto &tensor = tensors.at(inputs.at(0));
   switch (tensor->type)
   {
     case circle::TensorType_FLOAT16:
@@ -52,7 +52,7 @@ CircleNode *CircleRsqrtGraphBuilder::build_node(const circle::OperatorT &,
                                                 loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleRsqrt>();
-  node->x(inputs[0]);
+  node->x(inputs.at(0));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleScatterNd.cpp b/compiler/luci/import/src/Nodes/CircleScatterNd.cpp
index adcaa0030..7f86aeb74 100644
--- a/compiler/luci/import/src/Nodes/CircleScatterNd.cpp
+++ b/compiler/luci/import/src/Nodes/CircleScatterNd.cpp
@@ -32,12 +32,12 @@ bool CircleScatterNdGraphBuilder::validate(const ValidateArgs &args) const
   // indices must have the same type as shape
   const auto &tensors = args.reader.tensors();
 
-  if (tensors[inputs[0]]->type != tensors[inputs[2]]->type)
+  if (tensors[inputs.at(0)]->type != tensors[inputs.at(2)]->type)
     return false;
 
   // indices must be either int32 or int64
-  if (tensors[inputs[0]]->type != circle::TensorType_INT32 &&
-      tensors[inputs[0]]->type != circle::TensorType_INT64)
+  if (tensors[inputs.at(0)]->type != circle::TensorType_INT32 &&
+      tensors[inputs.at(0)]->type != circle::TensorType_INT64)
     return false;
 
   return true;
@@ -48,9 +48,9 @@ CircleNode *CircleScatterNdGraphBuilder::build_node(const circle::OperatorT &,
                                                     loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleScatterNd>();
-  node->indices(inputs[0]);
-  node->updates(inputs[1]);
-  node->shape(inputs[2]);
+  node->indices(inputs.at(0));
+  node->updates(inputs.at(1));
+  node->shape(inputs.at(2));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleSegmentSum.cpp b/compiler/luci/import/src/Nodes/CircleSegmentSum.cpp
index 1122bdca3..fb84e5d52 100644
--- a/compiler/luci/import/src/Nodes/CircleSegmentSum.cpp
+++ b/compiler/luci/import/src/Nodes/CircleSegmentSum.cpp
@@ -33,9 +33,9 @@ bool CircleSegmentSumGraphBuilder::validate(const ValidateArgs &args) const
     return false;
 
   const auto &tensors = args.reader.tensors();
-  const auto &tensor_in = tensors.at(inputs[0]);
+  const auto &tensor_in = tensors.at(inputs.at(0));
   const auto &tensor_out = tensors.at(outputs[0]);
-  const auto &tensor_ids = tensors.at(inputs[1]);
+  const auto &tensor_ids = tensors.at(inputs.at(1));
 
   switch (tensor_ids->type)
   {
@@ -59,8 +59,8 @@ CircleNode *CircleSegmentSumGraphBuilder::build_node(const circle::OperatorT &,
                                                      loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleSegmentSum>();
-  node->input(inputs[0]);
-  node->segment_ids(inputs[1]);
+  node->input(inputs.at(0));
+  node->segment_ids(inputs.at(1));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleSelect.cpp b/compiler/luci/import/src/Nodes/CircleSelect.cpp
index ff94212c3..1e649f1e0 100644
--- a/compiler/luci/import/src/Nodes/CircleSelect.cpp
+++ b/compiler/luci/import/src/Nodes/CircleSelect.cpp
@@ -33,7 +33,7 @@ bool CircleSelectGraphBuilder::validate(const ValidateArgs &args) const
     return false;
 
   const auto &tensors = args.reader.tensors();
-  const auto &tensor = tensors.at(inputs[0]);
+  const auto &tensor = tensors.at(inputs.at(0));
   if (tensor->type != circle::TensorType_BOOL)
     return false;
   // TODO check dtypes for input 1, 2
@@ -46,9 +46,9 @@ CircleNode *CircleSelectGraphBuilder::build_node(const circle::OperatorT &,
                                                  loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleSelect>();
-  node->condition(inputs[0]);
-  node->t(inputs[1]);
-  node->e(inputs[2]);
+  node->condition(inputs.at(0));
+  node->t(inputs.at(1));
+  node->e(inputs.at(2));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleSelectV2.cpp b/compiler/luci/import/src/Nodes/CircleSelectV2.cpp
index 78b2e6459..e6dd04de0 100644
--- a/compiler/luci/import/src/Nodes/CircleSelectV2.cpp
+++ b/compiler/luci/import/src/Nodes/CircleSelectV2.cpp
@@ -33,12 +33,12 @@ bool CircleSelectV2GraphBuilder::validate(const ValidateArgs &args) const
     return false;
 
   const auto &tensors = args.reader.tensors();
-  const auto &condition = tensors.at(inputs[0]);
+  const auto &condition = tensors.at(inputs.at(0));
   if (condition->type != circle::TensorType_BOOL)
     return false;
 
-  const auto &t = tensors.at(inputs[1]);
-  const auto &e = tensors.at(inputs[2]);
+  const auto &t = tensors.at(inputs.at(1));
+  const auto &e = tensors.at(inputs.at(2));
   if (t->type != e->type)
     return false;
 
@@ -50,9 +50,9 @@ CircleNode *CircleSelectV2GraphBuilder::build_node(const circle::OperatorT &,
                                                    loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleSelectV2>();
-  node->condition(inputs[0]);
-  node->t(inputs[1]);
-  node->e(inputs[2]);
+  node->condition(inputs.at(0));
+  node->t(inputs.at(1));
+  node->e(inputs.at(2));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleShape.cpp b/compiler/luci/import/src/Nodes/CircleShape.cpp
index 864b5eb51..bd7dfc9d9 100644
--- a/compiler/luci/import/src/Nodes/CircleShape.cpp
+++ b/compiler/luci/import/src/Nodes/CircleShape.cpp
@@ -42,7 +42,7 @@ CircleNode *CircleShapeGraphBuilder::build_node(const circle::OperatorT &op,
                                                 loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleShape>();
-  node->input(inputs[0]);
+  node->input(inputs.at(0));
 
   const auto *options = op.builtin_options.AsShapeOptions();
   node->out_type(luci_datatype(options->out_type));
diff --git a/compiler/luci/import/src/Nodes/CircleSin.cpp b/compiler/luci/import/src/Nodes/CircleSin.cpp
index 61d60c78f..4b245ef6b 100644
--- a/compiler/luci/import/src/Nodes/CircleSin.cpp
+++ b/compiler/luci/import/src/Nodes/CircleSin.cpp
@@ -33,7 +33,7 @@ bool CircleSinGraphBuilder::validate(const ValidateArgs &args) const
 
   // input type check
   const auto &tensors = args.reader.tensors();
-  const auto &tensor = tensors.at(inputs[0]);
+  const auto &tensor = tensors.at(inputs.at(0));
   switch (tensor->type)
   {
     case circle::TensorType_FLOAT16:
@@ -53,7 +53,7 @@ CircleNode *CircleSinGraphBuilder::build_node(const circle::OperatorT &,
                                               loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleSin>();
-  node->x(inputs[0]);
+  node->x(inputs.at(0));
 
   // No options for Sin
 
diff --git a/compiler/luci/import/src/Nodes/CircleSlice.cpp b/compiler/luci/import/src/Nodes/CircleSlice.cpp
index 313c35599..8601fbf21 100644
--- a/compiler/luci/import/src/Nodes/CircleSlice.cpp
+++ b/compiler/luci/import/src/Nodes/CircleSlice.cpp
@@ -42,9 +42,9 @@ CircleNode *CircleSliceGraphBuilder::build_node(const circle::OperatorT &,
                                                 loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleSlice>();
-  node->input(inputs[0]);
-  node->begin(inputs[1]);
-  node->size(inputs[2]);
+  node->input(inputs.at(0));
+  node->begin(inputs.at(1));
+  node->size(inputs.at(2));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleSoftmax.cpp b/compiler/luci/import/src/Nodes/CircleSoftmax.cpp
index 0d316e18c..0ef0b5418 100644
--- a/compiler/luci/import/src/Nodes/CircleSoftmax.cpp
+++ b/compiler/luci/import/src/Nodes/CircleSoftmax.cpp
@@ -38,7 +38,7 @@ CircleNode *CircleSoftmaxGraphBuilder::build_node(const circle::OperatorT &op,
                                                   loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleSoftmax>();
-  node->logits(inputs[0]);
+  node->logits(inputs.at(0));
 
   const auto *options = op.builtin_options.AsSoftmaxOptions();
   node->beta(options->beta);
diff --git a/compiler/luci/import/src/Nodes/CircleSpaceToBatchND.cpp b/compiler/luci/import/src/Nodes/CircleSpaceToBatchND.cpp
index f1361fb11..c1d508e3e 100644
--- a/compiler/luci/import/src/Nodes/CircleSpaceToBatchND.cpp
+++ b/compiler/luci/import/src/Nodes/CircleSpaceToBatchND.cpp
@@ -33,7 +33,7 @@ bool CircleSpaceToBatchNDGraphBuilder::validate(const ValidateArgs &args) const
 
   // input 1 and 2 should have INT32/INT64 type
   const auto &tensors = args.reader.tensors();
-  const auto &tensor_1 = tensors.at(inputs[1]);
+  const auto &tensor_1 = tensors.at(inputs.at(1));
   switch (tensor_1->type)
   {
     case circle::TensorType_INT32:
@@ -42,7 +42,7 @@ bool CircleSpaceToBatchNDGraphBuilder::validate(const ValidateArgs &args) const
     default:
       return false;
   }
-  const auto &tensor_2 = tensors.at(inputs[2]);
+  const auto &tensor_2 = tensors.at(inputs.at(2));
   switch (tensor_2->type)
   {
     case circle::TensorType_INT32:
@@ -53,7 +53,7 @@ bool CircleSpaceToBatchNDGraphBuilder::validate(const ValidateArgs &args) const
   }
 
   // Only support input shape dimension 3 and 4 only
-  const auto &tensor_0 = tensors.at(inputs[0]);
+  const auto &tensor_0 = tensors.at(inputs.at(0));
   const auto t_0_s = tensor_0->shape.size();
   if (t_0_s != 3 && t_0_s != 4)
     return false;
@@ -68,9 +68,9 @@ CircleNode *CircleSpaceToBatchNDGraphBuilder::build_node(const circle::OperatorT
                                                          loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleSpaceToBatchND>();
-  node->input(inputs[0]);
-  node->block_shape(inputs[1]);
-  node->paddings(inputs[2]);
+  node->input(inputs.at(0));
+  node->block_shape(inputs.at(1));
+  node->paddings(inputs.at(2));
 
   // No options for SpaceToBatchND
 
diff --git a/compiler/luci/import/src/Nodes/CircleSpaceToDepth.cpp b/compiler/luci/import/src/Nodes/CircleSpaceToDepth.cpp
index b612c9a9a..8ccd55dc6 100644
--- a/compiler/luci/import/src/Nodes/CircleSpaceToDepth.cpp
+++ b/compiler/luci/import/src/Nodes/CircleSpaceToDepth.cpp
@@ -41,7 +41,7 @@ CircleNode *CircleSpaceToDepthGraphBuilder::build_node(const circle::OperatorT &
                                                        loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleSpaceToDepth>();
-  node->input(inputs[0]);
+  node->input(inputs.at(0));
 
   const auto *options = op.builtin_options.AsSpaceToDepthOptions();
   node->block_size(options->block_size);
diff --git a/compiler/luci/import/src/Nodes/CircleSparseToDense.cpp b/compiler/luci/import/src/Nodes/CircleSparseToDense.cpp
index bfe790fc1..26d575e90 100644
--- a/compiler/luci/import/src/Nodes/CircleSparseToDense.cpp
+++ b/compiler/luci/import/src/Nodes/CircleSparseToDense.cpp
@@ -36,10 +36,10 @@ CircleNode *CircleSparseToDenseGraphBuilder::build_node(const circle::OperatorT
                                                         loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleSparseToDense>();
-  node->indices(inputs[0]);
-  node->output_shape(inputs[1]);
-  node->values(inputs[2]);
-  node->default_value(inputs[3]);
+  node->indices(inputs.at(0));
+  node->output_shape(inputs.at(1));
+  node->values(inputs.at(2));
+  node->default_value(inputs.at(3));
 
   const auto *options = op.builtin_options.AsSparseToDenseOptions();
   node->validate_indices(options->validate_indices);
diff --git a/compiler/luci/import/src/Nodes/CircleSqrt.cpp b/compiler/luci/import/src/Nodes/CircleSqrt.cpp
index 8a90f6691..c8beaee0d 100644
--- a/compiler/luci/import/src/Nodes/CircleSqrt.cpp
+++ b/compiler/luci/import/src/Nodes/CircleSqrt.cpp
@@ -36,7 +36,7 @@ CircleNode *CircleSqrtGraphBuilder::build_node(const circle::OperatorT &,
                                                loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleSqrt>();
-  node->x(inputs[0]);
+  node->x(inputs.at(0));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleSquare.cpp b/compiler/luci/import/src/Nodes/CircleSquare.cpp
index 8398548b6..b5ba048d7 100644
--- a/compiler/luci/import/src/Nodes/CircleSquare.cpp
+++ b/compiler/luci/import/src/Nodes/CircleSquare.cpp
@@ -33,7 +33,7 @@ bool CircleSquareGraphBuilder::validate(const ValidateArgs &args) const
   // bfloat16, half (float16), float32, float64, complex64, complex128
   // Currently, circle supports float16, float32, complex64
   const auto &tensors = args.reader.tensors();
-  const auto &tensor = tensors.at(inputs[0]);
+  const auto &tensor = tensors.at(inputs.at(0));
   switch (tensor->type)
   {
     case circle::TensorType_INT32:
@@ -55,7 +55,7 @@ CircleNode *CircleSquareGraphBuilder::build_node(const circle::OperatorT &,
                                                  loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleSquare>();
-  node->x(inputs[0]);
+  node->x(inputs.at(0));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleSquaredDifference.cpp b/compiler/luci/import/src/Nodes/CircleSquaredDifference.cpp
index 93ce959e2..6deae94c5 100644
--- a/compiler/luci/import/src/Nodes/CircleSquaredDifference.cpp
+++ b/compiler/luci/import/src/Nodes/CircleSquaredDifference.cpp
@@ -37,7 +37,7 @@ bool CircleSquaredDifferenceGraphBuilder::validate(const ValidateArgs &args) con
   // Inputs must be one of the following types
   // bfloat16, half(float16), float32, float64, int32, int64, complex64, complex128
   const auto &tensors = args.reader.tensors();
-  const auto &tensor = tensors.at(inputs[0]);
+  const auto &tensor = tensors.at(inputs.at(0));
   switch (tensor->type)
   {
     case circle::TensorType_FLOAT16:
@@ -53,11 +53,11 @@ bool CircleSquaredDifferenceGraphBuilder::validate(const ValidateArgs &args) con
   }
 
   // Input types must match
-  if (tensors.at(inputs[0])->type != tensors.at(inputs[1])->type)
+  if (tensors.at(inputs.at(0))->type != tensors.at(inputs.at(1))->type)
     return false;
 
   // Input and output types must match
-  if (tensors.at(inputs[0])->type != tensors.at(outputs[0])->type)
+  if (tensors.at(inputs.at(0))->type != tensors.at(outputs[0])->type)
     return false;
 
   return true;
@@ -68,8 +68,8 @@ CircleNode *CircleSquaredDifferenceGraphBuilder::build_node(const circle::Operat
                                                             loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleSquaredDifference>();
-  node->x(inputs[0]);
-  node->y(inputs[1]);
+  node->x(inputs.at(0));
+  node->y(inputs.at(1));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleSqueeze.cpp b/compiler/luci/import/src/Nodes/CircleSqueeze.cpp
index a5252d0bb..32792c266 100644
--- a/compiler/luci/import/src/Nodes/CircleSqueeze.cpp
+++ b/compiler/luci/import/src/Nodes/CircleSqueeze.cpp
@@ -38,7 +38,7 @@ CircleNode *CircleSqueezeGraphBuilder::build_node(const circle::OperatorT &op,
                                                   loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleSqueeze>();
-  node->input(inputs[0]);
+  node->input(inputs.at(0));
 
   const auto *options = op.builtin_options.AsSqueezeOptions();
   assert(options);
diff --git a/compiler/luci/import/src/Nodes/CircleStridedSlice.cpp b/compiler/luci/import/src/Nodes/CircleStridedSlice.cpp
index 95e446704..8f943a682 100644
--- a/compiler/luci/import/src/Nodes/CircleStridedSlice.cpp
+++ b/compiler/luci/import/src/Nodes/CircleStridedSlice.cpp
@@ -42,10 +42,10 @@ CircleNode *CircleStridedSliceGraphBuilder::build_node(const circle::OperatorT &
                                                        loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleStridedSlice>();
-  node->input(inputs[0]);
-  node->begin(inputs[1]);
-  node->end(inputs[2]);
-  node->strides(inputs[3]);
+  node->input(inputs.at(0));
+  node->begin(inputs.at(1));
+  node->end(inputs.at(2));
+  node->strides(inputs.at(3));
 
   const auto *options = op.builtin_options.AsStridedSliceOptions();
   node->begin_mask(options->begin_mask);
diff --git a/compiler/luci/import/src/Nodes/CircleSub.cpp b/compiler/luci/import/src/Nodes/CircleSub.cpp
index 968e9f51f..9acf83d40 100644
--- a/compiler/luci/import/src/Nodes/CircleSub.cpp
+++ b/compiler/luci/import/src/Nodes/CircleSub.cpp
@@ -39,8 +39,8 @@ CircleNode *CircleSubGraphBuilder::build_node(const circle::OperatorT &op,
                                               loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleSub>();
-  node->x(inputs[0]);
-  node->y(inputs[1]);
+  node->x(inputs.at(0));
+  node->y(inputs.at(1));
 
   const auto *options = op.builtin_options.AsSubOptions();
   node->fusedActivationFunction(luci_actfunc(options->fused_activation_function));
diff --git a/compiler/luci/import/src/Nodes/CircleSum.cpp b/compiler/luci/import/src/Nodes/CircleSum.cpp
index b4865de59..bd3cb6239 100644
--- a/compiler/luci/import/src/Nodes/CircleSum.cpp
+++ b/compiler/luci/import/src/Nodes/CircleSum.cpp
@@ -34,8 +34,8 @@ CircleNode *CircleSumGraphBuilder::build_node(const circle::OperatorT &op,
                                               loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleSum>();
-  node->input(inputs[0]);
-  node->reduction_indices(inputs[1]);
+  node->input(inputs.at(0));
+  node->reduction_indices(inputs.at(1));
 
   const auto *options = op.builtin_options.AsReducerOptions();
   node->keep_dims(options->keep_dims);
diff --git a/compiler/luci/import/src/Nodes/CircleTanh.cpp b/compiler/luci/import/src/Nodes/CircleTanh.cpp
index 8986378c4..018f5701b 100644
--- a/compiler/luci/import/src/Nodes/CircleTanh.cpp
+++ b/compiler/luci/import/src/Nodes/CircleTanh.cpp
@@ -28,21 +28,13 @@ bool CircleTanhGraphBuilder::validate(const ValidateArgs &args) const
   const auto &inputs = args.op.inputs;
   if (inputs.size() != 1)
     return false;
+  const auto &outputs = args.op.outputs;
+  if (outputs.size() != 1)
+    return false;
 
-  // Must be one of the following types
-  // bfloat16, half (float16), float32, float64, complex64, complex128
-  // Currently, circle supports float16, float32, complex64
   const auto &tensors = args.reader.tensors();
-  const auto &tensor = tensors.at(inputs[0]);
-  switch (tensor->type)
-  {
-    case circle::TensorType_FLOAT16:
-    case circle::TensorType_FLOAT32:
-    case circle::TensorType_COMPLEX64:
-      break;
-    default:
-      return false;
-  }
+  if (tensors.at(inputs.at(0))->type != tensors.at(outputs[0])->type)
+    return false;
 
   return true;
 }
@@ -52,7 +44,7 @@ CircleNode *CircleTanhGraphBuilder::build_node(const circle::OperatorT &,
                                                loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleTanh>();
-  node->x(inputs[0]);
+  node->x(inputs.at(0));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleTile.cpp b/compiler/luci/import/src/Nodes/CircleTile.cpp
index 91054ce7f..bc6f320ba 100644
--- a/compiler/luci/import/src/Nodes/CircleTile.cpp
+++ b/compiler/luci/import/src/Nodes/CircleTile.cpp
@@ -34,10 +34,10 @@ bool CircleTileGraphBuilder::validate(const ValidateArgs &args) const
   if (outputs.size() != 1)
     return false;
 
-  // Multiples (inputs[1]) must be one of the following types
+  // Multiples (inputs.at(1)) must be one of the following types
   // int32, int64
   const auto &tensors = args.reader.tensors();
-  const auto &tensor = tensors.at(inputs[1]);
+  const auto &tensor = tensors.at(inputs.at(1));
   switch (tensor->type)
   {
     case circle::TensorType_INT32:
@@ -48,7 +48,7 @@ bool CircleTileGraphBuilder::validate(const ValidateArgs &args) const
   }
 
   // Type of input and output must be the same
-  if (tensors.at(inputs[0])->type != tensors.at(outputs[0])->type)
+  if (tensors.at(inputs.at(0))->type != tensors.at(outputs[0])->type)
     return false;
 
   return true;
@@ -59,8 +59,8 @@ CircleNode *CircleTileGraphBuilder::build_node(const circle::OperatorT &,
                                                loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleTile>();
-  node->input(inputs[0]);
-  node->multiples(inputs[1]);
+  node->input(inputs.at(0));
+  node->multiples(inputs.at(1));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleTopKV2.cpp b/compiler/luci/import/src/Nodes/CircleTopKV2.cpp
index 5c1051c43..f0677de86 100644
--- a/compiler/luci/import/src/Nodes/CircleTopKV2.cpp
+++ b/compiler/luci/import/src/Nodes/CircleTopKV2.cpp
@@ -36,7 +36,7 @@ bool CircleTopKV2GraphBuilder::validate(const ValidateArgs &args) const
     return false;
 
   const auto &tensors = args.reader.tensors();
-  const auto &tensor = tensors.at(inputs[1]);
+  const auto &tensor = tensors.at(inputs.at(1));
   if (tensor->type != circle::TensorType_INT32)
     return false;
 
diff --git a/compiler/luci/import/src/Nodes/CircleTranspose.cpp b/compiler/luci/import/src/Nodes/CircleTranspose.cpp
index 8622c8b80..cc3153085 100644
--- a/compiler/luci/import/src/Nodes/CircleTranspose.cpp
+++ b/compiler/luci/import/src/Nodes/CircleTranspose.cpp
@@ -39,8 +39,8 @@ CircleNode *CircleTransposeGraphBuilder::build_node(const circle::OperatorT &op,
                                                     loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleTranspose>();
-  node->a(inputs[0]);
-  node->perm(inputs[1]);
+  node->a(inputs.at(0));
+  node->perm(inputs.at(1));
 
   const auto *options = op.builtin_options.AsTransposeOptions();
   (void)options;
diff --git a/compiler/luci/import/src/Nodes/CircleTransposeConv.cpp b/compiler/luci/import/src/Nodes/CircleTransposeConv.cpp
index 7bdf46daa..ddb196657 100644
--- a/compiler/luci/import/src/Nodes/CircleTransposeConv.cpp
+++ b/compiler/luci/import/src/Nodes/CircleTransposeConv.cpp
@@ -30,6 +30,24 @@ bool CircleTransposeConvGraphBuilder::validate(const ValidateArgs &args) const
   if (args.op.inputs.size() != 3)
     return false;
 
+  const auto &inputs = args.op.inputs;
+  const auto &tensors = args.reader.tensors();
+  const auto &filter_tensor = tensors.at(inputs.at(1));
+  const auto &filter_shape = filter_tensor.get()->shape;
+  const auto &ifm_tensor = tensors.at(inputs.at(2));
+  const auto &ifm_shape = ifm_tensor.get()->shape;
+
+  // ifm and filters must be 4-D tensor
+  if (ifm_shape.size() != 4)
+    return false;
+  if (filter_shape.size() != 4)
+    return false;
+
+  // input shape : [batch, height, width, in_channels]
+  // filters shape : [output_channels, height, weight, in_channels]
+  if (ifm_tensor.get()->shape.at(3) != filter_tensor.get()->shape.at(3))
+    return false;
+
   return true;
 }
 
@@ -39,9 +57,9 @@ CircleNode *CircleTransposeConvGraphBuilder::build_node(const circle::OperatorT
 {
   auto *node = graph->nodes()->create<CircleTransposeConv>();
 
-  node->inputSizes(inputs[0]);
-  node->filter(inputs[1]);
-  node->outBackprop(inputs[2]);
+  node->inputSizes(inputs.at(0));
+  node->filter(inputs.at(1));
+  node->outBackprop(inputs.at(2));
 
   const auto *options = op.builtin_options.AsTransposeConvOptions();
   node->padding(luci_padding(options->padding));
diff --git a/compiler/luci/import/src/Nodes/CircleUnique.cpp b/compiler/luci/import/src/Nodes/CircleUnique.cpp
new file mode 100644
index 000000000..5e79a2920
--- /dev/null
+++ b/compiler/luci/import/src/Nodes/CircleUnique.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/Import/Nodes/CircleUnique.h"
+
+#include <luci/IR/Nodes/CircleUnique.h>
+#include <luci/IR/Nodes/CircleUniqueOut.h>
+
+#include <loco.h>
+
+namespace luci
+{
+
+bool CircleUniqueGraphBuilder::validate(const ValidateArgs &args) const
+{
+  if (args.op.inputs.size() != 1)
+    return false;
+
+  if (args.op.outputs.size() != 2)
+    return false;
+
+  return true;
+}
+
+void CircleUniqueGraphBuilder::build(const circle::OperatorT &op,
+                                     GraphBuilderContext *context) const
+{
+  assert(context != nullptr);
+
+  auto graph = context->graph();
+
+  const std::vector<int32_t> &inputs = op.inputs;
+  const std::vector<int32_t> &outputs = op.outputs;
+  const auto &tensors = context->reader()->tensors();
+  auto tensors_ptr = context->reader()->tensors_ptr();
+  assert(tensors_ptr != nullptr);
+
+  std::vector<CircleNode *> input_nodes;
+  for (const int32_t input_tensor_index : inputs)
+  {
+    input_nodes.push_back(context->nodefinder()->node(input_tensor_index));
+  }
+
+  // Create CircleUnique
+  auto node = graph->nodes()->create<CircleUnique>();
+  node->input(input_nodes[0]);
+
+  const auto *options = op.builtin_options.AsUniqueOptions();
+  node->output_type(luci_datatype(options->idx_out_type));
+
+  assert(int32_t(outputs.size()) == 2);
+  // Let's use name of output 0 as Unique name
+  const circle::TensorT &output_tensor = *tensors[outputs[0]];
+  node->name(tensor_name(output_tensor));
+
+  // Create virtual outputs of Unique
+  for (int32_t n = 0; n < 2; ++n)
+  {
+    const circle::TensorT &output_tensor = *tensors[outputs[n]];
+
+    auto *nodeout = graph->nodes()->create<CircleUniqueOut>();
+    copy_tensor_attributes(output_tensor, nodeout);
+    // mark shape_status
+    if (tensors_ptr->Get(outputs[n])->shape() == nullptr)
+      nodeout->shape_status(ShapeStatus::NOSHAPE);
+    else
+      nodeout->shape_status(ShapeStatus::VALID);
+
+    nodeout->input(node);
+    nodeout->index(n);
+
+    context->nodefinder()->enroll(outputs[n], nodeout);
+  }
+}
+
+} // namespace luci
diff --git a/compiler/luci/import/src/Nodes/CircleUnpack.cpp b/compiler/luci/import/src/Nodes/CircleUnpack.cpp
index c4282e24f..9e7f3d3e1 100644
--- a/compiler/luci/import/src/Nodes/CircleUnpack.cpp
+++ b/compiler/luci/import/src/Nodes/CircleUnpack.cpp
@@ -59,7 +59,7 @@ bool CircleUnpackGraphBuilder::validate(const ValidateArgs &args) const
     return false;
 
   const auto &tensors = args.reader.tensors();
-  const auto &tensor = tensors.at(inputs[0]);
+  const auto &tensor = tensors.at(inputs.at(0));
   const auto &shape = tensor->shape;
   auto shape_size = static_cast<int32_t>(shape.size());
   if (shape_size > 0)
diff --git a/compiler/luci/import/src/Nodes/CircleWhere.cpp b/compiler/luci/import/src/Nodes/CircleWhere.cpp
index a13c4d6c9..f4c5f0c66 100644
--- a/compiler/luci/import/src/Nodes/CircleWhere.cpp
+++ b/compiler/luci/import/src/Nodes/CircleWhere.cpp
@@ -35,7 +35,7 @@ bool CircleWhereGraphBuilder::validate(const ValidateArgs &args) const
     return false;
 
   const auto &tensors = args.reader.tensors();
-  const auto &tensor_condition = tensors.at(inputs[0]);
+  const auto &tensor_condition = tensors.at(inputs.at(0));
   const auto &tensor_out = tensors.at(outputs[0]);
 
   if (tensor_condition->type != circle::TensorType_BOOL)
@@ -52,7 +52,7 @@ CircleNode *CircleWhereGraphBuilder::build_node(const circle::OperatorT &,
                                                 loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleWhere>();
-  node->condition(inputs[0]);
+  node->condition(inputs.at(0));
 
   return node;
 }
diff --git a/compiler/luci/import/src/Nodes/CircleZerosLike.cpp b/compiler/luci/import/src/Nodes/CircleZerosLike.cpp
index 4362925cd..e60424def 100644
--- a/compiler/luci/import/src/Nodes/CircleZerosLike.cpp
+++ b/compiler/luci/import/src/Nodes/CircleZerosLike.cpp
@@ -39,7 +39,7 @@ CircleNode *CircleZerosLikeGraphBuilder::build_node(const circle::OperatorT &,
                                                     loco::Graph *graph) const
 {
   auto *node = graph->nodes()->create<CircleZerosLike>();
-  node->input(inputs[0]);
+  node->input(inputs.at(0));
 
   // ZerosLikeOptinos are empty
 
diff --git a/compiler/luci/lang/include/luci/IR/CircleNodes.h b/compiler/luci/lang/include/luci/IR/CircleNodes.h
index 3b31149b2..e57f5bb3e 100644
--- a/compiler/luci/lang/include/luci/IR/CircleNodes.h
+++ b/compiler/luci/lang/include/luci/IR/CircleNodes.h
@@ -70,10 +70,12 @@
 #include "Nodes/CircleMirrorPad.h"
 #include "Nodes/CircleMul.h"
 #include "Nodes/CircleNeg.h"
+#include "Nodes/CircleNonMaxSuppressionV4.h"
 #include "Nodes/CircleNotEqual.h"
 #include "Nodes/CircleOneHot.h"
 #include "Nodes/CirclePack.h"
 #include "Nodes/CirclePad.h"
+#include "Nodes/CirclePadV2.h"
 #include "Nodes/CirclePow.h"
 #include "Nodes/CirclePRelu.h"
 #include "Nodes/CircleRange.h"
@@ -117,6 +119,7 @@
 #include "Nodes/CircleTopKV2.h"
 #include "Nodes/CircleTranspose.h"
 #include "Nodes/CircleTransposeConv.h"
+#include "Nodes/CircleUnique.h"
 #include "Nodes/CircleUnpack.h"
 #include "Nodes/CircleWhere.h"
 #include "Nodes/CircleWhile.h"
@@ -130,7 +133,9 @@
 #include "Nodes/CircleOutput.h"
 #include "Nodes/CircleCustomOut.h"
 #include "Nodes/CircleIfOut.h"
+#include "Nodes/CircleNonMaxSuppressionV4Out.h"
 #include "Nodes/CircleUnpackOut.h"
+#include "Nodes/CircleUniqueOut.h"
 #include "Nodes/CircleSplitOut.h"
 #include "Nodes/CircleSplitVOut.h"
 #include "Nodes/CircleTopKV2Out.h"
diff --git a/compiler/luci/lang/include/luci/IR/CircleNodes.lst b/compiler/luci/lang/include/luci/IR/CircleNodes.lst
index 488dcfb89..801051848 100644
--- a/compiler/luci/lang/include/luci/IR/CircleNodes.lst
+++ b/compiler/luci/lang/include/luci/IR/CircleNodes.lst
@@ -22,7 +22,6 @@ CIRCLE_NODE(BATCHMATMUL, luci::CircleBatchMatMul)
 CIRCLE_NODE(CAST, luci::CircleCast)
 CIRCLE_NODE(CEIL, luci::CircleCeil)
 CIRCLE_NODE(CONCATENATION, luci::CircleConcatenation)
-CIRCLE_NODE(CONST, luci::CircleConst)
 CIRCLE_NODE(CONV_2D, luci::CircleConv2D)
 CIRCLE_NODE(COS, luci::CircleCos)
 CIRCLE_NODE(CUSTOM, luci::CircleCustom)
@@ -64,10 +63,12 @@ CIRCLE_NODE(MINIMUM, luci::CircleMinimum)
 CIRCLE_NODE(MIRROR_PAD, luci::CircleMirrorPad)
 CIRCLE_NODE(MUL, luci::CircleMul)
 CIRCLE_NODE(NEG, luci::CircleNeg)
+CIRCLE_NODE(NON_MAX_SUPPRESSION_V4, luci::CircleNonMaxSuppressionV4)
 CIRCLE_NODE(NOT_EQUAL, luci::CircleNotEqual)
 CIRCLE_NODE(ONE_HOT, luci::CircleOneHot)
 CIRCLE_NODE(PACK, luci::CirclePack)
 CIRCLE_NODE(PAD, luci::CirclePad)
+CIRCLE_NODE(PADV2, luci::CirclePadV2)
 CIRCLE_NODE(POW, luci::CirclePow)
 CIRCLE_NODE(PRELU, luci::CirclePRelu)
 CIRCLE_NODE(RANGE, luci::CircleRange)
@@ -111,6 +112,7 @@ CIRCLE_NODE(TILE, luci::CircleTile)
 CIRCLE_NODE(TOPK_V2, luci::CircleTopKV2)
 CIRCLE_NODE(TRANSPOSE, luci::CircleTranspose)
 CIRCLE_NODE(TRANSPOSE_CONV, luci::CircleTransposeConv)
+CIRCLE_NODE(UNIQUE, luci::CircleUnique)
 CIRCLE_NODE(UNPACK, luci::CircleUnpack)
 CIRCLE_NODE(WHERE, luci::CircleWhere)
 CIRCLE_NODE(WHILE, luci::CircleWhile)
@@ -120,14 +122,17 @@ CIRCLE_NODE(BCQ_FULLY_CONNECTED, luci::CircleBCQFullyConnected)
 CIRCLE_NODE(BCQ_GATHER, luci::CircleBCQGather)
 CIRCLE_NODE(INSTANCE_NORM, luci::CircleInstanceNorm)
 // Virtual node(s)
+CIRCLE_NODE(CIRCLECONST, luci::CircleConst)
 CIRCLE_NODE(CIRCLEINPUT, luci::CircleInput)
 CIRCLE_NODE(CIRCLEOUTPUT, luci::CircleOutput)
 CIRCLE_NODE(CIRCLEOUTPUTDUMMY, luci::CircleOutputDummy)
 CIRCLE_NODE(CIRCLEOUTPUTEXCLUDE, luci::CircleOutputExclude)
 CIRCLE_NODE(CIRCLECUSTOMOUT, luci::CircleCustomOut)
 CIRCLE_NODE(CIRCLEIFOUT, luci::CircleIfOut)
+CIRCLE_NODE(CIRCLENONMAXSUPPRESSIONV4OUT, luci::CircleNonMaxSuppressionV4Out)
 CIRCLE_NODE(CIRCLESPLITOUT, luci::CircleSplitOut)
 CIRCLE_NODE(CIRCLESPLITVOUT, luci::CircleSplitVOut)
 CIRCLE_NODE(CIRCLETOPKV2OUT, luci::CircleTopKV2Out)
+CIRCLE_NODE(CIRCLEUNIQUEOUT, luci::CircleUniqueOut)
 CIRCLE_NODE(CIRCLEUNPACKOUT, luci::CircleUnpackOut)
 CIRCLE_NODE(CIRCLEWHILEOUT, luci::CircleWhileOut)
diff --git a/compiler/luci/lang/include/luci/IR/CircleQuantParam.h b/compiler/luci/lang/include/luci/IR/CircleQuantParam.h
index 7253e657b..694437303 100644
--- a/compiler/luci/lang/include/luci/IR/CircleQuantParam.h
+++ b/compiler/luci/lang/include/luci/IR/CircleQuantParam.h
@@ -29,6 +29,7 @@ struct CircleQuantParam
   std::vector<float> max;
   std::vector<float> scale;
   std::vector<int64_t> zerop;
+  int32_t quantized_dimension{0};
 };
 
 } // namespace luci
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleConst.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleConst.h
index fc671746f..250282049 100644
--- a/compiler/luci/lang/include/luci/IR/Nodes/CircleConst.h
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleConst.h
@@ -31,7 +31,7 @@ namespace luci
  * @brief Class to build tensor data
  * @note  This will not be exported as a specific op
  */
-class CircleConst final : public FixedArityNode<0, CircleNodeImpl<CircleOpcode::CONST>>
+class CircleConst final : public FixedArityNode<0, CircleNodeImpl<CircleOpcode::CIRCLECONST>>
 {
 public:
   CircleConst() = default;
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleNonMaxSuppressionV4.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleNonMaxSuppressionV4.h
new file mode 100644
index 000000000..69f3368c0
--- /dev/null
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleNonMaxSuppressionV4.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_IR_CIRCLE_NON_MAX_SUPPRESSION_V4_H__
+#define __LUCI_IR_CIRCLE_NON_MAX_SUPPRESSION_V4_H__
+
+#include "luci/IR/CircleNodeDecl.h"
+#include "luci/IR/CircleOpcode.h"
+
+#include "luci/IR/LuciNodeMixins.h"
+
+namespace luci
+{
+
+/**
+ * @brief NON_MAX_SUPPRESSION_V4 in Circle
+ */
+class CircleNonMaxSuppressionV4 final
+    : public FixedArityNode<5, CircleNodeImpl<CircleOpcode::NON_MAX_SUPPRESSION_V4>>
+{
+public:
+  loco::Node *boxes(void) const { return at(0)->node(); }
+  void boxes(loco::Node *node) { at(0)->node(node); }
+
+  loco::Node *scores(void) const { return at(1)->node(); }
+  void scores(loco::Node *node) { at(1)->node(node); }
+
+  loco::Node *max_output_size(void) const { return at(2)->node(); }
+  void max_output_size(loco::Node *node) { at(2)->node(node); }
+
+  loco::Node *iou_threshold(void) const { return at(3)->node(); }
+  void iou_threshold(loco::Node *node) { at(3)->node(node); }
+
+  loco::Node *score_threshold(void) const { return at(4)->node(); }
+  void score_threshold(loco::Node *node) { at(4)->node(node); }
+};
+
+} // namespace luci
+
+#endif // __LUCI_IR_CIRCLE_NON_MAX_SUPPRESSION_V4_H__
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleNonMaxSuppressionV4Out.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleNonMaxSuppressionV4Out.h
new file mode 100644
index 000000000..a24dc3e9c
--- /dev/null
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleNonMaxSuppressionV4Out.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_IR_CIRCLE_NONMAXSUPPRESSIONV4OUT_H__
+#define __LUCI_IR_CIRCLE_NONMAXSUPPRESSIONV4OUT_H__
+
+#include "luci/IR/CircleNodeDecl.h"
+#include "luci/IR/CircleOpcode.h"
+
+#include "luci/IR/LuciNodeMixins.h"
+
+namespace luci
+{
+
+/**
+ * @brief Virtual NONMAXSUPPRESSIONV4OUT in Circle
+ */
+class CircleNonMaxSuppressionV4Out final
+    : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::CIRCLENONMAXSUPPRESSIONV4OUT>>
+{
+public:
+  CircleNonMaxSuppressionV4Out() = default;
+
+public:
+  loco::Node *input(void) const { return at(0)->node(); }
+  void input(loco::Node *node) { at(0)->node(node); }
+
+public:
+  int32_t index(void) const { return _index; }
+  void index(int32_t index) { _index = index; }
+
+private:
+  int32_t _index{-1};
+};
+
+} // namespace luci
+
+#endif // __LUCI_IR_CIRCLE_NONMAXSUPPRESSIONV4OUT_H__
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CirclePadV2.h b/compiler/luci/lang/include/luci/IR/Nodes/CirclePadV2.h
new file mode 100644
index 000000000..563cfd9a4
--- /dev/null
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CirclePadV2.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_IR_CIRCLEPADV2_H__
+#define __LUCI_IR_CIRCLEPADV2_H__
+
+#include "luci/IR/CircleNodeDecl.h"
+#include "luci/IR/CircleOpcode.h"
+
+#include "luci/IR/LuciNodeMixins.h"
+
+namespace luci
+{
+
+/**
+ * @brief PADV2 in Circle
+ */
+class CirclePadV2 final : public FixedArityNode<3, CircleNodeImpl<CircleOpcode::PADV2>>
+{
+public:
+  CirclePadV2() = default;
+
+public:
+  loco::Node *input(void) const { return at(0)->node(); }
+  void input(loco::Node *node) { at(0)->node(node); }
+
+  loco::Node *paddings(void) const { return at(1)->node(); }
+  void paddings(loco::Node *node) { at(1)->node(node); }
+
+  loco::Node *constant_values(void) const { return at(2)->node(); }
+  void constant_values(loco::Node *node) { at(2)->node(node); }
+};
+
+} // namespace luci
+
+#endif // __LUCI_IR_CIRCLEPADV2_H__
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleUnique.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleUnique.h
new file mode 100644
index 000000000..719a72362
--- /dev/null
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleUnique.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_IR_CIRCELUNIQUE_H__
+#define __LUCI_IR_CIRCELUNIQUE_H__
+
+#include "luci/IR/CircleNodeDecl.h"
+#include "luci/IR/CircleOpcode.h"
+
+#include "luci/IR/LuciNodeMixins.h"
+
+namespace luci
+{
+
+/**
+ * @brief Unique in Circle
+ */
+class CircleUnique final : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::UNIQUE>>
+{
+public:
+  loco::Node *input(void) const { return at(0)->node(); }
+  void input(loco::Node *node) { at(0)->node(node); }
+
+public:
+  loco::DataType idx_out_type(void) const { return _idx_out_type; }
+  void output_type(loco::DataType ot) { _idx_out_type = ot; }
+
+private:
+  loco::DataType _idx_out_type{loco::DataType::S32};
+};
+
+} // namespace luci
+
+#endif // __LUCI_IR_CIRCELUNIQUE_H__
diff --git a/compiler/luci/lang/include/luci/IR/Nodes/CircleUniqueOut.h b/compiler/luci/lang/include/luci/IR/Nodes/CircleUniqueOut.h
new file mode 100644
index 000000000..f846403e0
--- /dev/null
+++ b/compiler/luci/lang/include/luci/IR/Nodes/CircleUniqueOut.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __LUCI_IR_CIRCLE_UNIQUEOUT_H__
+#define __LUCI_IR_CIRCLE_UNIQUEOUT_H__
+
+#include "luci/IR/CircleNodeDecl.h"
+#include "luci/IR/CircleOpcode.h"
+
+#include "luci/IR/LuciNodeMixins.h"
+
+namespace luci
+{
+
+/**
+ * @brief Virtual CIRCLEUNIQUEOUT in Circle
+ */
+class CircleUniqueOut final
+    : public FixedArityNode<1, CircleNodeImpl<CircleOpcode::CIRCLEUNIQUEOUT>>
+{
+public:
+  CircleUniqueOut() = default;
+
+public:
+  loco::Node *input(void) const { return at(0)->node(); }
+  void input(loco::Node *node) { at(0)->node(node); }
+
+public:
+  int32_t index(void) const { return _index; }
+  void index(int32_t index) { _index = index; }
+
+private:
+  int32_t _index{-1};
+};
+
+} // namespace luci
+
+#endif // __LUCI_IR_CIRCLE_UNIQUEOUT_H__
diff --git a/compiler/luci/lang/src/Module.test.cpp b/compiler/luci/lang/src/Module.test.cpp
index 26bf073be..a5973e52d 100644
--- a/compiler/luci/lang/src/Module.test.cpp
+++ b/compiler/luci/lang/src/Module.test.cpp
@@ -22,7 +22,7 @@ TEST(ModuleTest, consturctor)
 {
   auto gs = luci::make_module();
 
-  GTEST_SUCCEED();
+  SUCCEED();
 }
 
 TEST(ModuleTest, add)
diff --git a/compiler/luci/lang/src/Nodes/CircleCustom.test.cpp b/compiler/luci/lang/src/Nodes/CircleCustom.test.cpp
index 74ea82c6c..c07268cbf 100644
--- a/compiler/luci/lang/src/Nodes/CircleCustom.test.cpp
+++ b/compiler/luci/lang/src/Nodes/CircleCustom.test.cpp
@@ -35,7 +35,12 @@ TEST(CircleCustomTest, constructor)
   ASSERT_EQ(0, custom_node.custom_code().size());
 }
 
-TEST(CircleCustomTest, constructor_NEG) { ASSERT_DEBUG_DEATH(luci::CircleCustom{0}, ""); }
+TEST(CircleCustomTest, constructor_NEG)
+{
+  ASSERT_DEBUG_DEATH(luci::CircleCustom{0}, "");
+
+  SUCCEED();
+}
 
 TEST(CircleCustomTest, invalidIndex_NEG)
 {
diff --git a/compiler/luci/lang/src/Nodes/CircleIf.test.cpp b/compiler/luci/lang/src/Nodes/CircleIf.test.cpp
index e3c8c9f60..35f28e9ac 100644
--- a/compiler/luci/lang/src/Nodes/CircleIf.test.cpp
+++ b/compiler/luci/lang/src/Nodes/CircleIf.test.cpp
@@ -41,11 +41,15 @@ TEST(CircleIfTest, constructor)
 TEST(CircleIfTestDeath, invalid_arity_NEG)
 {
   ASSERT_DEBUG_DEATH(luci::CircleIf very_long_name_if_node(0, 1), "");
+
+  SUCCEED();
 }
 
 TEST(CircleIfTestDeath, invalid_output_count_NEG)
 {
   ASSERT_DEBUG_DEATH(luci::CircleIf if_node(2, 0), "");
+
+  SUCCEED();
 }
 
 TEST(CircleIfTestDeath, invalid_input_get_index_NEG)
diff --git a/compiler/luci/lang/src/Nodes/CircleNonMaxSuppressionV4.test.cpp b/compiler/luci/lang/src/Nodes/CircleNonMaxSuppressionV4.test.cpp
new file mode 100644
index 000000000..b25ce4d6d
--- /dev/null
+++ b/compiler/luci/lang/src/Nodes/CircleNonMaxSuppressionV4.test.cpp
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/IR/Nodes/CircleNonMaxSuppressionV4.h"
+
+#include "luci/IR/CircleDialect.h"
+#include "luci/IR/CircleNodeVisitor.h"
+
+#include <gtest/gtest.h>
+
+TEST(CircleNonMaxSuppressionV4Test, constructor)
+{
+  luci::CircleNonMaxSuppressionV4 nmsv4_node;
+
+  ASSERT_EQ(luci::CircleDialect::get(), nmsv4_node.dialect());
+  ASSERT_EQ(luci::CircleOpcode::NON_MAX_SUPPRESSION_V4, nmsv4_node.opcode());
+
+  ASSERT_EQ(nullptr, nmsv4_node.boxes());
+  ASSERT_EQ(nullptr, nmsv4_node.scores());
+  ASSERT_EQ(nullptr, nmsv4_node.max_output_size());
+  ASSERT_EQ(nullptr, nmsv4_node.iou_threshold());
+  ASSERT_EQ(nullptr, nmsv4_node.score_threshold());
+}
+
+TEST(CircleNonMaxSuppressionV4Test, input_NEG)
+{
+  luci::CircleNonMaxSuppressionV4 nmsv4_node;
+  luci::CircleNonMaxSuppressionV4 node;
+
+  nmsv4_node.boxes(&node);
+  nmsv4_node.scores(&node);
+  nmsv4_node.max_output_size(&node);
+  nmsv4_node.iou_threshold(&node);
+  nmsv4_node.score_threshold(&node);
+  ASSERT_NE(nullptr, nmsv4_node.boxes());
+  ASSERT_NE(nullptr, nmsv4_node.scores());
+  ASSERT_NE(nullptr, nmsv4_node.max_output_size());
+  ASSERT_NE(nullptr, nmsv4_node.iou_threshold());
+  ASSERT_NE(nullptr, nmsv4_node.score_threshold());
+
+  nmsv4_node.boxes(nullptr);
+  nmsv4_node.scores(nullptr);
+  nmsv4_node.max_output_size(nullptr);
+  nmsv4_node.iou_threshold(nullptr);
+  nmsv4_node.score_threshold(nullptr);
+  ASSERT_EQ(nullptr, nmsv4_node.boxes());
+  ASSERT_EQ(nullptr, nmsv4_node.scores());
+  ASSERT_EQ(nullptr, nmsv4_node.max_output_size());
+  ASSERT_EQ(nullptr, nmsv4_node.iou_threshold());
+  ASSERT_EQ(nullptr, nmsv4_node.score_threshold());
+}
+
+TEST(CircleNonMaxSuppressionV4Test, arity_NEG)
+{
+  luci::CircleNonMaxSuppressionV4 nmsv4_node;
+
+  ASSERT_NO_THROW(nmsv4_node.arg(4));
+  ASSERT_THROW(nmsv4_node.arg(5), std::out_of_range);
+}
+
+TEST(CircleNonMaxSuppressionV4Test, visit_mutable_NEG)
+{
+  struct TestVisitor final : public luci::CircleNodeMutableVisitor<void>
+  {
+  };
+
+  luci::CircleNonMaxSuppressionV4 nmsv4_node;
+
+  TestVisitor tv;
+  ASSERT_THROW(nmsv4_node.accept(&tv), std::exception);
+}
+
+TEST(CircleNonMaxSuppressionV4Test, visit_NEG)
+{
+  struct TestVisitor final : public luci::CircleNodeVisitor<void>
+  {
+  };
+
+  luci::CircleNonMaxSuppressionV4 nmsv4_node;
+
+  TestVisitor tv;
+  ASSERT_THROW(nmsv4_node.accept(&tv), std::exception);
+}
diff --git a/compiler/luci/lang/src/Nodes/CircleNonMaxSuppressionV4Out.test.cpp b/compiler/luci/lang/src/Nodes/CircleNonMaxSuppressionV4Out.test.cpp
new file mode 100644
index 000000000..c6cef4e91
--- /dev/null
+++ b/compiler/luci/lang/src/Nodes/CircleNonMaxSuppressionV4Out.test.cpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/IR/Nodes/CircleNonMaxSuppressionV4Out.h"
+
+#include "luci/IR/CircleDialect.h"
+
+#include <gtest/gtest.h>
+
+TEST(CircleNonMaxSuppressionV4OutTest, constructor)
+{
+  luci::CircleNonMaxSuppressionV4Out vout_node;
+
+  ASSERT_EQ(luci::CircleDialect::get(), vout_node.dialect());
+  ASSERT_EQ(luci::CircleOpcode::CIRCLENONMAXSUPPRESSIONV4OUT, vout_node.opcode());
+
+  ASSERT_EQ(nullptr, vout_node.input());
+  ASSERT_EQ(-1, vout_node.index());
+}
diff --git a/compiler/luci/lang/src/Nodes/CirclePadV2.test.cpp b/compiler/luci/lang/src/Nodes/CirclePadV2.test.cpp
new file mode 100644
index 000000000..e09d517b2
--- /dev/null
+++ b/compiler/luci/lang/src/Nodes/CirclePadV2.test.cpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/IR/Nodes/CirclePadV2.h"
+
+#include "luci/IR/CircleDialect.h"
+#include "luci/IR/CircleNodeVisitor.h"
+
+#include <gtest/gtest.h>
+
+TEST(CirclePadV2Test, constructor_P)
+{
+  luci::CirclePadV2 node;
+
+  ASSERT_EQ(luci::CircleDialect::get(), node.dialect());
+  ASSERT_EQ(luci::CircleOpcode::PADV2, node.opcode());
+
+  ASSERT_EQ(nullptr, node.input());
+  ASSERT_EQ(nullptr, node.paddings());
+  ASSERT_EQ(nullptr, node.constant_values());
+}
+
+TEST(CirclePadV2Test, input_NEG)
+{
+  luci::CirclePadV2 pad_node;
+  luci::CirclePadV2 node;
+
+  pad_node.input(&node);
+  pad_node.paddings(&node);
+  pad_node.constant_values(&node);
+  ASSERT_NE(nullptr, pad_node.input());
+  ASSERT_NE(nullptr, pad_node.paddings());
+  ASSERT_NE(nullptr, pad_node.constant_values());
+
+  pad_node.input(nullptr);
+  pad_node.paddings(nullptr);
+  pad_node.constant_values(nullptr);
+  ASSERT_EQ(nullptr, pad_node.input());
+  ASSERT_EQ(nullptr, pad_node.paddings());
+  ASSERT_EQ(nullptr, pad_node.constant_values());
+}
+
+TEST(CirclePadV2Test, arity_NEG)
+{
+  luci::CirclePadV2 pad_node;
+
+  ASSERT_NO_THROW(pad_node.arg(2));
+  ASSERT_THROW(pad_node.arg(3), std::out_of_range);
+}
+
+TEST(CirclePadV2Test, visit_mutable_NEG)
+{
+  struct TestVisitor final : public luci::CircleNodeMutableVisitor<void>
+  {
+  };
+
+  luci::CirclePadV2 pad_node;
+
+  TestVisitor tv;
+  ASSERT_THROW(pad_node.accept(&tv), std::exception);
+}
+
+TEST(CirclePadV2Test, visit_NEG)
+{
+  struct TestVisitor final : public luci::CircleNodeVisitor<void>
+  {
+  };
+
+  luci::CirclePadV2 pad_node;
+
+  TestVisitor tv;
+  ASSERT_THROW(pad_node.accept(&tv), std::exception);
+}
diff --git a/compiler/luci/lang/src/Nodes/CircleUnique.test.cpp b/compiler/luci/lang/src/Nodes/CircleUnique.test.cpp
new file mode 100644
index 000000000..517ee97d5
--- /dev/null
+++ b/compiler/luci/lang/src/Nodes/CircleUnique.test.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "luci/IR/Nodes/CircleUnique.h"
+
+#include "luci/IR/CircleDialect.h"
+#include "luci/IR/CircleNodeVisitor.h"
+
+#include <gtest/gtest.h>
+
+TEST(CircleUniqueTest, constructor)
+{
+  luci::CircleUnique unique_node;
+
+  ASSERT_EQ(luci::CircleDialect::get(), unique_node.dialect());
+  ASSERT_EQ(luci::CircleOpcode::UNIQUE, unique_node.opcode());
+
+  ASSERT_EQ(nullptr, unique_node.input());
+}
+
+TEST(CircleUniqueTest, input_NEG)
+{
+  luci::CircleUnique unique_node;
+  luci::CircleUnique node;
+
+  unique_node.input(&node);
+  ASSERT_NE(nullptr, unique_node.input());
+
+  unique_node.input(nullptr);
+  ASSERT_EQ(nullptr, unique_node.input());
+}
+
+TEST(CircleUniqueTest, arity_NEG)
+{
+  luci::CircleUnique unique_node;
+
+  ASSERT_NO_THROW(unique_node.arg(0));
+  ASSERT_THROW(unique_node.arg(1), std::out_of_range);
+}
+
+TEST(CircleUniqueTest, visit_mutable_NEG)
+{
+  struct TestVisitor final : public luci::CircleNodeMutableVisitor<void>
+  {
+  };
+
+  luci::CircleUnique unique_node;
+
+  TestVisitor tv;
+  ASSERT_THROW(unique_node.accept(&tv), std::exception);
+}
+
+TEST(CircleUniqueTest, visit_NEG)
+{
+  struct TestVisitor final : public luci::CircleNodeVisitor<void>
+  {
+  };
+
+  luci::CircleUnique unique_node;
+
+  TestVisitor tv;
+  ASSERT_THROW(unique_node.accept(&tv), std::exception);
+}
diff --git a/compiler/luci/lang/src/Nodes/CircleWhile.test.cpp b/compiler/luci/lang/src/Nodes/CircleWhile.test.cpp
index 19290c0a2..913686fbd 100644
--- a/compiler/luci/lang/src/Nodes/CircleWhile.test.cpp
+++ b/compiler/luci/lang/src/Nodes/CircleWhile.test.cpp
@@ -41,11 +41,15 @@ TEST(CircleWhileTest, constructor)
 TEST(CircleWhileTestDeath, invalid_arity_NEG)
 {
   ASSERT_DEBUG_DEATH(luci::CircleWhile very_long_name_while_node(0, 1), "");
+
+  SUCCEED();
 }
 
 TEST(CircleWhileTestDeath, invalid_output_count_NEG)
 {
   ASSERT_DEBUG_DEATH(luci::CircleWhile while_node(2, 0), "");
+
+  SUCCEED();
 }
 
 TEST(CircleWhileTestDeath, invalid_input_get_index_NEG)
diff --git a/compiler/luci/logex/src/FormattedGraph.cpp b/compiler/luci/logex/src/FormattedGraph.cpp
index 4725ee3df..f04a418ef 100644
--- a/compiler/luci/logex/src/FormattedGraph.cpp
+++ b/compiler/luci/logex/src/FormattedGraph.cpp
@@ -244,6 +244,7 @@ private:
   IMPLEMENT(luci::CircleMirrorPad)
   IMPLEMENT(luci::CircleMul)
   IMPLEMENT(luci::CircleNeg)
+  IMPLEMENT(luci::CircleNonMaxSuppressionV4)
   IMPLEMENT(luci::CircleNotEqual)
   IMPLEMENT(luci::CircleOneHot)
   IMPLEMENT(luci::CirclePack)
@@ -291,6 +292,7 @@ private:
   IMPLEMENT(luci::CircleTopKV2)
   IMPLEMENT(luci::CircleTranspose)
   IMPLEMENT(luci::CircleTransposeConv)
+  IMPLEMENT(luci::CircleUnique)
   IMPLEMENT(luci::CircleUnpack)
   IMPLEMENT(luci::CircleWhere)
   IMPLEMENT(luci::CircleWhile)
@@ -303,9 +305,11 @@ private:
   IMPLEMENT(luci::CircleInput)
   IMPLEMENT(luci::CircleOutput)
   IMPLEMENT(luci::CircleIfOut)
+  IMPLEMENT(luci::CircleNonMaxSuppressionV4Out)
   IMPLEMENT(luci::CircleSplitOut)
   IMPLEMENT(luci::CircleSplitVOut)
   IMPLEMENT(luci::CircleTopKV2Out)
+  IMPLEMENT(luci::CircleUniqueOut)
   IMPLEMENT(luci::CircleUnpackOut)
   IMPLEMENT(luci::CircleWhileOut)
 #undef IMPLEMENT
@@ -823,6 +827,19 @@ bool CircleNodeSummaryBuilder::summary(const luci::CircleNeg *node, locop::NodeS
   return use_x(tbl(), node, s);
 }
 
+bool CircleNodeSummaryBuilder::summary(const luci::CircleNonMaxSuppressionV4 *node,
+                                       locop::NodeSummary &s) const
+{
+  s.args().append("boxes", pepper::str(node->boxes()));
+  s.args().append("scores", pepper::str(node->scores()));
+  s.args().append("max_output_size", pepper::str(node->max_output_size()));
+  s.args().append("iou_threshold", pepper::str(node->iou_threshold()));
+  s.args().append("score_threshold", pepper::str(node->score_threshold()));
+
+  s.state(locop::NodeSummary::State::Complete);
+  return true;
+}
+
 bool CircleNodeSummaryBuilder::summary(const luci::CircleNotEqual *node,
                                        locop::NodeSummary &s) const
 {
@@ -1227,6 +1244,14 @@ bool CircleNodeSummaryBuilder::summary(const luci::CircleTransposeConv *node,
   return true;
 }
 
+bool CircleNodeSummaryBuilder::summary(const luci::CircleUnique *node, locop::NodeSummary &s) const
+{
+  s.args().append("input", tbl()->lookup(node->input()));
+  s.args().append("idx_out_type", to_str(node->idx_out_type()));
+  s.state(locop::NodeSummary::State::Complete);
+  return true;
+}
+
 bool CircleNodeSummaryBuilder::summary(const luci::CircleUnpack *node, locop::NodeSummary &s) const
 {
   s.args().append("value", tbl()->lookup(node->value()));
@@ -1293,6 +1318,16 @@ bool CircleNodeSummaryBuilder::summary(const luci::CircleTopKV2Out *node,
   return true;
 }
 
+bool CircleNodeSummaryBuilder::summary(const luci::CircleUniqueOut *node,
+                                       locop::NodeSummary &s) const
+{
+  s.args().append("unique", tbl()->lookup(node->input()));
+
+  s.state(locop::NodeSummary::State::Complete);
+
+  return true;
+}
+
 bool CircleNodeSummaryBuilder::summary(const luci::CircleUnpackOut *node,
                                        locop::NodeSummary &s) const
 {
@@ -1308,6 +1343,12 @@ bool CircleNodeSummaryBuilder::summary(const luci::CircleIfOut *node, locop::Nod
   return use_input(tbl(), node, s);
 }
 
+bool CircleNodeSummaryBuilder::summary(const luci::CircleNonMaxSuppressionV4Out *node,
+                                       locop::NodeSummary &s) const
+{
+  return use_input(tbl(), node, s);
+}
+
 bool CircleNodeSummaryBuilder::summary(const luci::CircleWhileOut *node,
                                        locop::NodeSummary &s) const
 {
diff --git a/compiler/luci/pass/src/CircleOptimizer.cpp b/compiler/luci/pass/src/CircleOptimizer.cpp
index 90fbe9009..2edf7a9c6 100644
--- a/compiler/luci/pass/src/CircleOptimizer.cpp
+++ b/compiler/luci/pass/src/CircleOptimizer.cpp
@@ -145,7 +145,7 @@ void CircleOptimizer::quantize(loco::Graph *g) const
   {
     static const std::vector<std::string> fakeq_supported_input_dtype{"float32"};
     static const std::vector<std::string> fakeq_supported_output_dtype{"uint8"};
-    static const std::vector<std::string> fakeq_supported_granularity{"layer"};
+    static const std::vector<std::string> fakeq_supported_granularity{"layer", "channel"};
 
     auto input_dtype = _options->param(Options::AlgorithmParameters::Quantize_input_dtype);
     auto output_dtype = _options->param(Options::AlgorithmParameters::Quantize_output_dtype);
@@ -173,7 +173,7 @@ void CircleOptimizer::quantize(loco::Graph *g) const
   {
     static const std::vector<std::string> qwmm_supported_input_dtype{"float32"};
     static const std::vector<std::string> qwmm_supported_output_dtype{"uint8"};
-    static const std::vector<std::string> qwmm_supported_granularity{"layer"};
+    static const std::vector<std::string> qwmm_supported_granularity{"layer", "channel"};
 
     auto input_dtype = _options->param(Options::AlgorithmParameters::Quantize_input_dtype);
     auto output_dtype = _options->param(Options::AlgorithmParameters::Quantize_output_dtype);
diff --git a/compiler/luci/pass/src/FuseBCQPass.cpp b/compiler/luci/pass/src/FuseBCQPass.cpp
index b81db8827..260de5b30 100644
--- a/compiler/luci/pass/src/FuseBCQPass.cpp
+++ b/compiler/luci/pass/src/FuseBCQPass.cpp
@@ -53,6 +53,11 @@ const std::string node_name_prefix(luci::NodeName node_name)
     const auto index = prefix.find("Tensordot/");
     prefix = prefix.substr(0, index - 1);
   }
+  else if (prefix.find("/MatMul") != std::string::npos)
+  {
+    const auto index = prefix.find("/MatMul");
+    prefix = prefix.substr(0, index);
+  }
   else if (prefix.find("kernel/") != std::string::npos)
   {
     const auto index = prefix.find("kernel/");
@@ -67,14 +72,190 @@ const std::string node_name_prefix(luci::NodeName node_name)
   return prefix;
 }
 
+/**
+ * @brief Create CircleOutputExclude operation, which has same shape and dtype with
+ *        original circle_node.
+ */
+luci::CircleOutputExclude *createNoOp(luci::CircleNode *circle_node)
+{
+  auto graph = circle_node->graph();
+  auto noOp = graph->nodes()->create<luci::CircleOutputExclude>();
+
+  if (circle_node->shape_status() == luci::ShapeStatus::VALID)
+  {
+    noOp->dtype(circle_node->dtype());
+    noOp->rank(circle_node->rank());
+    for (uint32_t i = 0; i < circle_node->rank(); ++i)
+      noOp->dim(i) = circle_node->dim(i);
+  }
+  else
+  {
+    // For type inference
+    noOp->dtype(loco::DataType::FLOAT32);
+  }
+
+  return noOp;
+};
+
 } // namespace
 
 namespace
 {
 
-class BCQConverter final
+// V means the version of BCQ.
+template <int32_t V> class BCQFuser;
+
+template <> class BCQFuser<1>
 {
 public:
+  bool fuseBCQ(loco::Graph *g)
+  {
+    bool changed = false;
+
+    for (auto node : loco::all_nodes(g))
+    {
+      if (auto circle_const = dynamic_cast<luci::CircleConst *>(node))
+      {
+        add_BCQ_info_node(circle_const);
+      }
+    }
+
+    if (!is_bcqinfo_valid())
+      return false;
+
+    for (auto node : loco::active_nodes(loco::output_nodes(g)))
+    {
+      if (auto gather = dynamic_cast<luci::CircleGather *>(node))
+      {
+        auto params = dynamic_cast<luci::CircleConst *>(gather->params());
+        if (params != nullptr && has_BCQ_info(params))
+        {
+          auto bcq_gather = g->nodes()->create<luci::CircleBCQGather>();
+
+          bcq_gather->op_version(1);
+          bcq_gather->input_scales(get_alpha(params));
+          bcq_gather->input_binary(get_packed_binary_code(params));
+          bcq_gather->indices(gather->indices());
+          bcq_gather->input_clusters(packed_clusters(params));
+
+          // input_binary shape : [output_size, hidden_size]
+          const auto binary_hidden_size =
+              loco::must_cast<luci::CircleConst *>(bcq_gather->input_binary())->dim(1).value() * 32;
+          bcq_gather->input_hidden_size(binary_hidden_size);
+
+          if (do_w_x(params))
+          {
+            bcq_gather->axis(gather->axis());
+          }
+          else
+          {
+            const auto axis_transpose = (gather->axis() == 0) ? 1 : 0;
+            bcq_gather->axis(axis_transpose);
+          }
+
+          loco::replace(gather).with(bcq_gather);
+
+          changed = true;
+        }
+      }
+      else if (auto fully_connected = dynamic_cast<luci::CircleFullyConnected *>(node))
+      {
+        auto weights = dynamic_cast<luci::CircleConst *>(fully_connected->weights());
+        if (weights != nullptr && has_BCQ_info(weights))
+        {
+          auto bcq_fc = g->nodes()->create<luci::CircleBCQFullyConnected>();
+
+          bcq_fc->op_version(1);
+          bcq_fc->weights_scales(get_alpha(weights));
+          bcq_fc->weights_binary(get_packed_binary_code(weights));
+          bcq_fc->bias(fully_connected->bias());
+          bcq_fc->weights_clusters(packed_clusters(weights));
+          bcq_fc->fusedActivationFunction(fully_connected->fusedActivationFunction());
+
+          loco::Node *bcq_input = fully_connected->input();
+          int32_t batch_rank = 0;
+
+          // If input of BCQFullyConnected has more than rank 2, we should reshape it as rank 2
+          const auto original_input = loco::must_cast<luci::CircleNode *>(fully_connected->input());
+          if (original_input->shape_status() == luci::ShapeStatus::VALID &&
+              original_input->rank() > 2)
+          {
+            auto new_shape = g->nodes()->create<luci::CircleConst>();
+            new_shape->dtype(loco::DataType::S32);
+            new_shape->size<loco::DataType::S32>(2);
+            new_shape->rank(1);
+            new_shape->dim(0) = 2;
+
+            auto batch_size = 1;
+            for (uint32_t i = 0; i < original_input->rank() - 1; ++i)
+              batch_size *= original_input->dim(i).value();
+
+            new_shape->at<loco::DataType::S32>(0) = batch_size;
+            new_shape->at<loco::DataType::S32>(1) =
+                original_input->dim(original_input->rank() - 1).value();
+            new_shape->shape_status(luci::ShapeStatus::VALID);
+
+            auto reshape = g->nodes()->create<luci::CircleReshape>();
+            reshape->tensor(original_input);
+            reshape->shape(new_shape);
+
+            bcq_input = reshape;
+            batch_rank = original_input->rank() - 2;
+          }
+
+          // If x_w formation, we should insert Transpose in front and back of BCQFullyConnected
+          if (do_w_x(weights))
+          {
+            const auto binary_hidden_size =
+                loco::must_cast<luci::CircleNode *>(fully_connected->input())
+                    ->dim(batch_rank)
+                    .value();
+            bcq_fc->weights_hidden_size(binary_hidden_size);
+            bcq_fc->input(bcq_input);
+            loco::replace(fully_connected).with(bcq_fc);
+          }
+          else
+          {
+            const auto binary_hidden_size =
+                loco::must_cast<luci::CircleNode *>(fully_connected->input())
+                    ->dim(1 + batch_rank)
+                    .value();
+            bcq_fc->weights_hidden_size(binary_hidden_size);
+
+            auto perm = g->nodes()->create<luci::CircleConst>();
+            perm->dtype(loco::DataType::S32);
+            perm->size<loco::DataType::S32>(2);
+            perm->rank(1);
+            perm->dim(0) = 2;
+            perm->at<loco::DataType::S32>(0) = 1;
+            perm->at<loco::DataType::S32>(1) = 0;
+            perm->shape_status(luci::ShapeStatus::VALID);
+
+            auto input_transpose = g->nodes()->create<luci::CircleTranspose>();
+            input_transpose->a(bcq_input);
+            input_transpose->perm(perm);
+
+            bcq_fc->input(input_transpose);
+
+            auto output_transpose = g->nodes()->create<luci::CircleTranspose>();
+            output_transpose->a(bcq_fc);
+            output_transpose->perm(perm);
+
+            loco::replace(fully_connected).with(output_transpose);
+          }
+
+          changed = true;
+        }
+      }
+    }
+
+    if (changed)
+      clear_BCQ_nodes();
+
+    return changed;
+  }
+
+private:
   void add_BCQ_info_node(luci::CircleConst *node)
   {
     const auto node_name = node->name();
@@ -119,16 +300,65 @@ public:
     return has_info;
   }
 
+  /**
+   * @brief Exclude BCQ information nodes which are used for fusing BCQ operations
+   *        from graph output by using CircleOutputExclude
+   */
+  void clear_BCQ_nodes()
+  {
+    auto clear_nodes = [](std::map<std::string, luci::CircleConst *> &nodes) {
+      for (auto &n : nodes)
+      {
+        auto node = n.second;
+
+        for (auto s : loco::succs(node))
+        {
+          if (auto outnode = dynamic_cast<luci::CircleOutput *>(s))
+          {
+            outnode->from(createNoOp(node));
+          }
+          else if (auto reshape_node = dynamic_cast<luci::CircleReshape *>(s))
+          {
+            for (auto o : loco::succs(reshape_node))
+            {
+              auto circle_output = loco::must_cast<luci::CircleOutput *>(o);
+              circle_output->from(createNoOp(reshape_node));
+            }
+          }
+        }
+      }
+    };
+
+    clear_nodes(_do_w_x);
+    clear_nodes(_alpha);
+    clear_nodes(_packed_binary_code);
+    clear_nodes(_number_of_clusters);
+    clear_nodes(_size_of_clusters);
+    clear_nodes(_qbits_of_clusters);
+    clear_nodes(_dequant_weight);
+  }
+
+  bool is_bcqinfo_valid()
+  {
+    // do_w_x should be int32 or bool type
+    for (auto n : _do_w_x)
+    {
+      if (n.second->dtype() != loco::DataType::BOOL && n.second->dtype() != loco::DataType::S32)
+        return false;
+    }
+
+    return true;
+  }
+
+private:
   bool do_w_x(luci::CircleConst *node)
   {
     const auto prefix = node_name_prefix(node->name());
 
     if (_do_w_x[prefix]->dtype() == loco::DataType::S32)
       return _do_w_x[prefix]->at<loco::DataType::S32>(0) == 1;
-    else if (_do_w_x[prefix]->dtype() == loco::DataType::BOOL)
-      return _do_w_x[prefix]->at<loco::DataType::BOOL>(0);
     else
-      throw std::runtime_error("do_w_x should be int or bool");
+      return _do_w_x[prefix]->at<loco::DataType::BOOL>(0);
   }
 
   luci::CircleConst *get_alpha(luci::CircleConst *node)
@@ -187,64 +417,6 @@ public:
     return packed_clusters;
   }
 
-  /**
-   * @brief Exclude BCQ information nodes which are used for fusing BCQ operations
-   *        from graph output by using CircleOutputExclude
-   */
-  void clear_BCQ_nodes()
-  {
-    auto createNoOp = [](luci::CircleNode *circle_node) {
-      auto graph = circle_node->graph();
-      auto noOp = graph->nodes()->create<luci::CircleOutputExclude>();
-
-      if (circle_node->shape_status() == luci::ShapeStatus::VALID)
-      {
-        noOp->dtype(circle_node->dtype());
-        noOp->rank(circle_node->rank());
-        for (uint32_t i = 0; i < circle_node->rank(); ++i)
-          noOp->dim(i) = circle_node->dim(i);
-      }
-      else
-      {
-        // For type inference
-        noOp->dtype(loco::DataType::FLOAT32);
-      }
-
-      return noOp;
-    };
-
-    auto clear_nodes = [createNoOp](std::map<std::string, luci::CircleConst *> &nodes) {
-      for (auto &n : nodes)
-      {
-        auto node = n.second;
-
-        for (auto s : loco::succs(node))
-        {
-          if (auto outnode = dynamic_cast<luci::CircleOutput *>(s))
-          {
-            outnode->from(createNoOp(node));
-          }
-          else if (auto reshape_node = dynamic_cast<luci::CircleReshape *>(s))
-          {
-            for (auto o : loco::succs(reshape_node))
-            {
-              auto circle_output = loco::must_cast<luci::CircleOutput *>(o);
-              circle_output->from(createNoOp(reshape_node));
-            }
-          }
-        }
-      }
-    };
-
-    clear_nodes(_do_w_x);
-    clear_nodes(_alpha);
-    clear_nodes(_packed_binary_code);
-    clear_nodes(_number_of_clusters);
-    clear_nodes(_size_of_clusters);
-    clear_nodes(_qbits_of_clusters);
-    clear_nodes(_dequant_weight);
-  }
-
 private:
   std::map<std::string, luci::CircleConst *> _do_w_x;
   std::map<std::string, luci::CircleConst *> _alpha;
@@ -262,143 +434,42 @@ namespace luci
 
 bool FuseBCQPass::run(loco::Graph *g)
 {
-  BCQConverter converter;
-
   bool changed = false;
 
+  // Find BCQ version information and check validity.
+  luci::CircleConst *version_node = nullptr;
   for (auto node : loco::all_nodes(g))
   {
     if (auto circle_const = dynamic_cast<luci::CircleConst *>(node))
     {
-      converter.add_BCQ_info_node(circle_const);
-    }
-  }
-
-  for (auto node : loco::active_nodes(loco::output_nodes(g)))
-  {
-    if (auto gather = dynamic_cast<luci::CircleGather *>(node))
-    {
-      auto params = dynamic_cast<luci::CircleConst *>(gather->params());
-      if (params != nullptr && converter.has_BCQ_info(params))
+      if (circle_const->name().find("/bcqinfo_version") != std::string::npos)
       {
-        auto bcq_gather = g->nodes()->create<luci::CircleBCQGather>();
-
-        bcq_gather->input_scales(converter.get_alpha(params));
-        bcq_gather->input_binary(converter.get_packed_binary_code(params));
-        bcq_gather->indices(gather->indices());
-        bcq_gather->input_clusters(converter.packed_clusters(params));
-
-        const auto binary_hidden_size =
-            loco::must_cast<luci::CircleConst *>(bcq_gather->input_binary())->dim(1).value() * 32;
-        bcq_gather->input_hidden_size(binary_hidden_size);
-
-        if (converter.do_w_x(params))
-        {
-          bcq_gather->axis(gather->axis());
-        }
-        else
+        // There should be only one bcqinfo_version in the model
+        if (version_node != nullptr)
         {
-          const auto axis_transpose = (gather->axis() == 0) ? 1 : 0;
-          bcq_gather->axis(axis_transpose);
+          assert(false && "Multiple version information found");
+          return false;
         }
 
-        loco::replace(gather).with(bcq_gather);
-
-        changed = true;
+        version_node = circle_const;
       }
     }
-    else if (auto fully_connected = dynamic_cast<luci::CircleFullyConnected *>(node))
-    {
-      auto weights = dynamic_cast<luci::CircleConst *>(fully_connected->weights());
-      if (weights != nullptr && converter.has_BCQ_info(weights))
-      {
-        auto bcq_fc = g->nodes()->create<luci::CircleBCQFullyConnected>();
-
-        bcq_fc->weights_scales(converter.get_alpha(weights));
-        bcq_fc->weights_binary(converter.get_packed_binary_code(weights));
-        bcq_fc->bias(fully_connected->bias());
-        bcq_fc->weights_clusters(converter.packed_clusters(weights));
-        bcq_fc->fusedActivationFunction(fully_connected->fusedActivationFunction());
-
-        loco::Node *bcq_input = fully_connected->input();
-        int32_t batch_rank = 0;
+  }
 
-        // If input of BCQFullyConnected has more than rank 2, we should reshape it as rank 2
-        const auto original_input = loco::must_cast<luci::CircleNode *>(fully_connected->input());
-        if (original_input->shape_status() == ShapeStatus::VALID && original_input->rank() > 2)
-        {
-          auto new_shape = g->nodes()->create<luci::CircleConst>();
-          new_shape->dtype(loco::DataType::S32);
-          new_shape->size<loco::DataType::S32>(2);
-          new_shape->rank(1);
-          new_shape->dim(0) = 2;
-
-          auto batch_size = 1;
-          for (uint32_t i = 0; i < original_input->rank() - 1; ++i)
-            batch_size *= original_input->dim(i).value();
-
-          new_shape->at<loco::DataType::S32>(0) = batch_size;
-          new_shape->at<loco::DataType::S32>(1) =
-              original_input->dim(original_input->rank() - 1).value();
-          new_shape->shape_status(ShapeStatus::VALID);
-
-          auto reshape = g->nodes()->create<luci::CircleReshape>();
-          reshape->tensor(original_input);
-          reshape->shape(new_shape);
-
-          bcq_input = reshape;
-          batch_rank = original_input->rank() - 2;
-        }
+  // If version node is not found, regard it as version 1.
+  int32_t bcq_version = (version_node != nullptr) ? version_node->at<loco::DataType::S32>(0) : 1;
 
-        // If x_w formation, we should insert Transpose in front and back of BCQFullyConnected
-        if (converter.do_w_x(weights))
-        {
-          const auto binary_hidden_size =
-              loco::must_cast<luci::CircleNode *>(fully_connected->input())
-                  ->dim(batch_rank)
-                  .value();
-          bcq_fc->weights_hidden_size(binary_hidden_size);
-          bcq_fc->input(bcq_input);
-          loco::replace(fully_connected).with(bcq_fc);
-        }
-        else
-        {
-          const auto binary_hidden_size =
-              loco::must_cast<luci::CircleNode *>(fully_connected->input())
-                  ->dim(1 + batch_rank)
-                  .value();
-          bcq_fc->weights_hidden_size(binary_hidden_size);
-
-          auto perm = g->nodes()->create<luci::CircleConst>();
-          perm->dtype(loco::DataType::S32);
-          perm->size<loco::DataType::S32>(2);
-          perm->rank(1);
-          perm->dim(0) = 2;
-          perm->at<loco::DataType::S32>(0) = 1;
-          perm->at<loco::DataType::S32>(1) = 0;
-          perm->shape_status(ShapeStatus::VALID);
-
-          auto input_transpose = g->nodes()->create<luci::CircleTranspose>();
-          input_transpose->a(bcq_input);
-          input_transpose->perm(perm);
-
-          bcq_fc->input(input_transpose);
-
-          auto output_transpose = g->nodes()->create<luci::CircleTranspose>();
-          output_transpose->a(bcq_fc);
-          output_transpose->perm(perm);
-
-          loco::replace(fully_connected).with(output_transpose);
-        }
+  if (bcq_version == 1)
+    changed = BCQFuser<1>().fuseBCQ(g);
+  else
+    assert(false && "Not supported BCQ version");
 
-        changed = true;
-      }
-    }
+  if (changed && version_node != nullptr)
+  {
+    // If BCQ is applied and version node was found, remove the node.
+    loco::replace(version_node).with(createNoOp(version_node));
   }
 
-  if (changed)
-    converter.clear_BCQ_nodes();
-
   return changed;
 }
 
diff --git a/compiler/luci/pass/src/QuantizationUtils.cpp b/compiler/luci/pass/src/QuantizationUtils.cpp
index 6726ce746..e18690605 100644
--- a/compiler/luci/pass/src/QuantizationUtils.cpp
+++ b/compiler/luci/pass/src/QuantizationUtils.cpp
@@ -24,6 +24,13 @@
 namespace luci
 {
 
+uint8_t fp32_to_uint8_cast(float f)
+{
+  assert(std::numeric_limits<uint8_t>::min() <= f);
+  assert(f <= std::numeric_limits<uint8_t>::max());
+  return static_cast<uint8_t>(f);
+}
+
 void compute_sym_scale_zp(float min, float max, float &scaling_factor, int64_t &zp,
                           float &nudged_min, float &nudged_max)
 {
@@ -78,7 +85,7 @@ void compute_asym_scale_zp(float min, float max, float &scaling_factor, int64_t
   }
   else
     zero_point_double = qmin_double - rmin / scale;
-  if (zero_point_double <= qmin_double)
+  if (min >= 0)
   {
     assert(min >= 0 && max >= 0);
     nudged_zero_point = kMinScale;
@@ -86,7 +93,7 @@ void compute_asym_scale_zp(float min, float max, float &scaling_factor, int64_t
     if (min > 0 && max > 0)
       WARN(l) << "The minimum and maximum values are all positive." << std::endl;
   }
-  else if (zero_point_double >= qmax_double)
+  else if (max < 0)
   {
     assert(min < 0 && max < 0);
     nudged_zero_point = kMaxScale;
@@ -96,7 +103,14 @@ void compute_asym_scale_zp(float min, float max, float &scaling_factor, int64_t
   else
   {
     assert(min < 0 && max >= 0);
-    nudged_zero_point = static_cast<uint8_t>(std::round(zero_point_double));
+    nudged_zero_point = fp32_to_uint8_cast(std::round(zero_point_double));
+  }
+
+  // protect scale from being very low due to overflow
+  if (scale < 1e-5)
+  {
+    scale = 1e-5;
+    nudged_zero_point = fp32_to_uint8_cast(std::round(qmin_double - rmin / scale));
   }
 
   nudged_min = static_cast<float>((qmin_double - nudged_zero_point) * scale);
diff --git a/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp b/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp
index f8abee751..b335a53b4 100644
--- a/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp
+++ b/compiler/luci/pass/src/QuantizeWithMinMaxPass.cpp
@@ -138,7 +138,8 @@ bool is_quantized(const CircleNode *node)
          node->dtype() == loco::DataType::S32;  // bias
 }
 
-void sym_wquant_per_channel(CircleConst *node, std::vector<float> &scaling_factor)
+void sym_wquant_per_channel(CircleConst *node, std::vector<float> &scaling_factor,
+                            int32_t &channel_dim_index)
 {
   assert(node->dtype() == loco::DataType::FLOAT32);
 
@@ -153,7 +154,6 @@ void sym_wquant_per_channel(CircleConst *node, std::vector<float> &scaling_facto
   uint32_t indices[4] = {
       0,
   };
-  int channel_dim_index{0};
 
   if (!get_channel_dim_index(node, dimension, channel_dim_index))
   {
@@ -189,7 +189,7 @@ void sym_wquant_per_channel(CircleConst *node, std::vector<float> &scaling_facto
 }
 
 void asym_wquant_per_channel(CircleConst *node, std::vector<float> &min,
-                             std::vector<float> &scaling_factor)
+                             std::vector<float> &scaling_factor, int32_t &channel_dim_index)
 {
   assert(node->dtype() == loco::DataType::FLOAT32);
 
@@ -204,7 +204,6 @@ void asym_wquant_per_channel(CircleConst *node, std::vector<float> &min,
   uint32_t indices[4] = {
       0,
   };
-  int channel_dim_index{0};
 
   if (!get_channel_dim_index(node, dimension, channel_dim_index))
   {
@@ -282,6 +281,10 @@ bool is_weights(CircleNode *node)
     if (dw_conv != nullptr && dw_conv->filter() == circle_const)
       return true;
 
+    auto t_conv = dynamic_cast<CircleTransposeConv *>(out);
+    if (t_conv != nullptr && t_conv->filter() == circle_const && circle_const->rank() == 4)
+      return true;
+
     auto fc = dynamic_cast<CircleFullyConnected *>(out);
     if (fc != nullptr && fc->weights() == circle_const)
       return true;
@@ -350,8 +353,8 @@ struct QuantizeActivation final : public luci::CircleNodeMutableVisitor<bool>
           circle_node->dtype(loco::DataType::S16);
         }
 
-        circle_node->quantparam()->max[0] = nudged_max;
-        circle_node->quantparam()->min[0] = nudged_min;
+        circle_node->quantparam()->min.clear();
+        circle_node->quantparam()->max.clear();
         circle_node->quantparam()->scale.push_back(scaling_factor);
         circle_node->quantparam()->zerop.push_back(zp);
       }
@@ -472,15 +475,19 @@ struct QuantizeWeights final : public luci::CircleNodeMutableVisitor<bool>
           assert(quantparam != nullptr);
           auto min = quantparam->min;
           auto scaling_factor = quantparam->scale;
+          int32_t channel_dim_index = 0;
 
           if (output_type == loco::DataType::U8)
           {
-            asym_wquant_per_channel(circle_const, min, scaling_factor);
+            asym_wquant_per_channel(circle_const, min, scaling_factor, channel_dim_index);
           }
           else
           {
-            sym_wquant_per_channel(circle_const, scaling_factor);
+            sym_wquant_per_channel(circle_const, scaling_factor, channel_dim_index);
           }
+          quantparam->min.clear();
+          quantparam->max.clear();
+          quantparam->quantized_dimension = channel_dim_index;
         }
         // Find min/max per layer-wise
         else
@@ -493,6 +500,8 @@ struct QuantizeWeights final : public luci::CircleNodeMutableVisitor<bool>
           auto min = quantparam->min[0];
           auto scaling_factor = quantparam->scale[0];
           asym_wquant_per_layer(circle_const, min, scaling_factor);
+          quantparam->min.clear();
+          quantparam->max.clear();
         }
       }
     }
diff --git a/compiler/luci/service/src/CircleShapeInferenceRule.cpp b/compiler/luci/service/src/CircleShapeInferenceRule.cpp
index a291cfe70..6355ec546 100644
--- a/compiler/luci/service/src/CircleShapeInferenceRule.cpp
+++ b/compiler/luci/service/src/CircleShapeInferenceRule.cpp
@@ -1010,6 +1010,12 @@ public:
 
   loco::NodeShape visit(const luci::CircleNeg *node) final { return use_x(node); }
 
+  loco::NodeShape visit(const luci::CircleNonMaxSuppressionV4 *node) final
+  {
+    const auto boxes_shape = loco::shape_get(node->boxes()).as<loco::TensorShape>();
+    return loco::NodeShape{boxes_shape};
+  }
+
   loco::NodeShape visit(const luci::CircleNotEqual *node) final { return broadcast_xy(node); }
 
   loco::NodeShape visit(const luci::CircleOneHot *node) final
@@ -1818,6 +1824,18 @@ public:
     return output_shape;
   }
 
+  loco::NodeShape visit(const luci::CircleUnique *node) final
+  {
+    auto input_shape = loco::shape_get(node->input()).as<loco::TensorShape>();
+
+    assert(input_shape.rank() == 1);
+
+    loco::TensorShape shape_output;
+    shape_output = own_shape(node);
+
+    return loco::NodeShape{shape_output};
+  }
+
   loco::NodeShape visit(const luci::CircleTransposeConv *node) final
   {
     // TransposeConv's output shape is written in its 'inputSizes' argument
@@ -2019,6 +2037,34 @@ public:
     return loco::NodeShape{*then_graph_output->shape()};
   }
 
+  loco::NodeShape visit(const luci::CircleNonMaxSuppressionV4Out *node) final
+  {
+    const loco::DataType S32 = loco::DataType::S32;
+
+    auto nmsv4 = dynamic_cast<const luci::CircleNonMaxSuppressionV4 *>(node->input());
+    if (nmsv4 == nullptr)
+      INTERNAL_EXN("CircleNonMaxSuppressionV4 IR is not configured correctly");
+
+    auto index = node->index();
+    if (index == 1)
+      return loco::TensorShape({0});
+
+    assert(index == 0);
+
+    auto unknown = loco::TensorShape{loco::Dimension()};
+    auto max_output_size = dynamic_cast<const luci::CircleConst *>(nmsv4->max_output_size());
+    if (max_output_size == nullptr)
+      return unknown; // we need CircleConst for max output size
+
+    LUCI_ASSERT(max_output_size->dtype() == S32, "Only support int32 for max_output_size");
+
+    if (max_output_size->size<S32>() < 1)
+      return unknown;
+
+    auto max_output_size_value = uint32_t(max_output_size->at<S32>(0));
+    return loco::TensorShape{max_output_size_value};
+  }
+
   loco::NodeShape visit(const luci::CircleSplitOut *node) final
   {
     const loco::DataType S32 = loco::DataType::S32;
@@ -2142,6 +2188,19 @@ public:
     return loco::NodeShape{output_shape};
   }
 
+  loco::NodeShape visit(const luci::CircleUniqueOut *node) final
+  {
+    auto unique = dynamic_cast<const luci::CircleUnique *>(node->input());
+    if (unique == nullptr)
+    {
+      INTERNAL_EXN("CircleUnique IR is not configured correctly");
+    }
+
+    auto unique_shape = loco::shape_get(unique).as<loco::TensorShape>();
+
+    return loco::NodeShape{unique_shape};
+  }
+
   loco::NodeShape visit(const luci::CircleUnpackOut *node) final
   {
     auto unpack = dynamic_cast<const luci::CircleUnpack *>(node->input());
diff --git a/compiler/luci/service/src/CircleTypeInferenceRule.cpp b/compiler/luci/service/src/CircleTypeInferenceRule.cpp
index de2ba3ea4..e7910bfc0 100644
--- a/compiler/luci/service/src/CircleTypeInferenceRule.cpp
+++ b/compiler/luci/service/src/CircleTypeInferenceRule.cpp
@@ -252,6 +252,11 @@ struct TypeInferenceAlgorithm final : public luci::CircleNodeVisitor<loco::DataT
 
   loco::DataType visit(const luci::CircleNeg *node) final { return loco::dtype_get(node->x()); }
 
+  loco::DataType visit(const luci::CircleNonMaxSuppressionV4 *node) final
+  {
+    return loco::dtype_get(node->boxes());
+  }
+
   loco::DataType visit(const luci::CircleNotEqual *) final { return loco::DataType::BOOL; }
 
   loco::DataType visit(const luci::CirclePack *node) final
@@ -345,7 +350,10 @@ struct TypeInferenceAlgorithm final : public luci::CircleNodeVisitor<loco::DataT
     return loco::dtype_get(node->tensor());
   }
 
-  loco::DataType visit(const luci::CircleResizeBilinear *) final { return loco::DataType::FLOAT32; }
+  loco::DataType visit(const luci::CircleResizeBilinear *node) final
+  {
+    return loco::dtype_get(node->input());
+  }
 
   loco::DataType visit(const luci::CircleResizeNearestNeighbor *node) final
   {
@@ -472,6 +480,11 @@ struct TypeInferenceAlgorithm final : public luci::CircleNodeVisitor<loco::DataT
     return loco::dtype_get(node->outBackprop());
   }
 
+  loco::DataType visit(const luci::CircleUnique *node) final
+  {
+    return loco::dtype_get(node->input());
+  }
+
   loco::DataType visit(const luci::CircleUnpack *node) final
   {
     return loco::dtype_get(node->value());
@@ -569,6 +582,13 @@ struct TypeInferenceAlgorithm final : public luci::CircleNodeVisitor<loco::DataT
     return then_graph_output->dtype();
   }
 
+  loco::DataType visit(const luci::CircleNonMaxSuppressionV4Out *node) final
+  {
+    (void)node;
+    assert(node->index() == 0 || node->index() == 1);
+    return loco::DataType::S32;
+  }
+
   loco::DataType visit(const luci::CircleSplitOut *node) final
   {
     return loco::dtype_get(node->input());
@@ -589,6 +609,17 @@ struct TypeInferenceAlgorithm final : public luci::CircleNodeVisitor<loco::DataT
     return loco::DataType::S32;
   }
 
+  loco::DataType visit(const luci::CircleUniqueOut *node) final
+  {
+    if (node->index() == 0)
+    {
+      return loco::dtype_get(node->input());
+    }
+    assert(node->index() == 1);
+    auto unique = loco::must_cast<luci::CircleUnique *>(node->input());
+    return unique->idx_out_type();
+  }
+
   loco::DataType visit(const luci::CircleUnpackOut *node) final
   {
     return loco::dtype_get(node->input());
diff --git a/compiler/luci/tests/test.lst b/compiler/luci/tests/test.lst
index 188e29828..9fd42ed4e 100644
--- a/compiler/luci/tests/test.lst
+++ b/compiler/luci/tests/test.lst
@@ -20,6 +20,7 @@ addread(ArgMin_U8_001)
 addread(ArgMin_U8_002)
 addread(ArgMin_U8_003)
 addread(AveragePool2D_000)
+addread(AveragePool2D_U8_000)
 addread(BatchMatMul_000)
 addread(BatchMatMulV2_000)
 addread(BatchMatMulV2_001)
@@ -30,13 +31,16 @@ addread(Ceil_000)
 addread(Concatenation_000)
 addread(Concatenation_U8_000)
 addread(Conv2D_000)
+addread(Conv2D_001)
 addread(Conv2D_002)
 addread(Conv2D_003)
 addread(Conv2D_U8_000)
+addread(Conv2D_U8_001)
 addread(Cos_000)
 addread(DepthToSpace_000)
 addread(DepthwiseConv2D_000)
 addread(DepthwiseConv2D_U8_000)
+addread(DepthwiseConv2D_U8_001)
 addread(DepthwiseConv2D_001)
 addread(Div_000)
 addread(ELU_000)
@@ -64,6 +68,7 @@ addread(GreaterEqual_000)
 addread(If_000)
 addread(If_001)
 addread(L2Normalize_000)
+addread(L2Normalize_U8_000)
 addread(L2Pool2D_000)
 addread(L2Pool2D_U8_000)
 addread(LeakyRelu_000)
@@ -75,6 +80,7 @@ addread(LogicalAnd_000)
 addread(LogicalNot_000)
 addread(LogicalOr_000)
 addread(Logistic_000)
+addread(Logistic_U8_000)
 addread(LogSoftmax_000)
 addread(MatMul_000)
 addread(MatrixDiag_000)
@@ -84,6 +90,7 @@ addread(MaxPool2D_000)
 addread(MaxPool2D_U8_000)
 addread(Mean_000)
 addread(Mean_001)
+addread(Mean_U8_000)
 addread(Minimum_000)
 addread(MirrorPad_000)
 addread(Mul_000)
@@ -97,6 +104,7 @@ addread(OneHot_003)
 addread(Pack_000)
 addread(Pack_U8_000)
 addread(Pad_000)
+addread(Pad_U8_000)
 addread(Pow_000)
 addread(PRelu_000)
 addread(Range_000)
@@ -212,6 +220,7 @@ addwrite(ArgMin_U8_001)
 addwrite(ArgMin_U8_002)
 addwrite(ArgMin_U8_003)
 addwrite(AveragePool2D_000)
+addwrite(AveragePool2D_U8_000)
 addwrite(BatchMatMul_000)
 addwrite(BatchMatMulV2_000)
 addwrite(BatchMatMulV2_001)
@@ -222,13 +231,16 @@ addwrite(Ceil_000)
 addwrite(Concatenation_000)
 addwrite(Concatenation_U8_000)
 addwrite(Conv2D_000)
+addwrite(Conv2D_001)
 addwrite(Conv2D_002)
 addwrite(Conv2D_003)
 addwrite(Conv2D_U8_000)
+addwrite(Conv2D_U8_001)
 addwrite(Cos_000)
 addwrite(DepthToSpace_000)
 addwrite(DepthwiseConv2D_000)
 addwrite(DepthwiseConv2D_U8_000)
+addwrite(DepthwiseConv2D_U8_001)
 addwrite(DepthwiseConv2D_001)
 addwrite(Div_000)
 addwrite(ELU_000)
@@ -256,6 +268,7 @@ addwrite(GreaterEqual_000)
 addwrite(If_000)
 addwrite(If_001)
 addwrite(L2Normalize_000)
+addwrite(L2Normalize_U8_000)
 addwrite(L2Pool2D_000)
 addwrite(L2Pool2D_U8_000)
 addwrite(LeakyRelu_000)
@@ -267,6 +280,7 @@ addwrite(LogicalAnd_000)
 addwrite(LogicalNot_000)
 addwrite(LogicalOr_000)
 addwrite(Logistic_000)
+addwrite(Logistic_U8_000)
 addwrite(LogSoftmax_000)
 addwrite(MatMul_000)
 addwrite(MatrixDiag_000)
@@ -276,6 +290,7 @@ addwrite(MaxPool2D_000)
 addwrite(MaxPool2D_U8_000)
 addwrite(Mean_000)
 addwrite(Mean_001)
+addwrite(Mean_U8_000)
 addwrite(Minimum_000)
 addwrite(MirrorPad_000)
 addwrite(Mul_000)
diff --git a/compiler/mio-tflite/CMakeLists.txt b/compiler/mio-tflite/CMakeLists.txt
index 2cfed1449..9ef2859b9 100644
--- a/compiler/mio-tflite/CMakeLists.txt
+++ b/compiler/mio-tflite/CMakeLists.txt
@@ -5,11 +5,7 @@ if(NOT FlatBuffers_FOUND)
   return()
 endif(NOT FlatBuffers_FOUND)
 
-# TODO recover official release version
-# NOTE we cannot use version number like "2.3.0-rc0" for find_package()
-#      use TensorFlowSource-2.3.0-rc0 as config itself
-# nnas_find_package(TensorFlowSource EXACT 2.3.0 QUIET)
-nnas_find_package(TensorFlowSource-2.3.0-rc0 QUIET)
+nnas_find_package(TensorFlowSource EXACT 2.3.0 QUIET)
 
 if(NOT TensorFlowSource_FOUND)
   return()
diff --git a/compiler/one-cmds/CMakeLists.txt b/compiler/one-cmds/CMakeLists.txt
index 7d73d9b23..173b8b476 100644
--- a/compiler/one-cmds/CMakeLists.txt
+++ b/compiler/one-cmds/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(ONE_COMMAND_FILES
     one-import
+    one-import-bcq
     one-import-tf
     one-import-tflite
     one-optimize
@@ -26,7 +27,7 @@ foreach(ONE_COMMAND IN ITEMS ${ONE_COMMAND_FILES})
 
   install(FILES ${ONE_COMMAND}
           PERMISSIONS OWNER_WRITE OWNER_READ OWNER_EXECUTE
-                      GROUP_READ GROUP_WRITE GROUP_EXECUTE
+                      GROUP_READ GROUP_EXECUTE
                       WORLD_READ WORLD_EXECUTE
           DESTINATION bin)
           
diff --git a/compiler/one-cmds/how-to-prepare-virtualenv.txt b/compiler/one-cmds/how-to-prepare-virtualenv.txt
index 41fff3aaf..62a94968b 100644
--- a/compiler/one-cmds/how-to-prepare-virtualenv.txt
+++ b/compiler/one-cmds/how-to-prepare-virtualenv.txt
@@ -1,12 +1,12 @@
 About
 -----
 
-Last update: 2020-07-14
+Last update: 2020-08-03
 
 This document explains about 'one-prepare-venv' command.
 
 'one-prepare-venv' will prepare python3 virtual environment with tensorflow-cpu
-version 2.3.0rc0, recommanded 2.x version as of now, so that 'one-import-tf'
+version 2.3.0, recommanded 2.x version as of now, so that 'one-import-tf'
 command can execute properly.
 
 
diff --git a/compiler/one-cmds/how-to-use-one-commands.txt b/compiler/one-cmds/how-to-use-one-commands.txt
index 6c2176afa..0ee69e077 100644
--- a/compiler/one-cmds/how-to-use-one-commands.txt
+++ b/compiler/one-cmds/how-to-use-one-commands.txt
@@ -1,7 +1,7 @@
 About
 -----
 
-Last update: 2020-07-14
+Last update: 2020-07-31
 
 This document briefly explains how to use one-* commands.
 Detailed options are not explained here. Run the command to see options.
@@ -30,6 +30,27 @@ Syntax: one-import [framework] [options]
 Currently supported frameworks are 'tf', 'tflite' for TensorFlow and TensorFlow
 lite.
 
+one-import-bcq
+-------------
+
+This will convert Tensorflow model file (.pb) to our circle model file with applying BCQ.
+To execute this command, original Tensorflow model file must include BCQ information.
+
+This command invokes following scripts internally.
+- preserve_bcq_info : Prevent BCQ information vanishing problem
+- generate_bcq_info : Designate BCQ information nodes as model output automatically
+- tf2tfliteV2 : Convert Tensorflow model to tflite model
+- tflite2circle : Convert Tensorflow Lite model to circle model
+When this command is finished, BCQ information nodes will be removed if BCQ information
+was valid and applying BCQ is done correctly without any errors.
+
+As tf2tfliteV2.py runs TensorFlow lite converter, you need to have TensorFlow
+installed in your system. We recommand to use 2.3.0 for now.
+
+We provide python virtual environment and one-import-bcq will enter and leave
+this environment so that you don't need to explictly 'activate' virtual
+environment.
+
 
 one-import-tf
 -------------
@@ -40,7 +61,7 @@ will internally use TensorFlow lite converter and then invoke tflite2circle
 converter to convert tflite model to circle model. 
 
 As tf2tfliteV2.py runs TensorFlow lite converter, you need to have TensorFlow
-installed in your system. We recommand to use 2.3.0rc0 for now.
+installed in your system. We recommand to use 2.3.0 for now.
 
 We provide python virtual environment and one-import-tf will enter and leave
 this environment so that you don't need to explictly 'activate' virtual
diff --git a/compiler/one-cmds/one-codegen b/compiler/one-cmds/one-codegen
index 2c80664e2..820b6d8a3 100644
--- a/compiler/one-cmds/one-codegen
+++ b/compiler/one-cmds/one-codegen
@@ -18,7 +18,7 @@ DRIVER_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 
 function Usage()
 {
-  echo "Usage: $0 [BACKEND] ..."
+  echo "Usage: one-codegen [BACKEND] ..."
   echo "Available BACKEND drivers:"
   backend_exist=0
   for file in `find $DRIVER_PATH -name *-compile -type f`;
@@ -33,23 +33,34 @@ function Usage()
   if [ $backend_exist == 0 ]; then
     echo "  (There is no available backend drivers)"
   fi
+
+  exit 255
 }
 
-# Get command from command-line
-BACKEND=$1; shift
-BACKEND_DRIVER="$BACKEND-compile"
+function version()
+{
+  $DRIVER_PATH/one-version one-codegen
+  exit 255
+}
 
-if [[ -z "${BACKEND_DRIVER}" ]]; then
+# Get command from command-line
+BACKEND=$1
+if [[ -z ${BACKEND} ]]; then
   Usage
-  exit 255
 fi
+shift
+
+if [[ "${BACKEND}" == "--version" ]]; then
+  version
+fi
+
+BACKEND_DRIVER="${BACKEND}-compile"
 
 BACKEND_DRIVER_CMD="${DRIVER_PATH}/${BACKEND_DRIVER}"
 
 if [[ ! -f "${BACKEND_DRIVER_CMD}" ]]; then
   echo "ERROR: '${BACKEND_DRIVER}' is not supported"
   Usage
-  exit 255
 fi
 
 "${BACKEND_DRIVER_CMD}" "$@"
diff --git a/compiler/one-cmds/one-import b/compiler/one-cmds/one-import
index dbf4af534..b1dd8f4c3 100644
--- a/compiler/one-cmds/one-import
+++ b/compiler/one-cmds/one-import
@@ -18,7 +18,7 @@ DRIVER_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 
 function Usage()
 {
-  echo "Usage: $0 [FRAMEWORK] ..."
+  echo "Usage: one-import [FRAMEWORK] ..."
   echo "Available FRAMEWORK drivers:"
   framework_exist=0
   for file in "$DRIVER_PATH"/one-import-*;
@@ -31,23 +31,34 @@ function Usage()
   if [ $framework_exist == 0 ]; then
     echo "  (There is no available import drivers)"
   fi
+
+  exit 255
 }
 
-# Get command from command-line
-FRAMEWORK=$1; shift
-FRAMEWORK_DRIVER="one-import-$FRAMEWORK"
+function version()
+{
+  $DRIVER_PATH/one-version one-import-tf
+  exit 255
+}
 
-if [[ -z "${FRAMEWORK_DRIVER}" ]]; then
+# Get command from command-line
+FRAMEWORK=$1
+if [[ -z ${FRAMEWORK} ]]; then
   Usage
-  exit 255
+fi
+shift
+
+if [ ${FRAMEWORK} = "--version" ]; then
+  version
 fi
 
+FRAMEWORK_DRIVER="one-import-$FRAMEWORK"
+
 FRAMEWORK_DRIVER_CMD="${DRIVER_PATH}/${FRAMEWORK_DRIVER}"
 
 if [[ ! -f "${FRAMEWORK_DRIVER_CMD}" ]]; then
   echo "ERROR: '${FRAMEWORK_DRIVER}' is not supported"
   Usage
-  exit 255
 fi
 
 "${FRAMEWORK_DRIVER_CMD}" "$@"
diff --git a/compiler/one-cmds/one-import-bcq b/compiler/one-cmds/one-import-bcq
new file mode 100644
index 000000000..98dd1efed
--- /dev/null
+++ b/compiler/one-cmds/one-import-bcq
@@ -0,0 +1,150 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+DRIVER_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+
+usage()
+{
+  echo "Convert TensorFlow model with BCQ to circle."
+  echo "Usage: one-import-bcq"
+  echo "    --version Show version information and exit"
+  echo "    --input_path <path/to/tfmodel/with/BCQ>"
+  echo "    --output_path <path/to/circle>"
+  echo "    --input_arrays <names of the input arrays, comma-separated>"
+  echo "    --input_shapes <input shapes, colon-separated>"
+  echo "    --output_arrays <names of the output arrays, comma-separated>"
+  echo "    --v2 Use TensorFlow 2.x interface (default is 1.x interface)"
+  exit 255
+}
+
+version()
+{
+  $DRIVER_PATH/one-version one-import-bcq
+  exit 255
+}
+
+TF_INTERFACE="--v1"
+
+# Parse command-line arguments
+#
+while [ "$#" -ne 0 ]; do
+  CUR="$1"
+
+  case $CUR in
+    '--help')
+      usage
+      ;;
+    '--version')
+      version
+      ;;
+    '--input_path')
+      export INPUT_PATH="$2"
+      shift 2
+      ;;
+    '--output_path')
+      export OUTPUT_PATH="$2"
+      shift 2
+      ;;
+    '--input_arrays')
+      export INPUT_ARRAYS="$2"
+      shift 2
+      ;;
+    '--input_shapes')
+      export INPUT_SHAPES="$2"
+      shift 2
+      ;;
+    '--output_arrays')
+      export OUTPUT_ARRAYS="$2"
+      shift 2
+      ;;
+    '--v2')
+      TF_INTERFACE="--v2"
+      shift
+      ;;
+    *)
+      echo "Unknown parameter: ${CUR}"
+      shift
+      ;;
+  esac
+done
+
+if [ -z ${INPUT_PATH} ] || [ ! -e ${INPUT_PATH} ]; then
+  echo "Error: input model not found"
+  echo ""
+  usage
+  exit 2
+fi
+
+FILE_BASE=$(basename ${OUTPUT_PATH})
+MODEL_NAME="${FILE_BASE%.*}"
+
+TMPDIR=$(mktemp -d)
+trap "{ rm -rf $TMPDIR; }" EXIT
+
+# activate python virtual environment
+VIRTUALENV_LINUX="${DRIVER_PATH}/venv/bin/activate"
+VIRTUALENV_WINDOWS="${DRIVER_PATH}/venv/Scripts/activate"
+
+if [ -e ${VIRTUALENV_LINUX} ]; then
+  source ${VIRTUALENV_LINUX}
+elif [ -e ${VIRTUALENV_WINDOWS} ]; then
+  source ${VIRTUALENV_WINDOWS}
+fi
+
+# remove previous log
+rm -rf "${OUTPUT_PATH}.log"
+
+# generate temporary preserved pb file
+echo "${DRIVER_PATH}/preserve_bcq_info" --input_path ${INPUT_PATH} \
+--output_path "${TMPDIR}/${MODEL_NAME}_preserved.pb"  > "${OUTPUT_PATH}.log"
+echo " " >> "${OUTPUT_PATH}.log"
+
+"${DRIVER_PATH}/preserve_bcq_info" --input_path ${INPUT_PATH} \
+--output_path "${TMPDIR}/${MODEL_NAME}_preserved.pb" >> "${OUTPUT_PATH}.log" 2>&1
+
+# generate output_arrays automatically
+echo "${DRIVER_PATH}/generate_bcq_output_arrays" \
+--input_path "${TMPDIR}/${MODEL_NAME}_preserved.pb" \
+--output_path "${TMPDIR}/${MODEL_NAME}_output_arrays.txt" > "${OUTPUT_PATH}.log"
+echo " " >> "${OUTPUT_PATH}.log"
+
+"${DRIVER_PATH}/generate_bcq_output_arrays" \
+--input_path "${TMPDIR}/${MODEL_NAME}_preserved.pb" \
+--output_path "${TMPDIR}/${MODEL_NAME}_output_arrays.txt" >> "${OUTPUT_PATH}.log" 2>&1
+
+# generate temporary tflite file
+CONVERT_SCRIPT="python ${DRIVER_PATH}/tf2tfliteV2.py ${TF_INTERFACE} "
+CONVERT_SCRIPT+="--input_path ${TMPDIR}/${MODEL_NAME}_preserved.pb "
+CONVERT_SCRIPT+="--input_arrays ${INPUT_ARRAYS} "
+CONVERT_SCRIPT+="--output_path ${TMPDIR}/${MODEL_NAME}.tflite "
+CONVERT_SCRIPT+="--output_arrays ${OUTPUT_ARRAYS}$(cat ${TMPDIR}/${MODEL_NAME}_output_arrays.txt) "
+if [ ! -z ${INPUT_SHAPES} ]; then
+  CONVERT_SCRIPT+="--input_shapes ${INPUT_SHAPES} "
+fi
+
+echo ${CONVERT_SCRIPT} > "${OUTPUT_PATH}.log"
+$CONVERT_SCRIPT >> "${OUTPUT_PATH}.log" 2>&1
+
+# convert .tflite to .circle
+echo " " >> "${OUTPUT_PATH}.log"
+echo "${DRIVER_PATH}/tflite2circle" "${TMPDIR}/${MODEL_NAME}.tflite" \
+"${OUTPUT_PATH}" >> "${OUTPUT_PATH}.log"
+echo " " >> "${OUTPUT_PATH}.log"
+
+"${DRIVER_PATH}/tflite2circle" "${TMPDIR}/${MODEL_NAME}.tflite" \
+"${OUTPUT_PATH}" >> "${OUTPUT_PATH}.log" 2>&1
diff --git a/compiler/one-cmds/one-import-tf b/compiler/one-cmds/one-import-tf
index c048a4e0c..d59e1c529 100644
--- a/compiler/one-cmds/one-import-tf
+++ b/compiler/one-cmds/one-import-tf
@@ -22,14 +22,24 @@ usage()
 {
   echo "Convert TensorFlow model to circle."
   echo "Usage: one-import-tf"
+  echo "    --version Show version information and exit"
   echo "    --input_path <path/to/tfmodel>"
   echo "    --output_path <path/to/circle>"
   echo "    --input_arrays <names of the input arrays, comma-separated>"
   echo "    --input_shapes <input shapes, colon-separated>"
   echo "    --output_arrays <names of the output arrays, comma-separated>"
-  exit 0
+  echo "    --v2 Use TensorFlow 2.x interface (default is 1.x interface)"
+  exit 255
 }
 
+version()
+{
+  $DRIVER_PATH/one-version one-import-tf
+  exit 255
+}
+
+TF_INTERFACE="--v1"
+
 # Parse command-line arguments
 #
 while [ "$#" -ne 0 ]; do
@@ -39,6 +49,9 @@ while [ "$#" -ne 0 ]; do
     '--help')
       usage
       ;;
+    '--version')
+      version
+      ;;
     '--input_path')
       export INPUT_PATH="$2"
       shift 2
@@ -59,6 +72,10 @@ while [ "$#" -ne 0 ]; do
       export OUTPUT_ARRAYS="$2"
       shift 2
       ;;
+    '--v2')
+      TF_INTERFACE="--v2"
+      shift
+      ;;
     *)
       echo "Unknown parameter: ${CUR}"
       shift
@@ -92,14 +109,21 @@ fi
 # remove previous log
 rm -rf "${OUTPUT_PATH}.log"
 
+show_err_onexit()
+{
+  cat "${OUTPUT_PATH}.log"
+}
+
+trap show_err_onexit ERR
+
 # generate temporary tflite file
-echo "python" "${DRIVER_PATH}/tf2tfliteV2.py" --v2 --input_path ${INPUT_PATH} \
+echo "python" "${DRIVER_PATH}/tf2tfliteV2.py" ${TF_INTERFACE} --input_path ${INPUT_PATH} \
 --input_arrays ${INPUT_ARRAYS} --input_shapes ${INPUT_SHAPES} \
 --output_path "${TMPDIR}/${MODEL_NAME}.tflite" \
 --output_arrays ${OUTPUT_ARRAYS} > "${OUTPUT_PATH}.log"
 echo " " >> "${OUTPUT_PATH}.log"
 
-python "${DRIVER_PATH}/tf2tfliteV2.py" --v2 --input_path ${INPUT_PATH} \
+python "${DRIVER_PATH}/tf2tfliteV2.py" ${TF_INTERFACE} --input_path ${INPUT_PATH} \
 --input_arrays ${INPUT_ARRAYS} --input_shapes ${INPUT_SHAPES} \
 --output_path "${TMPDIR}/${MODEL_NAME}.tflite" \
 --output_arrays ${OUTPUT_ARRAYS} >> "${OUTPUT_PATH}.log" 2>&1
diff --git a/compiler/one-cmds/one-import-tflite b/compiler/one-cmds/one-import-tflite
index 31ed5af85..053489c92 100644
--- a/compiler/one-cmds/one-import-tflite
+++ b/compiler/one-cmds/one-import-tflite
@@ -22,9 +22,16 @@ usage()
 {
   echo "Convert TensorFlow lite model to circle."
   echo "Usage: one-import-tflite"
+  echo "    --version Show version information and exit"
   echo "    --input_path <path/to/tflitemodel>"
   echo "    --output_path <path/to/circle>"
-  exit 0
+  exit 255
+}
+
+version()
+{
+  $DRIVER_PATH/one-version one-import-tflite
+  exit 255
 }
 
 # Parse command-line arguments
@@ -36,6 +43,9 @@ while [ "$#" -ne 0 ]; do
     '--help')
       usage
       ;;
+    '--version')
+      version
+      ;;
     '--input_path')
       export INPUT_PATH="$2"
       shift 2
@@ -55,12 +65,18 @@ if [ -z ${INPUT_PATH} ] || [ ! -e ${INPUT_PATH} ]; then
   echo "Error: input model not found"
   echo ""
   usage
-  exit 2
 fi
 
 # remove previous log
 rm -rf "${OUTPUT_PATH}.log"
 
+show_err_onexit()
+{
+  cat "${OUTPUT_PATH}.log"
+}
+
+trap show_err_onexit ERR
+
 # convert .tflite to .circle
 echo "${DRIVER_PATH}/tflite2circle" "${INPUT_PATH}" "${OUTPUT_PATH}" > "${OUTPUT_PATH}.log"
 
diff --git a/compiler/one-cmds/one-optimize b/compiler/one-cmds/one-optimize
index 95384c10d..17b6b980e 100644
--- a/compiler/one-cmds/one-optimize
+++ b/compiler/one-cmds/one-optimize
@@ -22,6 +22,7 @@ usage()
 {
   echo "Optimize circle model."
   echo "Usage: one-optimize"
+  echo "    --version       Show version information and exit"
   echo "    --all           Enable all optimization algorithms"
   echo "    --fuse_bcq      Enable FuseBCQ Pass"
   echo "    --fuse_instnorm Enable FuseInstanceNormalization Pass"
@@ -33,7 +34,13 @@ usage()
   echo "                    Enable ResolveCustomOpMatMulPass Pass"
   echo "    --input_path <path/to/input/circle>"
   echo "    --output_path <path/to/output/circle>"
-  exit 0
+  exit 255
+}
+
+version()
+{
+  $DRIVER_PATH/one-version one-optimize
+  exit 255
 }
 
 OPTIMIZE_all=0
@@ -52,6 +59,9 @@ while [ "$#" -ne 0 ]; do
     '--help')
       usage
       ;;
+    '--version')
+      version
+      ;;
     '--all')
       OPTIMIZE_all=1
       shift
@@ -96,7 +106,6 @@ if [ -z ${INPUT_PATH} ] || [ ! -e ${INPUT_PATH} ]; then
   echo "Error: input model not found"
   echo ""
   usage
-  exit 2
 fi
 
 OPTIMIZE_OPTIONS=""
@@ -123,6 +132,13 @@ fi
 # remove previous log
 rm -rf "${OUTPUT_PATH}.log"
 
+show_err_onexit()
+{
+  cat "${OUTPUT_PATH}.log"
+}
+
+trap show_err_onexit ERR
+
 # NOTE do not wrap ${OPTIMIZE_OPTIONS} with ""
 # optimize circle
 echo "${DRIVER_PATH}/circle2circle" ${OPTIMIZE_OPTIONS} \
diff --git a/compiler/one-cmds/one-pack b/compiler/one-cmds/one-pack
index 2bc4c601d..023b0a85f 100644
--- a/compiler/one-cmds/one-pack
+++ b/compiler/one-cmds/one-pack
@@ -22,9 +22,16 @@ usage()
 {
   echo "Package circle to nnpkg"
   echo "Usage: one-pack"
+  echo "    -v, --version Show version information and exit"
   echo "    -i <path/to/circle>"
   echo "    -o <path/to/nnpackage/folder>"
-  exit 0
+  exit 255
+}
+
+version()
+{
+  $DRIVER_PATH/one-version one-pack
+  exit 255
 }
 
 # Parse command-line arguments
@@ -36,6 +43,12 @@ while [ "$#" -ne 0 ]; do
     '--help')
       usage
       ;;
+    '-v')
+      version
+      ;;
+    '--version')
+      version
+      ;;
     '-i')
       export INPUT_PATH="$2"
       shift 2
@@ -55,13 +68,22 @@ if [ -z ${INPUT_PATH} ] || [ ! -e ${INPUT_PATH} ]; then
   echo "Error: input model not found"
   echo ""
   usage
-  exit 2
 fi
 
+INPUT_FILE=$(basename "${INPUT_PATH}")
+LOG_FILE="${INPUT_FILE%.*}.pack.log"
+
 # remove previous log
-rm -rf "${OUTPUT_PATH}.log"
+rm -rf "${LOG_FILE}"
+
+show_err_onexit()
+{
+  cat "${LOG_FILE}"
+}
+
+trap show_err_onexit ERR
 
 # Package circle model file to nnpkg
-echo "${DRIVER_PATH}/model2nnpkg.sh" -o "${OUTPUT_PATH}" "${INPUT_PATH}" > "${OUTPUT_PATH}.log"
+echo "${DRIVER_PATH}/model2nnpkg.sh" -o "${OUTPUT_PATH}" "${INPUT_PATH}" > "${LOG_FILE}"
 
-"${DRIVER_PATH}/model2nnpkg.sh" -o "${OUTPUT_PATH}" "${INPUT_PATH}" >> "${OUTPUT_PATH}.log" 2>&1
+"${DRIVER_PATH}/model2nnpkg.sh" -o "${OUTPUT_PATH}" "${INPUT_PATH}" >> "${LOG_FILE}" 2>&1
diff --git a/compiler/one-cmds/one-prepare-venv b/compiler/one-cmds/one-prepare-venv
index fce838d81..0a53bd3dd 100644
--- a/compiler/one-cmds/one-prepare-venv
+++ b/compiler/one-cmds/one-prepare-venv
@@ -26,7 +26,19 @@ if [ -f ${VENV_ACTIVATE} ]; then
 fi
 
 # Install prerequisites
-python3 -m pip install -U virtualenv
+python3 -m pip install --user -U virtualenv
+
+function error_no_ensurepip ()
+{
+  echo "ERROR: python3 'ensurepip' module is not found."
+  echo "       On ubuntu, try following command:"
+  echo
+  echo "         apt install python$(python3 --version | awk '{print $2}' | awk -F. '{print $1"."$2}')-venv"
+  echo
+  echo "       You may need root privilege for this."
+  exit 1
+}
+python3 -m ensurepip --version > /dev/null 2>&1 || error_no_ensurepip
 
 # Create python virtual enviornment
 python3 -m venv "${DRIVER_PATH}/venv"
@@ -37,4 +49,4 @@ source "${VENV_ACTIVATE}"
 python -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host files.pythonhost.org \
   install -U pip setuptools
 python -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host files.pythonhost.org \
-  install tensorflow-cpu==2.3.0rc0
+  install tensorflow-cpu==2.3.0
diff --git a/compiler/one-cmds/one-quantize b/compiler/one-cmds/one-quantize
index ff9e26672..c74b2c2d2 100644
--- a/compiler/one-cmds/one-quantize
+++ b/compiler/one-cmds/one-quantize
@@ -22,16 +22,23 @@ usage()
 {
   echo "Quantize circle model."
   echo "Usage: one-quantize"
+  echo "    --version         Show version information and exit"
   echo "    --input_dtype     Input data type (supported: float32, default=float32)"
   echo "    --quantized_dtype Output quantized data type (supported: uint8, default=uint8)"
-  echo "    --granularity     Quantize granularity (supported: layer, default=layer)"
+  echo "    --granularity     Quantize granularity (supported: layer, channel, default=layer)"
   echo "    --min_percentile  Minimum percentile (0.0~100.0, default=1.0)"
   echo "    --max_percentile  Maximum percentile (0.0~100.0, default=99.0)"
   echo "    --mode            Record mode (supported: percentile/moving_average, default=percentile)"
   echo "    --input_path <path/to/input/circle>"
   echo "    --input_data <path/to/input/data>"
   echo "    --output_path <path/to/output/circle>"
-  exit 0
+  exit 255
+}
+
+version()
+{
+  $DRIVER_PATH/one-version one-quantize
+  exit 255
 }
 
 INPUT_DTYPE=float32
@@ -50,6 +57,9 @@ while [ "$#" -ne 0 ]; do
     '--help')
       usage
       ;;
+    '--version')
+      version
+      ;;
 
     '--input_dtype')
       INPUT_DTYPE="$2"
@@ -100,13 +110,11 @@ if [ -z ${INPUT_PATH} ] || [ ! -e ${INPUT_PATH} ]; then
   echo "Error: input model not found"
   echo ""
   usage
-  exit 2
 fi
 if [ -z ${INPUT_DATA} ] || [ ! -e ${INPUT_DATA} ]; then
   echo "Error: input data not found"
   echo ""
   usage
-  exit 2
 fi
 
 FILE_BASE=$(basename ${OUTPUT_PATH})
@@ -118,6 +126,13 @@ trap "{ rm -rf $TMPDIR; }" EXIT
 # remove previous log
 rm -rf "${OUTPUT_PATH}.log"
 
+show_err_onexit()
+{
+  cat "${OUTPUT_PATH}.log"
+}
+
+trap show_err_onexit ERR
+
 # quantize circle
 echo "${DRIVER_PATH}/circle-quantizer" \
 --quantize_dequantize_weights ${INPUT_DTYPE} ${QUANTIZED_DTYPE} ${GRANULARITY} \
diff --git a/compiler/one-cmds/requires.cmake b/compiler/one-cmds/requires.cmake
index 9b858ad90..50c24579f 100644
--- a/compiler/one-cmds/requires.cmake
+++ b/compiler/one-cmds/requires.cmake
@@ -3,3 +3,5 @@ require("tflite2circle")
 require("circle2circle")
 require("circle-quantizer")
 require("record-minmax")
+require("vconone")
+require("bcq-tools")
diff --git a/compiler/pota-quantization-value-test/CMakeLists.txt b/compiler/pota-quantization-value-test/CMakeLists.txt
index d97ffc123..73b9ead73 100644
--- a/compiler/pota-quantization-value-test/CMakeLists.txt
+++ b/compiler/pota-quantization-value-test/CMakeLists.txt
@@ -49,21 +49,21 @@ add_test(
           ${QUANTIZATION_VALUE_TEST_WITH_PARAM}
 )
 
-#add_test(
-#  NAME pota_record_minmax_test
-#  COMMAND "${CMAKE_CURRENT_SOURCE_DIR}/test_record_minmax.sh"
-#          "${TEST_CONFIG}"
-#          "${ARTIFACTS_BIN_PATH}"
-#          ${QUANTIZATION_VALUE_TEST_WITH_PARAM}
-#)
+add_test(
+  NAME pota_record_minmax_test
+  COMMAND "${CMAKE_CURRENT_SOURCE_DIR}/test_record_minmax.sh"
+          "${TEST_CONFIG}"
+          "${ARTIFACTS_BIN_PATH}"
+          ${QUANTIZATION_VALUE_TEST_WITH_PARAM}
+)
 
-#add_test(
-#  NAME pota_quantization_test
-#  COMMAND "${CMAKE_CURRENT_SOURCE_DIR}/test_quantization.sh"
-#          "${TEST_CONFIG}"
-#          "${ARTIFACTS_BIN_PATH}"
-#          ${QUANTIZATION_VALUE_TEST_WITH_PARAM}
-#)
+add_test(
+  NAME pota_quantization_test
+  COMMAND "${CMAKE_CURRENT_SOURCE_DIR}/test_quantization.sh"
+          "${TEST_CONFIG}"
+          "${ARTIFACTS_BIN_PATH}"
+          ${QUANTIZATION_VALUE_TEST_WITH_PARAM}
+)
 
-#set_tests_properties(pota_record_minmax_test PROPERTIES DEPENDS pota_fake_wquant_test)
-#set_tests_properties(pota_quantization_test PROPERTIES DEPENDS pota_record_minmax_test)
+set_tests_properties(pota_record_minmax_test PROPERTIES DEPENDS pota_fake_wquant_test)
+set_tests_properties(pota_quantization_test PROPERTIES DEPENDS pota_record_minmax_test)
diff --git a/compiler/pota-quantization-value-test/compare_tensors.py b/compiler/pota-quantization-value-test/compare_tensors.py
index 258d46dc9..7d95d182d 100755
--- a/compiler/pota-quantization-value-test/compare_tensors.py
+++ b/compiler/pota-quantization-value-test/compare_tensors.py
@@ -69,7 +69,7 @@ def compare_quantization(tensor, tensor_name, expect_dir):
         if key == "weights":
             expected_weights = np.array(json_load["weights"])
             input_weights = tensor["weights"][:]
-            if np.allclose(input_weights, expected_weights, rtol=0, atol=0) == False:
+            if np.allclose(input_weights, expected_weights, rtol=0, atol=1) == False:
                 print("Quantized weights of " + tensor_name + " (" + str(input_weights) +
                       ") do not match with expected value (" + str(expected_weights) +
                       ").")
@@ -87,7 +87,7 @@ def compare_quantization(tensor, tensor_name, expect_dir):
             expected_zero_point = np.array(json_load["zero_point"])
             input_zero_point = tensor["zero_point"][:]
             if np.allclose(
-                    input_zero_point, expected_zero_point, rtol=0, atol=0) == False:
+                    input_zero_point, expected_zero_point, rtol=0, atol=1) == False:
                 print("Quantized zero_point of " + tensor_name + " (" +
                       str(input_zero_point) + ") do not match with expected value (" +
                       str(expected_zero_point) + ").")
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/layer/uint8/fake_quantization/ker.json b/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/layer/uint8/fake_quantization/ker.json
index 21b8ecad7..2558bb2be 100644
--- a/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/layer/uint8/fake_quantization/ker.json
+++ b/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/layer/uint8/fake_quantization/ker.json
@@ -3,44 +3,44 @@
     [
       [
         [
-          1.003921627998352, 
-          2.007843255996704
-        ], 
+          1.0039215087890625,
+          2.007843017578125
+        ],
         [
-          -3.0117647647857666, 
+          -3.0117650032043457,
           -4.015686511993408
         ]
-      ], 
+      ],
       [
         [
-          -5.019608020782471, 
-          6.023529529571533
-        ], 
+          -5.019608497619629,
+          6.023530006408691
+        ],
         [
-          -7.027451038360596, 
-          7.968627452850342
+          -7.027451515197754,
+          7.9686279296875
         ]
       ]
-    ], 
+    ],
     [
       [
         [
-          4.015686511993408, 
-          -2.007843255996704
-        ], 
+          4.01568603515625,
+          -2.007843494415283
+        ],
         [
-          3.0117647647857666, 
-          -1.003921627998352
+          3.0117645263671875,
+          -1.0039215087890625
         ]
-      ], 
+      ],
       [
         [
-          -7.968627452850342, 
-          -6.023529529571533
-        ], 
+          -7.9686279296875,
+          -6.023530006408691
+        ],
         [
-          7.027451038360596, 
-          5.019608020782471
+          7.027451515197754,
+          5.019608497619629
         ]
       ]
     ]
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/layer/uint8/quantization/bias.json b/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/layer/uint8/quantization/bias.json
index 462d0d3e3..50d44ece7 100644
--- a/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/layer/uint8/quantization/bias.json
+++ b/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/layer/uint8/quantization/bias.json
@@ -1,7 +1,7 @@
-  {
-    "scale": 0.0059054209919261825, 
-    "weights": [
-      169.0, 
-      339.0
-    ]
-  }
+{
+  "weights": [
+    4069,
+    8138
+  ],
+  "scale": 0.0002457468386200985
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/layer/uint8/quantization/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/layer/uint8/quantization/ifm.json
index 107117b80..24508860d 100644
--- a/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/layer/uint8/quantization/ifm.json
+++ b/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/layer/uint8/quantization/ifm.json
@@ -1,4 +1,4 @@
 {
-  "scale": 0.09411764705882353, 
+  "scale": 0.003916590008884668,
   "zero_point": 0.0
 }
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/layer/uint8/quantization/ker.json b/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/layer/uint8/quantization/ker.json
index 3a6e171a1..b249a0ce5 100644
--- a/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/layer/uint8/quantization/ker.json
+++ b/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/layer/uint8/quantization/ker.json
@@ -1,52 +1,52 @@
 {
-  "max": 7.968627450980392, 
-  "scale": 0.06274509803921569, 
   "weights": [
     [
       [
         [
-          144, 
-          160
-        ], 
+          143,
+          159
+        ],
         [
-          80, 
-          64
+          79,
+          63
         ]
-      ], 
+      ],
       [
         [
-          48, 
-          224
-        ], 
+          47,
+          223
+        ],
         [
-          16, 
-          255
+          15,
+          254
         ]
       ]
-    ], 
+    ],
     [
       [
         [
-          192, 
-          96
-        ], 
+          191,
+          95
+        ],
         [
-          176, 
-          112
+          175,
+          111
         ]
-      ], 
+      ],
       [
         [
-          1, 
-          32
-        ], 
+          0,
+          31
+        ],
         [
-          240, 
-          208
+          239,
+          207
         ]
       ]
     ]
-  ], 
-  "min": -8.031372549019608, 
-  "zero_point": 128.0
+  ],
+  "scale": 0.062745101749897,
+  "zero_point": 127.0,
+  "min": -7.9686279296875,
+  "max": 8.031373023986816
 }
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/layer/uint8/quantization/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/layer/uint8/quantization/ofm.json
index 2374639b1..a2dd6681f 100644
--- a/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/layer/uint8/quantization/ofm.json
+++ b/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/layer/uint8/quantization/ofm.json
@@ -1,4 +1,4 @@
 {
-  "scale": 0.17836222929113052, 
+  "scale": 0.037479765713214874,
   "zero_point": 0.0
 }
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/layer/uint8/record_minmax/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/layer/uint8/record_minmax/ifm.json
index 563c0424f..42f8b5617 100644
--- a/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/layer/uint8/record_minmax/ifm.json
+++ b/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/layer/uint8/record_minmax/ifm.json
@@ -1,4 +1,4 @@
 {
-  "max": 24.0, 
-  "min": 1.0
+  "min": 0.005472412034869194,
+  "max": 0.9987304735183716
 }
diff --git a/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/layer/uint8/record_minmax/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/layer/uint8/record_minmax/ofm.json
index fd0c6dc86..1862e8cb2 100644
--- a/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/layer/uint8/record_minmax/ofm.json
+++ b/compiler/pota-quantization-value-test/expected_outputs/Conv2D_004/layer/uint8/record_minmax/ofm.json
@@ -1,4 +1,4 @@
 {
-  "max": 45.48236846923828, 
-  "min": 0.0
+  "min": 0.0,
+  "max": 9.557340850830078
 }
diff --git a/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/layer/uint8/fake_quantization/ker.json b/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/layer/uint8/fake_quantization/ker.json
index 11e91ca42..cd3479781 100644
--- a/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/layer/uint8/fake_quantization/ker.json
+++ b/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/layer/uint8/fake_quantization/ker.json
@@ -3,29 +3,29 @@
     [
       [
         [
-          0.9725490212440491, 
-          1.9450980424880981, 
-          3.0392158031463623, 
+          0.9725494384765625,
+          1.945098876953125,
+          3.039216995239258,
           4.0117645263671875
-        ], 
+        ],
         [
-          -8.996078491210938, 
-          9.968626976013184, 
-          -10.941176414489746, 
-          12.035294532775879
+          -8.996077537536621,
+          9.9686279296875,
+          -10.94117546081543,
+          12.035295486450195
         ]
-      ], 
+      ],
       [
         [
-          4.984313488006592, 
-          5.956862926483154, 
-          7.050980567932129, 
-          8.023529052734375
-        ], 
+          4.98431396484375,
+          5.9568634033203125,
+          7.050981521606445,
+          8.023530960083008
+        ],
         [
-          13.007843017578125, 
-          -13.980392456054688, 
-          14.952940940856934, 
+          13.007843017578125,
+          -13.980391502380371,
+          14.95294189453125,
           -16.04705810546875
         ]
       ]
diff --git a/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/layer/uint8/quantization/bias.json b/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/layer/uint8/quantization/bias.json
index df7cb14c4..e60ff312e 100644
--- a/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/layer/uint8/quantization/bias.json
+++ b/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/layer/uint8/quantization/bias.json
@@ -1,9 +1,9 @@
 {
-  "scale": 0.007627835447904652, 
   "weights": [
-    131.0, 
-    262.0, 
-    393.0, 
-    524.0
-  ]
+    2156,
+    4312,
+    6468,
+    8624
+  ],
+  "scale": 0.0004638272181067826
 }
diff --git a/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/layer/uint8/quantization/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/layer/uint8/quantization/ifm.json
index 254ce899a..4ec4ef2d7 100644
--- a/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/layer/uint8/quantization/ifm.json
+++ b/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/layer/uint8/quantization/ifm.json
@@ -1,4 +1,4 @@
 {
-  "scale": 0.06274509803921569, 
+  "scale": 0.0038153529167175293,
   "zero_point": 0.0
 }
diff --git a/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/layer/uint8/quantization/ker.json b/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/layer/uint8/quantization/ker.json
index 3d14da173..01835fbde 100644
--- a/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/layer/uint8/quantization/ker.json
+++ b/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/layer/uint8/quantization/ker.json
@@ -1,38 +1,38 @@
 {
-  "max": 14.952941176470588, 
-  "scale": 0.12156862745098039, 
   "weights": [
     [
       [
         [
-          140, 
-          148, 
-          157, 
+          140,
+          148,
+          157,
           165
-        ], 
+        ],
         [
-          58, 
-          214, 
-          42, 
+          58,
+          214,
+          42,
           231
         ]
-      ], 
+      ],
       [
         [
-          173, 
-          181, 
-          190, 
+          173,
+          181,
+          190,
           198
-        ], 
+        ],
         [
-          239, 
-          17, 
-          255, 
+          239,
+          17,
+          255,
           0
         ]
       ]
     ]
-  ], 
-  "min": -16.04705882352941, 
-  "zero_point": 132.0
+  ],
+  "scale": 0.12156862765550613,
+  "zero_point": 132.0,
+  "min": -16.04705810546875,
+  "max": 14.952940940856934
 }
diff --git a/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/layer/uint8/quantization/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/layer/uint8/quantization/ofm.json
index 85dd4d9ae..39c64f3ef 100644
--- a/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/layer/uint8/quantization/ofm.json
+++ b/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/layer/uint8/quantization/ofm.json
@@ -1,4 +1,4 @@
 {
-  "scale": 0.893733185412837, 
+  "scale": 0.07362665981054306,
   "zero_point": 0.0
 }
diff --git a/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/layer/uint8/record_minmax/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/layer/uint8/record_minmax/ifm.json
index 9aee7bcb0..bb4292efe 100644
--- a/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/layer/uint8/record_minmax/ifm.json
+++ b/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/layer/uint8/record_minmax/ifm.json
@@ -1,4 +1,4 @@
 {
-  "max": 16.0, 
-  "min": 1.0
+  "min": 0.02638142943382263,
+  "max": 0.9729149651527405
 }
diff --git a/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/layer/uint8/record_minmax/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/layer/uint8/record_minmax/ofm.json
index aa42a6614..1c118e1db 100644
--- a/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/layer/uint8/record_minmax/ofm.json
+++ b/compiler/pota-quantization-value-test/expected_outputs/DepthwiseConv2D_002/layer/uint8/record_minmax/ofm.json
@@ -1,4 +1,4 @@
 {
-  "max": 227.90196228027344, 
-  "min": 0.0
+  "min": 0.0,
+  "max": 18.77479721069336
 }
diff --git a/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/layer/uint8/fake_quantization/weight.json b/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/layer/uint8/fake_quantization/weight.json
new file mode 100644
index 000000000..e1da53ab0
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/layer/uint8/fake_quantization/weight.json
@@ -0,0 +1,76 @@
+{
+  "weights": [
+    [
+      1.0039215087890625,
+      2.007843017578125,
+      -3.0117650032043457,
+      -4.015686511993408,
+      -5.019608497619629,
+      6.023530006408691,
+      -7.027451515197754,
+      7.9686279296875,
+      4.01568603515625,
+      -2.007843494415283,
+      3.0117645263671875,
+      -1.0039215087890625,
+      -7.9686279296875,
+      -6.023530006408691,
+      7.027451515197754,
+      5.019608497619629
+    ],
+    [
+      1.0039215087890625,
+      2.007843017578125,
+      -3.0117650032043457,
+      -4.015686511993408,
+      -5.019608497619629,
+      6.023530006408691,
+      -7.027451515197754,
+      7.9686279296875,
+      4.01568603515625,
+      -2.007843494415283,
+      3.0117645263671875,
+      -1.0039215087890625,
+      -7.9686279296875,
+      -6.023530006408691,
+      7.027451515197754,
+      5.019608497619629
+    ],
+    [
+      1.0039215087890625,
+      2.007843017578125,
+      -3.0117650032043457,
+      -4.015686511993408,
+      -5.019608497619629,
+      6.023530006408691,
+      -7.027451515197754,
+      7.9686279296875,
+      4.01568603515625,
+      -2.007843494415283,
+      3.0117645263671875,
+      -1.0039215087890625,
+      -7.9686279296875,
+      -6.023530006408691,
+      7.027451515197754,
+      5.019608497619629
+    ],
+    [
+      1.0039215087890625,
+      2.007843017578125,
+      -3.0117650032043457,
+      -4.015686511993408,
+      -5.019608497619629,
+      6.023530006408691,
+      -7.027451515197754,
+      7.9686279296875,
+      4.01568603515625,
+      -2.007843494415283,
+      3.0117645263671875,
+      -1.0039215087890625,
+      -7.9686279296875,
+      -6.023530006408691,
+      7.027451515197754,
+      5.019608497619629
+    ]
+  ]
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/layer/uint8/quantization/bias.json b/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/layer/uint8/quantization/bias.json
new file mode 100644
index 000000000..ecb49bb64
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/layer/uint8/quantization/bias.json
@@ -0,0 +1,9 @@
+{
+  "weights": [
+    415,
+    -829,
+    -1244,
+    1658
+  ],
+  "scale": 0.00241205753304663
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/layer/uint8/quantization/in.json b/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/layer/uint8/quantization/in.json
new file mode 100644
index 000000000..654824b5d
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/layer/uint8/quantization/in.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.03844216465950012,
+  "zero_point": 126.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/layer/uint8/quantization/out.json b/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/layer/uint8/quantization/out.json
new file mode 100644
index 000000000..3baa42155
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/layer/uint8/quantization/out.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.741962730884552,
+  "zero_point": 156.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/layer/uint8/quantization/weight.json b/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/layer/uint8/quantization/weight.json
new file mode 100644
index 000000000..940224049
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/layer/uint8/quantization/weight.json
@@ -0,0 +1,80 @@
+{
+  "weights": [
+    [
+      143,
+      159,
+      79,
+      63,
+      47,
+      223,
+      15,
+      254,
+      191,
+      95,
+      175,
+      111,
+      0,
+      31,
+      239,
+      207
+    ],
+    [
+      143,
+      159,
+      79,
+      63,
+      47,
+      223,
+      15,
+      254,
+      191,
+      95,
+      175,
+      111,
+      0,
+      31,
+      239,
+      207
+    ],
+    [
+      143,
+      159,
+      79,
+      63,
+      47,
+      223,
+      15,
+      254,
+      191,
+      95,
+      175,
+      111,
+      0,
+      31,
+      239,
+      207
+    ],
+    [
+      143,
+      159,
+      79,
+      63,
+      47,
+      223,
+      15,
+      254,
+      191,
+      95,
+      175,
+      111,
+      0,
+      31,
+      239,
+      207
+    ]
+  ],
+  "scale": 0.062745101749897,
+  "zero_point": 127.0,
+  "min": -7.9686279296875,
+  "max": 8.031373023986816
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/layer/uint8/record_minmax/in.json b/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/layer/uint8/record_minmax/in.json
new file mode 100644
index 000000000..a8ec5b2b6
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/layer/uint8/record_minmax/in.json
@@ -0,0 +1,4 @@
+{
+  "min": -4.832756385803223,
+  "max": 4.969995346069336
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/layer/uint8/record_minmax/out.json b/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/layer/uint8/record_minmax/out.json
new file mode 100644
index 000000000..de3b41564
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/FullyConnected_003/layer/uint8/record_minmax/out.json
@@ -0,0 +1,4 @@
+{
+  "min": -115.99438369750976,
+  "max": 73.20612327575684
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/layer/uint8/fake_quantization/ker.json b/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/layer/uint8/fake_quantization/ker.json
new file mode 100644
index 000000000..76a0440a0
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/layer/uint8/fake_quantization/ker.json
@@ -0,0 +1,48 @@
+{
+  "weights": [
+    [
+      [
+        [
+          0.960784912109375,
+          2.0588245391845703
+        ],
+        [
+          -3.0196075439453125,
+          -3.980391502380371
+        ],
+        [
+          4.9411773681640625,
+          -6.039215087890625
+        ]
+      ],
+      [
+        [
+          7.0,
+          7.960784912109375
+        ],
+        [
+          -9.058823585510254,
+          -10.019607543945312
+        ],
+        [
+          10.980392456054688,
+          -11.941176414489746
+        ]
+      ],
+      [
+        [
+          13.039216995239258,
+          14.000001907348633
+        ],
+        [
+          -14.960784912109375,
+          -16.05882453918457
+        ],
+        [
+          17.019607543945312,
+          -17.980392456054688
+        ]
+      ]
+    ]
+  ]
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/layer/uint8/quantization/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/layer/uint8/quantization/ifm.json
new file mode 100644
index 000000000..dc5ca8dd5
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/layer/uint8/quantization/ifm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 0.03869570419192314,
+  "zero_point": 126.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/layer/uint8/quantization/ker.json b/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/layer/uint8/quantization/ker.json
new file mode 100644
index 000000000..bc150bbb0
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/layer/uint8/quantization/ker.json
@@ -0,0 +1,52 @@
+{
+  "weights": [
+    [
+      [
+        [
+          138,
+          146
+        ],
+        [
+          109,
+          102
+        ],
+        [
+          167,
+          87
+        ]
+      ],
+      [
+        [
+          182,
+          189
+        ],
+        [
+          65,
+          58
+        ],
+        [
+          211,
+          44
+        ]
+      ],
+      [
+        [
+          226,
+          233
+        ],
+        [
+          22,
+          14
+        ],
+        [
+          255,
+          0
+        ]
+      ]
+    ]
+  ],
+  "scale": 0.13725490868091583,
+  "zero_point": 131.0,
+  "min": -17.980392456054688,
+  "max": 17.019609451293945
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/layer/uint8/quantization/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/layer/uint8/quantization/ofm.json
new file mode 100644
index 000000000..bfd862189
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/layer/uint8/quantization/ofm.json
@@ -0,0 +1,4 @@
+{
+  "scale": 1.6333034038543701,
+  "zero_point": 127.0
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/layer/uint8/record_minmax/ifm.json b/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/layer/uint8/record_minmax/ifm.json
new file mode 100644
index 000000000..2d2af08a6
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/layer/uint8/record_minmax/ifm.json
@@ -0,0 +1,4 @@
+{
+  "min": -4.890846576690674,
+  "max": 4.976558513641357
+}
diff --git a/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/layer/uint8/record_minmax/ofm.json b/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/layer/uint8/record_minmax/ofm.json
new file mode 100644
index 000000000..24598f06e
--- /dev/null
+++ b/compiler/pota-quantization-value-test/expected_outputs/TransposeConv_001/layer/uint8/record_minmax/ofm.json
@@ -0,0 +1,4 @@
+{
+  "min": -207.54233032226563,
+  "max": 208.95002136230468
+}
diff --git a/compiler/pota-quantization-value-test/test.lst b/compiler/pota-quantization-value-test/test.lst
index 65613ff8f..9eb348922 100644
--- a/compiler/pota-quantization-value-test/test.lst
+++ b/compiler/pota-quantization-value-test/test.lst
@@ -1,2 +1,4 @@
 addTest(Conv2D_004 layer uint8)
 addTest(DepthwiseConv2D_002 layer uint8)
+addTest(FullyConnected_003 layer uint8)
+addTest(TransposeConv_001 layer uint8)
diff --git a/compiler/pota-quantization-value-test/test_inputs/Conv2D_004/layer/uint8/0.txt b/compiler/pota-quantization-value-test/test_inputs/Conv2D_004/layer/uint8/0.txt
index 8803cb178..0614b5e83 100644
--- a/compiler/pota-quantization-value-test/test_inputs/Conv2D_004/layer/uint8/0.txt
+++ b/compiler/pota-quantization-value-test/test_inputs/Conv2D_004/layer/uint8/0.txt
@@ -1 +1 @@
-1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24
+0.01090685,0.0581577 ,0.637094  ,0.64067715,0.26264507,0.13692169,0.9649414 ,0.5117181 ,0.18012471,0.07855253,0.6358017 ,0.62257963,0.41469443,0.93169045,0.20763828,0.7634293 ,0.75929826,0.72708374,0.23463063,0.58222896,0.6351517 ,0.68781173,0.5558012 ,0.7652179 
diff --git a/compiler/pota-quantization-value-test/test_inputs/Conv2D_004/layer/uint8/1.txt b/compiler/pota-quantization-value-test/test_inputs/Conv2D_004/layer/uint8/1.txt
new file mode 100644
index 000000000..b1c39382f
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Conv2D_004/layer/uint8/1.txt
@@ -0,0 +1 @@
+0.57017624,0.08235867,0.03672464,0.40372616,0.7353964 ,0.59611887,0.7675548 ,0.21004233,0.09803218,0.20009473,0.8821493 ,0.17015271,0.14840214,0.99910176,0.37003204,0.22893582,0.43173164,0.3105084 ,0.41997132,0.43714985,0.08115962,0.71896386,0.7810953 ,0.00524598
diff --git a/compiler/pota-quantization-value-test/test_inputs/Conv2D_004/layer/uint8/2.txt b/compiler/pota-quantization-value-test/test_inputs/Conv2D_004/layer/uint8/2.txt
new file mode 100644
index 000000000..7e562de75
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Conv2D_004/layer/uint8/2.txt
@@ -0,0 +1 @@
+0.65292275,0.79842275,0.97853714,0.6711518 ,0.607567  ,0.40971732,0.74838483,0.95853555,0.32158023,0.911524  ,0.66938365,0.8573132 ,0.3047727 ,0.5561248 ,0.914098  ,0.07650814,0.37868017,0.29269257,0.19652605,0.63025194,0.61496884,0.32011527,0.8204132 ,0.21866946
diff --git a/compiler/pota-quantization-value-test/test_inputs/Conv2D_004/layer/uint8/3.txt b/compiler/pota-quantization-value-test/test_inputs/Conv2D_004/layer/uint8/3.txt
new file mode 100644
index 000000000..2958a7f54
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Conv2D_004/layer/uint8/3.txt
@@ -0,0 +1 @@
+0.4548901 ,0.56957537,0.0252368 ,0.4884317 ,0.7516498 ,0.02631272,0.22107519,0.95249426,0.34902394,0.11520014,0.808911  ,0.4148615 ,0.63615656,0.84020686,0.3633697 ,0.23993976,0.54176176,0.86938345,0.81628686,0.6380988 ,0.91891205,0.0406627 ,0.90289026,0.9429013 
diff --git a/compiler/pota-quantization-value-test/test_inputs/Conv2D_004/layer/uint8/4.txt b/compiler/pota-quantization-value-test/test_inputs/Conv2D_004/layer/uint8/4.txt
new file mode 100644
index 000000000..fc969308e
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/Conv2D_004/layer/uint8/4.txt
@@ -0,0 +1 @@
+0.9309136 ,0.02123719,0.64467335,0.6910113 ,0.47402772,0.54622203,0.31527275,0.81530565,0.98981965,0.36102158,0.03114039,0.1902339 ,0.45183742,0.60178596,0.4683102 ,0.59810966,0.40558222,0.5420302 ,0.72699505,0.9575108 ,0.46746576,0.08518691,0.40302262,0.69213694
diff --git a/compiler/pota-quantization-value-test/test_inputs/DepthwiseConv2D_002/layer/uint8/0.txt b/compiler/pota-quantization-value-test/test_inputs/DepthwiseConv2D_002/layer/uint8/0.txt
index c210774d2..44f0ff107 100644
--- a/compiler/pota-quantization-value-test/test_inputs/DepthwiseConv2D_002/layer/uint8/0.txt
+++ b/compiler/pota-quantization-value-test/test_inputs/DepthwiseConv2D_002/layer/uint8/0.txt
@@ -1 +1 @@
-1, 2, 7, 8, 3, 4, 9, 10, 5, 6, 11, 12, 13, 14, 15, 16
+0.31365377,0.6127105 ,0.7047126 ,0.2511918 ,0.16652136,0.36075932,0.44332707,0.77615815,0.60456425,0.26207635,0.28714025,0.11579613,0.89698446,0.67223394,0.3757766 ,0.11787009
diff --git a/compiler/pota-quantization-value-test/test_inputs/DepthwiseConv2D_002/layer/uint8/1.txt b/compiler/pota-quantization-value-test/test_inputs/DepthwiseConv2D_002/layer/uint8/1.txt
new file mode 100644
index 000000000..98e81041f
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/DepthwiseConv2D_002/layer/uint8/1.txt
@@ -0,0 +1 @@
+0.9409595 ,0.3991174 ,0.43546647,0.221152  ,0.7794665 ,0.8619514 ,0.5903087 ,0.24476172,0.5932698 ,0.2727837 ,0.3980262 ,0.13329633,0.4319272 ,0.37872055,0.1721639 ,0.92437047
diff --git a/compiler/pota-quantization-value-test/test_inputs/DepthwiseConv2D_002/layer/uint8/2.txt b/compiler/pota-quantization-value-test/test_inputs/DepthwiseConv2D_002/layer/uint8/2.txt
new file mode 100644
index 000000000..e9867529b
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/DepthwiseConv2D_002/layer/uint8/2.txt
@@ -0,0 +1 @@
+0.6484028 ,0.09222967,0.76285905,0.02265582,0.2564394 ,0.11219095,0.22529566,0.09101159,0.15937322,0.3540595 ,0.25971088,0.4681136 ,0.4279646 ,0.5386553 ,0.11397707,0.7413688 
diff --git a/compiler/pota-quantization-value-test/test_inputs/DepthwiseConv2D_002/layer/uint8/3.txt b/compiler/pota-quantization-value-test/test_inputs/DepthwiseConv2D_002/layer/uint8/3.txt
new file mode 100644
index 000000000..9b36fb520
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/DepthwiseConv2D_002/layer/uint8/3.txt
@@ -0,0 +1 @@
+0.9182678 ,0.8253187 ,0.6572848 ,0.46436486,0.45208713,0.42112917,0.24383743,0.16039051,0.24649048,0.63431305,0.31141657,0.25664324,0.721266  ,0.18996912,0.35422477,0.8826148 
diff --git a/compiler/pota-quantization-value-test/test_inputs/DepthwiseConv2D_002/layer/uint8/4.txt b/compiler/pota-quantization-value-test/test_inputs/DepthwiseConv2D_002/layer/uint8/4.txt
new file mode 100644
index 000000000..6b8957dcc
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/DepthwiseConv2D_002/layer/uint8/4.txt
@@ -0,0 +1 @@
+0.97424644,0.9360494 ,0.6849295 ,0.21313633,0.23943195,0.32497332,0.5091704 ,0.67543274,0.49667478,0.73460567,0.5866559 ,0.5312464 ,0.8252662 ,0.36093768,0.7143621 ,0.7234413 
diff --git a/compiler/pota-quantization-value-test/test_inputs/FullyConnected_003/layer/uint8/0.txt b/compiler/pota-quantization-value-test/test_inputs/FullyConnected_003/layer/uint8/0.txt
new file mode 100644
index 000000000..233e5eae3
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/FullyConnected_003/layer/uint8/0.txt
@@ -0,0 +1 @@
+ 2.7731526 , 2.451602  , 3.7535272 ,-1.2774152 , 1.5482912 , 1.3402948 , 4.4792123 ,-4.4954367 , 3.354679  ,-3.3615496 ,-4.619757  ,-3.3659618 , 4.7626247 ,-1.3596478 ,-4.835548  , 0.78964525
diff --git a/compiler/pota-quantization-value-test/test_inputs/FullyConnected_003/layer/uint8/1.txt b/compiler/pota-quantization-value-test/test_inputs/FullyConnected_003/layer/uint8/1.txt
new file mode 100644
index 000000000..6a126081d
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/FullyConnected_003/layer/uint8/1.txt
@@ -0,0 +1 @@
+ 0.5400839 ,-3.2621996 ,-3.4817135 , 3.8183312 , 0.48498327, 2.9812584 , 4.111276  , 0.11223658, 4.7201405 , 2.4256718 , 1.4895477 , 4.7596602 ,-0.32709372, 1.3507305 ,-0.30043927,-1.8077502 
diff --git a/compiler/pota-quantization-value-test/test_inputs/FullyConnected_003/layer/uint8/2.txt b/compiler/pota-quantization-value-test/test_inputs/FullyConnected_003/layer/uint8/2.txt
new file mode 100644
index 000000000..eccd2c625
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/FullyConnected_003/layer/uint8/2.txt
@@ -0,0 +1 @@
+ 3.8758078 , 4.978636  ,-0.22925885,-2.6760504 ,-1.9160627 ,-4.609644  ,-0.9515802 , 3.558274  , 2.9096057 , 0.3340422 , 0.38608226,-0.32168412, 4.688853  ,-4.583811  ,-2.5113506 ,-4.6688786 
diff --git a/compiler/pota-quantization-value-test/test_inputs/FullyConnected_003/layer/uint8/3.txt b/compiler/pota-quantization-value-test/test_inputs/FullyConnected_003/layer/uint8/3.txt
new file mode 100644
index 000000000..0da05277c
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/FullyConnected_003/layer/uint8/3.txt
@@ -0,0 +1 @@
+-2.9868221 , 2.4237797 , 1.0833962 ,-0.9231426 ,-2.1091506 ,-2.6163697 ,-0.23101932,-1.9252896 , 4.7034135 , 3.1088963 ,-2.345823  ,-2.7866168 ,-3.186763  ,-4.431844  , 3.3113294 , 0.9501982 
diff --git a/compiler/pota-quantization-value-test/test_inputs/FullyConnected_003/layer/uint8/4.txt b/compiler/pota-quantization-value-test/test_inputs/FullyConnected_003/layer/uint8/4.txt
new file mode 100644
index 000000000..ace24f7c1
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/FullyConnected_003/layer/uint8/4.txt
@@ -0,0 +1 @@
+ 3.9716747 ,-2.254871  , 1.1943274 ,-2.212602  , 3.4311683 , 1.114989  , 4.0739036 , 0.47244295,-3.5793104 ,-3.359908  ,-4.7657595 , 2.0369127 ,-2.5619278 ,-3.4452975 ,-4.5852203 ,-1.137643  
diff --git a/compiler/pota-quantization-value-test/test_inputs/TransposeConv_001/layer/uint8/0.txt b/compiler/pota-quantization-value-test/test_inputs/TransposeConv_001/layer/uint8/0.txt
new file mode 100644
index 000000000..e9db48f9e
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/TransposeConv_001/layer/uint8/0.txt
@@ -0,0 +1 @@
+-1.4124781 , 0.42694193, 1.1734594 ,-3.5111153 ,-2.9756174 , 1.3682148 ,-2.318465  , 2.198896  ,-4.5043235 , 3.1775594 ,-0.42802384,-1.4872279 , 1.3821319 ,-4.771963  ,-0.12837897, 4.132799  , 3.697655  , 2.0807178 ,-3.621293  , 2.121878  ,-0.25654107, 0.42100102,-1.4009671 ,-2.9733627 ,-0.7058871 ,-2.831215  , 3.5669627 , 2.1420689 ,-1.8789555 , 0.8104939 ,-2.0503597 , 1.7788508 
diff --git a/compiler/pota-quantization-value-test/test_inputs/TransposeConv_001/layer/uint8/1.txt b/compiler/pota-quantization-value-test/test_inputs/TransposeConv_001/layer/uint8/1.txt
new file mode 100644
index 000000000..479d062f1
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/TransposeConv_001/layer/uint8/1.txt
@@ -0,0 +1 @@
+ 3.4726453 , 3.0497985 ,-4.234619  ,-1.0526706 , 1.7278554 ,-3.341614  , 4.54768   , 3.0954597 ,-3.735109  , 2.8810751 ,-2.5381427 ,-3.2360535 ,-1.5378917 , 2.3052745 ,-3.170938  ,-3.327242  , 2.0654576 ,-2.2294598 ,-1.881382  , 0.13216451,-4.2825613 , 0.26616526, 4.6196365 ,-0.88623226, 1.7103885 ,-1.5865034 ,-3.9114466 ,-3.2227128 , 4.909618  , 2.3318915 , 0.84300846, 0.760918  
diff --git a/compiler/pota-quantization-value-test/test_inputs/TransposeConv_001/layer/uint8/2.txt b/compiler/pota-quantization-value-test/test_inputs/TransposeConv_001/layer/uint8/2.txt
new file mode 100644
index 000000000..ae28234bd
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/TransposeConv_001/layer/uint8/2.txt
@@ -0,0 +1 @@
+-4.6097918,-4.21991  ,-3.9955974, 3.6492047, 2.9191775, 2.8082933, 1.6189331, 0.2730309,-1.5029653,-1.9471445, 4.8758197, 3.3177438, 3.1338058,-2.1281245,-1.7526287,-2.5518703,-1.7746793, 4.0455256,-0.5839861,-4.408046 ,-4.0034447, 1.5858272,-4.5896654, 4.7211285,-4.677515 ,-2.6027086,-4.7896166,-3.5512326,-1.9068764,-2.9705904,-4.854087 ,-4.892111 
diff --git a/compiler/pota-quantization-value-test/test_inputs/TransposeConv_001/layer/uint8/3.txt b/compiler/pota-quantization-value-test/test_inputs/TransposeConv_001/layer/uint8/3.txt
new file mode 100644
index 000000000..fd40f84f4
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/TransposeConv_001/layer/uint8/3.txt
@@ -0,0 +1 @@
+ 2.1514777e-02, 2.6526773e+00,-3.0477784e+00, 1.3287724e+00,-4.1414630e-01,-1.7295350e-01, 7.6649576e-01,-1.8028022e+00,-7.0781744e-01,-2.5262204e-01,-3.0970418e+00,-1.3165286e+00,-4.6649928e+00, 2.0809033e+00,-1.5739973e+00,-4.0531826e-01,-2.1718202e+00, 2.0146034e+00, 2.5044403e+00,-1.1256610e+00, 1.3536702e+00, 1.0283234e-03,-1.8823910e+00, 4.7122188e+00, 9.4781297e-01, 3.2012525e+00,-5.5164534e-01,-2.6158772e+00,-1.8771547e+00,-3.1689723e+00, 4.9054880e+00,-3.4560370e+00
diff --git a/compiler/pota-quantization-value-test/test_inputs/TransposeConv_001/layer/uint8/4.txt b/compiler/pota-quantization-value-test/test_inputs/TransposeConv_001/layer/uint8/4.txt
new file mode 100644
index 000000000..e81c3b8e5
--- /dev/null
+++ b/compiler/pota-quantization-value-test/test_inputs/TransposeConv_001/layer/uint8/4.txt
@@ -0,0 +1 @@
+-2.0927553 ,-2.107511  ,-1.6963564 , 1.7006218 , 1.4575784 , 0.06095728, 1.2659966 , 4.1905265 , 1.3035946 , 4.9793477 ,-4.3388166 ,-0.23496658, 1.9831208 , 2.6154642 ,-0.2790228 ,-3.1774354 ,-3.178935  ,-1.1564373 ,-0.8199472 ,-2.245698  ,-4.8605046 ,-3.569018  ,-1.4226891 ,-4.1067843 , 2.6078918 ,-3.5830674 , 1.9065963 , 2.435578  ,-3.3216476 , 4.5930347 , 2.9191844 , 1.7885648 
diff --git a/compiler/pota-quantization-value-test/test_record_minmax.sh b/compiler/pota-quantization-value-test/test_record_minmax.sh
index eaa462d0c..acb7574c0 100755
--- a/compiler/pota-quantization-value-test/test_record_minmax.sh
+++ b/compiler/pota-quantization-value-test/test_record_minmax.sh
@@ -59,9 +59,9 @@ while [ "$1" != "" ]; do
 
     # Run record-minmax
     "${RECORD_MINMAX_PATH}" \
-      "${TEST_RESULT_FILE}.fake_quantized.circle" \
-      "${TEST_RESULT_FILE}.input.h5" \
-      "${TEST_RESULT_FILE}.minmax_recorded.circle" 
+      --input_model "${TEST_RESULT_FILE}.fake_quantized.circle" \
+      --input_data "${TESTCASE_FILE}.input.h5" \
+      --output_model "${TEST_RESULT_FILE}.minmax_recorded.circle" 
 
     # Dump min/max values (circle-tensordump)
     "${CIRCLE_TENSORDUMP_PATH}" \
diff --git a/compiler/record-minmax/CMakeLists.txt b/compiler/record-minmax/CMakeLists.txt
index 862660e06..f8a165bd3 100644
--- a/compiler/record-minmax/CMakeLists.txt
+++ b/compiler/record-minmax/CMakeLists.txt
@@ -19,9 +19,14 @@ target_link_libraries(record-minmax safemain)
 target_link_libraries(record-minmax luci_import)
 target_link_libraries(record-minmax luci_export)
 target_link_libraries(record-minmax luci_interpreter)
+target_link_libraries(record-minmax vconone)
 
 install(TARGETS record-minmax DESTINATION bin)
 
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
 nnas_find_package(GTest REQUIRED)
 GTest_AddTest(record_minmax_function_test "${CMAKE_CURRENT_SOURCE_DIR}/tests/RecordFunction.test.cpp")
 target_include_directories(record_minmax_function_test PRIVATE include)
diff --git a/compiler/record-minmax/driver/Driver.cpp b/compiler/record-minmax/driver/Driver.cpp
index ae4fcb7c7..8b09498c3 100644
--- a/compiler/record-minmax/driver/Driver.cpp
+++ b/compiler/record-minmax/driver/Driver.cpp
@@ -17,6 +17,13 @@
 #include "RecordMinMax.h"
 
 #include <arser/arser.h>
+#include <vconone/vconone.h>
+
+void print_version(void)
+{
+  std::cout << "record-minmax version " << vconone::get_string() << std::endl;
+  std::cout << vconone::get_copyright() << std::endl;
+}
 
 int entry(const int argc, char **argv)
 {
@@ -25,6 +32,13 @@ int entry(const int argc, char **argv)
   arser::Arser arser(
       "Embedding min/max values of activations to the circle model for post-training quantization");
 
+  arser.add_argument("--version")
+      .nargs(0)
+      .required(false)
+      .default_value(false)
+      .help("Show version information and exit")
+      .exit_with(print_version);
+
   arser.add_argument("--input_model")
       .nargs(1)
       .type(arser::DataType::STR)
@@ -66,7 +80,7 @@ int entry(const int argc, char **argv)
   {
     std::cout << err.what() << std::endl;
     std::cout << arser;
-    return 0;
+    return 255;
   }
 
   auto input_model_path = arser.get<std::string>("--input_model");
diff --git a/compiler/record-minmax/requires.cmake b/compiler/record-minmax/requires.cmake
index 054503539..f6804cef1 100644
--- a/compiler/record-minmax/requires.cmake
+++ b/compiler/record-minmax/requires.cmake
@@ -1,3 +1,4 @@
 require("luci")
 require("safemain")
 require("arser")
+require("vconone")
diff --git a/compiler/record-minmax/src/HDF5Importer.cpp b/compiler/record-minmax/src/HDF5Importer.cpp
index cf30cd863..a0e65eeb7 100644
--- a/compiler/record-minmax/src/HDF5Importer.cpp
+++ b/compiler/record-minmax/src/HDF5Importer.cpp
@@ -20,6 +20,7 @@
 
 #include <string>
 #include <cassert>
+#include <stdexcept>
 
 using Shape = luci_interpreter::Shape;
 using DataType = luci_interpreter::DataType;
diff --git a/compiler/record-minmax/src/MinMaxObserver.cpp b/compiler/record-minmax/src/MinMaxObserver.cpp
index 45f0197c8..c22cb4132 100644
--- a/compiler/record-minmax/src/MinMaxObserver.cpp
+++ b/compiler/record-minmax/src/MinMaxObserver.cpp
@@ -38,7 +38,7 @@ void MinMaxObserver::postTensorWrite(const luci::CircleNode *node,
   assert(node->opcode() != luci::CircleOpcode::UNPACK);
   assert(node->opcode() != luci::CircleOpcode::WHILE);
 
-  if (node->opcode() == luci::CircleOpcode::CONST)
+  if (node->opcode() == luci::CircleOpcode::CIRCLECONST)
   {
     // node is not activation. Do nothing.
     return;
diff --git a/compiler/record-minmax/src/RecordMinMax.cpp b/compiler/record-minmax/src/RecordMinMax.cpp
index d12a0d3ae..17c6aa6ff 100644
--- a/compiler/record-minmax/src/RecordMinMax.cpp
+++ b/compiler/record-minmax/src/RecordMinMax.cpp
@@ -158,7 +158,7 @@ void RecordMinMax::profileData(const std::string &mode, const std::string &input
     auto node = iter->first;
     auto minmax = iter->second;
 
-    float min, max;
+    float min{0.0f}, max{0.0f};
     if (mode == "percentile")
     {
       min = getNthPercentile(minmax.min_vector, min_percentile);
diff --git a/compiler/record-minmax/tests/RecordFunction.test.cpp b/compiler/record-minmax/tests/RecordFunction.test.cpp
index 13b464db9..e2f135a4e 100644
--- a/compiler/record-minmax/tests/RecordFunction.test.cpp
+++ b/compiler/record-minmax/tests/RecordFunction.test.cpp
@@ -32,6 +32,8 @@ TEST(GetNthPercentileTest, Edge)
 
   EXPECT_FLOAT_NEAR(0, getNthPercentile(input, 0));
   EXPECT_FLOAT_NEAR(9, getNthPercentile(input, 100));
+
+  SUCCEED();
 }
 
 TEST(GetNthPercentileTest, Simple)
@@ -47,6 +49,8 @@ TEST(GetNthPercentileTest, Simple)
   {
     EXPECT_FLOAT_NEAR(0.09 * std::floor(i) + 0.045, getNthPercentile(input, i));
   }
+
+  SUCCEED();
 }
 
 TEST(GetNthPercentileTest, Float)
@@ -61,6 +65,8 @@ TEST(GetNthPercentileTest, Float)
   EXPECT_FLOAT_NEAR(2.799942346802177, getNthPercentile(input, 1));
   EXPECT_FLOAT_NEAR(7.768503955476342, getNthPercentile(input, 3.14));
   EXPECT_FLOAT_NEAR(99.40456084968194, getNthPercentile(input, 99));
+
+  SUCCEED();
 }
 
 TEST(GetNthPercentileTest, FloatWithNegative)
@@ -75,6 +81,8 @@ TEST(GetNthPercentileTest, FloatWithNegative)
   EXPECT_FLOAT_NEAR(-47.20005765319782, getNthPercentile(input, 1));
   EXPECT_FLOAT_NEAR(-42.23149604452366, getNthPercentile(input, 3.14));
   EXPECT_FLOAT_NEAR(49.40456084968194, getNthPercentile(input, 99));
+
+  SUCCEED();
 }
 
 TEST(GetNthPercentileTest, SigleElement)
@@ -84,6 +92,8 @@ TEST(GetNthPercentileTest, SigleElement)
   EXPECT_FLOAT_NEAR(33, getNthPercentile(input, 0));
   EXPECT_FLOAT_NEAR(33, getNthPercentile(input, 50));
   EXPECT_FLOAT_NEAR(33, getNthPercentile(input, 100));
+
+  SUCCEED();
 }
 
 TEST(GetNthPercentileTest, OutOfBoundary_NEG)
@@ -92,6 +102,8 @@ TEST(GetNthPercentileTest, OutOfBoundary_NEG)
 
   EXPECT_THROW(getNthPercentile(input, -1), std::runtime_error);
   EXPECT_THROW(getNthPercentile(input, 101), std::runtime_error);
+
+  SUCCEED();
 }
 
 TEST(GetNthPercentileTest, EmptyVector_NEG)
@@ -99,6 +111,8 @@ TEST(GetNthPercentileTest, EmptyVector_NEG)
   std::vector<float> input;
 
   EXPECT_THROW(getNthPercentile(input, 10), std::runtime_error);
+
+  SUCCEED();
 }
 
 } // namespace record_minmax
diff --git a/compiler/tf2circle-value-pbtxt-remote-test/CMakeLists.txt b/compiler/tf2circle-value-pbtxt-remote-test/CMakeLists.txt
index 64dcc28fd..852018e64 100644
--- a/compiler/tf2circle-value-pbtxt-remote-test/CMakeLists.txt
+++ b/compiler/tf2circle-value-pbtxt-remote-test/CMakeLists.txt
@@ -141,7 +141,6 @@ add_custom_command(
   COMMAND ${CMAKE_COMMAND} -E echo 'HDF5_EXPORT_ACTION_PATH=\"$<TARGET_FILE:nnkit_HDF5_export_action>\"' >> ${TEST_CONFIG}
   COMMAND ${CMAKE_COMMAND} -E echo 'HDF5_IMPORT_ACTION_PATH=\"$<TARGET_FILE:nnkit_HDF5_import_action>\"' >> ${TEST_CONFIG}
   COMMAND ${CMAKE_COMMAND} -E echo 'MODEL2NNPKG_PATH=\"${NNAS_PROJECT_SOURCE_DIR}/tools/nnpackage_tool/model2nnpkg/model2nnpkg.sh\"' >> ${TEST_CONFIG}
-  COMMAND ${CMAKE_COMMAND} -E echo 'NNPKG_TEST_PATH=\"${NNAS_PROJECT_SOURCE_DIR}/tests/scripts/nnpkg_test.sh\"' >> ${TEST_CONFIG}
   COMMAND ${CMAKE_COMMAND} -E echo 'RUNTIME_LIBRARY_PATH=\"${NNAS_PROJECT_SOURCE_DIR}/Product/out/\"' >> ${TEST_CONFIG}
   DEPENDS
     nnkit-run
diff --git a/compiler/tf2circle-value-pbtxt-remote-test/README.md b/compiler/tf2circle-value-pbtxt-remote-test/README.md
index 5546cc879..0d41b0a48 100644
--- a/compiler/tf2circle-value-pbtxt-remote-test/README.md
+++ b/compiler/tf2circle-value-pbtxt-remote-test/README.md
@@ -36,13 +36,13 @@
         #--------------- Remote Machine Setting ---------------#
         set(REMOTE_IP "xxx.xxx.xxx.xxx")
         set(REMOTE_USER "remote_username")
-        
+
         #--------------------- Tests list ---------------------#
         add(UNIT_Add_000)
         add(UNIT_Add_001)
         ...
         ```
-    - If any Tensorflow model is added, or if `REMOTE_IP` and `REMOTE_USER` is not given, `tf2circle-value-pbtxt-remote-test` will not be created. 
+    - If any Tensorflow model is added, or if `REMOTE_IP` and `REMOTE_USER` is not given, `tf2circle-value-pbtxt-remote-test` will not be created.
 1. (Optional) ssh authentication
     - This test uses `ssh` and `scp` commands, and those commands require a password of remote machine whenever they are called. This means that you should enter the password everytime when `ssh` and `scp` require.
     - This test resolves the problem by using `ssh-copy-id`, which copies the public key of host machine to `authorized_keys` of remote machine. Because of that, this test will ask the password of remote machine only once, at the first time. This is the only user interaction while running this test.
@@ -71,7 +71,7 @@
         ├ Result_latest -> Result_YYMMDD_hhmmss.csv
         ├ Result_YYMMDD_hhmmss.csv
         ├ ...
-        | 
+        |
         ├ UNIT_Add_000
         |     ├ metadata
         |     |     ├ MANIFEST
@@ -91,16 +91,16 @@
         |
         ├ ...
     ```
-- `nnpkg_test.sh`, runtime products and each nnpackage are sent to `REMOTE_WORKDIR` in remote machine.
+- Runtime products and each nnpackage are sent to `REMOTE_WORKDIR` in remote machine.
 - (TBD) Modify script not to remove obtained h5 file.
     ```
     REMOTE_WORKDIR
-        ├ nnpkg_test.sh
         |
         ├ Product
         |     └ out
         |        ├ bin
         |        ├ lib
+        |        ├ test
         |        ├ ...
         |
         ├ UNIT_Add_000
diff --git a/compiler/tf2circle-value-pbtxt-remote-test/testall.sh b/compiler/tf2circle-value-pbtxt-remote-test/testall.sh
index ca6fb49c8..c80b00a14 100755
--- a/compiler/tf2circle-value-pbtxt-remote-test/testall.sh
+++ b/compiler/tf2circle-value-pbtxt-remote-test/testall.sh
@@ -30,7 +30,6 @@ echo "-- Found nnkit-run: ${NNKIT_RUN_PATH}"
 echo "-- Found TF backend: ${TF_BACKEND_PATH}"
 echo "-- Found TF2CIRCLE: ${TF2CIRCLE_PATH}"
 echo "-- Found MODEL2NNPKG: ${MODEL2NNPKG_PATH}"
-echo "-- Found nnpkg_test: ${NNPKG_TEST_PATH}"
 echo "-- Found Runtime library: ${RUNTIME_LIBRARY_PATH}"
 echo "-- Found randomize action: ${RANDOMIZE_ACTION_PATH}"
 echo "-- Found HDF5 export action: ${HDF5_EXPORT_ACTION_PATH}"
@@ -42,11 +41,6 @@ if [ -z ${MODEL2NNPKG_PATH} ] || [ ! -f ${MODEL2NNPKG_PATH} ]; then
   exit 3
 fi
 
-if [ -z ${NNPKG_TEST_PATH} ] || [ ! -f ${NNPKG_TEST_PATH} ]; then
-  echo "nnpkg_test is not found"
-  exit 4
-fi
-
 # Register remote machine ssh information
 cat /dev/zero | ssh-keygen -q -N ""
 ssh-copy-id -o ConnectTimeout=5 "${REMOTE_USER}@${REMOTE_IP}"
@@ -61,9 +55,6 @@ fi
 ssh "${REMOTE_USER}@${REMOTE_IP}" "mkdir -p ${REMOTE_WORKDIR}/Product/"
 scp -r "${RUNTIME_LIBRARY_PATH}" "${REMOTE_USER}@${REMOTE_IP}:${REMOTE_WORKDIR}/Product/"
 
-# Send nnpkg_test.sh
-scp "${NNPKG_TEST_PATH}" "${REMOTE_USER}@${REMOTE_IP}:${REMOTE_WORKDIR}/"
-
 TESTED=()
 PASSED=()
 FAILED=()
@@ -120,8 +111,8 @@ while [[ $# -ne 0 ]]; do
 
     # Run test_arm_nnpkg in remote machine
     scp -r "${WORKDIR}/${PREFIX}/" "${REMOTE_USER}@${REMOTE_IP}:${REMOTE_WORKDIR}/${PREFIX}/"
-    ssh "${REMOTE_USER}@${REMOTE_IP}" "cd ${REMOTE_WORKDIR}; ./nnpkg_test.sh -i . -o  ${PREFIX}/metadata/tc ${PREFIX}"
-    
+    ssh "${REMOTE_USER}@${REMOTE_IP}" "cd ${REMOTE_WORKDIR}; ./Product/out/test/onert-test nnpkg-test -i . -o  ${PREFIX}/metadata/tc ${PREFIX}"
+
     if [[ $? -eq 0 ]]; then
       touch "${PASSED_TAG}"
     fi
diff --git a/compiler/tf2nnpackage-value-remote-test/CMakeLists.txt b/compiler/tf2nnpackage-value-remote-test/CMakeLists.txt
index 4a59e8849..255806ce8 100644
--- a/compiler/tf2nnpackage-value-remote-test/CMakeLists.txt
+++ b/compiler/tf2nnpackage-value-remote-test/CMakeLists.txt
@@ -33,12 +33,12 @@ endforeach()
 
 get_target_property(ARTIFACTS_SRC_PATH testDataGenerator SOURCE_DIR)
 
-# In this test, only the runtime test is performed because the test from tf to 
-# nnpackage is done in common-artifacts, and for this runtime test, generation of 
+# In this test, only the runtime test is performed because the test from tf to
+# nnpackage is done in common-artifacts, and for this runtime test, generation of
 # test data is required. And, tcgenerate in ${ARTIFACTS_SRC_PATH}/exclude.lst
 # means it won't generate test data, which is why below "tcgenerate" macro excludes
-# specific opearators from runtime test. 
-# Also, since circlize and optimize macro included in `exclude.lst` file is only 
+# specific opearators from runtime test.
+# Also, since circlize and optimize macro included in `exclude.lst` file is only
 # needed in common-artifacts, it has no function here.
 macro(circlize)
 endmacro()
@@ -72,7 +72,6 @@ set(TEST_CONFIG "${CMAKE_CURRENT_BINARY_DIR}/test.config")
 add_custom_command(
   OUTPUT ${TEST_CONFIG}
   COMMAND ${CMAKE_COMMAND} -E remove -f ${TEST_CONFIG}
-  COMMAND ${CMAKE_COMMAND} -E echo 'NNPKG_TEST_PATH=\"${NNAS_PROJECT_SOURCE_DIR}/tests/scripts/nnpkg_test.sh\"' >> ${TEST_CONFIG}
   COMMAND ${CMAKE_COMMAND} -E echo 'RUNTIME_LIBRARY_PATH=\"${NNAS_PROJECT_SOURCE_DIR}/Product/out/\"' >> ${TEST_CONFIG}
   COMMENT "Generate test configuration"
 )
diff --git a/compiler/tf2nnpackage-value-remote-test/README.md b/compiler/tf2nnpackage-value-remote-test/README.md
index 36436fc6b..65f307b13 100644
--- a/compiler/tf2nnpackage-value-remote-test/README.md
+++ b/compiler/tf2nnpackage-value-remote-test/README.md
@@ -15,7 +15,7 @@
         set(REMOTE_IP "xxx.xxx.xxx.xxx")
         set(REMOTE_USER "remote_username")
         ```
-    - If any recipe is added, or if `REMOTE_IP` and `REMOTE_USER` is not given, `tf2nnpackage-value-remote-test` will not be created. 
+    - If any recipe is added, or if `REMOTE_IP` and `REMOTE_USER` is not given, `tf2nnpackage-value-remote-test` will not be created.
 1. (Optional) ssh authentication
     - This test uses `ssh` and `scp` commands, and those commands require a password of remote machine whenever they are called. This means that you should enter the password everytime when `ssh` and `scp` require.
     - This test resolves the problem by using `ssh-copy-id`, which copies the public key of host machine to `authorized_keys` of remote machine. Because of that, this test will ask the password of remote machine only once, at the first time. This is the only user interaction while running this test.
@@ -39,7 +39,7 @@
 ### Generated Files While Running
 
 - All related files(`pb`, `circle`, `h5` ... etc.) are taken from `build/compiler/common-artifacts` folder.
-- `nnpkg_test.sh`, runtime products and each nnpackage are sent to `REMOTE_WORKDIR` in remote machine.
+- Runtime products and each nnpackage are sent to `REMOTE_WORKDIR` in remote machine.
 - Each test result is generated in `build/compiler/common-artifacts` with the name `${RECIPE}.log`
 
 ### Check Test Result
diff --git a/compiler/tf2nnpackage-value-remote-test/testall.sh b/compiler/tf2nnpackage-value-remote-test/testall.sh
index f1c9789b3..ca672a3eb 100755
--- a/compiler/tf2nnpackage-value-remote-test/testall.sh
+++ b/compiler/tf2nnpackage-value-remote-test/testall.sh
@@ -27,15 +27,9 @@ RESULT_CSV="${BINDIR}/Result_${CURRENT_DATETIME}.csv"
 
 source "${CONFIG_PATH}"
 
-echo "-- Found nnpkg_test: ${NNPKG_TEST_PATH}"
 echo "-- Found Runtime library: ${RUNTIME_LIBRARY_PATH}"
 echo "-- Found workdir: ${WORKDIR}"
 
-if [ -z ${NNPKG_TEST_PATH} ] || [ ! -f ${NNPKG_TEST_PATH} ]; then
-  echo "nnpkg_test is not found"
-  exit 4
-fi
-
 # Register remote machine ssh information
 cat /dev/zero | ssh-keygen -q -N ""
 ssh-copy-id -o ConnectTimeout=5 "${REMOTE_USER}@${REMOTE_IP}"
@@ -50,9 +44,6 @@ fi
 ssh "${REMOTE_USER}@${REMOTE_IP}" "mkdir -p ${REMOTE_WORKDIR}/Product/"
 scp -r "${RUNTIME_LIBRARY_PATH}" "${REMOTE_USER}@${REMOTE_IP}:${REMOTE_WORKDIR}/Product/"
 
-# Send nnpkg_test.sh
-scp "${NNPKG_TEST_PATH}" "${REMOTE_USER}@${REMOTE_IP}:${REMOTE_WORKDIR}/"
-
 TESTED=()
 PASSED=()
 FAILED=()
@@ -84,8 +75,8 @@ while [[ $# -ne 0 ]]; do
     PREFIX=${PREFIX}.opt ;
     fi
     scp -r "${PREFIX}/" "${REMOTE_USER}@${REMOTE_IP}:${REMOTE_WORKDIR}/${PREFIX}/"
-    ssh "${REMOTE_USER}@${REMOTE_IP}" "cd ${REMOTE_WORKDIR}; ./nnpkg_test.sh ${PREFIX}"
-    
+    ssh "${REMOTE_USER}@${REMOTE_IP}" "cd ${REMOTE_WORKDIR}; ./Product/out/test/onert-test nnpkg-test ${PREFIX}"
+
     if [[ $? -eq 0 ]]; then
       touch "${BINDIR}/${PASSED_TAG}"
     fi
diff --git a/compiler/tf2tfliteV2/README.md b/compiler/tf2tfliteV2/README.md
index 13359aab1..0a90735cb 100644
--- a/compiler/tf2tfliteV2/README.md
+++ b/compiler/tf2tfliteV2/README.md
@@ -47,6 +47,9 @@ python tf2tfliteV2.py \
   -h, --help            show this help message and exit
   --v1                  Use TensorFlow Lite Converter 1.x
   --v2                  Use TensorFlow Lite Converter 2.x
+  --graph_def           Use graph def file(default)
+  --saved_model         Use saved model
+  --keras_model         Use keras model
   -i INPUT_PATH, --input_path INPUT_PATH
                         Full filepath of the input file.
   -o OUTPUT_PATH, --output_path OUTPUT_PATH
@@ -55,7 +58,8 @@ python tf2tfliteV2.py \
                         Names of the input arrays, comma-separated.
   -s INPUT_SHAPES, --input_shapes INPUT_SHAPES
                         Shapes corresponding to --input_arrays, colon-
-                        separated.
+                        separated.(ex:"1,4,4,3:1,20,20,3")
   -O OUTPUT_ARRAYS, --output_arrays OUTPUT_ARRAYS
                         Names of the output arrays, comma-separated.
+
 ```
diff --git a/compiler/tf2tfliteV2/tf2tfliteV2.py b/compiler/tf2tfliteV2/tf2tfliteV2.py
index 82d6ee232..c51dabde0 100755
--- a/compiler/tf2tfliteV2/tf2tfliteV2.py
+++ b/compiler/tf2tfliteV2/tf2tfliteV2.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 
 # Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
 # Copyright (C) 2018 The TensorFlow Authors
@@ -48,6 +48,27 @@ def _get_parser():
     converter_version.add_argument(
         "--v2", action="store_true", help="Use TensorFlow Lite Converter 2.x")
 
+    # Input model format
+    model_format_arg = parser.add_mutually_exclusive_group()
+    model_format_arg.add_argument(
+        "--graph_def",
+        action="store_const",
+        dest="model_format",
+        const="graph_def",
+        help="Use graph def file(default)")
+    model_format_arg.add_argument(
+        "--saved_model",
+        action="store_const",
+        dest="model_format",
+        const="saved_model",
+        help="Use saved model")
+    model_format_arg.add_argument(
+        "--keras_model",
+        action="store_const",
+        dest="model_format",
+        const="keras_model",
+        help="Use keras model")
+
     # Input and output path.
     parser.add_argument(
         "-i",
@@ -83,6 +104,8 @@ def _get_parser():
         help="Names of the output arrays, comma-separated.",
         required=True)
 
+    # Set default value
+    parser.set_defaults(model_format="graph_def")
     return parser
 
 
@@ -122,17 +145,26 @@ def _parse_array(arrays, type_fn=str):
 
 
 def _v1_convert(flags):
-    input_shapes = None
-    if flags.input_shapes:
-        input_arrays = _parse_array(flags.input_arrays)
-        input_shapes_list = [
-            _parse_array(shape, type_fn=int) for shape in flags.input_shapes.split(":")
-        ]
-        input_shapes = dict(list(zip(input_arrays, input_shapes_list)))
-
-    converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph(
-        flags.input_path, _parse_array(flags.input_arrays),
-        _parse_array(flags.output_arrays), input_shapes)
+    if flags.model_format == "graph_def":
+        input_shapes = None
+        if flags.input_shapes:
+            input_arrays = _parse_array(flags.input_arrays)
+            input_shapes_list = [
+                _parse_array(shape, type_fn=int)
+                for shape in flags.input_shapes.split(":")
+            ]
+            input_shapes = dict(list(zip(input_arrays, input_shapes_list)))
+
+        converter = tf.compat.v1.lite.TFLiteConverter.from_frozen_graph(
+            flags.input_path, _parse_array(flags.input_arrays),
+            _parse_array(flags.output_arrays), input_shapes)
+
+    if flags.model_format == "saved_model":
+        converter = tf.compat.v1.lite.TFLiteConverter.from_saved_model(flags.input_path)
+
+    if flags.model_format == "keras_model":
+        converter = tf.compat.v1.lite.TFLiteConverter.from_keras_model_file(
+            flags.input_path)
 
     converter.allow_custom_ops = True
 
@@ -141,27 +173,35 @@ def _v1_convert(flags):
 
 
 def _v2_convert(flags):
-    file_content = open(flags.input_path, 'rb').read()
-    try:
-        graph_def = tf.compat.v1.GraphDef()
-        graph_def.ParseFromString(file_content)
-    except (_text_format.ParseError, DecodeError):
+    if flags.model_format == "graph_def":
+        file_content = open(flags.input_path, 'rb').read()
         try:
-            _text_format.Merge(file_content, graph_def)
+            graph_def = tf.compat.v1.GraphDef()
+            graph_def.ParseFromString(file_content)
         except (_text_format.ParseError, DecodeError):
-            raise IOError("Unable to parse input file '{}'.".format(flags.input_path))
-
-    wrap_func = wrap_frozen_graph(
-        graph_def,
-        inputs=[
-            _str + ":0" if len(_str.split(":")) == 1 else _str
-            for _str in _parse_array(flags.input_arrays)
-        ],
-        outputs=[
-            _str + ":0" if len(_str.split(":")) == 1 else _str
-            for _str in _parse_array(flags.output_arrays)
-        ])
-    converter = tf.lite.TFLiteConverter.from_concrete_functions([wrap_func])
+            try:
+                _text_format.Merge(file_content, graph_def)
+            except (_text_format.ParseError, DecodeError):
+                raise IOError("Unable to parse input file '{}'.".format(flags.input_path))
+
+        wrap_func = wrap_frozen_graph(
+            graph_def,
+            inputs=[
+                _str + ":0" if len(_str.split(":")) == 1 else _str
+                for _str in _parse_array(flags.input_arrays)
+            ],
+            outputs=[
+                _str + ":0" if len(_str.split(":")) == 1 else _str
+                for _str in _parse_array(flags.output_arrays)
+            ])
+        converter = tf.lite.TFLiteConverter.from_concrete_functions([wrap_func])
+
+    if flags.model_format == "saved_model":
+        converter = tf.lite.TFLiteConverter.from_saved_model(flags.input_path)
+
+    if flags.model_format == "keras_model":
+        keras_model = tf.keras.models.load_model(flags.input_path)
+        converter = tf.lite.TFLiteConverter.from_keras_model(keras_model)
 
     converter.allow_custom_ops = True
     converter.experimental_new_converter = True
diff --git a/compiler/tfl-verify/CMakeLists.txt b/compiler/tfl-verify/CMakeLists.txt
index d33059fde..4421a4660 100644
--- a/compiler/tfl-verify/CMakeLists.txt
+++ b/compiler/tfl-verify/CMakeLists.txt
@@ -6,6 +6,7 @@ file(GLOB_RECURSE SOURCES "src/*.cpp")
 
 add_executable(tfl-verify ${SOURCES})
 target_include_directories(tfl-verify PRIVATE src)
+target_link_libraries(tfl-verify arser)
 target_link_libraries(tfl-verify foder)
 target_link_libraries(tfl-verify mio_tflite)
 target_link_libraries(tfl-verify safemain)
diff --git a/compiler/tfl-verify/requires.cmake b/compiler/tfl-verify/requires.cmake
index ed6b84db5..79503f325 100644
--- a/compiler/tfl-verify/requires.cmake
+++ b/compiler/tfl-verify/requires.cmake
@@ -1,3 +1,4 @@
+require("arser")
 require("foder")
 require("mio-tflite")
 require("safemain")
diff --git a/compiler/tfl-verify/src/Driver.cpp b/compiler/tfl-verify/src/Driver.cpp
index 81f6d5489..6d1897607 100644
--- a/compiler/tfl-verify/src/Driver.cpp
+++ b/compiler/tfl-verify/src/Driver.cpp
@@ -16,22 +16,31 @@
 
 #include "VerifyFlatBuffers.h"
 
+#include <arser/arser.h>
+
 #include <iostream>
 #include <memory>
 #include <string>
 
 int entry(int argc, char **argv)
 {
-  if (argc != 2)
+  arser::Arser arser;
+  arser.add_argument("tflite").type(arser::DataType::STR).help("TFLite file path to verify");
+
+  try
   {
-    std::cerr << "ERROR: Failed to parse arguments" << std::endl;
-    std::cerr << std::endl;
-    std::cerr << "USAGE: " << argv[0] << " [tflite]" << std::endl;
+    arser.parse(argc, argv);
+  }
+  catch (const std::runtime_error &err)
+  {
+    std::cout << err.what() << std::endl;
+    std::cout << arser;
     return 255;
   }
+
   auto verifier = std::make_unique<VerifyFlatbuffers>();
 
-  std::string model_file = argv[argc - 1];
+  std::string model_file = arser.get<std::string>("tflite");
 
   std::cout << "[ RUN       ] Check " << model_file << std::endl;
 
diff --git a/compiler/tflchef/core/src/ModelChef.cpp b/compiler/tflchef/core/src/ModelChef.cpp
index 932a649c5..692ce48c1 100644
--- a/compiler/tflchef/core/src/ModelChef.cpp
+++ b/compiler/tflchef/core/src/ModelChef.cpp
@@ -413,6 +413,7 @@ template <typename T> void cook_graph(const T &graph, CookParams &cp)
       quant_builder.add_min(quant_min);
       quant_builder.add_scale(quant_scale);
       quant_builder.add_zero_point(quant_zero_point);
+      quant_builder.add_quantized_dimension(quant.quantized_dimension());
 
       // Update QuantizationParameters Index
       quant_index = quant_builder.Finish();
diff --git a/compiler/tflchef/core/src/Op/NonMaxSuppressionV4.cpp b/compiler/tflchef/core/src/Op/NonMaxSuppressionV4.cpp
new file mode 100644
index 000000000..eadd62cc6
--- /dev/null
+++ b/compiler/tflchef/core/src/Op/NonMaxSuppressionV4.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "NonMaxSuppressionV4.h"
+
+flatbuffers::Offset<void> NonMaxSuppressionV4Chef::value(flatbuffers::FlatBufferBuilder &fbb) const
+{
+  tflite::NonMaxSuppressionV4OptionsBuilder options_builder{fbb};
+
+  return options_builder.Finish().Union();
+}
+
+std::unique_ptr<OpChef>
+NonMaxSuppressionV4ChefFactory::create(const tflchef::Operation *operation) const
+{
+  return std::unique_ptr<OpChef>{new NonMaxSuppressionV4Chef{operation}};
+}
diff --git a/compiler/tflchef/core/src/Op/NonMaxSuppressionV4.h b/compiler/tflchef/core/src/Op/NonMaxSuppressionV4.h
new file mode 100644
index 000000000..a8e783d53
--- /dev/null
+++ b/compiler/tflchef/core/src/Op/NonMaxSuppressionV4.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __OP_NON_MAX_SUPPRESSION_V4_H__
+#define __OP_NON_MAX_SUPPRESSION_V4_H__
+
+#include "OpChef.h"
+
+class NonMaxSuppressionV4Chef final : public OpChef
+{
+public:
+  explicit NonMaxSuppressionV4Chef(const tflchef::Operation *operation) : _operation{operation}
+  {
+    // DO NOTHING
+  }
+
+public:
+  tflite::BuiltinOperator code(void) const override
+  {
+    return tflite::BuiltinOperator_NON_MAX_SUPPRESSION_V4;
+  }
+
+  tflite::BuiltinOptions type(void) const override
+  {
+    return tflite::BuiltinOptions_NonMaxSuppressionV4Options;
+  }
+
+  flatbuffers::Offset<void> value(flatbuffers::FlatBufferBuilder &fbb) const override;
+
+private:
+  const tflchef::Operation *_operation;
+};
+
+struct NonMaxSuppressionV4ChefFactory final : public OpChefFactory
+{
+  std::unique_ptr<OpChef> create(const tflchef::Operation *operation) const override;
+};
+
+#endif // __OP_NON_MAX_SUPPRESSION_V4_H__
diff --git a/compiler/tflchef/core/src/Op/PadV2.cpp b/compiler/tflchef/core/src/Op/PadV2.cpp
new file mode 100644
index 000000000..bfa2289e5
--- /dev/null
+++ b/compiler/tflchef/core/src/Op/PadV2.cpp
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "PadV2.h"
+
+flatbuffers::Offset<void> PadV2Chef::value(flatbuffers::FlatBufferBuilder &fbb) const
+{
+  tflite::PadV2OptionsBuilder padv2_options_builder{fbb};
+  return padv2_options_builder.Finish().Union();
+}
+
+std::unique_ptr<OpChef> PadV2ChefFactory::create(const tflchef::Operation *operation) const
+{
+  return std::unique_ptr<OpChef>{new PadV2Chef{operation}};
+}
diff --git a/compiler/tflchef/core/src/Op/PadV2.h b/compiler/tflchef/core/src/Op/PadV2.h
new file mode 100644
index 000000000..d15532390
--- /dev/null
+++ b/compiler/tflchef/core/src/Op/PadV2.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __OP_PADV2_H__
+#define __OP_PADV2_H__
+
+#include "OpChef.h"
+
+class PadV2Chef final : public OpChef
+{
+public:
+  explicit PadV2Chef(const tflchef::Operation *operation) : _operation{operation}
+  {
+    // DO NOTHING
+  }
+
+public:
+  tflite::BuiltinOperator code(void) const override { return tflite::BuiltinOperator_PADV2; }
+
+  tflite::BuiltinOptions type(void) const override { return tflite::BuiltinOptions_PadV2Options; }
+
+  flatbuffers::Offset<void> value(flatbuffers::FlatBufferBuilder &fbb) const override;
+
+private:
+  const tflchef::Operation *_operation;
+};
+
+struct PadV2ChefFactory final : public OpChefFactory
+{
+  std::unique_ptr<OpChef> create(const tflchef::Operation *operation) const override;
+};
+
+#endif // __OP_PADV2_H__
diff --git a/compiler/tflchef/core/src/OpChef.def b/compiler/tflchef/core/src/OpChef.def
index 263725a24..244186265 100644
--- a/compiler/tflchef/core/src/OpChef.def
+++ b/compiler/tflchef/core/src/OpChef.def
@@ -55,10 +55,12 @@ OP_CHEF(Minimum, MinimumChefFactory)
 OP_CHEF(MirrorPad, MirrorPadChefFactory)
 OP_CHEF(Mul, MulChefFactory)
 OP_CHEF(Neg, NegChefFactory)
+OP_CHEF(NonMaxSuppressionV4, NonMaxSuppressionV4ChefFactory)
 OP_CHEF(NotEqual, NotEqualChefFactory)
 OP_CHEF(OneHot, OneHotChefFactory)
 OP_CHEF(Pack, PackChefFactory)
 OP_CHEF(Pad, PadChefFactory)
+OP_CHEF(PadV2, PadV2ChefFactory)
 OP_CHEF(Pow, PowChefFactory)
 OP_CHEF(PRelu, PReluChefFactory)
 OP_CHEF(Range, RangeChefFactory)
diff --git a/compiler/tflchef/core/src/OpChefs.h b/compiler/tflchef/core/src/OpChefs.h
index 55c37ebfb..5b2e89bd9 100644
--- a/compiler/tflchef/core/src/OpChefs.h
+++ b/compiler/tflchef/core/src/OpChefs.h
@@ -68,10 +68,12 @@
 #include "Op/MirrorPad.h"
 #include "Op/Mul.h"
 #include "Op/Neg.h"
+#include "Op/NonMaxSuppressionV4.h"
 #include "Op/NotEqual.h"
 #include "Op/OneHot.h"
 #include "Op/Pack.h"
 #include "Op/Pad.h"
+#include "Op/PadV2.h"
 #include "Op/Pow.h"
 #include "Op/PRelu.h"
 #include "Op/Range.h"
diff --git a/compiler/tflchef/proto/tflchef.proto b/compiler/tflchef/proto/tflchef.proto
index 792503bc9..70b966ec3 100644
--- a/compiler/tflchef/proto/tflchef.proto
+++ b/compiler/tflchef/proto/tflchef.proto
@@ -35,6 +35,7 @@ message TensorQuantization {
   repeated float max = 2;
   repeated float scale = 3;
   repeated int64 zero_point = 4;
+  optional int32 quantized_dimension = 5 [default = 0];
 }
 
 message Operand {
@@ -153,6 +154,10 @@ message PadOptions {
   // None
 }
 
+message PadV2Options {
+  // None
+}
+
 message MirrorPadOptions {
   optional MirrorPadMode mode = 1 [default = REFLECT];
 }
@@ -362,6 +367,10 @@ message GatherNdOptions {
   // None
 }
 
+message NonMaxSuppressionV4Options {
+  // None
+}
+
 message NotEqualOptions {
   // None
 }
@@ -507,7 +516,7 @@ message Operation {
   optional LogSoftmaxOptions log_softmax_options = 168;
   // DequantizeOptions 169
   optional NegOptions neg_options = 170;
-  // PadV2Options 171
+  optional PadV2Options padv2_options = 171;
   optional LessEqualOptions lessequal_options = 172;
   optional SliceOptions slice_options = 173;
   optional TransposeConvOptions transpose_conv_options = 174;
@@ -534,7 +543,7 @@ message Operation {
   optional MatrixSetDiagOptions matrix_set_diag_options = 195;
   // HardSwishOptions 196
   optional DepthToSpaceOptions depth_to_space_options = 197;
-  // NonMaxSuppressionV4Options 198
+  optional NonMaxSuppressionV4Options non_max_suppression_v4_options = 198;
   // NonMaxSuppressionV5Options 199
   optional ScatterNdOptions scatter_nd_options = 200;
   optional NotEqualOptions notequal_options = 201;
diff --git a/compiler/tflchef/tflite/src/Op/NonMaxSuppressionV4.cpp b/compiler/tflchef/tflite/src/Op/NonMaxSuppressionV4.cpp
new file mode 100644
index 000000000..ad9921970
--- /dev/null
+++ b/compiler/tflchef/tflite/src/Op/NonMaxSuppressionV4.cpp
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "NonMaxSuppressionV4.h"
+
+#include "Convert.h"
+#include "FillerHelper.h"
+
+namespace tflchef
+{
+
+void TFliteOpNonMaxSuppressionV4::filler(const tflite::Operator *op, TFliteImport *import,
+                                         tflchef::ModelRecipe *model_recipe) const
+{
+  const auto &inputs = *op->inputs();
+
+  const tflite::Tensor *max_output_size_tensor = import->tensors()->Get(inputs[2]);
+  assert(max_output_size_tensor->type() == tflite::TensorType::TensorType_INT32);
+
+  const tflite::Tensor *iou_threshold_tensor = import->tensors()->Get(inputs[3]);
+  assert(iou_threshold_tensor->type() == tflite::TensorType::TensorType_FLOAT32);
+
+  const tflite::Tensor *score_threshold_tensor = import->tensors()->Get(inputs[4]);
+  assert(score_threshold_tensor->type() == tflite::TensorType::TensorType_FLOAT32);
+
+  for (int32_t index = 2; index < 5; ++index)
+  {
+    fill_tensor_to_import(index, import);
+  }
+}
+
+tflchef::Operation *TFliteOpNonMaxSuppressionV4::build(const tflite::Operator *op,
+                                                       TFliteImport *import,
+                                                       tflchef::ModelRecipe *model_recipe) const
+{
+  auto operation = model_recipe->add_operation();
+
+  operation->set_type("NonMaxSuppressionV4");
+
+  return operation;
+}
+
+} // namespace tflchef
diff --git a/compiler/tflchef/tflite/src/Op/NonMaxSuppressionV4.h b/compiler/tflchef/tflite/src/Op/NonMaxSuppressionV4.h
new file mode 100644
index 000000000..114a2ad2f
--- /dev/null
+++ b/compiler/tflchef/tflite/src/Op/NonMaxSuppressionV4.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __TFLITE_OP_NON_MAX_SUPPRESSION_V4_H__
+#define __TFLITE_OP_NON_MAX_SUPPRESSION_V4_H__
+
+#include "TFliteOpChef.h"
+
+namespace tflchef
+{
+
+/**
+ * @brief tflchef operator builder for NON_MAX_SUPPRESSION_V4
+ */
+class TFliteOpNonMaxSuppressionV4 : public TFliteOpChef
+{
+public:
+  void filler(const tflite::Operator *op, TFliteImport *import,
+              tflchef::ModelRecipe *model_recipe) const override;
+  tflchef::Operation *build(const tflite::Operator *op, TFliteImport *import,
+                            tflchef::ModelRecipe *model_recipe) const override;
+};
+
+} // namespace tflchef
+
+#endif // __TFLITE_OP_NON_MAX_SUPPRESSION_V4_H__
diff --git a/compiler/tflchef/tflite/src/Op/PadV2.cpp b/compiler/tflchef/tflite/src/Op/PadV2.cpp
new file mode 100644
index 000000000..0b1c9f3b2
--- /dev/null
+++ b/compiler/tflchef/tflite/src/Op/PadV2.cpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "PadV2.h"
+
+#include "FillerHelper.h"
+
+namespace tflchef
+{
+
+void TFliteOpPadV2::filler(const tflite::Operator *op, TFliteImport *import,
+                           tflchef::ModelRecipe *model_recipe) const
+{
+  // Filler for paddings and constant_values
+  fill_tensor_to_import(1, import);
+  fill_tensor_to_import(2, import);
+}
+
+tflchef::Operation *TFliteOpPadV2::build(const tflite::Operator *op, TFliteImport *import,
+                                         tflchef::ModelRecipe *model_recipe) const
+{
+  auto operation = model_recipe->add_operation();
+
+  operation->set_type("PadV2");
+
+  return operation;
+}
+
+} // namespace tflchef
diff --git a/compiler/tflchef/tflite/src/Op/PadV2.h b/compiler/tflchef/tflite/src/Op/PadV2.h
new file mode 100644
index 000000000..3aa474b92
--- /dev/null
+++ b/compiler/tflchef/tflite/src/Op/PadV2.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __TFLITE_OP_PADV2_H__
+#define __TFLITE_OP_PADV2_H__
+
+#include "TFliteOpChef.h"
+
+namespace tflchef
+{
+
+/**
+ * @brief tflchef operator builder for PADV2
+ */
+class TFliteOpPadV2 : public TFliteOpChef
+{
+public:
+  void filler(const tflite::Operator *op, TFliteImport *import,
+              tflchef::ModelRecipe *model_recipe) const override;
+  tflchef::Operation *build(const tflite::Operator *op, TFliteImport *import,
+                            tflchef::ModelRecipe *model_recipe) const override;
+};
+
+} // namespace tflchef
+
+#endif // __TFLITE_OP_PADV2_H__
diff --git a/compiler/tflchef/tflite/src/Op/TransposeConv.cpp b/compiler/tflchef/tflite/src/Op/TransposeConv.cpp
index 7e772b954..4e7adf6c6 100644
--- a/compiler/tflchef/tflite/src/Op/TransposeConv.cpp
+++ b/compiler/tflchef/tflite/src/Op/TransposeConv.cpp
@@ -35,6 +35,10 @@ void TFliteOpTransposeConv::filler(const tflite::Operator *op, TFliteImport *imp
     auto vec = extract_buffer<int32_t>(buffer);
     import->set_tensor_filler(inputs[0], vec);
   }
+
+  // filter
+  const tflite::Tensor *filter_tensor = import->tensors()->Get(inputs[1]);
+  import->set_tensor_filler(inputs[1]);
 }
 
 tflchef::Operation *TFliteOpTransposeConv::build(const tflite::Operator *op, TFliteImport *import,
diff --git a/compiler/tflchef/tflite/src/RecipeChef.cpp b/compiler/tflchef/tflite/src/RecipeChef.cpp
index db62d0e40..088961c1c 100644
--- a/compiler/tflchef/tflite/src/RecipeChef.cpp
+++ b/compiler/tflchef/tflite/src/RecipeChef.cpp
@@ -184,6 +184,8 @@ std::unique_ptr<ModelRecipe> generate_recipe(const tflite::Model *model)
         for (uint32_t idx = 0; idx < quant->zero_point()->size(); ++idx)
           chef_quant->add_zero_point(quant->zero_point()->Get(idx));
       }
+      tflchef::TensorQuantization *chef_quant = operand->mutable_quant();
+      chef_quant->set_quantized_dimension(quant->quantized_dimension());
     }
   }
 
diff --git a/compiler/tflchef/tflite/src/TFliteOpChefs.h b/compiler/tflchef/tflite/src/TFliteOpChefs.h
index ad52af1c2..de14e37d1 100644
--- a/compiler/tflchef/tflite/src/TFliteOpChefs.h
+++ b/compiler/tflchef/tflite/src/TFliteOpChefs.h
@@ -68,10 +68,12 @@
 #include "Op/MirrorPad.h"
 #include "Op/Mul.h"
 #include "Op/Neg.h"
+#include "Op/NonMaxSuppressionV4.h"
 #include "Op/NotEqual.h"
 #include "Op/OneHot.h"
 #include "Op/Pack.h"
 #include "Op/Pad.h"
+#include "Op/PadV2.h"
 #include "Op/Pow.h"
 #include "Op/PRelu.h"
 #include "Op/Range.h"
diff --git a/compiler/tflchef/tflite/src/TFliteOpRegistry.h b/compiler/tflchef/tflite/src/TFliteOpRegistry.h
index 0a44b3f06..8d33007be 100644
--- a/compiler/tflchef/tflite/src/TFliteOpRegistry.h
+++ b/compiler/tflchef/tflite/src/TFliteOpRegistry.h
@@ -105,10 +105,12 @@ private:
     REG_TFL_OP(MIRROR_PAD, TFliteOpMirrorPad);
     REG_TFL_OP(MUL, TFliteOpMul);
     REG_TFL_OP(NEG, TFliteOpNeg);
+    REG_TFL_OP(NON_MAX_SUPPRESSION_V4, TFliteOpNonMaxSuppressionV4);
     REG_TFL_OP(NOT_EQUAL, TFliteOpNotEqual);
     REG_TFL_OP(ONE_HOT, TFliteOpOneHot);
     REG_TFL_OP(PACK, TFliteOpPack);
     REG_TFL_OP(PAD, TFliteOpPad);
+    REG_TFL_OP(PADV2, TFliteOpPadV2);
     REG_TFL_OP(POW, TFliteOpPow);
     REG_TFL_OP(PRELU, TFliteOpPRelu);
     REG_TFL_OP(RANGE, TFliteOpRange);
diff --git a/compiler/tflchef/tools/file/Driver.cpp b/compiler/tflchef/tools/file/Driver.cpp
index cecfeeb3e..46e5b5583 100644
--- a/compiler/tflchef/tools/file/Driver.cpp
+++ b/compiler/tflchef/tools/file/Driver.cpp
@@ -41,7 +41,7 @@ int entry(int argc, char **argv)
   {
     std::cout << err.what() << std::endl;
     std::cout << arser;
-    return 0;
+    return 255;
   }
 
   int32_t model_version = 1;
diff --git a/compiler/tflchef/tools/reverse/Driver.cpp b/compiler/tflchef/tools/reverse/Driver.cpp
index 1116dec34..4d795a3d0 100644
--- a/compiler/tflchef/tools/reverse/Driver.cpp
+++ b/compiler/tflchef/tools/reverse/Driver.cpp
@@ -38,7 +38,7 @@ int entry(int argc, char **argv)
   {
     std::cout << err.what() << std::endl;
     std::cout << arser;
-    return 0;
+    return 255;
   }
 
   std::string tflite_path = arser.get<std::string>("tflite");
diff --git a/compiler/tfldump/driver/Driver.cpp b/compiler/tfldump/driver/Driver.cpp
index 3961d2f17..38c9c062f 100644
--- a/compiler/tfldump/driver/Driver.cpp
+++ b/compiler/tfldump/driver/Driver.cpp
@@ -33,7 +33,7 @@ int entry(int argc, char **argv)
   {
     std::cout << err.what() << '\n';
     std::cout << arser;
-    return 0;
+    return 255;
   }
 
   std::string tflite_path = arser.get<std::string>("tflite");
diff --git a/compiler/tfldump/src/OpPrinter.cpp b/compiler/tfldump/src/OpPrinter.cpp
index 9fc1a6456..df027c3e3 100644
--- a/compiler/tfldump/src/OpPrinter.cpp
+++ b/compiler/tfldump/src/OpPrinter.cpp
@@ -676,6 +676,7 @@ OpPrinterRegistry::OpPrinterRegistry()
   _op_map[tflite::BuiltinOperator_MAX_POOL_2D] = make_unique<Pool2DPrinter>();
   _op_map[tflite::BuiltinOperator_MIRROR_PAD] = make_unique<MirrorPadPrinter>();
   _op_map[tflite::BuiltinOperator_MUL] = make_unique<MulPrinter>();
+  // There is no Option for NON_MAX_SUPPRESSION_V4
   _op_map[tflite::BuiltinOperator_ONE_HOT] = make_unique<OneHotPrinter>();
   _op_map[tflite::BuiltinOperator_PACK] = make_unique<PackPrinter>();
   // There is no Option for PAD
diff --git a/compiler/tflite2circle/CMakeLists.txt b/compiler/tflite2circle/CMakeLists.txt
index a0a2e026b..b1d1f6149 100644
--- a/compiler/tflite2circle/CMakeLists.txt
+++ b/compiler/tflite2circle/CMakeLists.txt
@@ -14,5 +14,6 @@ target_link_libraries(tflite2circle arser)
 target_link_libraries(tflite2circle safemain)
 target_link_libraries(tflite2circle mio_tflite)
 target_link_libraries(tflite2circle mio_circle)
+target_link_libraries(tflite2circle vconone)
 
 install(TARGETS tflite2circle DESTINATION bin)
diff --git a/compiler/tflite2circle/driver/Driver.cpp b/compiler/tflite2circle/driver/Driver.cpp
index 67b8e33bc..2f11e0a13 100644
--- a/compiler/tflite2circle/driver/Driver.cpp
+++ b/compiler/tflite2circle/driver/Driver.cpp
@@ -24,10 +24,25 @@
 #include "CircleModel.h"
 #include "TFLModel.h"
 
+#include <vconone/vconone.h>
+
+void print_version(void)
+{
+  std::cout << "tflite2circle version " << vconone::get_string() << std::endl;
+  std::cout << vconone::get_copyright() << std::endl;
+}
+
 int entry(int argc, char **argv)
 {
   arser::Arser arser{"tflite2circle is a Tensorflow lite to circle model converter"};
 
+  arser.add_argument("--version")
+      .nargs(0)
+      .required(false)
+      .default_value(false)
+      .help("Show version information and exit")
+      .exit_with(print_version);
+
   arser.add_argument("tflite")
       .nargs(1)
       .type(arser::DataType::STR)
@@ -42,7 +57,7 @@ int entry(int argc, char **argv)
   {
     std::cout << err.what() << std::endl;
     std::cout << arser;
-    return 0;
+    return 255;
   }
 
   std::string tfl_path = arser.get<std::string>("tflite");
diff --git a/compiler/tflite2circle/requires.cmake b/compiler/tflite2circle/requires.cmake
index ff19b7491..837c287b6 100644
--- a/compiler/tflite2circle/requires.cmake
+++ b/compiler/tflite2circle/requires.cmake
@@ -2,3 +2,4 @@ require("arser")
 require("mio-tflite")
 require("mio-circle")
 require("safemain")
+require("vconone")
diff --git a/compiler/tflite2circle/src/BuildBuiltinOptions.h b/compiler/tflite2circle/src/BuildBuiltinOptions.h
index 159a8af97..00b3de943 100644
--- a/compiler/tflite2circle/src/BuildBuiltinOptions.h
+++ b/compiler/tflite2circle/src/BuildBuiltinOptions.h
@@ -62,10 +62,12 @@
 #include "BuildBuiltinOptions/MirrorPadOptions.h"
 #include "BuildBuiltinOptions/MulOptions.h"
 #include "BuildBuiltinOptions/NegOptions.h"
+#include "BuildBuiltinOptions/NonMaxSuppressionV4Options.h"
 #include "BuildBuiltinOptions/NotEqualOptions.h"
 #include "BuildBuiltinOptions/OneHotOptions.h"
 #include "BuildBuiltinOptions/PackOptions.h"
 #include "BuildBuiltinOptions/PadOptions.h"
+#include "BuildBuiltinOptions/PadV2Options.h"
 #include "BuildBuiltinOptions/RangeOptions.h"
 #include "BuildBuiltinOptions/Pool2DOptions.h"
 #include "BuildBuiltinOptions/PowOptions.h"
diff --git a/compiler/tflite2circle/src/BuildBuiltinOptions/NonMaxSuppressionV4Options.cpp b/compiler/tflite2circle/src/BuildBuiltinOptions/NonMaxSuppressionV4Options.cpp
new file mode 100644
index 000000000..1a39f503b
--- /dev/null
+++ b/compiler/tflite2circle/src/BuildBuiltinOptions/NonMaxSuppressionV4Options.cpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "NonMaxSuppressionV4Options.h"
+
+namespace tflite2circle
+{
+
+flatbuffers::Offset<circle::NonMaxSuppressionV4Options>
+build_circle_NonMaxSuppressionV4Options(flatbuffers::FlatBufferBuilder &fb,
+                                        const tflite::Operator *)
+{
+  circle::NonMaxSuppressionV4OptionsBuilder builtin_options_builder{fb};
+  return builtin_options_builder.Finish();
+}
+
+} // namespace tflite2circle
diff --git a/compiler/tflite2circle/src/BuildBuiltinOptions/NonMaxSuppressionV4Options.h b/compiler/tflite2circle/src/BuildBuiltinOptions/NonMaxSuppressionV4Options.h
new file mode 100644
index 000000000..6073142a8
--- /dev/null
+++ b/compiler/tflite2circle/src/BuildBuiltinOptions/NonMaxSuppressionV4Options.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BBO_NON_MAX_SUPPRESSION_V4_OPTIONS_H__
+#define __BBO_NON_MAX_SUPPRESSION_V4_OPTIONS_H__
+
+#include <mio/tflite/schema_generated.h>
+#include <mio/circle/schema_generated.h>
+
+namespace tflite2circle
+{
+
+flatbuffers::Offset<circle::NonMaxSuppressionV4Options>
+build_circle_NonMaxSuppressionV4Options(flatbuffers::FlatBufferBuilder &fb,
+                                        const tflite::Operator *op);
+
+} // namespace tflite2circle
+
+#endif // __BBO_NON_MAX_SUPPRESSION_V4_OPTIONS_H__
diff --git a/compiler/tflite2circle/src/BuildBuiltinOptions/PadV2Options.cpp b/compiler/tflite2circle/src/BuildBuiltinOptions/PadV2Options.cpp
new file mode 100644
index 000000000..6636634a3
--- /dev/null
+++ b/compiler/tflite2circle/src/BuildBuiltinOptions/PadV2Options.cpp
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "PadV2Options.h"
+
+namespace tflite2circle
+{
+
+flatbuffers::Offset<circle::PadV2Options>
+build_circle_PadV2Options(flatbuffers::FlatBufferBuilder &fb, const tflite::Operator *op)
+{
+  circle::PadV2OptionsBuilder builtin_options_builder{fb};
+  return builtin_options_builder.Finish();
+}
+
+} // namespace tflite2circle
diff --git a/compiler/tflite2circle/src/BuildBuiltinOptions/PadV2Options.h b/compiler/tflite2circle/src/BuildBuiltinOptions/PadV2Options.h
new file mode 100644
index 000000000..36a2c82e8
--- /dev/null
+++ b/compiler/tflite2circle/src/BuildBuiltinOptions/PadV2Options.h
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __BBO_PADV2_OPTIONS_H__
+#define __BBO_PADV2_OPTIONS_H__
+
+#include <mio/tflite/schema_generated.h>
+#include <mio/circle/schema_generated.h>
+
+namespace tflite2circle
+{
+
+flatbuffers::Offset<circle::PadV2Options>
+build_circle_PadV2Options(flatbuffers::FlatBufferBuilder &fb, const tflite::Operator *op);
+
+} // namespace tflite2circle
+
+#endif // __BBO_PADV2_OPTIONS_H__
diff --git a/compiler/tflite2circle/src/TFLBuiltinOptions.lst b/compiler/tflite2circle/src/TFLBuiltinOptions.lst
index 3ef9f1575..a2a14538e 100644
--- a/compiler/tflite2circle/src/TFLBuiltinOptions.lst
+++ b/compiler/tflite2circle/src/TFLBuiltinOptions.lst
@@ -26,6 +26,7 @@ TFL_BUILTIN_OPTIONS(SpaceToDepthOptions)
 //TFL_BUILTIN_OPTIONS(EmbeddingLookupSparseOptions)
 TFL_BUILTIN_OPTIONS(MulOptions)
 TFL_BUILTIN_OPTIONS(PadOptions)
+TFL_BUILTIN_OPTIONS(PadV2Options)
 TFL_BUILTIN_OPTIONS(GatherOptions)
 TFL_BUILTIN_OPTIONS(BatchToSpaceNDOptions)
 TFL_BUILTIN_OPTIONS(SpaceToBatchNDOptions)
@@ -99,7 +100,7 @@ TFL_BUILTIN_OPTIONS(MatrixSetDiagOptions)
 TFL_BUILTIN_OPTIONS(IfOptions)
 TFL_BUILTIN_OPTIONS(WhileOptions)
 TFL_BUILTIN_OPTIONS(DepthToSpaceOptions)
-//TFL_BUILTIN_OPTIONS(NonMaxSuppressionV4Options)
+TFL_BUILTIN_OPTIONS(NonMaxSuppressionV4Options)
 //TFL_BUILTIN_OPTIONS(NonMaxSuppressionV5Options)
 TFL_BUILTIN_OPTIONS(RankOptions)
 TFL_BUILTIN_OPTIONS(ScatterNdOptions)
diff --git a/compiler/vconone/CMakeLists.txt b/compiler/vconone/CMakeLists.txt
new file mode 100644
index 000000000..b8cb79331
--- /dev/null
+++ b/compiler/vconone/CMakeLists.txt
@@ -0,0 +1,31 @@
+if (NOT VCONONE_VERSION)
+  set(VCONONE_VERSION 0x0000000000080001)
+  # NOTE order is [build patch minor major]
+  # if VCONONE_VERSION is set with -D option, it will be cached
+  # you may have to remove cache file if you remove -D option
+endif()
+
+configure_file(version_cfg.h.in version_cfg.h @ONLY)
+
+set(DRIVER "driver/driver.cpp")
+
+file(GLOB_RECURSE SOURCES "src/*.cpp")
+file(GLOB_RECURSE TESTS "src/*.test.cpp")
+list(REMOVE_ITEM SOURCES ${TESTS})
+
+add_library(vconone STATIC ${SOURCES})
+target_include_directories(vconone PUBLIC include)
+target_include_directories(vconone PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
+
+add_executable(one-version ${DRIVER})
+target_link_libraries(one-version vconone)
+install(TARGETS one-version DESTINATION bin)
+
+if(NOT ENABLE_TEST)
+  return()
+endif(NOT ENABLE_TEST)
+
+nnas_find_package(GTest REQUIRED)
+
+GTest_AddTest(vconone_test ${TESTS})
+target_link_libraries(vconone_test vconone)
diff --git a/compiler/vconone/README.md b/compiler/vconone/README.md
new file mode 100644
index 000000000..c08dd63d3
--- /dev/null
+++ b/compiler/vconone/README.md
@@ -0,0 +1,14 @@
+# vconone
+
+_vconone_ provides version number and strings for one-* commands and command
+line tools
+
+# Revise version number
+
+To revise version number, update `VCONONE_VERSION` in `CmakeLists.txt`
+or give `-DVCONONE_VERSION=0x0000000100080001` at cmake configure step.
+
+Number given is four numbers `build`, `patch`, `minor` and `major` in order for
+each 16bit integers. `build` is not used for now.
+
+`0x0000000100080001` version is interpretered as `1.8.1`
diff --git a/compiler/vconone/driver/driver.cpp b/compiler/vconone/driver/driver.cpp
new file mode 100644
index 000000000..12bd0eef2
--- /dev/null
+++ b/compiler/vconone/driver/driver.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vconone/vconone.h>
+
+#include <string>
+#include <iostream>
+
+int main(int argc, char *argv[])
+{
+  auto str = vconone::get_string();
+  if (argc >= 2)
+  {
+    for (int c = 1; c < argc; ++c)
+      std::cout << argv[c] << " ";
+    std::cout << "version " << str << std::endl;
+    std::cout << vconone::get_copyright() << std::endl;
+  }
+  else
+    std::cout << str;
+
+  return 0;
+}
diff --git a/compiler/vconone/include/vconone/vconone.h b/compiler/vconone/include/vconone/vconone.h
new file mode 100644
index 000000000..a6a1998a5
--- /dev/null
+++ b/compiler/vconone/include/vconone/vconone.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __VCON_ONE_H__
+#define __VCON_ONE_H__
+
+#include <cstdint>
+#include <string>
+
+namespace vconone
+{
+
+struct four
+{
+  uint16_t major;
+  uint16_t minor;
+  uint16_t patch;
+  uint16_t build; // build is not used for now
+};
+
+union version {
+  uint64_t v;
+  four f;
+};
+
+/**
+ * @brief get_number will return version union structure
+ */
+version get_number(void);
+
+/**
+ * @brief get_string will return string of major.minor.patch (without build)
+ */
+std::string get_string(void);
+
+/**
+ * @brief get_string4 will return string of major.minor.patch.build
+ */
+std::string get_string4(void);
+
+/**
+ * @brief get_copyright will return copyright string
+ */
+std::string get_copyright(void);
+
+} // namespace vconone
+
+#endif // __VCON_ONE_H__
diff --git a/compiler/vconone/src/version.cpp b/compiler/vconone/src/version.cpp
new file mode 100644
index 000000000..9b693c621
--- /dev/null
+++ b/compiler/vconone/src/version.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "vconone/vconone.h"
+
+#include "version_cfg.h"
+
+#include <sstream>
+
+namespace vconone
+{
+
+version get_number(void)
+{
+  version v;
+  v.v = VCONONE_VERSION;
+  return v;
+}
+
+std::string get_string4(void)
+{
+  std::ostringstream ss;
+
+  auto v = get_number();
+  ss << unsigned(v.f.major) << "." << unsigned(v.f.minor) << "." << unsigned(v.f.patch) << "."
+     << unsigned(v.f.build);
+
+  return ss.str();
+}
+
+std::string get_string(void)
+{
+  std::ostringstream ss;
+
+  auto v = get_number();
+  ss << unsigned(v.f.major) << "." << unsigned(v.f.minor) << "." << unsigned(v.f.patch);
+
+  return ss.str();
+}
+
+std::string get_copyright(void)
+{
+  std::string str;
+  str = "Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved\r\n";
+  str += "Licensed under the Apache License, Version 2.0\r\n";
+  str += "https://github.com/Samsung/ONE";
+  return str;
+}
+
+} // namespace vconone
diff --git a/compiler/vconone/src/version.test.cpp b/compiler/vconone/src/version.test.cpp
new file mode 100644
index 000000000..35a0647c1
--- /dev/null
+++ b/compiler/vconone/src/version.test.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <vconone/vconone.h>
+
+#include <gtest/gtest.h>
+
+TEST(vconone, version_number)
+{
+  auto v = vconone::get_number();
+
+  ASSERT_NE(0x0000000000000000ULL, v.v);
+}
+
+TEST(vconone, version_string)
+{
+  auto str = vconone::get_string();
+
+  ASSERT_NE("..", str);
+  ASSERT_NE("", str);
+}
+
+TEST(vconone, version_string4)
+{
+  auto str = vconone::get_string4();
+
+  ASSERT_NE("...", str);
+  ASSERT_NE("", str);
+}
+
+TEST(vconone, copyright)
+{
+  auto str = vconone::get_copyright();
+
+  ASSERT_NE("", str);
+}
diff --git a/compiler/vconone/version_cfg.h.in b/compiler/vconone/version_cfg.h.in
new file mode 100644
index 000000000..aa3ad9e70
--- /dev/null
+++ b/compiler/vconone/version_cfg.h.in
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __VCON_ONE_VERSION_CFG_H__
+#define __VCON_ONE_VERSION_CFG_H__
+
+#define VCONONE_VERSION @VCONONE_VERSION@ULL
+
+#endif // __VCON_ONE_VERSION_CFG_H__
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgOperationKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgOperationKernel.h
deleted file mode 100644
index 9699b5c00..000000000
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLArgOperationKernel.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/**
- * @file CLArgOperationKernel.h
- * @brief This file defines CLArgOperationKernel
- * @ingroup COM_AI_RUNTIME
- */
-
-#ifndef __ARM_COMPUTE_CLARGOPERATIONKERNEL_H__
-#define __ARM_COMPUTE_CLARGOPERATIONKERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/TypesEx.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/**
- * @brief Class to define interface for the argop kernel.
- */
-class CLArgOperationKernel : public ICLKernel
-{
-public:
-  /**
-   * @brief Default constructor.
-   */
-  CLArgOperationKernel();
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers).
-   * @param [in] copiedInstance Const reference of CLArgOperationKernel to be copied
-   */
-  CLArgOperationKernel(const CLArgOperationKernel &) = delete;
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers).
-   * @param [in] copiedInstance Const reference of CLArgOperationKernel to be copied
-   * @return Reference of this instance
-   */
-  CLArgOperationKernel &operator=(const CLArgOperationKernel &) = delete;
-  /**
-   * @brief Allow instances of this class to be moved
-   * @param [in] movedInstance Rvalue reference of CLArgOperationKernel to be moved
-   */
-  CLArgOperationKernel(CLArgOperationKernel &&) = default;
-  /**
-   * @brief Allow instances of this class to be moved
-   * @param [in] movedInstance Rvalue reference of CLArgOperationKernel to be moved
-   * @return Reference of this instance
-   */
-  CLArgOperationKernel &operator=(CLArgOperationKernel &&) = default;
-  /**
-   * @brief Initialise the kernel's input, output and border mode.
-   * @param[in]  input          An input tensor. Data types supported: U8/QASYMM8/S32/F32.
-   * @param[out] output         The output tensor, Data types supported: S32.
-   * @param[in]  axis           Axis along which to reduce. It must be sorted and no duplicates.
-   * @param[in]  op             Arg operation to perform.
-   * return N/A
-   */
-  void configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis, ArgOperation op);
-  /**
-   * @brief Static function to check if given info will lead to a valid configuration of @ref
-   * CLArgOperationKernel
-   * @param[in] input           An input tensor info. Data types supported: U8/QASYMM8/S32/F32.
-   * @param[in] output          The output tensor info, Data types supported: S32.
-   * @param[in] axis            Axis along which to reduce. It must be sorted and no duplicates.
-   * @param[in] op              Arg operation to perform.
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis,
-                         ArgOperation op);
-
-  /*
-   * @brief Run CLArgOperationKernel op
-   * @param[in] window  Window to be used for in_slice
-   * @param[in] queue   cl::CommandQueue
-   * @return N/A
-   */
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-  const ICLTensor *_input;
-  ICLTensor *_output;
-  uint32_t _axis;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLARGOPERATIONKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h
deleted file mode 100644
index b0357fe99..000000000
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLCastKernel.h
+++ /dev/null
@@ -1,121 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/**
- * @file      CLCastKernel.h
- * @ingroup   COM_AI_RUNTIME
- * @brief     This file defines CLCastKernel class
- */
-
-#ifndef __ARM_COMPUTE_CLCASTKERNEL_H__
-#define __ARM_COMPUTE_CLCASTKERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-#include "arm_compute/core/TypesEx.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/**
- * @brief Class to define OpenCL kernel for cast operation
- */
-class CLCastKernel : public ICLKernel
-{
-public:
-  /**
-   * @brief Construct CLCastKernel object
-   */
-  CLCastKernel();
-
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers)
-   */
-  CLCastKernel(const CLCastKernel &) = delete;
-
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers)
-   */
-  CLCastKernel &operator=(const CLCastKernel &) = delete;
-
-  /**
-   * @brief Construct CLCastKernel object using default move constructor
-   * @param[in] CLCastKernel object to move
-   */
-  CLCastKernel(CLCastKernel &&) = default;
-
-  /**
-   * @brief Allow instances of this class to be moved
-   * @param[in] CLCastKernel object to move
-   */
-  CLCastKernel &operator=(CLCastKernel &&) = default;
-
-  /**
-   * @brief Destruct this CLCastKernel object
-   */
-  ~CLCastKernel() = default;
-
-  /**
-   * @brief Initialise the kernel's input and output.
-   * @param[in]  input  Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
-   * @param[in]  output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
-   * @param[in]  input_subtype  Sub data type of input.
-   * @return N/A
-   */
-  void configure(const ICLTensor *input, ICLTensor *output, SubDataType input_subtype);
-
-  /**
-   * @brief Enqueue the OpenCL kernel to process the given window on the passed OpenCL command
-   *        queue.
-   * @note  The queue is *not* flushed by this method, and therefore the kernel will not have
-   *        been executed by the time this method returns.
-   * @param[in] window      Region on which to execute the kernel. (Must be a valid region of
-   *                        the window returned by window()).
-   * @param[in,out] queue   Command queue on which to enqueue the kernel.@return N/A
-   * @return N/A
-   */
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-  const ICLTensor *_input; /**< Source tensor */
-  ICLTensor *_output;      /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_CLCASTKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h
deleted file mode 100644
index 8615cf120..000000000
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__
-#define __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform depthTospace operation */
-class CLDepthToSpaceKernel : public ICLKernel
-{
-public:
-  /** Default constructor */
-  CLDepthToSpaceKernel();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLDepthToSpaceKernel(const CLDepthToSpaceKernel &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLDepthToSpaceKernel &operator=(const CLDepthToSpaceKernel &) = delete;
-  /** Allow instances of this class to be moved */
-  CLDepthToSpaceKernel(CLDepthToSpaceKernel &&) = default;
-  /** Allow instances of this class to be moved */
-  CLDepthToSpaceKernel &operator=(CLDepthToSpaceKernel &&) = default;
-  /** Default destructor */
-  ~CLDepthToSpaceKernel() = default;
-  /** Initialise the kernel's input and output.
-   *
-   * @param[in]  input  Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
-   * @param[in]  output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
-   */
-  void configure(const ICLTensor *input, ICLTensor *output, const int32_t block_size);
-
-  // Inherited methods overridden:
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-  const ICLTensor *_input; /**< Source tensor */
-  ICLTensor *_output;      /**< Destination tensor */
-};
-
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_CLDEPTHTOSPACEKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h
deleted file mode 100644
index 9321c3677..000000000
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNELEX_H__
-#define __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNELEX_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to multiply matrices
- *
- * @note This kernel should be used ONLY for Midgard architectures
- *
- * This kernel performs the following computation:
- *
- *  -# Convert a values from int8 to int32
- *  -# Convert b values from int8 to int32
- *  -# Compute the int32 matrix product of the resulting a * b and store the result as int32
- *
- */
-class CLGEMMLowpMatrixMultiplyKernelEx : public ICLKernel
-{
-public:
-  /** Default Constructor */
-  CLGEMMLowpMatrixMultiplyKernelEx();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLGEMMLowpMatrixMultiplyKernelEx(const CLGEMMLowpMatrixMultiplyKernelEx &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLGEMMLowpMatrixMultiplyKernelEx &operator=(const CLGEMMLowpMatrixMultiplyKernelEx &) = delete;
-  /** Allow instances of this class to be moved */
-  CLGEMMLowpMatrixMultiplyKernelEx(CLGEMMLowpMatrixMultiplyKernelEx &&) = default;
-  /** Allow instances of this class to be moved */
-  CLGEMMLowpMatrixMultiplyKernelEx &operator=(CLGEMMLowpMatrixMultiplyKernelEx &&) = default;
-  /** Initialise the kernel's input and output.
-   *
-   * @note This kernel should be used ONLY for Midgard architectures
-   *
-   * @param[in]  input0    Input tensor containing the LHS matrix. Data type supported: QASYMM8
-   * @param[in]  input1    Input tensor containing the RHS matrix. Data type supported: same as @p
-   * input0
-   * @param[out] output    Output tensor to store the result of matrix multiplication. Data type
-   * supported: S32
-   * @param[in]  gemm_info (Optional) GEMM information used to retrieve the original dimensions of
-   * the input matrices
-   */
-  void configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output,
-                 const GEMMReshapeInfo &gemm_info = GEMMReshapeInfo());
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLGEMMLowpMatrixMultiplyKernelEx
-   *
-   * @param[in] input0    Input tensor containing the LHS matrix. Data type supported: QASYMM8
-   * @param[in] input1    Input tensor containing the RHS matrix. Data type supported: same as @p
-   * input0
-   * @param[in] output    Output tensor to store the result of matrix multiplication. Data type
-   * supported: S32
-   * @param[in] gemm_info (Optional) GEMM information used to retrieve the original dimensions of
-   * the input matrices
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input0, const ITensorInfo *input1,
-                         const ITensorInfo *output,
-                         const GEMMReshapeInfo &gemm_info = GEMMReshapeInfo());
-
-  // Inherited methods overridden:
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-  const ICLTensor *_input0;
-  const ICLTensor *_input1;
-  ICLTensor *_output;
-  bool _slide_matrix_b;
-  bool _reinterpret_input_as_3d;
-  bool _reinterpret_output_as_3d;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYKERNELEX_H__*/
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h
deleted file mode 100644
index dd2dbf6a4..000000000
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLPReLUKernel.h
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLPRELU_KERNEL_H__
-#define __ARM_COMPUTE_CLPRELU_KERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to calculate PReLU*/
-class CLPReLUKernel : public ICLKernel
-{
-public:
-  /** Default constructor */
-  CLPReLUKernel();
-  /** Prevent instances of this class from being copied (As this class contains pointers). */
-  CLPReLUKernel(const CLPReLUKernel &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers). */
-  CLPReLUKernel &operator=(const CLPReLUKernel &) = delete;
-  /** Allow instances of this class to be moved */
-  CLPReLUKernel(CLPReLUKernel &&) = default;
-  /** Allow instances of this class to be moved */
-  CLPReLUKernel &operator=(CLPReLUKernel &&) = default;
-  /** Initialize the kernel's input, output.
-   *
-   * @param[in]  input  Source tensor1.
-   * @param[in]  alpha  Source tensor2.
-   * @param[out] output  Output tensor.
-   */
-  void configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output);
-
-  // Inherited methods overridden:
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-  BorderSize border_size() const override;
-
-private:
-  const ICLTensor *_input;
-  const ICLTensor *_alpha;
-  ICLTensor *_output;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLPRELU_KERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h
deleted file mode 100644
index 4c0a82ce1..000000000
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__
-#define __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** OpenCL kernel to perform spaceTodepth operation */
-class CLSpaceToDepthKernel : public ICLKernel
-{
-public:
-  /** Default constructor */
-  CLSpaceToDepthKernel();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLSpaceToDepthKernel(const CLSpaceToDepthKernel &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLSpaceToDepthKernel &operator=(const CLSpaceToDepthKernel &) = delete;
-  /** Allow instances of this class to be moved */
-  CLSpaceToDepthKernel(CLSpaceToDepthKernel &&) = default;
-  /** Allow instances of this class to be moved */
-  CLSpaceToDepthKernel &operator=(CLSpaceToDepthKernel &&) = default;
-  /** Default destructor */
-  ~CLSpaceToDepthKernel() = default;
-  /** Initialise the kernel's input and output.
-   *
-   * @param[in]  input  Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
-   * @param[in]  output Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
-   */
-  void configure(const ICLTensor *input, ICLTensor *output, const int32_t block_size);
-
-  // Inherited methods overridden:
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-  const ICLTensor *_input; /**< Source tensor */
-  ICLTensor *_output;      /**< Destination tensor */
-};
-
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_CLSPACETODEPTHKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h b/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h
deleted file mode 100644
index 9d174deb5..000000000
--- a/compute/ARMComputeEx/arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLEKERNEL_H__
-#define __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLEKERNEL_H__
-
-#include "arm_compute/core/CL/ICLKernel.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Interface for the Upsampling layer kernel for transpose convolution on OpenCL.
- */
-class CLTransposeConvLayerUpsampleKernel : public ICLKernel
-{
-public:
-  /** Constructor */
-  CLTransposeConvLayerUpsampleKernel();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLTransposeConvLayerUpsampleKernel(const CLTransposeConvLayerUpsampleKernel &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLTransposeConvLayerUpsampleKernel &
-  operator=(const CLTransposeConvLayerUpsampleKernel &) = delete;
-  /** Default Move Constructor. */
-  CLTransposeConvLayerUpsampleKernel(CLTransposeConvLayerUpsampleKernel &&) = default;
-  /** Default move assignment operator */
-  CLTransposeConvLayerUpsampleKernel &operator=(CLTransposeConvLayerUpsampleKernel &&) = default;
-  /** Default destructor */
-  ~CLTransposeConvLayerUpsampleKernel() = default;
-
-  /** Initialise the kernel's input and output.
-   *
-   * @param[in]  input        Source tensor. Data types supported: QASYMM8/F16/F32.
-   * @param[out] output       Destination tensor. Data types supported: same as @p input. All but
-   * the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only
-   * performed within the XY-plane.
-   * @param[in]  inner_border Top and right inner border sizes. These rows and columns will be
-   * filled with zero.
-   * @param[in]  info         Contains padding and stride information described in @ref
-   * PadStrideInfo.
-   */
-  void configure(const ICLTensor *input, ICLTensor *output, const BorderSize &inner_border,
-                 const PadStrideInfo &info);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLTransposeConvLayerUpsample
-   *
-   * @param[in] input        Source tensor info. Data types supported: QASYMM8/F16/F32.
-   * @param[in] output       Destination tensor info. Data types supported: same as @p input. All
-   * but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is
-   * only performed within the XY-plane.
-   * @param[in] inner_border Top and right inner border sizes. These rows and columns will be filled
-   * with zero.
-   * @param[in] info         Contains padding and stride information described in @ref
-   * PadStrideInfo.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                         const BorderSize &inner_border, const PadStrideInfo &info);
-
-  // Inherited methods overridden:
-  void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-  const ICLTensor *_input;
-  ICLTensor *_output;
-  BorderSize _inner_border;
-  PadStrideInfo _info;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLEKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h b/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h
deleted file mode 100644
index d4c9c610a..000000000
--- a/compute/ARMComputeEx/arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CPPUPSAMPLEKERNEL_EX_H__
-#define __ARM_COMPUTE_CPPUPSAMPLEKERNEL_EX_H__
-
-#include "arm_compute/core/CPP/ICPPKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** CPP kernel to perform tensor upsample.
- *
- */
-class CPPUpsampleKernelEx : public ICPPKernel
-{
-public:
-  const char *name() const override { return "CPPUpsampleKernelEx"; }
-  /** Default constructor */
-  CPPUpsampleKernelEx();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CPPUpsampleKernelEx(const CPPUpsampleKernelEx &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CPPUpsampleKernelEx &operator=(const CPPUpsampleKernelEx &) = delete;
-  /** Allow instances of this class to be moved */
-  CPPUpsampleKernelEx(CPPUpsampleKernelEx &&) = default;
-  /** Allow instances of this class to be moved */
-  CPPUpsampleKernelEx &operator=(CPPUpsampleKernelEx &&) = default;
-  /** Default destructor */
-  ~CPPUpsampleKernelEx() = default;
-
-  /** Set the input and output of the kernel.
-   *
-   * @param[in]  input  The input tensor to upsample. Data types supported: F32/F16/QASYMM8
-   * @param[out] output The output tensor. Data types supported: Same as @p input
-   * @param[in]  info   Padding info.
-   */
-  void configure(const ITensor *input, ITensor *output, const PadStrideInfo &info);
-
-  // Inherited methods overridden:
-  void run(const Window &window, const ThreadInfo &info) override;
-  bool is_parallelisable() const override;
-
-private:
-  const ITensor *_input;
-  ITensor *_output;
-  PadStrideInfo _info;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CPPUPSAMPLEKERNEL_EX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastKernel.h
deleted file mode 100644
index 4e9f097c2..000000000
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NECastKernel.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NECASTKERNEL_H__
-#define __ARM_COMPUTE_NECASTKERNEL_H__
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/TypesEx.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the cast layer kernel. */
-class NECastKernel : public INEKernel
-{
-public:
-  const char *name() const override { return "NECastKernel"; }
-  /** Default constructor */
-  NECastKernel();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NECastKernel(const NECastKernel &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NECastKernel &operator=(const NECastKernel &) = delete;
-  /** Default Move Constructor. */
-  NECastKernel(NECastKernel &&) = default;
-  /** Default move assignment operator */
-  NECastKernel &operator=(NECastKernel &&) = default;
-  /** Default destructor */
-  ~NECastKernel() = default;
-  /** Set input, output tensors.
-   *
-   * @param[in]  input  Source tensor. Data type supported: U8/S8/QASYMM8/U32/S32/F32.
-   * @param[out] output Destination tensor with the same dimensions of input. Data type supported:
-   * U8/S8/QASYMM8/U32/S32/F32.
-   * @param[in]  input_subtype  Sub data type of input.
-   */
-  void configure(const ITensor *input, ITensor *output, SubDataType input_subtype);
-  /** Static function to check if given info will lead to a valid configuration of @ref NECastKernel
-   *
-   * @param[in] input  Input tensor info. Data types supported: U8/S8/QASYMM8/U32/S32/F32.
-   * @param[in] output Output tensor info. Data types supported: U8/S8/QASYMM8/U32/S32/F32.
-   * @param[in] input_subtype  Sub data type of input.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                         SubDataType input_subtype);
-
-  // Inherited methods overridden:
-  void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-  const ITensor *_input;
-  ITensor *_output;
-  SubDataType _input_subtype;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_NECASTKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h
deleted file mode 100644
index b62897e68..000000000
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNELEX_H__
-#define __ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNELEX_H__
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the depth to space kernel */
-class NEDepthToSpaceLayerKernelEx : public INEKernel
-{
-public:
-  const char *name() const override { return "NEDepthToSpaceLayerKernelEx"; }
-  /** Default constructor */
-  NEDepthToSpaceLayerKernelEx();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NEDepthToSpaceLayerKernelEx(const NEDepthToSpaceLayerKernelEx &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NEDepthToSpaceLayerKernelEx &operator=(const NEDepthToSpaceLayerKernelEx &) = delete;
-  /** Allow instances of this class to be moved */
-  NEDepthToSpaceLayerKernelEx(NEDepthToSpaceLayerKernelEx &&) = default;
-  /** Allow instances of this class to be moved */
-  NEDepthToSpaceLayerKernelEx &operator=(NEDepthToSpaceLayerKernelEx &&) = default;
-  /** Default destructor */
-  ~NEDepthToSpaceLayerKernelEx() = default;
-  /** Initialise the kernel's inputs and output.
-   *
-   * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported:
-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-   * @param[out] output      Tensor output. Data types supported: same as @p input
-   * @param[in]  block_shape Block shape x value.
-   */
-  void configure(const ITensor *input, ITensor *output, int32_t block_shape);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NEDepthToSpaceLayerKernelEx.
-   *
-   * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported:
-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-   * @param[in] output      Tensor output info. Data types supported: same as @p input
-   * @param[in] block_shape Block shape value.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
-
-  // Inherited methods overridden:
-  void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-  const ITensor *_input; /**< Source tensor */
-  ITensor *_output;      /**< Destination tensor */
-  int32_t _block_shape;  /**< Block shape */
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEDEPTHTOSPACELAYERKERNELEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h
deleted file mode 100644
index 57de78dd8..000000000
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEELEMENTWISEUNARYKERNELEX_H__
-#define __ARM_COMPUTE_NEELEMENTWISEUNARYKERNELEX_H__
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/TypesEx.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for an element-wise unary operation kernel
- *
- * Element-wise operation is computed by:
- * @f[ output(x) = OP(input(x))@f]
- *
- */
-class NEElementwiseUnaryKernelEx : public INEKernel
-{
-public:
-  const char *name() const override { return "NEElementwiseUnaryKernelEx"; }
-  /** Default constructor */
-  NEElementwiseUnaryKernelEx();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NEElementwiseUnaryKernelEx(const NEElementwiseUnaryKernelEx &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NEElementwiseUnaryKernelEx &operator=(const NEElementwiseUnaryKernelEx &) = delete;
-  /** Allow instances of this class to be moved */
-  NEElementwiseUnaryKernelEx(NEElementwiseUnaryKernelEx &&) = default;
-  /** Allow instances of this class to be moved */
-  NEElementwiseUnaryKernelEx &operator=(NEElementwiseUnaryKernelEx &&) = default;
-  /** Default destructor */
-  ~NEElementwiseUnaryKernelEx() = default;
-
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NEElementwiseUnaryKernelEx
-   *
-   * @param[in] op     Arithmetic operation to be executed.
-   * @param[in] input  First tensor input. Data types supported: F16/F32/S32.
-   * @param[in] output Output tensor. Data types supported: Same as @p input.
-   */
-  void configure(ElementWiseUnaryEx op, const ITensor *input, ITensor *output);
-
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NEElementwiseUnaryKernelEx
-   *
-   * @param[in] op     Arithmetic operation to be executed.
-   * @param[in] input  First tensor input info. Data types supported: F16/F32/S32.
-   * @param[in] output Output tensor info. Data types supported: Same as @p input.
-   *
-   * @return a Status
-   */
-  static Status validate(ElementWiseUnaryEx op, const ITensorInfo *input,
-                         const ITensorInfo *output);
-
-  // Inherited methods overridden:
-  void run(const Window &window, const ThreadInfo &info) override;
-
-  /** Common signature for all the specialised arithmetic functions
-   *
-   * @param[in]  input  An input tensor. Data types supported: F16/F32/S32.
-   * @param[out] output The output tensor. Data types supported: Same as @p input.
-   * @param[in]  window Region on which to execute the kernel.
-   */
-  using ElementwiseUnaryFunction = void(const ITensor *input, ITensor *output,
-                                        const Window &window);
-
-protected:
-  // Inherited methods overridden:
-  static Status validate_arguments(const ITensorInfo &input, const ITensorInfo &output);
-
-  /** Function to use for the particular tensor types passed to configure() */
-  std::function<void(const ITensor *input, ITensor *output, const Window &window)> _function;
-
-  const ITensor *_input;
-  ITensor *_output;
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEELEMENTWISEUNARYKERNELEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEPReLUKernel.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEPReLUKernel.h
deleted file mode 100644
index 722efd3d0..000000000
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NEPReLUKernel.h
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEPRELUKERNEL_H__
-#define __ARM_COMPUTE_NEPRELUKERNEL_H__
-
-#include "arm_compute/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the kernel to perform Parametric Rectified Linear Unit
- *
- * Result is computed by:
- * @f[ output(x) = alpha * x for x < 0, output(x) = x for x >= 0 @f]
- */
-class NEPReLUKernel : public INEKernel
-{
-public:
-  const char *name() const override { return "NEPReLUKernel"; }
-  /** Default constructor */
-  NEPReLUKernel();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NEPReLUKernel(const NEPReLUKernel &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NEPReLUKernel &operator=(const NEPReLUKernel &) = delete;
-  /** Allow instances of this class to be moved */
-  NEPReLUKernel(NEPReLUKernel &&) = default;
-  /** Allow instances of this class to be moved */
-  NEPReLUKernel &operator=(NEPReLUKernel &&) = default;
-  /** Initialise the kernel's inputs and output
-   *
-   * @param[in]  input Input tensor. Data type supported: QASYMM8/F32
-   * @param[in]  alpha Alpha tensor. Data types supported: Same as @p input
-   * @param[out] output Output tensor. Data types supported: Same as @p input
-   */
-  void configure(const ITensor *input, const ITensor *alpha, ITensor *output);
-
-  // Inherited methods overridden:
-  void run(const Window &window, const ThreadInfo &info) override;
-
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NEPReLUKernel.h
-   *
-   * @param[in] input  Input tensor input info. Data types supported: QASYMM8/F32.
-   * @param[in] alpha  Alpha tensor input info. Data types supported: Same as @p input.
-   * @param[in] output Output tensor info. Data types supported: Same as @p input.
-   *
-   * @return a Status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *alpha,
-                         const ITensorInfo *output);
-  static Status validate_arguments(const ITensorInfo &input, const ITensorInfo &alpha,
-                                   const ITensorInfo &output);
-
-private:
-  const ITensor *_input; /**< Source tensor */
-  const ITensor *_alpha; /**< Alpha tensor */
-  ITensor *_output;      /**< Destination tensor */
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEPRELUKERNEL_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h b/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h
deleted file mode 100644
index 0ffcf6be8..000000000
--- a/compute/ARMComputeEx/arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h
+++ /dev/null
@@ -1,97 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NESPACETODEPTHLAYERKERNELEX_H__
-#define __ARM_COMPUTE_NESPACETODEPTHLAYERKERNELEX_H__
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the space to depth kernel */
-class NESpaceToDepthLayerKernelEx : public INEKernel
-{
-public:
-  const char *name() const override { return "NESpaceToDepthLayerKernelEx"; }
-  /** Default constructor */
-  NESpaceToDepthLayerKernelEx();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NESpaceToDepthLayerKernelEx(const NESpaceToDepthLayerKernelEx &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NESpaceToDepthLayerKernelEx &operator=(const NESpaceToDepthLayerKernelEx &) = delete;
-  /** Allow instances of this class to be moved */
-  NESpaceToDepthLayerKernelEx(NESpaceToDepthLayerKernelEx &&) = default;
-  /** Allow instances of this class to be moved */
-  NESpaceToDepthLayerKernelEx &operator=(NESpaceToDepthLayerKernelEx &&) = default;
-  /** Default destructor */
-  ~NESpaceToDepthLayerKernelEx() = default;
-  /** Initialise the kernel's inputs and output.
-   *
-   * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported:
-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-   * @param[out] output      Tensor output. Data types supported: same as @p input
-   * @param[in]  block_shape Block shape value
-   */
-  void configure(const ITensor *input, ITensor *output, int32_t block_shape);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NESpaceToDepthLayerKernelEx
-   *
-   * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported:
-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-   * @param[in] output      Tensor output info. Data types supported: same as @p input
-   * @param[in] block_shape Block shape value
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
-
-  // Inherited methods overridden:
-  void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-  const ITensor *_input; /**< Source tensor */
-  ITensor *_output;      /**< Destination tensor */
-  int32_t _block_shape;  /**< Block shape */
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NESPACETODEPTHLAYERKERNELEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
index 97bc4cea5..cfbd13436 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/CLFunctionsEx.h
@@ -16,25 +16,14 @@
 #ifndef __ARM_COMPUTE_CLFUNCTIONSEX_H__
 #define __ARM_COMPUTE_CLFUNCTIONSEX_H__
 
-#include <arm_compute/runtime/CL/functions/CLArgOperation.h>
-#include <arm_compute/runtime/CL/functions/CLBatchToSpaceND.h>
 #include <arm_compute/runtime/CL/functions/CLBinaryLogicalOp.h>
-#include <arm_compute/runtime/CL/functions/CLCast.h>
-#include <arm_compute/runtime/CL/functions/CLDepthToSpace.h>
 #include <arm_compute/runtime/CL/functions/CLEmbeddingLookup.h>
 #include <arm_compute/runtime/CL/functions/CLFullyConnectedReshapingLayer.h>
 #include <arm_compute/runtime/CL/functions/CLGatherEx.h>
 #include <arm_compute/runtime/CL/functions/CLHashtableLookup.h>
 #include <arm_compute/runtime/CL/functions/CLInstanceNormalizationLayerEx.h>
-#include <arm_compute/runtime/CL/functions/CLLogicalNot.h>
 #include <arm_compute/runtime/CL/functions/CLNeg.h>
-#include <arm_compute/runtime/CL/functions/CLPixelWiseDivision.h>
-#include <arm_compute/runtime/CL/functions/CLPReLU.h>
 #include <arm_compute/runtime/CL/functions/CLReduceOperation.h>
-#include <arm_compute/runtime/CL/functions/CLRNNLayerEx.h>
-#include <arm_compute/runtime/CL/functions/CLSpaceToDepth.h>
-#include <arm_compute/runtime/CL/functions/CLSplit.h>
-#include <arm_compute/runtime/CL/functions/CLStridedSliceEx.h>
 #include <arm_compute/runtime/CL/functions/CLTopKV2.h>
 #include <arm_compute/runtime/CL/functions/CLTransposeConvLayer.h>
 
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgOperation.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgOperation.h
deleted file mode 100644
index c37096f7c..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLArgOperation.h
+++ /dev/null
@@ -1,129 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/**
- * @file CLArgOperation.h
- * @ingroup COM_AI_RUNTIME
- * @brief This file contains arm_compute::CLArgOperation class
- */
-
-#ifndef __ARM_COMPUTE_CLARGOPERATION_H__
-#define __ARM_COMPUTE_CLARGOPERATION_H__
-
-#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/core/TypesEx.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/**
- * @brief Class to execute CLArgOperation operation
- */
-class CLArgOperation : public IFunction
-{
-public:
-  /**
-   * @brief Construct a new CLArgOperation object
-   */
-  CLArgOperation();
-
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers)
-   */
-  CLArgOperation(const CLArgOperation &) = delete;
-
-  /**
-   * @brief Prevent instances of this class from being copied (As this class contains pointers)
-   */
-  CLArgOperation &operator=(const CLArgOperation &) = delete;
-
-  /**
-   * @brief Construct a new CLArgOperation object by using copy constructor
-   * @param[in] CLArgOperation object to move
-   */
-  CLArgOperation(CLArgOperation &&) = default;
-
-  /**
-   * @brief Assign a CLArgOperation object.
-   * @param[in] CLArgOperation object to assign. This object will be moved.
-   */
-  CLArgOperation &operator=(CLArgOperation &&) = default;
-
-  /**
-   * @brief Initialise the kernel's inputs and outputs.
-   * @param[in]  input     Input tensor. Data types supported: U8/QASYMM8/S32/F32.
-   * @param[out] output    The result of arg operation. Data types supported: S32.
-   * @param[in]  axis      Axis along which to reduce. It must be sorted and no duplicates.
-   * @param[in]  op        Arg operation to perform.
-   * @return N/A
-   */
-  void configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis, ArgOperation op);
-
-  /**
-   * @brief Static function to check if given info will lead to a valid configuration
-   * @param[in]  input     Input tensor. Data types supported: U8/QASYMM8/S32/F32.
-   * @param[in]  axis      Axis along which to reduce. It must be sorted and no duplicates.
-   * @param[out] output    The result of arg operation. Data types supported: S32.
-   * @param[in]  op        Arg operation to perform.
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const std::vector<uint32_t> &axis,
-                         const ITensorInfo *output, ArgOperation op);
-  /**
-   * @brief Run the OpenCL kernel for this operation
-   * @return N/A
-   */
-  void run() override;
-
-private:
-  ICLTensor *_input{nullptr};
-  ICLTensor *_output{nullptr};
-  std::vector<uint32_t> _axis{};
-  ArgOperation _arg_op{ArgOperation::MAX};
-
-  std::unique_ptr<CLTensor[]> _interm_tensors{nullptr};
-  std::unique_ptr<CLArgOperationKernel[]> _argop_kernels{nullptr};
-  size_t _num_of_kernels{0};
-};
-}
-#endif /*__ARM_COMPUTE_CLARGOPERATION_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h
deleted file mode 100644
index eed5cb8a4..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLBatchToSpaceND.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__
-#define __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to run @ref CLBatchToSpaceNDKernel
- *
- * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
- * @note The function converts the input tensor to the tensor of the output tensor's type.
- */
-class CLBatchToSpaceND : public ICLSimpleFunction
-{
-public:
-  /** Initialise the kernel's input and output.
-   *
-   * @param[in]  input              Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
-   * @param[out] output             Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
-   * @param[in]  block_size         A pointer to an array of integer values specifying block sizes
-   *                                for spatial dimension.
-   */
-  void configure(ICLTensor *input, ICLTensor *output, const int32_t *block_size);
-};
-
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_CLBATCH_TO_SPACE_ND_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h
deleted file mode 100644
index ebe0d8a1c..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLCast.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/**
- * @file CLCast.h
- * @ingroup COM_AI_RUNTIME
- * @brief This file contains arm_compute::CLCast class
- */
-
-#ifndef __ARM_COMPUTE_CLCAST_H__
-#define __ARM_COMPUTE_CLCAST_H__
-
-#include "arm_compute/core/TypesEx.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/**
- * @brief Class to run @ref CLCastKernel.
- * This converts the input tensor to the tensor of the output tensor's type.
- */
-class CLCast : public ICLSimpleFunction
-{
-public:
-  /**
-   * @brief Initialise the kernel's input and output
-   * @param[in, out] input    Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
-   *                          The input tensor is [in, out] because its TensorInfo might be
-   *                          modified inside the kernel.
-   * @param[out]     output   Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
-   * @param[in]      input_subtype  Sub data type of input.
-   */
-  void configure(ICLTensor *input, ICLTensor *output, SubDataType input_subtype);
-};
-}
-#endif /* __ARM_COMPUTE_CLCAST_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h
deleted file mode 100644
index d52a538df..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDepthToSpace.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLDEPTHTOSPACE_H__
-#define __ARM_COMPUTE_CLDEPTHTOSPACE_H__
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to run @ref CLDepthToSpaceKernel
- *
- * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
- * @note The function converts the input tensor to the tensor of the output tensor's type.
- */
-class CLDepthToSpace : public ICLSimpleFunction
-{
-public:
-  /** Initialise the kernel's input and output.
-   *
-   * @param[in]  input              Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
-   * @param[out] output             Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
-   * @param[block_size] block size  integer only
-   */
-  void configure(ICLTensor *input, ICLTensor *output, const int32_t block_size);
-};
-} // namesace arm_compute
-
-#endif /* __ARM_COMPUTE_CLDEPTHTOSPACE_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h
new file mode 100644
index 000000000..409eaf593
--- /dev/null
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CLDIRECTTRANSPOSECONVLAYER_H__
+#define __ARM_COMPUTE_CLDIRECTTRANSPOSECONVLAYER_H__
+
+#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
+#include "arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h"
+#include "arm_compute/runtime/CL/functions/CLReverse.h"
+#include "arm_compute/runtime/CL/functions/CLTranspose.h"
+
+#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+class ICLTensor;
+/** Function to run the deconvolution layer.
+ *
+ * Deconvolution Layer is the backward pass of Convolution Layer. First we transform the input
+ * depending on the stride and pad info and then perform a 1x1
+ * convolution pass. Input stride defines how many zeroes we should put between each element of the
+ * input and pad is the amount of padding.
+ *
+ *  The relation between input to output is as follows:
+ *  \f[
+ *       width\_output = (width\_input - 1) \cdot stride\_x - 2 \cdot padding\_x + kernel\_x
+ *  \f]
+ *  \f[
+ *       height\_output = (height\_input - 1) \cdot stride\_y - 2 \cdot padding\_y + kernel\_y
+ *  \f]
+ *
+ *  where:
+ *      width_input is the size of the first input dimension.
+ *      height_input is the size of the second input dimension.
+ *      width_output is the size of the first output dimension.
+ *      height_output is the size of the second output dimension.
+ *      kernel_x and kernel_y are the convolution sizes in x and y.
+ *      stride_x and stride_y is the input stride of the first and second dimension.
+ *
+ * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution.
+ * Therefore, it will be necessary to use the weights in the
+ * reverse order to perform an actual convolution. This is achieved by using @ref CLReverse.
+ *
+ * This function calls the following OpenCL kernels/functions:
+ *
+ * -# @ref CLDeconvolutionLayerUpsample
+ * -# @ref CLConvolutionLayer
+ *
+ * And the following CPP kernels:
+ * -# @ref CLReverse
+ *
+ */
+class CLDirectTransposeConvLayer : public IFunction
+{
+public:
+  /** Constructor */
+  CLDirectTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLDirectTransposeConvLayer(const CLDirectTransposeConvLayer &) = delete;
+  /** Default move constructor */
+  CLDirectTransposeConvLayer(CLDirectTransposeConvLayer &&) = default;
+  /** Prevent instances of this class from being copied (As this class contains pointers) */
+  CLDirectTransposeConvLayer &operator=(const CLDirectTransposeConvLayer &) = delete;
+  /** Default move assignment operator */
+  CLDirectTransposeConvLayer &operator=(CLDirectTransposeConvLayer &&) = default;
+  /** Set the input, weights, biases and output tensors.
+   *
+   * @param[in,out] input        Input tensor. 3 lower dimensions represent a single input, and an
+ * optional 4th dimension for batch of inputs.
+   *                             Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+   * @param[in]     weights      The 4d weights with dimensions [width, height, IFM, OFM]. Data type
+ * supported: Same as @p input.
+   * @param[in]     bias         (Optional) The biases have one dimension.
+   *                             Data type supported: Should match @p input data type, except for
+ * input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
+   * @param[out]    output       Output tensor. The output has the same number of dimensions as the
+ * @p input.
+   * @param[in]     info         Contains padding and policies to be used in the deconvolution, this
+ * is decribed in @ref PadStrideInfo.
+ * @param[in] invalid_right  The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
+   * @param[in]     weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
+ * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+   *
+   */
+  void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
+                 const PadStrideInfo &info, unsigned int invalid_right, unsigned int invalid_bottom,
+                 const WeightsInfo &weights_info = WeightsInfo());
+  /** Set the input, weights, biases and output tensors.
+   *
+   * @param[in]     compile_context The compile context to be used.
+   * @param[in,out] input           Input tensor. 3 lower dimensions represent a single input, and
+ * an optional 4th dimension for batch of inputs.
+   *                                Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+   * @param[in]     weights         The 4d weights with dimensions [width, height, IFM, OFM]. Data
+ * type supported: Same as @p input.
+   * @param[in]     bias            (Optional) The biases have one dimension.
+   *                                Data type supported: Should match @p input data type, except for
+ * input of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
+   * @param[out]    output          Output tensor. The output has the same number of dimensions as
+ * the @p input.
+   * @param[in]     info            Contains padding and policies to be used in the deconvolution,
+ * this is decribed in @ref PadStrideInfo.
+ * @param[in] invalid_right  The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
+   * @param[in]     weights_info    (Optional) Weights information needed for @ref
+ * CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref
+ * CLWeightsReshapeKernel.
+   *
+   */
+  void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights,
+                 const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
+                 unsigned int invalid_right, unsigned int invalid_bottom,
+                 const WeightsInfo &weights_info = WeightsInfo());
+  /** Static function to check if given info will lead to a valid configuration of @ref
+ * CLDirectTransposeConvLayer
+   *
+   * @param[in] input        Input tensor info. 3 lower dimensions represent a single input, and an
+ * optional 4th dimension for batch of inputs.
+   *                         Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+   * @param[in] weights      The 4d weights info with dimensions [width, height, IFM, OFM]. Data
+ * type supported: Same as @p input.
+   * @param[in] bias         (Optional) The biases have one dimension.
+   *                         Data type supported: Should match @p input data type, except for input
+ * of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type
+   * @param[in] output       Output tensor info. The output has the same number of dimensions as the
+ * @p input.
+   * @param[in] info         Contains padding and policies to be used in the deconvolution, this is
+ * decribed in @ref PadStrideInfo.
+ * @param[in] invalid_right  The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
+   * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
+ * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+   *
+   * @return a status
+   */
+  static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
+                         const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
+                         unsigned int invalid_right, unsigned int invalid_bottom,
+                         const WeightsInfo &weights_info = WeightsInfo());
+
+  // Inherited methods overridden:
+  void run() override;
+  void prepare() override;
+
+private:
+  MemoryGroup _memory_group;
+  CLDeconvolutionLayerUpsample _scale_f;
+  CLConvolutionLayer _conv_f;
+  CLReverse _flip_weights;
+
+  CLTensor _scaled_output;
+  ICLTensor *_original_weights;
+  CLTensor _weights_flipped;
+  CLTensor _flip_axis;
+
+  bool _is_prepared;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CLDIRECTTRANSPOSECONVLAYER_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h
index 1a0284a3e..f3266f688 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLFullyConnectedHybridLayer.h
@@ -50,7 +50,7 @@
 #include "arm_compute/core/CL/kernels/CLTransposeKernel.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
 
 namespace arm_compute
 {
@@ -168,7 +168,7 @@ private:
   CLFullyConnectedHybridLayerReshapeWeights _reshape_weights_kernel;
   CLScaleFactorSymm8Kernel _scale_factor_kernel;
   CLQuantizationSymmetricKernel _quant_input_kernel;
-  CLGEMMLowpMatrixMultiplyCoreEx _mm_gemmlowp;
+  CLGEMMLowpMatrixMultiplyCore _mm_gemmlowp;
   CLMultiplyScaleFactorKernel _multiply_scale_kernel;
   CLGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel; // TODO(COMPMID-1889): Use CLGEMM to
                                                                 // add bias in
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h
deleted file mode 100644
index 68aba74ab..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCOREEX_H__
-#define __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCOREEX_H__
-
-#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h"
-#include "arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h"
-#include "arm_compute/runtime/CL/CLTensor.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-namespace arm_compute
-{
-class IMemoryManager;
-class ICLTensor;
-
-/** Basic function to execute GEMMLowpMatrixMultiplyCore on OpenCL. This function calls the
- * following OpenCL kernels:
- *
- *  -# @ref CLGEMMLowpMatrixMultiplyKernel (if the parameter "reshape_b_only_on_first_run" of
- * GEMMInfo is FALSE)
- *  -# @ref CLGEMMLowpMatrixAReductionKernel (if the offset of matrix B is not 0)
- *  -# @ref CLGEMMLowpMatrixBReductionKernel (if the offset of matrix A is not 0)
- *
-*/
-class CLGEMMLowpMatrixMultiplyCoreEx : public IFunction
-{
-public:
-  /** Constructor */
-  CLGEMMLowpMatrixMultiplyCoreEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLGEMMLowpMatrixMultiplyCoreEx(const CLGEMMLowpMatrixMultiplyCoreEx &) = delete;
-  /** Default move constructor */
-  CLGEMMLowpMatrixMultiplyCoreEx(CLGEMMLowpMatrixMultiplyCoreEx &&) = default;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLGEMMLowpMatrixMultiplyCoreEx &operator=(const CLGEMMLowpMatrixMultiplyCoreEx &) = delete;
-  /** Default move assignment operator */
-  CLGEMMLowpMatrixMultiplyCoreEx &operator=(CLGEMMLowpMatrixMultiplyCoreEx &&) = default;
-  /** Initialise the kernel's inputs, output
-   *
-   * @note GEMMLowp:  low precision GEMM kernel. [A * B + C]
-   *  This kernel performs the following computations:
-   *
-   *  -# Convert a values from QASYMM8 to int32 and add a_offset to each of them.
-   *  -# Convert b values from QASYMM8 to int32 and add b_offset to each of them.
-   *  -# Compute the matrix product of the resulting a * b in int32.
-   *  -# Quantize to uint8 if gemm_info.gemmlowp_output_stage != NONE
-   *
-   * @param[in]  a         First input tensor  (Matrix A). Data type supported: QASYMM8.
-   * @param[in]  b         Second input tensor (Matrix B). Data type supported: same as @p a
-   * @param[in]  c         Third input tensor  (Matrix C). It can be a nullptr. Data type supported:
-   * S32
-   * @param[out] output    Output tensor. Data type supported: S32 or QASYMM8 if
-   * gemm_info.gemmlowp_output_stage != NONE
-   * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped
-   * and
-   *                       if the reshape of matrix B should be executed only for the first run
-   */
-  void configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output,
-                 const GEMMInfo &gemm_info = GEMMInfo());
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLGEMMLowpMatrixMultiplyCoreEx
-   *
-   * @param[in] a         First input tensor info (Matrix A). Data type supported: QASYMM8.
-   * @param[in] b         Second input tensor info (Matrix B). Data type supported: same as @p a
-   * @param[in] c         Third input tensor info (Matrix C). It can be a nullptr. Data type
-   * supported: S32
-   * @param[in] output    Output tensor info. Data type supported: S32 or QASYMM8 if
-   * gemm_info.gemmlowp_output_stage != NONE
-   * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped
-   * and
-   *                      if the reshape of matrix B should be executed only for the first run
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c,
-                         const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
-
-  // Inherited methods overridden:
-  void run() override;
-  void prepare() override;
-
-private:
-  MemoryGroup _memory_group;
-
-  // Kernels used
-  CLGEMMLowpMatrixMultiplyKernelEx _mm_midgard_kernel;
-  CLGEMMLowpMatrixAReductionKernel _mtx_a_reduction_kernel;
-  CLGEMMLowpMatrixBReductionKernel _mtx_b_reduction_kernel;
-
-  // Temporary tensors
-  CLTensor _vector_sum_col;
-  CLTensor _vector_sum_row;
-
-  int32_t _a_offset;
-  int32_t _b_offset;
-  bool _reshape_b_only_on_first_run;
-  bool _is_prepared;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCOREEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLLogicalNot.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLLogicalNot.h
deleted file mode 100644
index 51216715f..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLLogicalNot.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLLOGICALNOT_H__
-#define __ARM_COMPUTE_CLLOGICALNOT_H__
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-class CLLogicalNot : public ICLSimpleFunction
-{
-public:
-  /** Initialise the function's source and destination.
-   *
-   * @param[in]  input  Source tensor. Data types supported: QASYMM8.
-   * @param[out] output Output tensor. Data types supported: QASYMM8.
-   */
-  void configure(ICLTensor *input, ICLTensor *output);
-};
-
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLLOGICALNOT_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h
deleted file mode 100644
index 7fbe558ff..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPReLU.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLPRELU_H__
-#define __ARM_COMPUTE_CLPRELU_H__
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-class CLPReLU : public ICLSimpleFunction
-{
-public:
-  /** Initialise the function's source and destination.
-   *
-   * @param[in]  input. Data types supported:
-   * QASYMM8/F16/F32.
-   * @param[in]  alpha. Data types supported:
-   * QASYMM8/F16/F32.
-   * @param[out] output Output tensor. Data types supported: Same as @p input.
-   */
-  void configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output);
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_CLPRELU_H__*/
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h
deleted file mode 100644
index e83fb01cd..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLPixelWiseDivision.h
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/**
- * @file CLPixelWiseDivision.h
- * @ingroup COM_AI_RUNTIME
- * @brief This file contains arm_compute::CLPixelWiseDivision class
- */
-#ifndef __ARM_COMPUTE_CLPIXELWISEDIVISION_H__
-#define __ARM_COMPUTE_CLPIXELWISEDIVISION_H__
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/**
- * @brief Class to run @ref CLPixelWiseDivisionKernel.
- */
-class CLPixelWiseDivision : public ICLSimpleFunction
-{
-public:
-  /**
-   * @brief Initialise the kernel's inputs, output and convertion policy.
-   * @param[in, out] input1          An input tensor. Data types supported: U8/S16/F16/F32
-   *                                 The input tensor is [in, out] because its TensorInfo might be
-   * modified inside the kernel in case of broadcasting of dimension 0.
-   * @param[in, out] input2          An input tensor. Data types supported: same as @p input1.
-   *                                 The input tensor is [in, out] because its TensorInfo might be
-   * modified inside the kernel in case of broadcasting of dimension 0.
-   * @param[out]     output          The output tensor, Data types supported: same as @p input1.
-   * Note: U8 requires both inputs to be U8.
-   * @param[in]      scale           Scale to apply after multiplication.
-   *                                 Scale must be positive and its value must be either 1/255 or
-   * 1/2^n where n is between 0 and 15.
-   * @param[in]      overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
-   * @param[in]      rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest
-   * even.
-   * @return N/A
-   */
-  void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale = 1.f,
-                 ConvertPolicy overflow_policy = ConvertPolicy::WRAP,
-                 RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO);
-
-  /**
-   * @brief Static function to check if given info will lead to a valid configuration of @ref
-   * CLPixelWiseDivision
-   * @param[in] input1          An input tensor info. Data types supported: U8/S16/F16/F32
-   * @param[in] input2          An input tensor info. Data types supported: same as @p input1.
-   * @param[in] output          The output tensor info, Data types supported: same as @p input1.
-   * Note: U8 requires both inputs to be U8.
-   * @param[in] scale           Scale to apply after multiplication.
-   *                            Scale must be positive and its value must be either 1/255 or 1/2^n
-   * where n is between 0 and 15.
-   * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate
-   * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
-                         const ITensorInfo *output, float scale = 1.f,
-                         ConvertPolicy overflow_policy = ConvertPolicy::WRAP,
-                         RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO);
-};
-}
-#endif /*__ARM_COMPUTE_CLPIXELWISEDIVISION_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLRNNLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLRNNLayerEx.h
deleted file mode 100644
index b49cbd873..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLRNNLayerEx.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLRNN_LAYER_EX_H__
-#define __ARM_COMPUTE_CLRNN_LAYER_EX_H__
-
-#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLCopyKernel.h"
-#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-#include "arm_compute/runtime/CL/functions/CLFullyConnectedLayer.h"
-#include "arm_compute/runtime/CL/functions/CLGEMM.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to run @ref CLRNNLayerEx */
-class CLRNNLayerEx : public IFunction
-{
-public:
-  /** Default constructor */
-  CLRNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-  /** Initialize the function
-   *
-   * @param[in]     input             Input is a 2-D tensor of shape [input_size, batch_size]. Data
-   * types supported: F16/F32
-   * @param[in]     weights           Weights tensor of shape [input_size, num_units] that
-   * multiplies the input. Data types supported: Same as @p input
-   * @param[in]     recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies
-   * the current 'state'. Data types supported: Same as @p input
-   * @param[in]     bias              Bias vector of shape [num_units]. Data types supported: Same
-   * as @p input
-   * @param[out]    output            Output tensor of shape [num_units, batch_size]. Data types
-   * supported: Same as @p input
-   * @param[in,out] hidden_state      Output tensor of shape [num_units, batch_size]. Data types
-   * supported: Same as @p input
-   * @param[in]     info              Activation layer parameter.
-   */
-  void configure(const ICLTensor *input, const ICLTensor *weights,
-                 const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state,
-                 ICLTensor *output, ActivationLayerInfo &info);
-  /** Initialize the function
-   *
-   * @param[in] input             Input is a 2-D tensor of shape [input_size, batch_size]. Data
-   * types supported: F16/F32
-   * @param[in] weights           Weights tensor of shape [input_size, num_units] that multiplies
-   * the input. Data types supported: Same as @p input
-   * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies the
-   * current 'state'. Data types supported: Same as @p input
-   * @param[in] bias              Bias vector of shape [num_units]. Data types supported: Same as @p
-   * input
-   * @param[in] output            Output tensor of shape [num_units, batch_size]. Data types
-   * supported: Same as @p input
-   * @param[in] hidden_state      Output tensor of shape [num_units, batch_size]. Data types
-   * supported: Same as @p input
-   * @param[in] info              Activation layer parameter.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
-                         const ITensorInfo *recurrent_weights, const ITensorInfo *bias,
-                         const ITensorInfo *hidden_state, const ITensorInfo *output,
-                         const ActivationLayerInfo &info);
-
-  // Inherited methods overridden:
-  void run() override;
-  void prepare() override;
-
-private:
-  MemoryGroup _memory_group;
-  CLGEMM _gemm_state_f;
-  CLSaturatedArithmeticOperationKernel _add_kernel;
-  CLActivationLayerKernel _activation_kernel;
-  CLFullyConnectedLayer _fully_connected_kernel;
-  CLCopyKernel _copy_kernel;
-  CLTensor _fully_connected_out;
-  CLTensor _gemm_output;
-  CLTensor _add_output;
-  bool _is_prepared;
-};
-}
-#endif /* __ARM_COMPUTE_CLRNN_LAYER_EX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h
deleted file mode 100644
index 2090b46fa..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLSpaceToDepth.h
+++ /dev/null
@@ -1,68 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLSPACETODEPTH_H__
-#define __ARM_COMPUTE_CLSPACETODEPTH_H__
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to run @ref CLSpaceToDepthKernel
- *
- * @note The tensor data type for the inputs must be U8/QASYMM8/S16/S32/F16/F32.
- * @note The function converts the input tensor to the tensor of the output tensor's type.
- */
-class CLSpaceToDepth : public ICLSimpleFunction
-{
-public:
-  /** Initialise the kernel's input and output.
-   *
-   * @param[in]  input              Input tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
-   * @param[out] output             Output tensor. Data types supported: U8/QASYMM8/S16/S32/F16/F32.
-   * @param[block_size] block size  integer only
-   */
-  void configure(ICLTensor *input, ICLTensor *output, const int32_t block_size);
-};
-
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_CLSPACETODEPTH_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h
deleted file mode 100644
index 03edd15e6..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLStridedSliceEx.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/**
- * @file CLStridedSlice.h
- * @ingroup COM_AI_RUNTIME
- * @brief This file contains arm_compute::CLStridedSlice and arm_compute::CLStridedSliceCPU class
- */
-
-#ifndef __ARM_COMPUTE_CLSTRIDEDSLICEEX_H__
-#define __ARM_COMPUTE_CLSTRIDEDSLICEEX_H__
-
-#include "arm_compute/runtime/CL/ICLSimpleFunction.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/**
- * @brief Class to run @ref CLStridedSliceKernel
- */
-class CLStridedSliceEx : public ICLSimpleFunction
-{
-public:
-  /**
-   * @brief Initialise the kernel's inputs and outputs
-   * @param[in]  input   Tensor input. Data type supported:
-   *                     U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32
-   * @param[out] output  Output tensor. Data type supported: Same as @p input
-   * @param[in]  beginData 'begin' vector of strided slice operation
-   * @param[in]  endData   'end' vector of strided slice operation
-   * @param[in]  stridesData 'strides' vector of strided slice operation
-   * @param[in]  beginMask  If the ith bit is set, begin[i] is ignored
-   * @param[in]  endMask    If the ith bit is set, end[i] is ignored
-   * @param[in]  shrinkAxisMask  If the ith bit is set, the ith specification shrinks the
-   *                             dimensionality by 1, taking on the value at index begin[i]
-   * @return N/A
-   */
-  void configure(const ICLTensor *input, ICLTensor *output, ICLTensor *beginData,
-                 ICLTensor *endData, ICLTensor *stridesData, int32_t beginMask, int32_t endMask,
-                 int32_t shrinkAxisMask);
-};
-}
-#endif /*__ARM_COMPUTE_CLSTRIDEDSLICEEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h
index 54a697e69..5fb102e47 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayer.h
@@ -15,7 +15,7 @@
  */
 
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,16 +37,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-
 #ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__
 #define __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__
 
-#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
-#include "arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h"
-
-#include "arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h"
-
-#include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h"
+#include "arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 
@@ -54,119 +49,102 @@
 
 namespace arm_compute
 {
-class ICLTensor;
-/** Function to run the transpose convolution layer.
- *
- * @note This layer was copied in order to fix a bug computing to wrong output dimensions.
- *
- * TransposeConv Layer is the backward pass of Convolution Layer. First we transform the input
- * depending on the stride and pad info and then perform a 1x1
- * convolution pass. Input stride defines how many zeroes we should put between each element of the
- * input, pad is the amount of padding and finally a is a user
- * specified value where a < stride - 1, that increases the padding top and right of the input
- * image.
- *
- *  The relation between input to output is as follows:
- *  \f[
- *       width\_output = (width\_input - 1) \cdot stride\_x - \cdot padding\_x + kernel\_x
- *  \f]
- *  \f[
- *       height\_output = (height\_input - 1) \cdot stride\_y - \cdot padding\_y + kernel\_y
- *  \f]
- *
- *  where:
- *      width_input is the size of the first input dimension.
- *      height_input is the size of the second input dimension.
- *      width_output is the size of the first output dimension.
- *      height_output is the size of the second output dimension.
- *      kernel_x and kernel_y are the convolution sizes in x and y.
- *      stride_x and stride_y is the input stride of the first and second dimension.
- *
- * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution.
- * Therefore, it will be necessary to use the weights in the
- * reverse order to perform an actual convolution. This is achieved by using the @ref
- * CPPFlipWeightsKernel.
- *
- * This function calls the following OpenCL kernels/functions:
- *
- * -# @ref CLTransposeConvLayerUpsample
- * -# @ref CLConvolutionLayer
+/** Basic function to compute the deconvolution layer. This function calls the following OpenCL
+ * kernels/functions:
  *
+ * -# @ref CLGEMMDeconvolutionLayer
+ * -# @ref CLDirectTransposeConvLayer
  */
 class CLTransposeConvLayer : public IFunction
 {
 public:
-  /** Constructor */
+  /** Default constructor */
   CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLTransposeConvLayer(const CLTransposeConvLayer &) = delete;
-  /** Default move constructor */
-  CLTransposeConvLayer(CLTransposeConvLayer &&) = default;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLTransposeConvLayer &operator=(const CLTransposeConvLayer &) = delete;
-  /** Default move assignment operator */
-  CLTransposeConvLayer &operator=(CLTransposeConvLayer &&) = default;
+
   /** Set the input, weights, biases and output tensors.
    *
-   * @param[in,out] input          Input tensor. 3 lower dimensions represent a single input,
-   *                               and an optional 4th dimension for batch of inputs.
-   *                               Data types supported: QASYMM8/F16/F32.
-   * @param[in]     weights        The 4d weights with dimensions [width, height, IFM, OFM].
-   *                               Data type supported: Same as @p input.
-   * @param[in]     bias           (Optional) The biases have one dimension. Data type supported:
-   *                               Same as @p input.
-   * @param[out]    output         Output tensor. The output has the same number of dimensions
-   *                               as the @p input.
-   * @param[in]     info           Contains padding and policies to be used in the
-   *                               transpose convolution, this is decribed in @ref PadStrideInfo.
-   * @param[in]     invalid_right  The number of zeros added to right edge of the output.
-   * @param[in]     invalid_bottom The number of zeros added to top edge of the output.
-   * @param[in]     weights_info   (Optional) Weights information needed for @ref
-   *                               CLConvolutionLayer, specifies if the weights tensor has been
-   *                               reshaped with @ref CLWeightsReshapeKernel.
+   * @param[in,out] input        Input tensor. 3 lower dimensions represent a single input, and an
+ * optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+   * @param[in]     weights      The 4d weights with dimensions [width, height, IFM, OFM]. Data type
+ * supported: Same as @p input.
+   * @param[in]     bias         (Optional) The biases have one dimension. Data type supported: Same
+ * as @p input.
+   * @param[out]    output       Output tensor. The output has the same number of dimensions as the
+ * @p input.
+   * @param[in]     deconv_info  Contains padding and policies to be used in the deconvolution, this
+ * is described in @ref PadStrideInfo.
+ * @param[in] invalid_right  The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
+   * @param[in]     weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
+ * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
+   *
    */
   void configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
-                 const PadStrideInfo &info, unsigned int invalid_right, unsigned int invalid_bottom,
+                 const PadStrideInfo &deconv_info, unsigned int invalid_right,
+                 unsigned int invalid_bottom, const WeightsInfo &weights_info = WeightsInfo());
+  /** Set the input, weights, biases and output tensors.
+   *
+   * @param[in]     compile_context The compile context to be used.
+   * @param[in,out] input           Input tensor. 3 lower dimensions represent a single input, and
+ * an optional 4th dimension for batch of inputs. Data types supported:
+ * QASYMM8_SIGNED/QASYMM8/F16/F32.
+   * @param[in]     weights         The 4d weights with dimensions [width, height, IFM, OFM]. Data
+ * type supported: Same as @p input.
+   * @param[in]     bias            (Optional) The biases have one dimension. Data type supported:
+ * Same as @p input.
+   * @param[out]    output          Output tensor. The output has the same number of dimensions as
+ * the @p input.
+   * @param[in]     deconv_info     Contains padding and policies to be used in the deconvolution,
+ * this is described in @ref PadStrideInfo.
+ * @param[in] invalid_right  The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
+   * @param[in]     weights_info    (Optional) Weights information needed for @ref
+ * CLConvolutionLayer, specifies if the weights tensor has been reshaped with @ref
+ * CLWeightsReshapeKernel.
+   *
+   */
+  void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights,
+                 const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
+                 unsigned int invalid_right, unsigned int invalid_bottom,
                  const WeightsInfo &weights_info = WeightsInfo());
   /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLTransposeConvLayer
+ * CLTransposeConvLayer
+   *
+   * @param[in] input        Input tensor info. 3 lower dimensions represent a single input, and an
+ * optional 4th dimension for batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32.
+   * @param[in] weights      The 4d weights info with dimensions [width, height, IFM, OFM]. Data
+ * type supported: Same as @p input.
+   * @param[in] bias         (Optional) The biases have one dimension. Data type supported: Same as
+ * @p input.
+   * @param[in] output       Output tensor info. The output has the same number of dimensions as the
+ * @p input.
+   * @param[in] deconv_info  Contains padding and policies to be used in the deconvolution, this is
+ * described in @ref PadStrideInfo.
+ * @param[in] invalid_right  The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
+   * @param[in] weights_info (Optional) Weights information needed for @ref CLConvolutionLayer,
+ * specifies if the weights tensor has been reshaped with @ref CLWeightsReshapeKernel.
    *
-   * @param[in] input           Input tensor info. 3 lower dimensions represent a single input,
-   *                            and an optional 4th dimension for batch of inputs.
-   *                            Data types supported: QASYMM8/F16/F32.
-   * @param[in] weights         The 4d weights info with dimensions [width, height, IFM, OFM].
-   *                            Data type supported: Same as @p input.
-   * @param[in] bias            (Optional) The biases have one dimension. Data type supported:
-   *                            Same as @p input.
-   * @param[in] output          Output tensor info. The output has the same number of dimensions
-   *                            as the @p input.
-   * @param[in] info            Contains padding and policies to be used in the
-   *                            transpose convolution, this is decribed in @ref PadStrideInfo.
-   * @param[in] innvalid_right  The number of zeros added to right edge of the output.
-   * @param[in] invalid_bottom  The number of zeros added to top edge of the output.
-   * @param[in] weights_info    (Optional) Weights information needed for @ref CLConvolutionLayer,
-   *                            specifies if the weights tensor has been reshaped with @ref
-   *                            CLWeightsReshapeKernel.
    * @return a status
    */
   static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
-                         const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
-                         unsigned int innvalid_right, unsigned int invalid_bottom,
+                         const ITensorInfo *bias, ITensorInfo *output,
+                         const PadStrideInfo &deconv_info, unsigned int invalid_right,
+                         unsigned int invalid_bottom,
                          const WeightsInfo &weights_info = WeightsInfo());
 
+  static DeconvolutionMethod
+  get_deconvolution_method(const ITensorInfo *input, const ITensorInfo *weights,
+                           const ITensorInfo *bias, ITensorInfo *output,
+                           const PadStrideInfo &deconv_info, unsigned int invalid_right,
+                           unsigned int invalid_bottom, const WeightsInfo &weights_info);
   // Inherited methods overridden:
   void run() override;
   void prepare() override;
 
 private:
-  MemoryGroup _memory_group;
-  CLTransposeConvLayerUpsample _scale_f;
-  CLConvolutionLayer _conv_f;
-  CPPFlipWeightsKernel _flip_weights;
-  CLTensor _scaled_output;
-  ICLTensor *_original_weights;
-  CLTensor _weights_flipped;
-  bool _is_prepared;
+  std::shared_ptr<IMemoryManager> _memory_manager;
+  std::unique_ptr<IFunction> _function;
 };
-}
+} // namespace arm_compute
 #endif /* __ARM_COMPUTE_CLTRANSPOSECONVLAYER_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h b/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h
deleted file mode 100644
index 7570fe76d..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLE_H__
-#define __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLE_H__
-
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-
-namespace arm_compute
-{
-class ICLTensor;
-
-/** Basic function to run @ref CLTransposeConvLayerUpsampleKernel */
-class CLTransposeConvLayerUpsample : public IFunction
-{
-public:
-  /** Default constructor */
-  CLTransposeConvLayerUpsample();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLTransposeConvLayerUpsample(const CLTransposeConvLayerUpsample &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  CLTransposeConvLayerUpsample &operator=(const CLTransposeConvLayerUpsample &) = delete;
-  /** Allow instances of this class to be moved */
-  CLTransposeConvLayerUpsample(CLTransposeConvLayerUpsample &&) = default;
-  /** Allow instances of this class to be moved */
-  CLTransposeConvLayerUpsample &operator=(CLTransposeConvLayerUpsample &&) = default;
-  /** Default destructor */
-  virtual ~CLTransposeConvLayerUpsample() = default;
-
-  /** Initialize the function's source, destination, interpolation type and border_mode.
-   *
-   * @param[in, out] input        Source tensor. Data type supported: QASYMM8/F16/F32.
-   * @param[out]     output       Destination tensor. Data type supported: same as @p input.
-   * @param[in]      inner_border The number of zeros added to right and top edges of the input.
-   * @param[in]      info         Contains padding and policies to be used in the deconvolution.
-   */
-  void configure(ICLTensor *input, ICLTensor *output, const BorderSize &inner_border,
-                 const PadStrideInfo &info);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * CLTransposeConvLayerUpsample
-   *
-   * @param[in] input        Source tensor info. Data type supported: QASYMM8/F16/F32.
-   * @param[in] output       Destination tensor info. Data type supported: same as @p input.
-   * @param[in] inner_border The number of zeros added to right and top edges of the input.
-   * @param[in] info         Contains padding and policies to be used in the deconvolution.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                         const BorderSize &inner_border, const PadStrideInfo &info);
-
-  // Inherited methods overridden:
-  void run() override;
-
-private:
-  CLTransposeConvLayerUpsampleKernel _upsample;
-  ICLTensor *_output;
-};
-}
-#endif /* __ARM_COMPUTE_CLTRANSPOSECONVLAYERUPSAMPLE_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPUpsampleEx.h b/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPUpsampleEx.h
deleted file mode 100644
index 666afef4b..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/CPP/functions/CPPUpsampleEx.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_CPPUPSAMPLE_EX_H__
-#define __ARM_COMPUTE_CPPUPSAMPLE_EX_H__
-
-#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
-
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to run @ref CPPUpsample */
-class CPPUpsampleEx : public ICPPSimpleFunction
-{
-public:
-  /** Configure the upsample CPP kernel
-   *
-   * @param[in]  input  The input tensor to upsample. Data types supported: F32/F16/QASYMM8
-   * @param[out] output The output tensor. Data types supported: Same as @p input
-   * @param[in]  info   Padding information
-   */
-  void configure(const ITensor *input, ITensor *output, const PadStrideInfo &info);
-};
-}
-#endif /* __ARM_COMPUTE_CPPUPSAMPLE_EX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
index 49504fde3..3fad230f1 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/NEFunctionsEx.h
@@ -18,20 +18,13 @@
 
 #include <arm_compute/runtime/NEON/functions/NEActivationLayerEx.h>
 #include <arm_compute/runtime/NEON/functions/NEBinaryLogicalOperation.h>
-#include <arm_compute/runtime/NEON/functions/NECast.h>
-#include <arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h>
 #include <arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h>
 #include <arm_compute/runtime/NEON/functions/NEFullyConnectedReshapingLayer.h>
 #include <arm_compute/runtime/NEON/functions/NEGatherEx.h>
 #include <arm_compute/runtime/NEON/functions/NEHashtableLookup.h>
 #include <arm_compute/runtime/NEON/functions/NEInstanceNormalizationLayerEx.h>
-#include <arm_compute/runtime/NEON/functions/NEPReLU.h>
-#include <arm_compute/runtime/NEON/functions/NEReduceMeanEx.h>
 #include <arm_compute/runtime/NEON/functions/NEReduceSum.h>
-#include <arm_compute/runtime/NEON/functions/NERNNLayerEx.h>
 #include <arm_compute/runtime/NEON/functions/NEReduceOperation.h>
-#include <arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h>
-#include <arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h>
 #include <arm_compute/runtime/NEON/functions/NETransposeConvLayer.h>
 
 #endif // __ARM_COMPUTE_NEFUNCTIONSEX_H__
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h
deleted file mode 100644
index f0f0d8114..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NECast.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NECAST_H__
-#define __ARM_COMPUTE_NECAST_H__
-
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/TypesEx.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Basic function to run @ref NECastKernel that converts an input tensor to the other types */
-class NECast : public INESimpleFunctionNoBorder
-{
-public:
-  /** Configure the kernel.
-   *
-   * @param[in]  input  Source tensor. Data types supported: U8/S8/QASYMM8/U32/S32/F32.
-   * @param[out] output Destination tensor with the same dimensions of input. Data type supported:
-   * U8/S8/QASYMM8/U32/S32/F32.
-   * @param[in]  input_subtype  Sub data type of input.
-   */
-  void configure(const ITensor *input, ITensor *output,
-                 SubDataType input_subtype = SubDataType::NONE);
-  /** Static function to check if given info will lead to a valid configuration of @ref NECast
-   *
-   * @param[in] input  Input tensor info. Data types supported: U8/S8/QASYMM8/U32/S32/F32.
-   * @param[in] output Output tensor info. Data type supported: U8/S8/QASYMM8/U32/S32/F32.
-   * @param[in] input_subtype  Sub data type of input.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                         SubDataType input_subtype = SubDataType::NONE);
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NECAST_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h
deleted file mode 100644
index 005d85add..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEDEPTHTOSPACELAYEREX_H__
-#define __ARM_COMPUTE_NEDEPTHTOSPACELAYEREX_H__
-
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to run @ref NEDepthToSpaceLayerKernelEx. */
-class NEDepthToSpaceLayerEx : public INESimpleFunctionNoBorder
-{
-public:
-  /** Set the input and output tensors.
-   *
-   * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported:
-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-   * @param[out] output      Tensor output. Data types supported: same as @p input
-   * @param[in]  block_shape Block shape value.
-   */
-  void configure(const ITensor *input, ITensor *output, int32_t block_shape);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NEDepthToSpaceLayerEx.
-   *
-   * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported:
-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-   * @param[in] output      Tensor output info. Data types supported: same as @p input
-   * @param[in] block_shape Block shape x value.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEDEPTHTOSPACELAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h
deleted file mode 100644
index 27a38e982..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayerEx.h
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEELEMENTWISEUNARYLAYEREX_H__
-#define __ARM_COMPUTE_NEELEMENTWISEUNARYLAYEREX_H__
-
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to perform negative on an input tensor. */
-class NENegLayer : public INESimpleFunction
-{
-public:
-  /** Initialize the function
-   *
-   * @param[in]  input  Input tensor. Data types supported: F16/F32/S32.
-   * @param[out] output Output tensor. Data types supported: same as @p input.
-   */
-  void configure(const ITensor *input, ITensor *output);
-  /** Static function to check if given info will lead to a valid configuration of @ref NERsqrtLayer
-   *
-   * @param[in] input  First tensor input info. Data types supported: F16/F32/S32.
-   * @param[in] output Output tensor info. Data types supported: Same as @p input.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEELEMENTWISEUNARYLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
index 39c57eb70..56548a479 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEFullyConnectedHybridLayer.h
@@ -46,7 +46,7 @@
 #include "arm_compute/core/NEON/kernels/NEMuliplyScaleFactorKernel.h"
 #include "arm_compute/core/NEON/kernels/NETransposeKernel.h"
 #include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 #include "arm_compute/runtime/Tensor.h"
 
@@ -164,7 +164,7 @@ private:
   MemoryGroup _memory_group;
   NEFullyConnectedHybridLayerReshapeWeights _reshape_weights_function;
   NEQuantizationSymmetricKernel _quant_input_kernel;
-  NEGEMMLowpMatrixMultiplyCoreEx _mm_gemmlowp;
+  NEGEMMLowpMatrixMultiplyCore _mm_gemmlowp;
   NEMultiplyScaleFactorKernel _multiply_scale_kernel;
   NEGEMMMatrixAccumulateBiasesKernel _accumulate_biases_kernel;
   Tensor _reshape_weights_output;
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h
deleted file mode 100644
index d844513c9..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h
+++ /dev/null
@@ -1,170 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCOREEX_H__
-#define __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCOREEX_H__
-
-#include "arm_compute/core/NEON/INEKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-// #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
-#include "arm_compute/runtime/Tensor.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to execute GEMMLowpMatrixMultiplyCore on NEON. This function calls the following
- * NEON kernels if the DOT product instruction is not available:
- *
- *  -# @ref NEGEMMInterleave4x4Kernel
- *  -# @ref NEGEMMTranspose1xWKernel
- *  -# @ref NEGEMMLowpMatrixMultiplyKernel
- *  -# @ref NEGEMMLowpOffsetContributionKernel
- *  -# @ref NEActivationLayer
- *
- * otherwise if the DOT product instruction is available:
- *
- *  -# @ref NEGEMMLowpOffsetContributionKernel
- *
-*/
-class NEGEMMLowpMatrixMultiplyCoreEx : public IFunction
-{
-public:
-  /** Constructor */
-  NEGEMMLowpMatrixMultiplyCoreEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NEGEMMLowpMatrixMultiplyCoreEx(const NEGEMMLowpMatrixMultiplyCoreEx &) = delete;
-  /** Default move constructor */
-  NEGEMMLowpMatrixMultiplyCoreEx(NEGEMMLowpMatrixMultiplyCoreEx &&) = default;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NEGEMMLowpMatrixMultiplyCoreEx &operator=(const NEGEMMLowpMatrixMultiplyCoreEx &) = delete;
-  /** Default move assignment operator */
-  NEGEMMLowpMatrixMultiplyCoreEx &operator=(NEGEMMLowpMatrixMultiplyCoreEx &&) = default;
-  /** Initialise the kernel's inputs, output
-   *
-   * @note GEMM_LOWP:  low precision GEMM kernel
-   *  This kernel performs the following computations:
-   *
-   *  -# Convert a values from QASYMM8 to int32 and add a_offset to each of them.
-   *  -# Convert b values from QASYMM8 to int32 add b_offset to each of them.
-   *  -# Compute the matrix product of the resulting a * b in int32.
-   *
-   * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is
-   * QASYMM8/QASYMM8_SIGNED otherwise
-   *
-   * @param[in]  a         First input tensor  (Matrix A). Data type supported:
-   * QASYMM8/QASYMM8_SIGNED.
-   * @param[in]  b         Second input tensor (Matrix B). Data type supported: same as @p a
-   * @param[in]  c         Third input tensor  (Matrix C). It can be a nullptr. Data type supported:
-   * S32
-   * @param[out] output    Output tensor. Data type supported: Data type supported:
-   * S32/QASYMM8/QASYMM8_SIGNED
-   * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped
-   * and
-   *                       if the reshape of matrix B should be executed only for the first run
-   */
-  void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output,
-                 const GEMMInfo &gemm_info = GEMMInfo());
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NEGEMMLowpMatrixMultiplyCoreEx
-   *
-   * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is
-   * QASYMM8/QASYMM8_SIGNED otherwise
-   *
-   * @param[in] a         First input tensor info  (Matrix A). Data type supported:
-   * QASYMM8/QASYMM8_SIGNED.
-   * @param[in] b         Second input tensor info (Matrix B). Data type supported: same as @p a
-   * @param[in] c         Third input tensor  info (Matrix C). It can be a nullptr. Data type
-   * supported: S32
-   * @param[in] output    Output tensor info. Data type supported: Data type supported:
-   * S32/QASYMM8/QASYMM8_SIGNED
-   * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped
-   * and
-   *                      if the reshape of matrix B should be executed only for the first run
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c,
-                         const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
-
-  // Inherited methods overridden
-  void run() override;
-  void prepare() override;
-
-private:
-  MemoryGroup _memory_group;
-  NEGEMMAssemblyDispatch _asm_glue;
-  std::unique_ptr<INEKernel> _mm_kernel;
-  std::unique_ptr<INEKernel> _mtx_a_reshape_kernel;
-  std::unique_ptr<INEKernel> _mtx_b_reshape_kernel;
-  NEGEMMLowpMatrixAReductionKernel _mtx_a_reduction_kernel;
-  NEGEMMLowpMatrixBReductionKernel _mtx_b_reduction_kernel;
-  NEGEMMLowpOffsetContributionKernel _offset_contribution_kernel;
-  NEGEMMLowpOffsetContributionOutputStageKernel _offset_contribution_output_stage_kernel;
-
-  Tensor _vector_sum_col;
-  Tensor _vector_sum_row;
-  Tensor _tmp_a;
-  Tensor _tmp_b;
-  Tensor _mm_result_s32;
-  Tensor _signed_a;
-  Tensor _signed_output;
-  const ITensor *_original_b;
-  int32_t _a_offset;
-  int32_t _b_offset;
-
-  bool _run_vector_matrix_multiplication;
-  bool _assembly_path;
-  bool _fused_assembly_path;
-  bool _reshape_b_only_on_first_run;
-  bool _is_prepared;
-  bool _fuse_output_stage;
-  bool _flip_signedness;
-};
-} // namespace arm_compute
-#endif /*__ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCOREEX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h
deleted file mode 100644
index ca8413352..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEPReLU.h
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEPRELU_H__
-#define __ARM_COMPUTE_NEPRELU_H__
-
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to run @ref NEPReLUKernel */
-class NEPReLU : public INESimpleFunctionNoBorder
-{
-public:
-  /** Initialise the kernel's inputs and output
-   *
-   * @param[in]  input. Data types supported: QASYMM8/F32.
-   * @param[in]  alpha. Data types supported: Same as @p input.
-   * @param[out] output Output tensor. Data types supported: Same as @p input.
-   */
-  void configure(const ITensor *input, const ITensor *alpha, ITensor *output);
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEPRELU_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h
deleted file mode 100644
index 8a7b17946..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NERNNLayerEx.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NERNNLAYER_EX_H__
-#define __ARM_COMPUTE_NERNNLAYER_EX_H__
-
-#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
-#include "arm_compute/core/NEON/kernels/NECopyKernel.h"
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-
-/** Basic function to run @ref NERNNLayerEx */
-class NERNNLayerEx : public IFunction
-{
-public:
-  /** Default constructor */
-  NERNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NERNNLayerEx(const NERNNLayerEx &) = delete;
-  /** Default move constructor */
-  NERNNLayerEx(NERNNLayerEx &&) = default;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NERNNLayerEx &operator=(const NERNNLayerEx &) = delete;
-  /** Default move assignment operator */
-  NERNNLayerEx &operator=(NERNNLayerEx &&) = default;
-  /** Initialize the function
-   *
-   * @param[in]     input             Input is a 2-D tensor of shape [input_size, batch_size]. Data
-   * types supported: F16/F32
-   * @param[in]     weights           Weights tensor of shape [input_size, num_units] that
-   * multiplies the input. Data types supported: Same as @p input
-   * @param[in]     recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies
-   * the current 'state'. Data types supported: Same as @p input
-   * @param[in]     bias              Bias vector of shape [num_units]. Data types supported: Same
-   * as @p input
-   * @param[out]    output            Output tensor of shape [num_units, batch_size]. Data types
-   * supported: Same as @p input
-   * @param[in,out] hidden_state      Output tensor of shape [num_units, batch_size]. Data types
-   * supported: Same as @p input
-   * @param[in]     info              Activation layer parameter.
-   */
-  void configure(const ITensor *input, const ITensor *weights, const ITensor *recurrent_weights,
-                 const ITensor *bias, ITensor *hidden_state, ITensor *output,
-                 ActivationLayerInfo &info);
-  /** Initialize the function
-   *
-   * @param[in] input             Input is a 2-D tensor of shape [input_size, batch_size]. Data
-   * types supported: F16/F32
-   * @param[in] weights           Weights tensor of shape [input_size, num_units] that multiplies
-   * the input. Data types supported: Same as @p input
-   * @param[in] recurrent_weights Weights tensor of shape [num_units, num_units] that multiplies the
-   * current 'state'. Data types supported: Same as @p input
-   * @param[in] bias              Bias vector of shape [num_units]. Data types supported: Same as @p
-   * input
-   * @param[in] output            Output tensor of shape [num_units, batch_size]. Data types
-   * supported: Same as @p input
-   * @param[in] hidden_state      Output tensor of shape [num_units, batch_size]. Data types
-   * supported: Same as @p input
-   * @param[in] info              Activation layer parameter.
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *weights,
-                         const ITensorInfo *recurrent_weights, const ITensorInfo *bias,
-                         const ITensorInfo *hidden_state, const ITensorInfo *output,
-                         const ActivationLayerInfo &info);
-
-  // Inherited methods overridden:
-  void run() override;
-  void prepare() override;
-
-private:
-  MemoryGroup _memory_group;
-  NEGEMM _gemm_state_f;
-  NEArithmeticAdditionKernel _add_kernel;
-  NEActivationLayerKernel _activation_kernel;
-  NEFullyConnectedLayer _fully_connected_kernel;
-  NECopyKernel _copy_kernel;
-  Tensor _fully_connected_out;
-  Tensor _gemm_output;
-  Tensor _add_output;
-  bool _is_prepared;
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NERNNLAYER_EX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h
deleted file mode 100644
index 03ac45798..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NEReduceMeanEx.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__
-#define __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__
-
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
-#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to perform reduce operation */
-class NEReduceMeanEx : public IFunction
-{
-public:
-  /** Constructor */
-  NEReduceMeanEx(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
-  /** Configure kernel
-   *
-   * @note Supported tensor rank: up to 4
-   *
-   * @param[in]  input          Source tensor. Data type supported: QASYMM8/F16/F32
-   * @param[in]  reduction_axis Reduction axis vector.
-   * @param[in]  keep_dims      If positive, retains reduced dimensions with length 1.
-   * @param[out] output         Destination tensor. Data type supported: Same as @p input
-   */
-  void configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims,
-                 ITensor *output);
-
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NEReduceMeanEx
-   *
-   * @param[in] input          Source tensor. Data type supported: QASYMM8/F16/F32
-   * @param[in] reduction_axis Reduction axis vector.
-   * @param[in] keep_dims      If positive, retains reduced dimensions with length 1.
-   * @param[in] output         Destination tensor. Data type supported: Same as @p input
-   *
-   * @return A status
-   */
-  static Status validate(const ITensorInfo *input, const Coordinates &reduction_axis,
-                         bool keep_dims, const ITensorInfo *output);
-
-  // Inherited methods overridden:
-  void run() override;
-
-private:
-  MemoryGroup _memory_group;
-  std::unique_ptr<NEReductionOperation[]> _reduction_kernels{nullptr};
-  std::unique_ptr<Tensor[]> _reduced_outs{nullptr};
-  NEReshapeLayer _reshape;
-  unsigned int _reduction_ops;
-  bool _keep_dims;
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NEON_REDUCE_MEAN_EX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h
deleted file mode 100644
index 3b695fbc0..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NESPACETOBATCHLAYEREX_H__
-#define __ARM_COMPUTE_NESPACETOBATCHLAYEREX_H__
-
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/core/NEON/kernels/NEMemsetKernel.h"
-#include "arm_compute/core/NEON/kernels/NESpaceToBatchLayerKernel.h"
-#include "arm_compute/core/Types.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Basic function to spatial divide a tensor. This function calls the following NEON
- * kernels/functions:
- *
- *  -# @ref NEMemsetKernel
- *  -# @ref NESpaceToBatchLayerKernel
- */
-class NESpaceToBatchLayerEx : public IFunction
-{
-public:
-  /** Default constructor */
-  NESpaceToBatchLayerEx();
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NESpaceToBatchLayerEx(const NESpaceToBatchLayerEx &) = delete;
-  /** Prevent instances of this class from being copied (As this class contains pointers) */
-  NESpaceToBatchLayerEx &operator=(const NESpaceToBatchLayerEx &) = delete;
-  /** Allow instances of this class to be moved */
-  NESpaceToBatchLayerEx(NESpaceToBatchLayerEx &&) = default;
-  /** Allow instances of this class to be moved */
-  NESpaceToBatchLayerEx &operator=(NESpaceToBatchLayerEx &&) = default;
-  /** Default destructor */
-  virtual ~NESpaceToBatchLayerEx() = default;
-  /** Set the input and output tensors.
-   *
-   * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported:
-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-   * @param[in]  block_shape 1-D tensor with shape [M]. Data types supported: S32
-   * @param[in]  paddings    2-D tensor with shape [2, M]. Data types supported: S32
-   * @param[out] output      Tensor output. Data types supported: same as @p input
-   */
-  void configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings,
-                 ITensor *output);
-  /** Set the input and output tensors. (Static block shape and paddings)
-   *
-   * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported:
-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-   * @param[in]  block_shape_x Block shape x value.
-   * @param[in]  block_shape_y Block shape y value.
-   * @param[in]  padding_left  The left padding of the output tensor.
-   * @param[in]  padding_right The right padding of the output tensor.
-   * @param[out] output        Tensor output. Data types supported: same as @p input
-   */
-  void configure(const ITensor *input, const int block_shape_x, const int block_shape_y,
-                 const Size2D &padding_left, const Size2D &padding_right, ITensor *output);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NESpaceToBatchLayerEx
-   *
-   * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported:
-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-   * @param[in] block_shape block shape tensor info with shape [M]. Data types supported: S32
-   * @param[in] paddings    paddings tensor info with shape [2, M]. Data types supported: S32
-   * @param[in] output      Tensor output info. Data types supported: same as @p input
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape,
-                         const ITensorInfo *paddings, const ITensorInfo *output);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NESpaceToBatchLayerEx (Static block shape and paddings)
-   *
-   * @param[in] input         Tensor input info. Supported tensor rank: 4. Data types supported:
-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-   * @param[in] block_shape_x Block shape x value.
-   * @param[in] block_shape_y Block shape y value.
-   * @param[in] padding_left  The left padding of the output tensor.
-   * @param[in] padding_right The right padding of the output tensor.
-   * @param[in] output        Tensor output info. Data types supported: same as @p input
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y,
-                         const Size2D &padding_left, const Size2D &padding_right,
-                         const ITensorInfo *output);
-
-  // Inherited methods overridden:
-  void run() override;
-
-private:
-  NESpaceToBatchLayerKernel _space_to_batch_kernel; /**< SpaceToBatch kernel to run */
-  NEMemsetKernel _memset_kernel;                    /**< Memset kernel to run */
-  bool _has_padding;                                /**< Flag to check if the output has padding */
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NESPACETOBATCHLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h
deleted file mode 100644
index 9f32616f3..000000000
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef __ARM_COMPUTE_NESPACETODEPTHLAYEREX_H__
-#define __ARM_COMPUTE_NESPACETODEPTHLAYEREX_H__
-
-#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** This function calls the following NEON kernels/functions:
- *
- *  -# @ref NESpaceToDepthLayerKernelEx
- */
-class NESpaceToDepthLayerEx : public INESimpleFunctionNoBorder
-{
-public:
-  /** Set the input and output tensors.
-   *
-   * @param[in]  input       Tensor input. Supported tensor rank: 4. Data types supported:
-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-   * @param[out] output      Tensor output. Data types supported: same as @p input
-   * @param[in]  block_shape Block shape value
-   */
-  void configure(const ITensor *input, ITensor *output, int32_t block_shape);
-  /** Static function to check if given info will lead to a valid configuration of @ref
-   * NESpaceToDepthLayerEx (Static block shape and paddings)
-   *
-   * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported:
-   * U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32.
-   * @param[in] output      Tensor output info. Data types supported: same as @p input
-   * @param[in] block_shape Block shape value
-   *
-   * @return a status
-   */
-  static Status validate(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape);
-};
-} // namespace arm_compute
-#endif /* __ARM_COMPUTE_NESPACETODEPTHLAYEREX_H__ */
diff --git a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
index 408d150d0..24ff5dac9 100644
--- a/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
+++ b/compute/ARMComputeEx/arm_compute/runtime/NEON/functions/NETransposeConvLayer.h
@@ -15,7 +15,7 @@
  */
 
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,16 +37,14 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-
 #ifndef __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__
 #define __ARM_COMPUTE_NETRANSPOSECONVLAYER_H__
 
-#include "arm_compute/runtime/CPP/functions/CPPUpsampleEx.h"
+#include "arm_compute/runtime/CPP/functions/CPPUpsample.h"
 #include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEPermute.h"
+#include "arm_compute/runtime/NEON/functions/NEReverse.h"
 
-#include "arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
@@ -59,8 +57,8 @@ namespace arm_compute
 {
 /** Function to run the deconvolution layer.
  *
- * Transpose convolution Layer is the backward pass of Convolution Layer. First we transform the
- * input depending on the stride and pad info and then perfrom a 1x1
+ * Deconvolution Layer is the backward pass of Convolution Layer. First we transform the input
+ * depending on the stride and pad info and then perfrom a 1x1
  * convolution pass. Input stride defines how many zeroes we should put between each element of the
  * input, pad is the amount of padding and finaly a is a user
  * specified value where a < stride - 1 that increases the padding top and right of the input image.
@@ -81,21 +79,22 @@ namespace arm_compute
  *      kernel_x and kernel_y are the convolution sizes in x and y.
  *      stride_x and stride_y is the input stride of the first and second dimension.
  *
- * The weights used by Transpose convolution are supposed to be the same as the ones used for
- * Convolution. Therefore, it will be necessary to use the weights in the
- * reverse order to perform an actual convolution. This is achieved by using the @ref
- * CPPFlipWeightsKernel.
+ * The weights used by Deconvolution are supposed to be the same as the ones used for Convolution.
+ * Therefore, it will be necessary to use the weights in the
+ * reverse order to perform an actual convolution. This is achieved by using @ref NEReverse.
  *
  * This function calls the following NEON kernels/functions:
  *
- * -# @ref CPPUpsample
+ * -# @ref CPPUpsampleEx
  * -# @ref NEConvolutionLayer
+ * -# @ref NEPermute
+ * -# @ref NEReverse
  *
  */
 class NETransposeConvLayer : public IFunction
 {
 public:
-  /** Default constructor */
+  /** Constructor */
   NETransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
 
   /** Prevent instances of this class from being copied (As this class contains pointers) */
@@ -112,37 +111,38 @@ public:
   /** Set the input, weights, biases and output tensors.
    *
    * @param[in,out] input   Input tensor. 3 lower dimensions represent a single input, and an
-   * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8.
+ * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
    * @param[in]     weights The 4d weights with dimensions [width, height, IFM, OFM]. Data type
-   * supported: Same as @p input.
+ * supported: Same as @p input.
    * @param[in]     bias    Optional, ignored if NULL. The biases have one dimension. Data type
-   * supported: Data types supported: S32 for QASYMM8 input, F32 for F32 input, F16 for F16 input.
+ * supported: Data types supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16
+ * for F16 input.
    * @param[out]    output  Output tensor. The output has the same number of dimensions as the @p
-   * input.
+ * input.
    * @param[in]     info    Contains padding and policies to be used in the deconvolution, this is
-   * decribed in @ref PadStrideInfo.
-   * @param[in]     invalid_right  The number of zeros added to right edge of the output.
-   * @param[in]     invalid_bottom The number of zeros added to top edge of the output.
+ * decribed in @ref PadStrideInfo.
+ * @param[in]     invalid_right  The number of zeros added to right edge of the output.
+ * @param[in]     invalid_bottom The number of zeros added to bottom edge of the output.
    *
    */
   void configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output,
                  const PadStrideInfo &info, unsigned int invalid_right,
                  unsigned int invalid_bottom);
   /** Static function to check if given info will lead to a valid configuration of @ref
-   * NETransposeConvLayer
+ * NETransposeConvLayer
    *
    * @param[in] input   Input tensor info. 3 lower dimensions represent a single input, and an
-   * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8.
+ * optional 4th dimension for batch of inputs. Data types supported: F32/F16/QASYMM8/QASYMM8_SIGNED.
    * @param[in] weights The 4d weights info with dimensions [width, height, IFM, OFM]. Data type
-   * supported: Same as @p input.
+ * supported: Same as @p input.
    * @param[in] bias    (Optional) The biases have one dimension. Data type supported: Data types
-   * supported: S32 for QASYMM8 input, F32 for F32 input, F16 for F16 input.
+ * supported: S32 for QASYMM8 and QASYMM8_SIGNED input, F32 for F32 input, F16 for F16 input.
    * @param[in] output  Output tensor info. The output has the same number of dimensions as the @p
-   * input.
+ * input.
    * @param[in] info    Contains padding and policies to be used in the deconvolution, this is
-   * decribed in @ref PadStrideInfo.
-   * @param[in] innvalid_right  The number of zeros added to right edge of the output.
-   * @param[in] invalid_bottom  The number of zeros added to top edge of the output.
+ * decribed in @ref PadStrideInfo.
+ * @param[in] innvalid_right  The number of zeros added to right edge of the output.
+ * @param[in] invalid_bottom  The number of zeros added to bottom edge of the output.
    *
    * @return a status
    */
@@ -158,17 +158,11 @@ public:
 private:
   MemoryGroup _memory_group;
   NEConvolutionLayer _conv_f;
-  CPPUpsampleEx _upsample_f;
-  CPPFlipWeightsKernel _flip_weights;
-  NEPermute _permute_input;
-  NEPermute _permute_weights;
-  NEPermute _permute_output;
+  CPPUpsample _upsample_f;
+  NEReverse _flip_weights;
   Tensor _scaled_output;
   Tensor _weights_flipped;
-  Tensor _permuted_input;
-  Tensor _permuted_weights;
-  Tensor _permuted_output;
-  bool _is_nchw;
+  Tensor _flip_axis;
   const ITensor *_original_weights;
   ITensor *_input;
   PadStrideInfo _info;
diff --git a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
index 7b6b9742b..ba42a2456 100644
--- a/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
+++ b/compute/ARMComputeEx/src/core/CL/CLKernelLibrary.cpp
@@ -55,16 +55,7 @@ using namespace arm_compute;
 
 const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map = {
     // ARMComputeEx kernels
-    {"arg_op", "arg_operation.cl"},
-    {"arithmetic_add_qasymm8", "arithmetic_op_quantized.cl"},
     {"binary_logical_op", "binary_logical_op.cl"},
-    {"cast", "cast.cl"},
-    {"cast_qasymm_in", "cast.cl"},
-    {"cast_qasymm_out", "cast.cl"},
-    {"comparison_op", "comparison_op.cl"},
-    {"comparison_op_qasymm8", "comparison_op_quantized.cl"},
-    {"depth_to_space_nchw", "depth_to_space.cl"},
-    {"depth_to_space_nhwc", "depth_to_space.cl"},
     {"embedding_lookup", "embedding_lookup.cl"},
     {"gather_ex", "gather_ex.cl"},
     {"gather_ex_1d", "gather_ex.cl"},
@@ -74,10 +65,6 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
     {"instance_normalization_ex", "instance_normalization_ex.cl"},
     {"multiply_scale_factor", "multiply_scale_factor.cl"},
     {"neg_tensor", "neg_tensor.cl"},
-    {"permute_generic", "permute_ex.cl"},
-    {"pixelwise_mul_qasymm8", "pixelwise_mul_quantized.cl"},
-    {"prelu", "prelu.cl"},
-    {"prelu_qasymm8", "prelu_quantized.cl"},
     {"quantization_symm8", "quantization_symm8.cl"},
     {"reduce_min_max", "reduce_operation.cl"},
     {"reduce_sum_mean", "reduce_operation.cl"},
@@ -91,29 +78,15 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_kernel_program_map
     {"radixsort_reorder", "topkv2_radixsort.cl"},
     {"topkv2_quicksort", "topkv2_quicksort.cl"},
     {"scale_factor_symm8", "scale_factor.cl"},
-    {"space_to_depth_nchw", "space_to_depth.cl"},
-    {"space_to_depth_nhwc", "space_to_depth.cl"},
 };
 
 const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map = {
 #ifdef EMBEDDED_KERNELS
     {
-        "arg_operation.cl",
-#include "./cl_kernels/arg_operation.clembed"
-    },
-    {
-        "cast.cl",
-#include "./cl_kernels/cast.clembed"
-    },
-    {
         "embedding_lookup.cl",
 #include "./cl_kernels/embedding_lookup.clembed"
     },
     {
-        "depth_to_space.cl",
-#include "./cl_kernels/depth_to_space.clembed"
-    },
-    {
         "gather_ex.cl",
 #include "./cl_kernels/gather_ex.clembed"
     },
@@ -150,14 +123,6 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
 #include "./cl_kernels/neg_tensor.clembed"
     },
     {
-        "prelu.cl",
-#include "./cl_kernels/prelu.clembed"
-    },
-    {
-        "prelu_quantized.cl",
-#include "./cl_kernels/prelu_quantized.clembed"
-    },
-    {
         "quantization_symm8.cl",
 #include "./cl_kernels/quantization_symm8.clembed"
     },
@@ -170,10 +135,6 @@ const std::map<std::string, std::string> CLKernelLibraryEx::_program_source_map
 #include "./cl_kernels/scale_factor.clembed"
     },
     {
-        "space_to_depth.cl",
-#include "./cl_kernels/space_to_depth.clembed"
-    },
-    {
         "topkv2.cl",
 #include "./cl_kernels/topkv2.clembed"
     },
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl
deleted file mode 100644
index 03717cfe9..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/arg_operation.cl
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
-/** Perform arg_max/arg_min
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type.
- *       e.g. -DDATA_TYPE=short
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
- *            e.g. -DDEPTH_OUT=16
- * @attention Operation type(code) specifying which operation to perform should be passed as
- *            preprocessor argument using -DOP_CODE = number. e.g. -DOP_CODE=1
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data
- * types:
- *                                                  U8/QASYMM8/S8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension
- *                                                  (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension
- *                                                  (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension
- *                                                  (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element
- *                                                  in the source image
- * @param[in]  input_stride_w                       Stride of the source tensor in W dimension
- *                                                  (in bytes)
- * @param[in]  input_step_w                         output_stride_w * number of elements along W
- *                                                  processed per workitem(in bytes)
- * @param[out] output_ptr                           Pointer to the destination image.
- *                                                  Supported data types: U32
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension
- *                                                  (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
- *                                                  (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension
- *                                                  (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension
- *                                                  (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
- *                                                  destination image
- * @param[in]  axis                                 Axis through which reduction occurs
- * @param[in]  dim                                  Dimension across the axis to be reduced.
- */
-
-__kernel void arg_op(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output), const int axis,
-                     const int dim)
-{
-  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, 0);
-  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH_OUT);
-
-  int indices[4] = {
-      get_global_id(0), get_global_id(1), get_global_id(2) % DEPTH_OUT,
-      get_global_id(2) / DEPTH_OUT,
-  };
-
-  DATA_TYPE value =
-      *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1], indices[2], indices[3]));
-  DATA_TYPE tval = value;
-  int idx = 0;
-  for (int i = 1; i < dim; ++i)
-  {
-    indices[axis] = i;
-
-#if OP_CODE == 1 // ArgMax
-    value = max(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1],
-                                                               indices[2], indices[3])));
-#elif OP_CODE == 2 // ArgMin
-    value = min(value, *((__global DATA_TYPE *)tensor4D_offset(&in, indices[0], indices[1],
-                                                               indices[2], indices[3])));
-#else
-    return;
-
-#endif
-
-    if (tval != value)
-    {
-      idx = indices[axis];
-      tval = value;
-    }
-  }
-
-  *((__global uint *)out.ptr) = idx;
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(OP_CODE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
deleted file mode 100644
index f74c1c103..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/arithmetic_op_quantized.cl
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers_asymm.h"
-
-#ifdef SATURATE
-#define ADD(x, y) add_sat((x), (y))
-#define SUB(x, y) sub_sat((x), (y))
-#else /* SATURATE */
-#define ADD(x, y) (x) + (y)
-#define SUB(x, y) (x) - (y)
-#endif /* SATURATE */
-
-/** Performs a pixelwise addition used to quantize down the int32 accumulator values of GEMMLowp to
- *  QASYMM8
- *
- * The following computations will be performed:
- *
- *  -# Add offset terms to inputs
-    -# Get scaled value of two inputs
- *  -# Add inputs
- *  -# Add offset terms to final result
- *  -# Multiply each entry of result by result_mult_int
- *  -# Shift the int32 accumulator by result_shift
- *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
- *
- * @attention The inputs and output data types need to be passed at compile time using
- *            -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- *            e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
- * @attention The number of bits to shift left of input tensors must be passed at compile time using
- *            -DLEFT_SHIFT
- * @attention The offset, scalar scale factor and number of bits to shift right of input tensors
- *            must be passed at compile time using -DIN1_OFFSET, -RIN1_MULT_INT, -DIN1_SHIFT,
- -DIN2_OFFSET,
- *            -RIN2_MULT_INT and -DIN2_SHIFT
- * @attention The offset, scalar scale factor and number of bits to shift right of output tensor
- *            must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and
- -DRESULT_SHIFT
- *
- * @attention The input and output data_types need to be passed at compile time using
- *            -DDATA_TYPE_IN1, -DDATA_TYPE_IN2 and -DDATA_TYPE_OUT:
- *            e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=uchar -DDATA_TYPE_OUT=uchar
- * @attention The inputs and output scale information of qasymm8 need to be passed at compile time
- *            using -DSCALE_IN1, -DSCALE_IN2 and -DSCALE_OUT:
- *            e.g. -DSCALE_IN1=1.f -DSCALE_IN2=1.f -DSCALE_OUT=2.f
- * @attention The inputs and output scale offset need to be passed at compile time using
- *            -DOFFSET_IN1, -DOFFSET_IN2 and -DOFFSET_OUT:
- *            e.g. -DOFFSET_IN1=0 -DOFFSET_IN2=0 -DOFFSET_OUT=0
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
- *            -DVEC_SIZE=16
- * @attention To perform saturating operation -DSATURATE has to be passed to the compiler otherwise
- *            wrapping policy will be used.
- *
- * @param[in]  in1_ptr                           Pointer to the source tensor.
- *                                               Supported data types: QASYMM8
- * @param[in]  in1_stride_x                      Stride of the source tensor in X dimension
- *                                               (in bytes)
- * @param[in]  in1_step_x                        in1_stride_x * number of elements along X processed
- *                                               per workitem(in bytes)
- * @param[in]  in1_stride_y                      Stride of the source tensor in Y dimension
- *                                               (in bytes)
- * @param[in]  in1_step_y                        in1_stride_y * number of elements along Y processed
- *                                               per workitem(in bytes)
- * @param[in]  in1_stride_z                      Stride of the source tensor in Z dimension
- *                                               (in bytes)
- * @param[in]  in1_step_z                        in1_stride_z * number of elements along Z processed
- *                                               per workitem(in bytes)
- * @param[in]  in1_offset_first_element_in_bytes The offset of the first element in the source
- *                                               tensor
- * @param[in]  in2_ptr                           Pointer to the source tensor. Supported data types:
- *                                               QASYMM8
- * @param[in]  in2_stride_x                      Stride of the source tensor in X dimension
- *                                               (in bytes)
- * @param[in]  in2_step_x                        in2_stride_x * number of elements along X processed
- *                                               per workitem(in bytes)
- * @param[in]  in2_stride_y                      Stride of the source tensor in Y dimension
- *                                               (in bytes)
- * @param[in]  in2_step_y                        in2_stride_y * number of elements along Y processed
- *                                               per workitem(in bytes)
- * @param[in]  in2_stride_z                      Stride of the source tensor in Z dimension
- *                                               (in bytes)
- * @param[in]  in2_step_z                        in2_stride_z * number of elements along Z processed
- *                                               per workitem(in bytes)
- * @param[in]  in2_offset_first_element_in_bytes The offset of the first element in the source
- *                                               tensor
- * @param[out] out_ptr                           Pointer to the destination tensor.
- *                                               Supported data types: QASYMM8
- * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension
- *                                               (in bytes)
- * @param[in]  out_step_x                        out_stride_x * number of elements along X processed
- *                                               per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension
- *                                               (in bytes)
- * @param[in]  out_step_y                        out_stride_y * number of elements along Y processed
- *                                               per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension
- *                                               (in bytes)
- * @param[in]  out_step_z                        out_stride_z * number of elements along Z processed
- *                                               per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination
- *                                               tensor
- */
-__kernel void arithmetic_add_qasymm8(TENSOR3D_DECLARATION(in1), TENSOR3D_DECLARATION(in2),
-                                     TENSOR3D_DECLARATION(out))
-{
-  // Get pixels pointer
-  Tensor3D in1 = CONVERT_TO_TENSOR3D_STRUCT(in1);
-  Tensor3D in2 = CONVERT_TO_TENSOR3D_STRUCT(in2);
-  Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
-  // Load data
-  VEC_DATA_TYPE(int, 16)
-  in1_data = CONVERT(vload16(0, (__global DATA_TYPE_IN1 *)in1.ptr), VEC_DATA_TYPE(int, 16));
-  VEC_DATA_TYPE(int, 16)
-  in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(int, 16));
-
-  // Get scaled value of two inputs
-  VEC_DATA_TYPE(int, 16) in1_val = in1_data + (VEC_DATA_TYPE(int, 16))(IN1_OFFSET);
-  VEC_DATA_TYPE(int, 16) in2_val = in2_data + (VEC_DATA_TYPE(int, 16))(IN2_OFFSET);
-
-  VEC_DATA_TYPE(int, 16)
-  left_shift = (VEC_DATA_TYPE(int, 16))1 << (VEC_DATA_TYPE(int, 16))(LEFT_SHIFT);
-  VEC_DATA_TYPE(int, 16) shifted_in1_val = in1_val * left_shift;
-  VEC_DATA_TYPE(int, 16) shifted_in2_val = in2_val * left_shift;
-
-  VEC_DATA_TYPE(int, 16)
-  scaled_in1_val =
-      ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in1_val, IN1_MULT_INT, IN1_SHIFT, 16);
-  VEC_DATA_TYPE(int, 16)
-  scaled_in2_val =
-      ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(shifted_in2_val, IN2_MULT_INT, IN2_SHIFT, 16);
-
-  // Add inputs and multiply with a multiplier smaller than 1
-  VEC_DATA_TYPE(int, 16) sum_val = scaled_in1_val + scaled_in2_val;
-  VEC_DATA_TYPE(int, 16)
-  out_val =
-      ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(sum_val, RESULT_MULT_INT, RESULT_SHIFT, 16);
-  out_val += (VEC_DATA_TYPE(int, 16))(RESULT_OFFSET);
-
-  VEC_DATA_TYPE(uchar, 16) res = CONVERT(out_val, VEC_DATA_TYPE(uchar, 16));
-
-  // TODO: Apply min-max BOUND to support fuse with relu.
-  /*
-  #if defined(MIN_BOUND)
-      res = max(res, (uchar16)MIN_BOUND);
-  #endif // defined(MIN_BOUND)
-  #if defined(MAX_BOUND)
-      res = min(res, (uchar16)MAX_BOUND);
-  #endif // defined(MAX_BOUND)
-  */
-
-  // Store result
-  VSTORE(16)(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr);
-}
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
deleted file mode 100644
index 4147a0017..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/cast.cl
+++ /dev/null
@@ -1,233 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-#ifndef SCALE
-#define SCALE 1.0f
-#endif
-#ifndef OFFSET
-#define OFFSET 0
-#endif
-#ifndef VEC_SIZE
-#define VEC_SIZE 1
-#endif
-
-#if defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
-/** Perform a cast operation on an input tensor.
- *
- * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and
- *            -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
- *            -DVEC_SIZE=16
- * @attention -DBOOL_INPUT : Whether type of input is bool.
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data
- *                                                  types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
- *                                                  bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
- *                                                  bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
- *                                                  image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data
- *                                                  types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension
- *                                                  (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
- *                                                  (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
- *                                                  destination image
- */
-__kernel void cast(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
-{
-  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-  VSTORE(VEC_SIZE)
-  (CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr),
-           VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)),
-   0, (__global DATA_TYPE_OUT *)output.ptr);
-  VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)
-  res = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr),
-                VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
-#if defined(BOOL_INPUT)
-  VEC_DATA_TYPE(char, VEC_SIZE) tmp = CONVERT(res, VEC_DATA_TYPE(char, VEC_SIZE));
-  VEC_DATA_TYPE(char, VEC_SIZE) mask = (VEC_DATA_TYPE(char, VEC_SIZE))(1);
-  res = CONVERT(tmp & mask, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE));
-#endif // defined(BOOL_INPUT)
-
-  VSTORE(VEC_SIZE)(res, 0, (__global DATA_TYPE_OUT *)output.ptr);
-}
-
-/** Perform a cast operation on an QASYMM8 input tensor.
- * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and
- *            -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
- * @attention Offset and Scale of input should be given as a preprocessor argument using
- *            -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
- *            -DVEC_SIZE=16
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data
- *                                                  types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
- *                                                  bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
- *                                                  bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
- *                                                  image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data
- *                                                  types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension
- *                                                  (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
- *                                                  (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
- *                                                  destination image
- */
-__kernel void cast_qasymm_in(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
-{
-  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-  VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
-  in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
-  VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET);
-  VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE);
-
-  VEC_DATA_TYPE(int, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(int, VEC_SIZE)) - offset;
-  VEC_DATA_TYPE(float, VEC_SIZE) out_data = CONVERT(tmp, VEC_DATA_TYPE(float, VEC_SIZE)) * scale;
-
-  VSTORE(VEC_SIZE)
-  (CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0,
-   (__global DATA_TYPE_OUT *)output.ptr);
-}
-
-/** Perform a cast operation on an QASYMM8 output tensor.
- * @attention Data types of both input and output can be passed using the -DDATA_TYPE_IN and
- *            -DDATA_TYPE_OUT compile flag, e.g. -DDATA_TYPE_IN=float, -DDATA_TYPE_OUT=int
- * @attention Offset and Scale of output should be given as a preprocessor argument using
- *            -DOFFSET=int, -DSCALE=float. e.g. -DOFFSET=1, -DSCALE=0.5
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
- *            -DVEC_SIZE=16
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data
- *                                                  types: F16/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
- *                                                 bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
- *                                                  bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
- *                                                  image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data
- *                                                  types: U8
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension
- *                                                  (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
- *                                                  (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
- *                                                  destination image
- */
-__kernel void cast_qasymm_out(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output))
-{
-  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-  VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
-  in_data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
-  VEC_DATA_TYPE(int, VEC_SIZE) offset = (VEC_DATA_TYPE(int, VEC_SIZE))(OFFSET);
-  VEC_DATA_TYPE(float, VEC_SIZE) scale = (VEC_DATA_TYPE(float, VEC_SIZE))(SCALE);
-
-  VEC_DATA_TYPE(float, VEC_SIZE) tmp = CONVERT(in_data, VEC_DATA_TYPE(float, VEC_SIZE)) / scale;
-  VEC_DATA_TYPE(float, VEC_SIZE) out_data = tmp + CONVERT(offset, VEC_DATA_TYPE(float, VEC_SIZE));
-
-  VSTORE(VEC_SIZE)
-  (CONVERT(out_data, VEC_DATA_TYPE(DATA_TYPE_OUT, VEC_SIZE)), 0,
-   (__global DATA_TYPE_OUT *)output.ptr);
-}
-#endif // defined(DATA_TYPE_IN) && defined(DATA_TYPE_OUT)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
deleted file mode 100644
index 0285c955b..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/depth_to_space.cl
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
-/** Perform space to depth rearrangement of tensor
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
- *            e.g. -DDEPTH_OUT=16
- * @attention The value of the z-axis of output tensor should be given as a preprocessor argument
- *            using -DZ_OUT=size. e.g. -DZ_OUT=16
- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
- *            -DBLOCK_SIZE=1
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data
- *                                                  types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
- *                                                  bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X
- *                                                  processed per workitem(in  bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
- *                                                  bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y
- *                                                  processed per workitem(in  bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z
- *                                                  processed per workitem(in  bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
- *                                                  image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data
- *                                                  types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension
- *                                                  (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
- *                                                  (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in
- *                                                  bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
- *                                                  destination image
- */
-__kernel void depth_to_space_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
-{
-  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT);
-
-  int out_index[4] = {0};
-  int in_index[4] = {0};
-
-  out_index[0] = get_global_id(0);         // W
-  out_index[1] = get_global_id(1);         // H
-  out_index[2] = get_global_id(2) % Z_OUT; // C
-  out_index[3] = get_global_id(2) / Z_OUT; // B
-
-  in_index[0] = out_index[0] / BLOCK_SIZE;
-  in_index[1] = out_index[1] / BLOCK_SIZE;
-  in_index[2] = out_index[2] +
-                ((out_index[1] % BLOCK_SIZE) * BLOCK_SIZE + out_index[0] % BLOCK_SIZE) * DEPTH_OUT;
-  in_index[3] = out_index[3];
-
-  *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(
-      &in, in_index[0], in_index[1], in_index[2], in_index[3]));
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
-
-#if defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
-/** Perform space to depth rearrangement of tensor (NHWC)
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Output tensor depth should be given as a preprocessor argument using -DDEPTH_OUT=size.
- *            e.g. -DDEPTH_OUT=16
- * @attention The value of the z-axis of output tensor should be given as a preprocessor argument
- *            using -DZ_OUT=size. e.g. -DZ_OUT=16
- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
- *            -DBLOCK_SIZE=1
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data
- *                                                  types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
- *                                                  bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X
- *                                                  processed per workitem(in  bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
- *                                                  bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y
- *                                                  processed per workitem(in  bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z
- *                                                  processed per workitem(in  bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
- *                                                  image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data
- *                                                  types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension
- *                                                  (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
- *                                                  (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in
- *                                                  bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
- *                                                  destination image
- */
-__kernel void depth_to_space_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
-{
-  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
-  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(output, Z_OUT);
-
-  int out_index[4] = {0};
-  int in_index[4] = {0};
-
-  out_index[0] = get_global_id(0);         // C
-  out_index[1] = get_global_id(1);         // W
-  out_index[2] = get_global_id(2) % Z_OUT; // H
-  out_index[3] = get_global_id(2) / Z_OUT; // B
-
-  in_index[0] = out_index[0] +
-                ((out_index[2] % BLOCK_SIZE) * BLOCK_SIZE + out_index[1] % BLOCK_SIZE) * DEPTH_OUT;
-  in_index[1] = out_index[1] / BLOCK_SIZE;
-  in_index[2] = out_index[2] / BLOCK_SIZE;
-  in_index[3] = out_index[3];
-
-  *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(
-      &in, in_index[0], in_index[1], in_index[2], in_index[3]));
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_OUT) && defined(BLOCK_SIZE) && defined(Z_OUT)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
index 2d0b6a299..e07a25ec9 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers.h
@@ -15,7 +15,7 @@
  */
 
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,7 +37,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-
 #ifndef ARM_COMPUTE_HELPER_H
 #define ARM_COMPUTE_HELPER_H
 
@@ -59,16 +58,219 @@
 #pragma OPENCL EXTENSION cl_arm_printf : enable
 #endif // defined(ARM_COMPUTE_DEBUG_ENABLED) && defined(cl_arm_printf)
 
+#define GPU_ARCH_MIDGARD 0x100
+#define GPU_ARCH_BIFROST 0x200
+
+/** Concatenate two inputs.
+ *
+ * @param[in] a The first input to be concatenated
+ * @param[in] b The second input to be concatenated
+ *
+ * @return The concatenated output
+ */
+#define CONCAT(a, b) a##b
+
+/** Expand the given vector
+ *
+ * @param[in] x The vector to be expanded
+ *
+ * @return The expanded output
+ */
 #define EXPAND(x) x
 
+/** Clamp the given value between an upper and lower bound.
+ *
+ * @param[in] x       The value to be clamped
+ * @param[in] min_val The lower bound
+ * @param[in] max_val The upper bound
+ *
+ * @return The clamped value.
+ */
 #define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
 
+/** REVn reverses the given vector whose size is n.
+ * @name REVn
+ *
+ * @param[in] x The vector to be reversed
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REV1(x) ((x))
+#define REV2(x) ((x).s10)
+#define REV3(x) ((x).s210)
+#define REV4(x) ((x).s3210)
+#define REV8(x) ((x).s76543210)
+#define REV16(x) ((x).sFEDCBA9876543210)
+/** @} */ // end of group REVn
+
+/** Reverse the given vector.
+ * @name REVERSE
+ *
+ * @param[in] x The vector to be reversed
+ * @param[in] s The size of the vector
+ *
+ * @return The reversed vector
+ * @{
+ */
+#define REVERSE_STR(x, s) REV##s((x))
+#define REVERSE(x, s) REVERSE_STR(x, s)
+/** @} */ // end of group REVERSE
+
+/** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
+ * @name ROTs_n
+ *
+ * @param[in] x The vector to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROT1_0(x) ((x))
+
+#define ROT2_0(x) ((x))
+#define ROT2_1(x) ((x).s10)
+
+#define ROT3_0(x) ((x))
+#define ROT3_1(x) ((x).s201)
+#define ROT3_2(x) ((x).s120)
+
+#define ROT4_0(x) ((x))
+#define ROT4_1(x) ((x).s3012)
+#define ROT4_2(x) ((x).s2301)
+#define ROT4_3(x) ((x).s1230)
+
+#define ROT8_0(x) ((x))
+#define ROT8_1(x) ((x).s70123456)
+#define ROT8_2(x) ((x).s67012345)
+#define ROT8_3(x) ((x).s56701234)
+#define ROT8_4(x) ((x).s45670123)
+#define ROT8_5(x) ((x).s34567012)
+#define ROT8_6(x) ((x).s23456701)
+#define ROT8_7(x) ((x).s12345670)
+
+#define ROT16_0(x) ((x))
+#define ROT16_1(x) ((x).sF0123456789ABCDE)
+#define ROT16_2(x) ((x).sEF0123456789ABCD)
+#define ROT16_3(x) ((x).sDEF0123456789ABC)
+#define ROT16_4(x) ((x).sCDEF0123456789AB)
+#define ROT16_5(x) ((x).sBCDEF0123456789A)
+#define ROT16_6(x) ((x).sABCDEF0123456789)
+#define ROT16_7(x) ((x).s9ABCDEF012345678)
+#define ROT16_8(x) ((x).s89ABCDEF01234567)
+#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_10(x) ((x).s6789ABCDEF012345)
+#define ROT16_11(x) ((x).s56789ABCDEF01234)
+#define ROT16_12(x) ((x).s456789ABCDEF0123)
+#define ROT16_13(x) ((x).s3456789ABCDEF012)
+#define ROT16_14(x) ((x).s23456789ABCDEF01)
+#define ROT16_15(x) ((x).s123456789ABCDEF0)
+/** @} */ // end of group ROTs_n
+
+/** Circular-right-shift (rotate-right) the given vector by the given amount.
+ * @name ROTATE
+ *
+ * @param[in] x The vector to be shifted
+ * @param[in] s The size of the vector
+ * @param[in] n The amount to be shifted
+ *
+ * @return The shifted vector
+ * @{
+ */
+#define ROTATE_STR(x, s, n) ROT##s##_##n(x)
+#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+/** @} */ // end of group ROTATE
+
+/** Creates a vector of size n filled with offset values corresponding to the location of each
+ * element.
+ * @name V_OFFSn
+ *
+ * @param[in] dt The data type of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define V_OFFS1(dt) (dt)(0)
+#define V_OFFS2(dt) (dt)(0, 1)
+#define V_OFFS3(dt) (dt)(0, 1, 3)
+#define V_OFFS4(dt) (dt)(0, 1, 2, 3)
+#define V_OFFS8(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS16(dt) (dt)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
+/** @} */ // end of group V_OFFSn
+
+/** Create a vector filled with offset values corresponding to the location of each element.
+ * @name VEC_OFFS
+ *
+ * @param[in] dt The data type of the output vector
+ * @param[in] s  The size of the output vector
+ *
+ * @return The vector filled with offset values
+ * @{
+ */
+#define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
+#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+/** @} */ // end of group VEC_OFFS
+
 #define VLOAD_STR(size) vload##size
 #define VLOAD(size) VLOAD_STR(size)
 
 #define VSTORE_STR(size) vstore##size
 #define VSTORE(size) VSTORE_STR(size)
 
+#define float1 float
+#define half1 half
+#define char1 char
+#define uchar1 uchar
+#define short1 short
+#define ushort1 ushort
+#define int1 int
+#define uint1 uint
+#define long1 long
+#define ulong1 ulong
+#define double1 double
+
+#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
+
+// Convert built-in functions with _sat modifier are not supported in floating point so we create
+// defines
+// without _sat to overcome this issue
+#define convert_float_sat convert_float
+#define convert_float1_sat convert_float
+#define convert_float2_sat convert_float2
+#define convert_float3_sat convert_float3
+#define convert_float4_sat convert_float4
+#define convert_float8_sat convert_float8
+#define convert_float16_sat convert_float16
+#define convert_half_sat convert_float
+#define convert_half1_sat convert_half
+#define convert_half2_sat convert_half2
+#define convert_half3_sat convert_half3
+#define convert_half4_sat convert_half4
+#define convert_half8_sat convert_half8
+#define convert_half16_sat convert_half16
+
+#define convert_float1 convert_float
+#define convert_half1 convert_half
+#define convert_char1 convert_char
+#define convert_uchar1 convert_uchar
+#define convert_short1 convert_short
+#define convert_ushort1 convert_ushort
+#define convert_int1 convert_int
+#define convert_uint1 convert_uint
+#define convert_long1 convert_long
+#define convert_ulong1 convert_ulong
+#define convert_double1 convert_double
+
+#define convert_char1_sat convert_char_sat
+#define convert_uchar1_sat convert_uchar_sat
+#define convert_short1_sat convert_short_sat
+#define convert_ushort1_sat convert_ushort_sat
+#define convert_int1_sat convert_int_sat
+#define convert_uint1_sat convert_uint_sat
+#define convert_long1_sat convert_long_sat
+#define convert_ulong1_sat convert_ulong_sat
+#define convert_double1_sat convert_double_sat
+
 #define VEC_DATA_TYPE_STR(type, size) type##size
 #define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
 
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
index a83b1a8a5..5f1b3f902 100644
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
+++ b/compute/ARMComputeEx/src/core/CL/cl_kernels/helpers_asymm.h
@@ -15,7 +15,7 @@
  */
 
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,29 +37,112 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-
 #ifndef ARM_COMPUTE_HELPERS_ASYMM_H
 #define ARM_COMPUTE_HELPERS_ASYMM_H
 
 #include "helpers.h"
 
+/** Convert the given vector with round to nearest even rounding mode
+ *
+ * @param[in] x    The target to be converted
+ * @param[in] type The target type
+ *
+ * @return The converted vector
+ */
+#define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x)))
+#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type)
+
+/** Quantize a floating-point scalar value to 8-bit asymmetric
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline uchar quantize_qasymm8(float input, float offset, float scale)
+{
+  float out_f32 = input / scale + offset;
+  uchar res_u8 = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, int), uchar);
+  return res_u8;
+}
+
+/** Dequantize a scalar value from 8-bit asymmetric to floating-point
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline float dequantize_qasymm8(uchar input, float offset, float scale)
+{
+  return ((float)input - offset) * scale;
+}
+
+/** Dequantize a scalar value from signed 8-bit asymmetric to floating-point
+ *
+ * @param[in] input  Input value to quantize
+ * @param[in] offset Quantization offset
+ * @param[in] scale  Quantization scale
+ *
+ * @return quantized value
+ */
+inline float dequantize_qasymm8_signed(char input, float offset, float scale)
+{
+  return ((float)input - offset) * scale;
+}
+
+/** Quantize a vector of values from floating-point
+ *
+ * @param[in] type Output data type.
+ * @param[in] size Size of vector.
+ *
+ * @return quantized values
+ */
+#define QUANTIZE_IMPL(type, size)                                                                 \
+  inline VEC_DATA_TYPE(type, size)                                                                \
+      quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale)          \
+  {                                                                                               \
+    VEC_DATA_TYPE(float, size)                                                                    \
+    out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset); \
+    VEC_DATA_TYPE(type, size)                                                                     \
+    res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)),                        \
+                      VEC_DATA_TYPE(type, size));                                                 \
+    return res;                                                                                   \
+  }
+
+/** Dequantize a vector of values to floating-point
+ *
+ * @param[in] type Input data type.
+ * @param[in] size Size of vector.
+ *
+ * @return dequantized values in floating point
+ */
+#define DEQUANTIZE_IMPL(type, size)                                                       \
+  inline VEC_DATA_TYPE(float, size)                                                       \
+      dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
+  {                                                                                       \
+    return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;                 \
+  }
+
 /** Correctly-rounded-to-nearest division by a power-of-two.
  *
  * @param[in] size Size of vector.
  *
  * @return Correctly-rounded-to-nearest division by a power-of-two.
  */
-#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                                     \
-  inline VEC_DATA_TYPE(int, size)                                                    \
-      asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, int exponent) \
-  {                                                                                  \
-    VEC_DATA_TYPE(int, size)                                                         \
-    mask = (1 << exponent) - 1;                                                      \
-    const VEC_DATA_TYPE(int, size) zero = 0;                                         \
-    const VEC_DATA_TYPE(int, size) one = 1;                                          \
-    VEC_DATA_TYPE(int, size)                                                         \
-    threshold = (mask >> 1) + select(zero, one, x < 0);                              \
-    return (x >> exponent) + select(zero, one, (x & mask) > threshold);              \
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                        \
+  inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size( \
+      VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent)    \
+  {                                                                     \
+    const VEC_DATA_TYPE(int, size) zero = (VEC_DATA_TYPE(int, size))0;  \
+    const VEC_DATA_TYPE(int, size) one = (VEC_DATA_TYPE(int, size))1;   \
+    VEC_DATA_TYPE(int, size)                                            \
+    mask = (one << exponent) - one;                                     \
+    VEC_DATA_TYPE(int, size)                                            \
+    threshold = (mask >> 1) + select(zero, one, x < 0);                 \
+    return (x >> exponent) + select(zero, one, (x & mask) > threshold); \
   }
 
 /** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
@@ -81,9 +164,19 @@
     b_64 = convert_long##size(b);                                              \
     VEC_DATA_TYPE(long, size)                                                  \
     ab_64 = a_64 * b_64;                                                       \
-    /* COMPMID-907 */                                                          \
+    /* Revert COMPMID-907 */                                                   \
+    VEC_DATA_TYPE(long, size)                                                  \
+    mask1 = 1 << 30;                                                           \
+    VEC_DATA_TYPE(long, size)                                                  \
+    mask2 = 1 - (1 << 30);                                                     \
+    VEC_DATA_TYPE(long, size)                                                  \
+    is_positive_or_zero = ab_64 >= 0;                                          \
+    VEC_DATA_TYPE(long, size)                                                  \
+    nudge = select(mask2, mask1, is_positive_or_zero);                         \
+    VEC_DATA_TYPE(long, size)                                                  \
+    mask = 1ll << 31;                                                          \
     VEC_DATA_TYPE(int, size)                                                   \
-    ab_x2_high32 = convert_int##size(((ab_64 + (1 << 30)) >> 31));             \
+    ab_x2_high32 = convert_int##size((ab_64 + nudge) / mask);                  \
     return select(ab_x2_high32, INT_MAX, overflow);                            \
   }
 
@@ -335,9 +428,18 @@
     return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                         \
   }
 
+#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale)
+#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size)
+#define DEQUANTIZE_STR(input, offset, scale, type, size) \
+  dequantize_##type##size(input, offset, scale)
+#define DEQUANTIZE(input, offset, scale, type, size) \
+  DEQUANTIZE_STR(input, offset, scale, type, size)
+
 #define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) \
   asymm_rounding_divide_by_POW2_##size(x, exponent)
 #define ASYMM_MULT(a, b, size) asymm_mult##size(a, b)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
+  ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
 #define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
   ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
 #define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) \
@@ -360,11 +462,53 @@
 #define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \
   asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
 
+#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size)                                               \
+  inline VEC_DATA_TYPE(int, size)                                                                 \
+      multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \
+  {                                                                                               \
+    const int left_shift = shift > 0 ? shift : 0;                                                 \
+    const int right_shift = shift > 0 ? 0 : -shift;                                               \
+    return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size),       \
+                                         right_shift, size);                                      \
+  }
+#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) \
+  multiply_by_quantized_multiplier##size(input, qmul, shift)
+
+QUANTIZE_IMPL(uchar, 1)
+QUANTIZE_IMPL(char, 1)
+QUANTIZE_IMPL(uint, 1)
+QUANTIZE_IMPL(int, 1)
+QUANTIZE_IMPL(uchar, 4)
+QUANTIZE_IMPL(ushort, 4)
+QUANTIZE_IMPL(short, 4)
+QUANTIZE_IMPL(uchar, 16)
+QUANTIZE_IMPL(char, 16)
+QUANTIZE_IMPL(ushort, 16)
+QUANTIZE_IMPL(short, 16)
+QUANTIZE_IMPL(uint, 16)
+QUANTIZE_IMPL(int, 16)
+
+DEQUANTIZE_IMPL(uchar, 1)
+DEQUANTIZE_IMPL(char, 1)
+DEQUANTIZE_IMPL(uint, 1)
+DEQUANTIZE_IMPL(int, 1)
+DEQUANTIZE_IMPL(uchar, 4)
+DEQUANTIZE_IMPL(ushort, 4)
+DEQUANTIZE_IMPL(short, 4)
+DEQUANTIZE_IMPL(uchar, 16)
+DEQUANTIZE_IMPL(char, 16)
+DEQUANTIZE_IMPL(ushort, 16)
+DEQUANTIZE_IMPL(short, 16)
+DEQUANTIZE_IMPL(uint, 16)
+DEQUANTIZE_IMPL(int, 16)
+
+ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1)
 ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
 ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(4)
 ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(8)
 ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(16)
 
+ASYMM_MULT_IMPL(1)
 ASYMM_MULT_IMPL(2)
 ASYMM_MULT_IMPL(4)
 ASYMM_MULT_IMPL(8)
@@ -375,16 +519,19 @@ ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(4)
 ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(8)
 ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(16)
 
+ASYMM_SELECT_USING_MASK_IMPL(1)
 ASYMM_SELECT_USING_MASK_IMPL(2)
 ASYMM_SELECT_USING_MASK_IMPL(4)
 ASYMM_SELECT_USING_MASK_IMPL(8)
 ASYMM_SELECT_USING_MASK_IMPL(16)
 
+ASYMM_MASK_IF_ZERO_IMPL(1)
 ASYMM_MASK_IF_ZERO_IMPL(2)
 ASYMM_MASK_IF_ZERO_IMPL(4)
 ASYMM_MASK_IF_ZERO_IMPL(8)
 ASYMM_MASK_IF_ZERO_IMPL(16)
 
+ASYMM_MASK_IF_NON_ZERO_IMPL(1)
 ASYMM_MASK_IF_NON_ZERO_IMPL(2)
 ASYMM_MASK_IF_NON_ZERO_IMPL(4)
 ASYMM_MASK_IF_NON_ZERO_IMPL(8)
@@ -400,6 +547,7 @@ ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(4)
 ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(8)
 ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(16)
 
+ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(1)
 ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(2)
 ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(4)
 ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(8)
@@ -415,9 +563,16 @@ ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(4)
 ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(8)
 ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_IMPL(16)
 
+ASYMM_RESCALE_IMPL(1)
 ASYMM_RESCALE_IMPL(2)
 ASYMM_RESCALE_IMPL(4)
 ASYMM_RESCALE_IMPL(8)
 ASYMM_RESCALE_IMPL(16)
 
+MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(1)
+MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(2)
+MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(4)
+MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(8)
+MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(16)
+
 #endif // ARM_COMPUTE_HELPERS_ASYMM_H
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl
deleted file mode 100644
index 12c8eeb79..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu.cl
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-#ifndef VEC_SIZE
-#define VEC_SIZE 1
-#endif
-
-#if defined(DATA_TYPE)
-/** Returns result of prelu function implemented as below:
- * f(input) = alpha * input for input < 0, f(input) = input for input >= 0.
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
- *            -DVEC_SIZE=16
- * @note Can only take floating point data types.
- *
- * @param[in]  input1_ptr                            Pointer to the source image. Supported Data
- *                                                   types : F16/F32
- * @param[in]  input1_stride_x                       Stride of the source image in X dimension (in
- *                                                   bytes)
- * @param[in]  input1_step_x                         input1_stride_x * number of elements along X
- *                                                   processed per workitem(in bytes)
- * @param[in]  input1_stride_y                       Stride of the source image in Y dimension (in
- *                                                   bytes)
- * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y
- *                                                   processed per workitem(in bytes)
- * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension (in
- *                                                   bytes)
- * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z
- *                                                   processed per workitem(in bytes)
- * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source
- *                                                   image
- * @param[in]  alpha_ptr                             Pointer to the source image. Supported Data
- *                                                   types : F16/F32
- * @param[in]  alpha_stride_x                        Stride of the source image in X dimension (in
- *                                                   bytes)
- * @param[in]  alpha_step_x                          input2_stride_x * number of elements along X
- *                                                   processed per workitem(in bytes)
- * @param[in]  alpha_stride_y                        Stride of the source image in Y dimension (in
- *                                                   bytes)
- * @param[in]  alpha_step_y                          input2_stride_y * number of elements along Y
- *                                                   processed per workitem(in bytes)
- * @param[in]  alpha_stride_z                        Stride of the source tensor in Z dimension (in
- *                                                   bytes)
- * @param[in]  alpha_step_z                          input2_stride_z * number of elements along Z
- *                                                   processed per workitem(in bytes)
- * @param[in]  alpha_offset_first_element_in_bytes   The offset of the first element in the source
- *                                                   image
- *
- * @param[out] output_ptr                            Pointer to the destination image. Supported
- *                                                   data types: same as @p input_ptr
- * @param[in]  output_stride_x                       Stride of the destination image in X dimension
- *                                                   (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X
- *                                                   processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the destination image in Y dimension
- *                                                   (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y
- *                                                   processed per workitem(in bytes)
- * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in
- *                                                   bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z
- *                                                   processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the
- *                                                   destination image
- */
-__kernel void prelu(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(alpha),
-                    TENSOR3D_DECLARATION(output))
-{
-  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-  Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha);
-  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-  VSTORE(VEC_SIZE)
-  (VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) < 0
-       ? VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr) *
-             VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)alpha.ptr)
-       : VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input.ptr),
-   0, (__global DATA_TYPE *)output.ptr);
-}
-#endif // defined(DATA_TYPE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl
deleted file mode 100644
index a66e107d1..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/prelu_quantized.cl
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-#define SUB(x, y) (x) - (y)
-
-#if defined(OFF_IN) && defined(OFF_ALPHA) && defined(OFF_OUT) && defined(SCALE_IN) && \
-    defined(SCALE_ALPHA) && defined(SCALE_OUT) && defined(VEC_SIZE)
-
-#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
-#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
-#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE)
-#define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
-#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
-#define SELECT_TYPE VEC_INT
-
-/** Returns result of prelu function implemented as below:
- *  f(input) = alpha * input for input < 0, f(input) = input for input >= 0.
- *
- * @attention Data type can be passed using the -DDATA_TYPE_IN compile flag, e.g.
- *            -DDATA_TYPE_IN=uchar
- * @attention Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g.
- *            -DVEC_SIZE=16
- * @note Can only take uchar data types.
- *
- * @param[in]  input1_ptr                            Pointer to the source image. Supported Data
- *                                                   types : QASYMM8
- * @param[in]  input1_stride_x                       Stride of the source image in X dimension (in
- *                                                   bytes)
- * @param[in]  input1_step_x                         input1_stride_x * number of elements along X
- *                                                   processed per workitem(in bytes)
- * @param[in]  input1_stride_y                       Stride of the source image in Y dimension (in
- *                                                   bytes)
- * @param[in]  input1_step_y                         input1_stride_y * number of elements along Y
- *                                                   processed per workitem(in bytes)
- * @param[in]  input1_stride_z                       Stride of the source tensor in Z dimension (in
- *                                                   bytes)
- * @param[in]  input1_step_z                         input1_stride_z * number of elements along Z
- *                                                   processed per workitem(in bytes)
- * @param[in]  input1_offset_first_element_in_bytes  The offset of the first element in the source
- *                                                   image
- * @param[in]  alpha_ptr                             Pointer to the source image. Supported Data
- *                                                   types : QASYMM8
- * @param[in]  alpha_stride_x                        Stride of the source image in X dimension (in
- *                                                   bytes)
- * @param[in]  alpha_step_x                          input2_stride_x * number of elements along X
- *                                                   processed per workitem(in bytes)
- * @param[in]  alpha_stride_y                        Stride of the source image in Y dimension (in
- *                                                   bytes)
- * @param[in]  alpha_step_y                          input2_stride_y * number of elements along Y
- *                                                   processed per workitem(in bytes)
- * @param[in]  alpha_stride_z                        Stride of the source tensor in Z dimension (in
- *                                                   bytes)
- * @param[in]  alpha_step_z                          input2_stride_z * number of elements along Z
- *                                                   processed per workitem(in bytes)
- * @param[in]  alpha_offset_first_element_in_bytes   The offset of the first element in the source
- *                                                   image
- * @param[out] output_ptr                            Pointer to the destination image. Supported
- *                                                   data types: same as @p input_ptr
- * @param[in]  output_stride_x                       Stride of the destination image in X dimension
- *                                                   (in bytes)
- * @param[in]  output_step_x                         output_stride_x * number of elements along X
- *                                                   processed per workitem(in bytes)
- * @param[in]  output_stride_y                       Stride of the destination image in Y dimension
- *                                                   (in bytes)
- * @param[in]  output_step_y                         output_stride_y * number of elements along Y
- *                                                   processed per workitem(in bytes)
- * @param[in]  output_stride_z                       Stride of the source tensor in Z dimension (in
- *                                                   bytes)
- * @param[in]  output_step_z                         output_stride_z * number of elements along Z
- *                                                   processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes  The offset of the first element in the
- *                                                   destination image
- */
-__kernel void prelu_qasymm8(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(alpha),
-                            TENSOR3D_DECLARATION(output))
-{
-  // Get pixels pointer
-  Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
-  Tensor3D alpha = CONVERT_TO_TENSOR3D_STRUCT(alpha);
-  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-  VEC_INT in_vec = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)input.ptr), VEC_INT);
-  VEC_INT alpha_vec = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)alpha.ptr), VEC_INT);
-
-  in_vec = SUB(in_vec, (VEC_INT)((int)OFF_IN));
-  alpha_vec = SUB(alpha_vec, (VEC_INT)((int)OFF_ALPHA));
-
-  const VEC_FLOAT inf32 = CONVERT(in_vec, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_IN);
-  const VEC_FLOAT alphaf32 = CONVERT(alpha_vec, VEC_FLOAT) * (VEC_FLOAT)((float)SCALE_ALPHA);
-  const VEC_FLOAT outf32 =
-      select(inf32, inf32 * alphaf32, CONVERT(inf32 < (VEC_FLOAT)0, SELECT_TYPE));
-  const VEC_FLOAT qresf32 = outf32 / ((VEC_FLOAT)(float)SCALE_OUT) + ((VEC_FLOAT)((float)OFF_OUT));
-  const VEC_UCHAR res = CONVERT_SAT(CONVERT_DOWN(qresf32, VEC_INT), VEC_UCHAR);
-
-  VSTORE(VEC_SIZE)
-  (res, 0, (__global uchar *)output.ptr);
-}
-
-#endif // defined(OFF_IN) && defined(OFF_ALPHA) && defined(OFF_OUT) && defined(SCALE_IN) &&
-       // defined(SCALE_ALPHA) && defined(SCALE_OUT) && defined(VEC_SIZE)
diff --git a/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl b/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
deleted file mode 100644
index eb612f834..000000000
--- a/compute/ARMComputeEx/src/core/CL/cl_kernels/space_to_depth.cl
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016, 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
-/** Perform space to depth rearrangement of tensor
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size.
- *            e.g. -DDEPTH_IN=16
- * @attention The value of the z-axis of input tensor depth should be given as a preprocessor
- *            argument using -DZ_IN=size. e.g. -DZ_IN=16
- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
- *            -DBLOCK_SIZE=1
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data
- *                                                  types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
- *                                                  bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X
- *                                                  processed per workitem(in  bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
- *                                                  bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y
- *                                                  processed per workitem(in  bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z
- *                                                  processed per workitem(in  bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
- *                                                  image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data
- *                                                  types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension
- *                                                  (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
- *                                                  (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in
- *                                                  bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
- *                                                  destination image
- */
-__kernel void space_to_depth_nchw(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
-{
-  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN);
-  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-
-  int out_index[4] = {0};
-  int in_index[4] = {0};
-
-  in_index[0] = get_global_id(0);        // W
-  in_index[1] = get_global_id(1);        // H
-  in_index[2] = get_global_id(2) % Z_IN; // C
-  in_index[3] = get_global_id(2) / Z_IN; // B
-
-  out_index[0] = in_index[0] / BLOCK_SIZE;
-  out_index[1] = in_index[1] / BLOCK_SIZE;
-  out_index[2] =
-      in_index[2] + ((in_index[1] % BLOCK_SIZE) * BLOCK_SIZE + in_index[0] % BLOCK_SIZE) * DEPTH_IN;
-  out_index[3] = in_index[3];
-
-  *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2],
-                                          out_index[3])) = *((__global DATA_TYPE *)in.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(Z_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
-
-#if defined(DATA_TYPE) && defined(Z_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
-/** Perform space to depth rearrangement of tensor
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size.
- *            e.g. -DDEPTH_IN=16
- * @attention The value of the z-axis of input tensor depth should be given as a preprocessor
- *            argument using -DZ_IN=size. e.g. -DZ_IN=16
- * @attention block size should be given as a preprocessor argument using -DBLOCK_SIZE=size. e.g.
- *            -DBLOCK_SIZE=1
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data
- *                                                  types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in
- *                                                  bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X
- *                                                  processed per workitem(in  bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in
- *                                                  bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y
- *                                                  processed per workitem(in  bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z
- *                                                  processed per workitem(in  bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source
- *                                                  image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data
- *                                                  types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension
- *                                                  (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension
- *                                                  (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in
- *                                                  bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in
- *                                                  bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W
- *                                                  processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the
- *                                                  destination image
- */
-__kernel void space_to_depth_nhwc(TENSOR4D_DECLARATION(input), TENSOR4D_DECLARATION(output))
-{
-  Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT(input, Z_IN);
-  Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-
-  int out_index[4] = {0};
-  int in_index[4] = {0};
-
-  in_index[0] = get_global_id(0);        // C
-  in_index[1] = get_global_id(1);        // W
-  in_index[2] = get_global_id(2) % Z_IN; // H
-  in_index[3] = get_global_id(2) / Z_IN; // B
-
-  out_index[0] =
-      in_index[0] + ((in_index[2] % BLOCK_SIZE) * BLOCK_SIZE + in_index[1] % BLOCK_SIZE) * DEPTH_IN;
-  out_index[1] = in_index[1] / BLOCK_SIZE;
-  out_index[2] = in_index[2] / BLOCK_SIZE;
-  out_index[3] = in_index[3];
-
-  *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2],
-                                          out_index[3])) = *((__global DATA_TYPE *)in.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(BLOCK_SIZE) && defined(Z_IN)
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp
deleted file mode 100644
index 06eeb5b98..000000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLArgOperationKernel.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-const TensorShape inferOutputShape(const TensorShape &input_shape, const uint32_t axis)
-{
-  TensorShape out_shape{input_shape};
-
-  out_shape.set(axis, 1);
-
-  return out_shape;
-}
-} // namespace
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const uint32_t axis,
-                          ArgOperation /*op*/)
-{
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::S32, DataType::F32, DataType::U8,
-                                        DataType::QASYMM8);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::S32);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape().num_dimensions() - 1) !=
-                                      output->tensor_shape().num_dimensions(),
-                                  "Input's rank is not same with output");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() == 0,
-                                  "Inputs are not broadcast compatible");
-
-  const TensorShape output_shape = inferOutputShape(input->tensor_shape(), axis);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_shape.total_size() != output->tensor_shape().total_size(),
-                                  "output shape's size does not match axis");
-
-  const auto num_dimensions = input->tensor_shape().num_dimensions();
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= num_dimensions, "axis must be less than (input's rank).");
-  return Status{};
-}
-
-} // namespace
-
-CLArgOperationKernel::CLArgOperationKernel() : _input(nullptr), _output(nullptr), _axis() {}
-
-void CLArgOperationKernel::configure(const ICLTensor *input, ICLTensor *output, const uint32_t axis,
-                                     ArgOperation op)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
-
-  _input = input;
-  _output = output;
-  _axis = axis;
-
-  std::unique_ptr<ITensorInfo> output_info = output->info()->clone();
-  output_info->set_tensor_shape(inferOutputShape(input->info()->tensor_shape(), axis));
-
-  // Construct kernel and set op_code based on type of ArgOperation as specified by object op
-  std::string kernel_name = "arg_op";
-  int op_code = 0;
-  if (op == ArgOperation::MAX)
-  {
-    op_code = 1;
-  }
-  else if (op == ArgOperation::MIN)
-  {
-    op_code = 2;
-  }
-  else
-    throw std::runtime_error("Operation not supported, yet");
-
-  // Set kernel build options
-  std::set<std::string> build_opts;
-  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(output_info->dimension(2)));
-  build_opts.emplace("-DOP_CODE=" + support::cpp11::to_string(op_code));
-
-  // Create kernel
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
-  // Configure  kernel window
-  Window win = calculate_max_window(*output_info, Steps());
-
-  Coordinates coord;
-  coord.set_num_dimensions(output_info->num_dimensions());
-  output->info()->set_valid_region(ValidRegion(coord, output_info->tensor_shape()));
-
-  ICLKernel::configure_internal(win);
-}
-
-Status CLArgOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                      const uint32_t axis, ArgOperation op)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
-
-  return Status{};
-}
-
-void CLArgOperationKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  const TensorShape &shape_in = _input->info()->tensor_shape();
-
-  unsigned int idx = 2 * num_arguments_per_4D_tensor(); // Skip the input and output parameters
-
-  _kernel.setArg<cl_int>(idx++, _axis);
-  _kernel.setArg<cl_int>(idx++, shape_in[_axis]);
-
-  Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
-  // Setup input slice
-  Window slice_in(slice_out);
-  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-  slice_in.set(3, Window::Dimension(0, 0, 0));
-
-  // Copy output's shape in order to use for recovering at end of this method
-  const TensorShape shape_out = _output->info()->tensor_shape();
-  _output->info()->set_tensor_shape(inferOutputShape(shape_in, _axis));
-
-  do
-  {
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, _input, slice_in);
-    add_4D_tensor_argument(idx, _output, slice_out);
-    enqueue(queue, *this, slice_out);
-  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
-
-  // Recover output's shape of output tensor
-  _output->info()->set_tensor_shape(shape_out);
-}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
index bb5556888..fbc76f5e1 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLBinaryLogicalOpKernel.cpp
@@ -43,6 +43,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "support/StringSupport.h"
 
 using namespace arm_compute;
 
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
deleted file mode 100644
index 01ea655b4..000000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLCastKernel.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLCastKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-CLCastKernel::CLCastKernel() : _input(nullptr), _output(nullptr) {}
-
-void CLCastKernel::configure(const ICLTensor *input, ICLTensor *output, SubDataType input_subtype)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
-                                                DataType::S16, DataType::S32, DataType::F16,
-                                                DataType::F32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
-                                                DataType::S16, DataType::S32, DataType::F16,
-                                                DataType::F32);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output);
-
-  _input = input;
-  _output = output;
-
-  constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-  // Set kernel build options
-  CLBuildOptions build_opts;
-  build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.add_option("-DDATA_TYPE_OUT=" +
-                        get_cl_type_from_data_type(output->info()->data_type()));
-  build_opts.add_option(
-      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-
-  // Create kernel
-  if (is_data_type_quantized_asymmetric(input->info()->data_type()))
-  {
-    UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform();
-    const float scale_in = qinfo.scale;
-    const int offset_in = qinfo.offset;
-    build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_in));
-    build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_in));
-
-    _kernel = static_cast<cl::Kernel>(
-        CLKernelLibraryEx::get().create_kernel("cast_qasymm_in", build_opts.options()));
-  }
-  else if (is_data_type_quantized_asymmetric(output->info()->data_type()))
-  {
-    UniformQuantizationInfo qinfo = output->info()->quantization_info().uniform();
-    const float scale_in = qinfo.scale;
-    const float offset_in = qinfo.offset;
-
-    build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_in));
-    build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_in));
-
-    _kernel = static_cast<cl::Kernel>(
-        CLKernelLibraryEx::get().create_kernel("cast_qasymm_out", build_opts.options()));
-  }
-  else
-  {
-    build_opts.add_option_if(input_subtype == SubDataType::BOOL, "-DBOOL_INPUT");
-    _kernel = static_cast<cl::Kernel>(
-        CLKernelLibraryEx::get().create_kernel("cast", build_opts.options()));
-  }
-
-  // Configure kernel window
-  Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
-  AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-  update_window_and_padding(win, input_access, output_access);
-  output_access.set_valid_region(win, input->info()->valid_region());
-
-  ICLKernel::configure_internal(win);
-}
-
-void CLCastKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-  Window slice = collapsed.first_slice_window_3D();
-
-  do
-  {
-    unsigned int idx = 0;
-    add_3D_tensor_argument(idx, _input, slice);
-    add_3D_tensor_argument(idx, _output, slice);
-    enqueue(queue, *this, slice, lws_hint());
-  } while (collapsed.slide_window_slice_3D(slice));
-}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
deleted file mode 100644
index 389136817..000000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLDepthToSpaceKernel.cpp
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-// TODO Use this validation function
-#if 0
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          const int32_t block_size)
-{
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
-                                                DataType::S16, DataType::S32, DataType::F16,
-                                                DataType::F32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
-                                                DataType::S16, DataType::S32, DataType::F16,
-                                                DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size < 1,
-                                  "Block size should be greater than or equal to 1.");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(0) * block_size,
-                                  "Output width should be equal to (Input width * block size)");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(1) != input->dimension(1) * block_size,
-                                  "Output height should be equal to (Input height * block size)");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) % (block_size * block_size) != 0,
-                                  "Input depth should be divisible by (block size * block size)");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-      output->dimension(2) != input->dimension(2) / (block_size * block_size),
-      "Output depth should be equal to (Input depth / (block size * block size))");
-
-  return Status{};
-}
-#endif
-} // namespace
-
-CLDepthToSpaceKernel::CLDepthToSpaceKernel() : _input(nullptr), _output(nullptr)
-{
-  // DO NOTHING
-}
-
-void CLDepthToSpaceKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                     const int32_t block_size)
-{
-  // TODO Add validation of data_layout
-  _input = input;
-  _output = output;
-
-  // Set kernel build options
-  auto layout_out = output->info()->data_layout();
-  std::set<std::string> build_opts;
-  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
-  auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL);
-  auto depth = output->info()->dimension(index_depth);
-  build_opts.emplace("-DDEPTH_OUT=" + support::cpp11::to_string(depth));
-  build_opts.emplace("-DZ_OUT=" + support::cpp11::to_string(output->info()->tensor_shape().z()));
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(
-      "depth_to_space_" + lower_string(string_from_data_layout(layout_out)), build_opts));
-
-  // Configure  kernel window
-  Window win = calculate_max_window(*output->info(), Steps());
-
-  Coordinates coord;
-  coord.set_num_dimensions(output->info()->num_dimensions());
-  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
-  ICLKernel::configure_internal(win);
-}
-
-void CLDepthToSpaceKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-  Window slice_out = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
-  // Setup input slice
-  Window slice_in(slice_out);
-  slice_in.set(Window::DimX, Window::Dimension(0, 0, 0));
-  slice_in.set(Window::DimY, Window::Dimension(0, 0, 0));
-  slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
-  slice_in.set(3, Window::Dimension(0, 0, 0));
-
-  do
-  {
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, _input, slice_in);
-    add_4D_tensor_argument(idx, _output, slice_out);
-    enqueue(queue, *this, slice_out);
-  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
-}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
index 79f5ce065..67aaf2db6 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLEmbeddingLookupKernel.cpp
@@ -43,6 +43,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "support/StringSupport.h"
 
 using namespace arm_compute;
 
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.cpp
deleted file mode 100644
index 235e8975d..000000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.cpp
+++ /dev/null
@@ -1,372 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernelEx.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/AccessWindowTranspose.h"
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "support/ToolchainSupport.h"
-
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
-using namespace arm_compute;
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-class Coordinates;
-} // namespace arm_compute
-
-namespace
-{
-using ElementsProcessed = Steps;
-
-Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1,
-                          const ITensorInfo *output, const GEMMReshapeInfo &gemm_info)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::S8);
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input0->num_dimensions() > 4,
-                                  "The number of dimensions for the matrix A must be <= 4");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 3,
-                                  "The number of dimensions for the matrix B must be <= 3");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input1->num_dimensions() > 2 &&
-                                      gemm_info.reinterpret_input_as_3d(),
-                                  "The input1 tensor cannot have more than 2 dimensions if input0 "
-                                  "has to be reinterpreted as 3D");
-
-  const int m = gemm_info.m();
-  const int n = gemm_info.n();
-  const int k = gemm_info.k();
-
-  ARM_COMPUTE_UNUSED(m);
-  ARM_COMPUTE_UNUSED(n);
-  ARM_COMPUTE_UNUSED(k);
-
-  ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(0) != static_cast<unsigned int>(k));
-  ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != static_cast<unsigned int>(n));
-  ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(1) != static_cast<unsigned int>(k));
-  if (gemm_info.reinterpret_input_as_3d())
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) * input0->dimension(2) !=
-                                static_cast<unsigned int>(m));
-  }
-  else
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != static_cast<unsigned int>(m));
-  }
-
-  if (output->total_size() != 0)
-  {
-    const TensorInfo tensor_info_output =
-        output->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, false, gemm_info));
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
-  }
-
-  return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1,
-                                                        ITensorInfo *output,
-                                                        const GEMMReshapeInfo &gemm_info,
-                                                        ElementsProcessed &num_elements_processed)
-{
-  unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
-  unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
-  bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-  bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
-
-  Window win{};
-  Window win_out{};
-  bool window_changed = false;
-
-  // In case both input and output have to be reinterpreted as 3D tensors,
-  // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-  if (reinterpret_input_as_3d == reinterpret_output_as_3d)
-  {
-    reinterpret_input_as_3d = false;
-    reinterpret_output_as_3d = false;
-  }
-
-  // Output tensor auto inizialitation if not yet initialized
-  auto_init_if_empty(*output,
-                     input0->clone()
-                         ->set_tensor_shape(compute_mm_shape(*input0, *input1, false, gemm_info))
-                         .set_data_type(DataType::S32));
-
-  TensorInfo tmp_info(*output);
-
-  if (reinterpret_output_as_3d)
-  {
-    // Since the output tensor has to be reinterpreted as 3D and the execute window is based on a 2D
-    // GEMM,
-    // the window needs to be constructed on the 2D collapsed version of the tensor
-    TensorShape tmp_shape(output->tensor_shape());
-    tmp_shape.collapse(2U, 1U);
-    tmp_info.set_tensor_shape(tmp_shape);
-  }
-
-  // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x
-  // Note: if the dot product instruction is available, the 8x2 tile has to be used
-  num_elems_processed_per_iteration_x = 4;
-  num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 4);
-
-  // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
-  // The only way to set properly the paddings, it is to set those explicitly through the
-  // AccessWindowStatic
-  const int m = reinterpret_input_as_3d ? input0->tensor_shape()[1] * input0->tensor_shape()[2]
-                                        : input0->tensor_shape()[1];
-  const int bottom_pad =
-      (num_elems_processed_per_iteration_y - (m % num_elems_processed_per_iteration_y)) %
-      num_elems_processed_per_iteration_y;
-
-  // Configure window
-  win = calculate_max_window(
-      tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-  win_out = calculate_max_window(
-      *output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-
-  AccessWindowStatic input0_access(input0, 0, 0, input0->dimension(0),
-                                   input0->dimension(1) + bottom_pad);
-  AccessWindowStatic input1_access(
-      input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
-      input1->dimension(1));
-  AccessWindowStatic output_access(
-      output, 0, 0, ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration_x),
-      output->dimension(1) + bottom_pad);
-
-  window_changed =
-      update_window_and_padding(win, input0_access,
-                                input1_access) || // window used by the execute_window_loop
-      update_window_and_padding(
-          win_out,
-          output_access); // window used to update the padding requirements of output tensor
-
-  Coordinates coord;
-  coord.set_num_dimensions(output->num_dimensions());
-  output_access.set_valid_region(win_out, ValidRegion(coord, output->tensor_shape()));
-
-  // Collapse along the Z direction
-  // This collapse needs to be here in order to tune the Z dimension of LWS
-  Window collapsed = win;
-  const unsigned int dimension_to_collapse =
-      std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
-  collapsed = win.collapse(win, dimension_to_collapse);
-
-  Status err = (window_changed)
-                   ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!")
-                   : Status{};
-  return std::make_pair(err, collapsed);
-}
-} // namespace
-
-CLGEMMLowpMatrixMultiplyKernelEx::CLGEMMLowpMatrixMultiplyKernelEx()
-    : _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true),
-      _reinterpret_input_as_3d(false), _reinterpret_output_as_3d(false)
-{
-}
-
-void CLGEMMLowpMatrixMultiplyKernelEx::configure(const ICLTensor *input0, const ICLTensor *input1,
-                                                 ICLTensor *output,
-                                                 const GEMMReshapeInfo &gemm_info)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);
-
-  ARM_COMPUTE_ERROR_THROW_ON(
-      validate_arguments(input0->info(), input1->info(), output->info(), gemm_info));
-
-  _input0 = input0;
-  _input1 = input1;
-  _output = output;
-  _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-  _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
-
-  // In case both input and output have to be reinterpreted as 3D tensors,
-  // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-  if (_reinterpret_input_as_3d == _reinterpret_output_as_3d)
-  {
-    _reinterpret_input_as_3d = false;
-    _reinterpret_output_as_3d = false;
-  }
-
-  // Check if we need to slide the matrix B
-  const unsigned int num_dimensions_input0 = _reinterpret_input_as_3d
-                                                 ? _input0->info()->num_dimensions() - 1
-                                                 : _input0->info()->num_dimensions();
-  _slide_matrix_b = (_input1->info()->num_dimensions() >= num_dimensions_input0);
-
-  ElementsProcessed num_elements_processed{};
-
-  // Configure kernel window
-  auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(),
-                                                  gemm_info, num_elements_processed);
-  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-  ICLKernel::configure_internal(win_config.second);
-
-  // Create build options
-  std::string kernel_name(" ");
-  CLBuildOptions build_opts;
-  build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
-  build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-  build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
-                           "-DHEIGHT_GEMM3D=" +
-                               support::cpp11::to_string(output->info()->dimension(1)));
-  build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
-                           "-DDEPTH_GEMM3D=" +
-                               support::cpp11::to_string(output->info()->dimension(2)));
-  build_opts.add_option_if(!_slide_matrix_b,
-                           "-DMATRIX_B_DEPTH=" +
-                               support::cpp11::to_string(input1->info()->dimension(2)));
-  build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0)));
-  build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" +
-                        support::cpp11::to_string(num_elements_processed.x()));
-  build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" +
-                        support::cpp11::to_string(num_elements_processed.y()));
-
-  kernel_name = "gemmlowp_mm_midgard_ex";
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts.options()));
-
-  // Set config_id for enabling LWS tuning
-  _config_id = kernel_name;
-  _config_id += "_";
-  _config_id += (_reinterpret_input_as_3d ? "3di_" : "");
-  _config_id += (_reinterpret_output_as_3d ? "3do_" : "");
-  _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
-  _config_id += "_";
-  _config_id += support::cpp11::to_string(output->info()->dimension(1));
-  _config_id += "_";
-  _config_id += support::cpp11::to_string(output->info()->dimension(0));
-}
-
-Status CLGEMMLowpMatrixMultiplyKernelEx::validate(const ITensorInfo *input0,
-                                                  const ITensorInfo *input1,
-                                                  const ITensorInfo *output,
-                                                  const GEMMReshapeInfo &gemm_info)
-{
-  ElementsProcessed num_elements_processed{};
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, gemm_info));
-  ARM_COMPUTE_RETURN_ON_ERROR(
-      validate_and_configure_window(input0->clone().get(), input1->clone().get(),
-                                    output->clone().get(), gemm_info, num_elements_processed)
-          .first);
-
-  return Status{};
-}
-
-void CLGEMMLowpMatrixMultiplyKernelEx::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  if (_input1->info()->num_dimensions() < 3)
-  {
-    // The stride_z for matrix B must be zero if we do not slice
-    ARM_COMPUTE_ERROR_ON(_input1->info()->strides_in_bytes()[3] != 0);
-  }
-
-  Window slice = window.first_slice_window_3D();
-  Window slice_matrix_b = slice;
-
-  slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
-  slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
-
-  if (_reinterpret_input_as_3d)
-  {
-    // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
-    const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3;
-    const unsigned int total_cross_plane_pad =
-        _input0->info()->padding().top + _input0->info()->padding().bottom;
-    _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-  }
-
-  if (_reinterpret_output_as_3d)
-  {
-    // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
-    const unsigned int idx0 =
-        3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
-    const unsigned int total_cross_plane_pad =
-        _output->info()->padding().top + _output->info()->padding().bottom;
-    _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
-  }
-
-  do
-  {
-    Window slice_b = slice;
-    // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A
-    // more than 2
-    // This scenario can happen when the matrix multiplication is used to perform a convolution
-    // operation
-    if (!_slide_matrix_b)
-    {
-      slice_b = slice_matrix_b;
-    }
-
-    unsigned int idx = 0;
-    add_2D_tensor_argument(idx, _input0, slice);
-    add_2D_tensor_argument(idx, _input1, slice_b);
-    add_2D_tensor_argument(idx, _output, slice);
-    _kernel.setArg<cl_uint>(idx++,
-                            static_cast<unsigned int>(_input0->info()->strides_in_bytes()[2]));
-    _kernel.setArg<cl_uint>(idx++,
-                            static_cast<unsigned int>(_input1->info()->strides_in_bytes()[2]));
-    _kernel.setArg<cl_uint>(idx++,
-                            static_cast<unsigned int>(_output->info()->strides_in_bytes()[2]));
-    enqueue(queue, *this, slice, lws_hint());
-  } while (window.slide_window_slice_3D(slice));
-}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
index 3a25987d0..3bfe3e407 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLGatherExKernel.cpp
@@ -45,6 +45,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
 #include "arm_compute/core/UtilsEx.h"
+#include "support/StringSupport.h"
 
 using namespace arm_compute;
 
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
index 7fbdcdaa7..930e7c944 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLHashtableLookupKernel.cpp
@@ -43,6 +43,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "support/StringSupport.h"
 
 using namespace arm_compute;
 
@@ -110,7 +111,7 @@ void CLHashtableLookupKernel::configure(const ICLTensor *lookups, const ICLTenso
   _hits = hits;
 
   // Make _lookup_indices tensor
-  _lookup_indices = arm_compute::support::cpp14::make_unique<CLTensor>();
+  _lookup_indices = support::cpp14::make_unique<CLTensor>();
   _lookup_indices->allocator()->init(
       TensorInfo(lookups->info()->tensor_shape(), lookups->info()->num_channels(), DataType::S32));
   _lookup_indices->allocator()->allocate();
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
index b45f6bb24..61c14d271 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLInstanceNormalizationLayerKernelEx.cpp
@@ -48,7 +48,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
-
+#include "support/StringSupport.h"
 #include "support/ToolchainSupport.h"
 
 namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
index d305896ea..6b27c9917 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLMultiplyScaleFactorKernel.cpp
@@ -49,6 +49,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "support/StringSupport.h"
 
 using namespace arm_compute;
 
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
index 74f7b4158..643c8b110 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLNegKernel.cpp
@@ -43,6 +43,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "support/StringSupport.h"
 
 using namespace arm_compute;
 
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp
deleted file mode 100644
index 8910a7b80..000000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLPReLUKernel.cpp
+++ /dev/null
@@ -1,210 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLPReLUKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-Status validate_info(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
-{
-  const TensorShape &out_shape =
-      TensorShape::broadcast_shape(input->tensor_shape(), alpha->tensor_shape());
-
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32,
-                                                       DataType::QASYMM8);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(alpha, 1, DataType::F16, DataType::F32,
-                                                       DataType::QASYMM8);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
-                                  "Inputs are not broadcast compatible");
-  // Validate in case of configured output
-  if (output->total_size() > 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32,
-                                                         DataType::QASYMM8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        detail::have_different_dimensions(out_shape, output->tensor_shape(), 0),
-        "Wrong shape for output");
-  }
-  return Status{};
-}
-} // namespace
-
-CLPReLUKernel::CLPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {}
-
-void CLPReLUKernel::configure(const ICLTensor *input, const ICLTensor *alpha, ICLTensor *output)
-{
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, alpha);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_info(input->info(), alpha->info(), output->info()));
-
-  _input = input;
-  _alpha = alpha;
-  _output = output;
-
-  // Create kernel
-  std::string kernel_name = "prelu";
-  std::set<std::string> build_opts;
-  build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-  build_opts.emplace(
-      ("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-
-  if (is_data_type_quantized_asymmetric(input->info()->data_type()))
-  {
-    build_opts.emplace("-DOFF_IN=" + support::cpp11::to_string(
-                                         input->info()->quantization_info().uniform().offset));
-    build_opts.emplace("-DOFF_ALPHA=" + support::cpp11::to_string(
-                                            alpha->info()->quantization_info().uniform().offset));
-    build_opts.emplace("-DOFF_OUT=" + support::cpp11::to_string(
-                                          output->info()->quantization_info().uniform().offset));
-    build_opts.emplace("-DSCALE_IN=" + support::cpp11::to_string(
-                                           input->info()->quantization_info().uniform().scale));
-    build_opts.emplace("-DSCALE_ALPHA=" + support::cpp11::to_string(
-                                              alpha->info()->quantization_info().uniform().scale));
-    build_opts.emplace("-DSCALE_OUT=" + support::cpp11::to_string(
-                                            output->info()->quantization_info().uniform().scale));
-    kernel_name += "_qasymm8";
-  }
-  _kernel =
-      static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(kernel_name, build_opts));
-
-  const std::pair<TensorShape, ValidRegion> broadcast_pair =
-      ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info());
-
-  const TensorShape &out_shape = broadcast_pair.first;
-  const ValidRegion &valid_region = broadcast_pair.second;
-
-  // Auto initialize output if not initialized
-  {
-    set_shape_if_empty(*output->info(), out_shape);
-
-    if (input->info()->data_type() == DataType::F16 && alpha->info()->data_type() == DataType::F16)
-    {
-      set_format_if_unknown(*output->info(), Format::F16);
-    }
-    else if (input->info()->data_type() == DataType::F32 ||
-             alpha->info()->data_type() == DataType::F32)
-    {
-      set_format_if_unknown(*output->info(), Format::F32);
-    }
-  }
-
-  Window win = calculate_max_window(valid_region, Steps(num_elems_processed_per_iteration));
-  Window win_input1 = win.broadcast_if_dimension_le_one(*input->info());
-  Window win_input2 = win.broadcast_if_dimension_le_one(*alpha->info());
-
-  AccessWindowHorizontal input1_access(input->info(), 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal input2_access(alpha->info(), 0, num_elems_processed_per_iteration);
-  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-
-  update_window_and_padding(win_input1, input1_access) ||
-      update_window_and_padding(win_input2, input2_access) ||
-      update_window_and_padding(win, output_access);
-
-  output_access.set_valid_region(win, valid_region);
-
-  ICLKernel::configure_internal(win);
-}
-
-void CLPReLUKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  const TensorShape &in_shape1 = _input->info()->tensor_shape();
-  const TensorShape &in_shape2 = _alpha->info()->tensor_shape();
-  const TensorShape &out_shape = _output->info()->tensor_shape();
-
-  bool can_collapse = true;
-  if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
-  {
-    can_collapse =
-        (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-    for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
-    {
-      can_collapse = (in_shape1[d] == in_shape2[d]);
-    }
-  }
-
-  bool has_collapsed = false;
-  Window collapsed =
-      can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed)
-                   : window;
-
-  const TensorShape &in_shape1_collapsed =
-      has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
-  const TensorShape &in_shape2_collapsed =
-      has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
-
-  Window slice = collapsed.first_slice_window_3D();
-  Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed);
-  Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed);
-
-  do
-  {
-    unsigned int idx = 0;
-    add_3D_tensor_argument(idx, _input, slice_input1);
-    add_3D_tensor_argument(idx, _alpha, slice_input2);
-    add_3D_tensor_argument(idx, _output, slice);
-
-    enqueue(queue, *this, slice);
-
-    collapsed.slide_window_slice_3D(slice_input1);
-    collapsed.slide_window_slice_3D(slice_input2);
-  } while (collapsed.slide_window_slice_3D(slice));
-}
-
-BorderSize CLPReLUKernel::border_size() const
-{
-  const unsigned int replicateSize =
-      _output->info()->dimension(0) -
-      std::min(_input->info()->dimension(0), _alpha->info()->dimension(0));
-  const unsigned int border =
-      std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
-  return BorderSize(0, border, 0, 0);
-}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
index 2d551f654..1a7a18cfa 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLQuantizationSymmetricKernel.cpp
@@ -49,6 +49,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "support/StringSupport.h"
 
 namespace arm_compute
 {
@@ -69,7 +70,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *scale_fac
 
   // Output must always be initialized
   ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S8);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8_SIGNED);
   ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
 
   return Status{};
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
index a98318323..06c2579f2 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLReduceOperationKernel.cpp
@@ -43,6 +43,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibraryEx.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "support/StringSupport.h"
 
 using namespace arm_compute;
 namespace
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
index ff1904abd..8d8853c81 100644
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
+++ b/compute/ARMComputeEx/src/core/CL/kernels/CLScaleFactorSymm8Kernel.cpp
@@ -48,6 +48,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "support/StringSupport.h"
 
 #include <climits>
 
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
deleted file mode 100644
index 64fc0384e..000000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLSpaceToDepthKernel.cpp
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibraryEx.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          const int32_t block_size)
-{
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QASYMM8,
-                                                DataType::S16, DataType::S32, DataType::F16,
-                                                DataType::F32);
-  ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QASYMM8,
-                                                DataType::S16, DataType::S32, DataType::F16,
-                                                DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(block_size < 1,
-                                  "Block size should be greater than or equal to 1.");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(3) != output->dimension(3),
-                                  "Input batch should be equal to Output batch");
-
-  auto layout_out = input->data_layout();
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
-
-  auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL);
-  auto index_height = get_data_layout_dimension_index(layout_out, DataLayoutDimension::HEIGHT);
-  auto index_width = get_data_layout_dimension_index(layout_out, DataLayoutDimension::WIDTH);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-      input->dimension(index_depth) * block_size * block_size != output->dimension(index_depth),
-      "Output depth should be equal to (input depth * block size *block size)");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->dimension(index_width) % block_size) ||
-                                      (input->dimension(index_height) % block_size),
-                                  "Input height and width should be divisible by block size");
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-      (output->dimension(index_width) != (input->dimension(index_width) / block_size)) ||
-          (output->dimension(index_height) != (input->dimension(index_height) / block_size)),
-      "Output height and width should be equal to "
-      "input_height/blocksize and input_width/blocksize respectively");
-
-  return Status{};
-}
-
-} // namespace
-
-CLSpaceToDepthKernel::CLSpaceToDepthKernel() : _input(nullptr), _output(nullptr) {}
-
-void CLSpaceToDepthKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                     const int32_t block_size)
-{
-
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_size));
-
-  _input = input;
-  _output = output;
-
-  // Set kernel build options
-  auto layout_out = input->info()->data_layout();
-  std::set<std::string> build_opts;
-  build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-  build_opts.emplace("-DBLOCK_SIZE=" + support::cpp11::to_string(block_size));
-  auto index_depth = get_data_layout_dimension_index(layout_out, DataLayoutDimension::CHANNEL);
-  auto depth = input->info()->dimension(index_depth);
-  build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(depth));
-  build_opts.emplace("-DZ_IN=" + support::cpp11::to_string(input->info()->tensor_shape().z()));
-
-  // Create kernel
-  _kernel = static_cast<cl::Kernel>(CLKernelLibraryEx::get().create_kernel(
-      "space_to_depth_" + lower_string(string_from_data_layout(layout_out)), build_opts));
-
-  // Configure  kernel window
-  Window win = calculate_max_window(*input->info(), Steps());
-
-  Coordinates coord;
-  coord.set_num_dimensions(output->info()->num_dimensions());
-  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
-  ICLKernel::configure_internal(win);
-}
-
-void CLSpaceToDepthKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
-
-  Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
-
-  // Setup output slice
-  Window slice_out(slice_in);
-  slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-  slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-  slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
-  slice_out.set(3, Window::Dimension(0, 0, 0));
-
-  do
-  {
-    unsigned int idx = 0;
-    add_4D_tensor_argument(idx, _input, slice_in);
-    add_4D_tensor_argument(idx, _output, slice_out);
-    enqueue(queue, *this, slice_in);
-  } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
-}
diff --git a/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp b/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp
deleted file mode 100644
index 61999cbd4..000000000
--- a/compute/ARMComputeEx/src/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.cpp
+++ /dev/null
@@ -1,188 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CL/kernels/CLTransposeConvLayerUpsampleKernel.h"
-
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-#include "arm_compute/core/CL/CLValidate.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-using namespace arm_compute;
-
-CLTransposeConvLayerUpsampleKernel::CLTransposeConvLayerUpsampleKernel()
-    : _input(nullptr), _output(nullptr), _inner_border(), _info()
-{
-}
-
-Status CLTransposeConvLayerUpsampleKernel::validate(const ITensorInfo *input,
-                                                    const ITensorInfo *output,
-                                                    const BorderSize &inner_border,
-                                                    const PadStrideInfo &info)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
-                                                       DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-
-  const DataLayout data_layout = input->data_layout();
-
-  const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-  const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-  const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
-  ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_w) == 0);
-  ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_h) == 0);
-
-  ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_c) != output->dimension(idx_c));
-  for (size_t i = 3; i < Coordinates::num_max_dimensions; ++i)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
-  }
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border.right > info.stride().first - 1,
-                                  "inner_border_right must be smaller that stride_x");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(inner_border.top > info.stride().second - 1,
-                                  "inner_border_top must be smaller that stride_y");
-
-  return Status{};
-}
-
-void CLTransposeConvLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                                   const BorderSize &inner_border,
-                                                   const PadStrideInfo &info)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-  _input = input;
-  _output = output;
-  _inner_border = inner_border;
-  _info = info;
-
-  // Perform validation step
-  ARM_COMPUTE_ERROR_THROW_ON(CLTransposeConvLayerUpsampleKernel::validate(
-      input->info(), output->info(), inner_border, info));
-
-  // Create kernel
-  CLBuildOptions build_opts;
-  build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-  _kernel = static_cast<cl::Kernel>(
-      CLKernelLibrary::get().create_kernel("deconvolution_upsample", build_opts.options()));
-
-  constexpr unsigned int num_elems_processed_per_iteration = 1;
-
-  // Configure kernel window
-  Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
-  AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-  output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));
-
-  ICLKernel::configure_internal(win);
-}
-
-void CLTransposeConvLayerUpsampleKernel::run(const Window &window, cl::CommandQueue &queue)
-{
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
-
-  const DataLayout data_layout = _input->info()->data_layout();
-
-  const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-  const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
-  const int out_start_x = _info.pad_left();
-  const int out_end_x = _output->info()->dimension(idx_w) - _inner_border.right -
-                        _info.pad_right() + _info.stride().first - 1;
-  const int out_step_x = _info.stride().first;
-
-  const int out_start_y = _inner_border.top + _info.pad_top();
-  const int out_end_y =
-      _output->info()->dimension(idx_h) - _info.pad_bottom() + _info.stride().second - 1;
-  const int out_step_y = _info.stride().second;
-
-  switch (data_layout)
-  {
-    case DataLayout::NCHW:
-    {
-      Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
-
-      Window slice_out = collapsed.first_slice_window_3D();
-      slice_out.set(Window::DimX, Window::Dimension(out_start_x, out_end_x, out_step_x));
-      slice_out.set(Window::DimY, Window::Dimension(out_start_y, out_end_y, out_step_y));
-
-      Window slice_in = collapsed.first_slice_window_3D();
-
-      do
-      {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice_in);
-        add_3D_tensor_argument(idx, _output, slice_out);
-        enqueue(queue, *this, slice_out);
-      } while (collapsed.slide_window_slice_3D(slice_in) &&
-               collapsed.slide_window_slice_3D(slice_out));
-      break;
-    }
-    case DataLayout::NHWC:
-    {
-      // NOTE: not collapsing in NHWC
-      Window slice_out = window.first_slice_window_3D();
-      slice_out.set(Window::DimY, Window::Dimension(out_start_x, out_end_x, out_step_x));
-      slice_out.set(Window::DimZ, Window::Dimension(out_start_y, out_end_y, out_step_y));
-
-      Window slice_in = window.first_slice_window_3D();
-
-      do
-      {
-        unsigned int idx = 0;
-        add_3D_tensor_argument(idx, _input, slice_in);
-        add_3D_tensor_argument(idx, _output, slice_out);
-        enqueue(queue, *this, slice_out);
-      } while (window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out));
-      break;
-    }
-    default:
-      ARM_COMPUTE_ERROR("Unsupported data layout");
-  }
-}
diff --git a/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp b/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp
deleted file mode 100644
index 648afb304..000000000
--- a/compute/ARMComputeEx/src/core/CPP/kernels/CPPUpsampleKernelEx.cpp
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
-#include <cstddef>
-#include <cstdint>
-
-namespace arm_compute
-{
-CPPUpsampleKernelEx::CPPUpsampleKernelEx() : _input(nullptr), _output(nullptr), _info() {}
-
-bool CPPUpsampleKernelEx::is_parallelisable() const { return false; }
-
-void CPPUpsampleKernelEx::configure(const ITensor *input, ITensor *output,
-                                    const PadStrideInfo &info)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-  _input = input;
-  _output = output;
-  _info = info;
-
-  // Configure kernel window
-  Window win = calculate_max_window(*input->info(), Steps());
-
-  // The CPPUpsampleKernelEx doesn't need padding so update_window_and_padding() can be skipped
-  Coordinates coord;
-  coord.set_num_dimensions(output->info()->num_dimensions());
-  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
-  ICPPKernel::configure(win);
-}
-
-void CPPUpsampleKernelEx::run(const Window &window, const ThreadInfo &info)
-{
-  ARM_COMPUTE_UNUSED(info);
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
-
-  // Initialize _scaled_output buffer
-  const int width_scaled = _output->info()->dimension(0);
-  const int height_scaled = _output->info()->dimension(1);
-  const int stride_x = _info.stride().first;
-  const int stride_y = _info.stride().second;
-  const int start_x = _info.pad_left();
-  const int start_y = _info.pad_top();
-  const int end_y = height_scaled - _info.pad_bottom();
-  const int end_x = width_scaled - _info.pad_top();
-  const size_t element_size = _input->info()->element_size();
-
-  // The fill value is normally 0, but for QASYMM8 the '0' corresponds to the offset
-  const uint8_t fill_value =
-      _output->info()->data_type() == DataType::QASYMM8
-          ? utility::clamp<uint8_t>(_output->info()->quantization_info().uniform().offset)
-          : 0;
-  // Filling a value different than 0 works only for QASYMM8 datatype since we are filling 1byte
-  // values in a buffer of uint8_ts
-  std::fill_n(_output->buffer(), _output->info()->total_size(), fill_value);
-
-  // Create window
-  Window window_out(window);
-  window_out.set(Window::DimX, Window::Dimension(start_x, end_x, stride_x));
-  window_out.set(Window::DimY, Window::Dimension(start_y, end_y, stride_y));
-
-  // Create iterators
-  Iterator in(_input, window);
-  Iterator out(_output, window_out);
-
-  execute_window_loop(
-      window, [&](const Coordinates &) { memcpy(out.ptr(), in.ptr(), element_size); }, in, out);
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp
deleted file mode 100644
index fbb9dbca9..000000000
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NECastKernel.cpp
+++ /dev/null
@@ -1,671 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/NECastKernel.h"
-
-#include "arm_compute/core/AccessWindowStatic.h"
-#include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
-
-#include <arm_neon.h>
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          SubDataType input_subtype)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8,
-                                                       DataType::QASYMM8, DataType::U32,
-                                                       DataType::S32, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON(input_subtype == SubDataType::BOOL &&
-                              input->data_type() != DataType::U8);
-
-  if (output->tensor_shape().total_size() > 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8,
-                                                         DataType::QASYMM8, DataType::U32,
-                                                         DataType::S32, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
-  }
-
-  return Status{};
-}
-
-std::tuple<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-  // Configure kernel window
-  Window win = calculate_max_window(*input, Steps());
-
-  // Output tensor auto initialization if not yet initialized
-  auto_init_if_empty(*output, input->tensor_shape(), 1, DataType::F32);
-
-  // NECastKernel doesn't need padding so update_window_and_padding() can be skipped
-  Coordinates coord;
-  coord.set_num_dimensions(output->num_dimensions());
-  output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
-
-  return std::make_tuple(Status{}, win);
-}
-
-typedef struct bool8x16
-{
-  uint8x16_t val;
-} bool8x16_t;
-
-static inline uint8x16_t vreinterpretq_u8_b8(bool8x16_t __a) { return (uint8x16_t)__a.val; }
-
-template <typename ToV, typename FromV> inline ToV vcast(const FromV &v) { return v; }
-template <> inline uint8x16_t vcast(const bool8x16_t &v)
-{
-  const uint8x16_t vu8 = vreinterpretq_u8_b8(v);
-  const uint8x16_t zero_uint8x16 = vdupq_n_u8(0);
-  uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16);
-  return vshrq_n_u8(mask, 7); // true -> 1, false -> 0
-}
-
-template <> inline uint32x4x4_t vcast(const bool8x16_t &v)
-{
-  const uint8x16_t vu8 = vreinterpretq_u8_b8(v);
-  const uint8x16_t zero_uint8x16 = vdupq_n_u8(0);
-  uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16);
-  uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0
-
-  const uint32x4x4_t ret = {{
-      vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb)))),
-      vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb)))),
-      vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb)))),
-      vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb)))),
-  }};
-
-  return ret;
-}
-
-template <> inline int32x4x4_t vcast(const bool8x16_t &v)
-{
-  const uint8x16_t vu8 = vreinterpretq_u8_b8(v);
-  const uint8x16_t zero_uint8x16 = vdupq_n_u8(0);
-  uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16);
-  uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0
-
-  const int32x4x4_t ret = {{
-      vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb))))),
-      vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb))))),
-      vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb))))),
-      vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb))))),
-  }};
-
-  return ret;
-}
-
-template <> inline float32x4x4_t vcast(const bool8x16_t &v)
-{
-  const uint8x16_t vu8 = vreinterpretq_u8_b8(v);
-  const uint8x16_t zero_uint8x16 = vdupq_n_u8(0);
-  uint8x16_t mask = vcgtq_u8(vu8, zero_uint8x16);
-  uint8x16_t vb = vshrq_n_u8(mask, 7); // true -> 1, false -> 0
-
-  const float32x4x4_t ret = {{
-      vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(vb))))),
-      vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(vb))))),
-      vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(vb))))),
-      vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(vb))))),
-  }};
-
-  return ret;
-}
-
-template <> inline uint32x4x4_t vcast(const uint8x16_t &v)
-{
-  const uint32x4x4_t ret = {{
-      vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v)))),
-      vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v)))),
-      vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v)))),
-      vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v)))),
-  }};
-
-  return ret;
-}
-
-template <> inline int32x4x4_t vcast(const uint8x16_t &v)
-{
-  const int32x4x4_t ret = {{
-      vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v))))),
-      vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v))))),
-      vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v))))),
-      vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v))))),
-  }};
-
-  return ret;
-}
-
-template <> inline float32x4x4_t vcast(const uint8x16_t &v)
-{
-  const float32x4x4_t ret = {{
-      vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(v))))),
-      vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(v))))),
-      vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(v))))),
-      vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(v))))),
-  }};
-
-  return ret;
-}
-
-template <> inline uint8x16_t vcast(const int32x4x4_t &v)
-{
-  // Saturate cast
-  return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovun_s32(v.val[0]), vqmovun_s32(v.val[1]))),
-                     vqmovn_u16(vcombine_u16(vqmovun_s32(v.val[2]), vqmovun_s32(v.val[3]))));
-}
-
-template <> inline uint32x4x4_t vcast(const int32x4x4_t &v)
-{
-  // Saturate cast
-  const uint32x4x4_t ret = {{
-      vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[0]))),
-                   vqmovun_s64(vmovl_s32(vget_high_s32(v.val[0])))),
-      vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[1]))),
-                   vqmovun_s64(vmovl_s32(vget_high_s32(v.val[1])))),
-      vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[2]))),
-                   vqmovun_s64(vmovl_s32(vget_high_s32(v.val[2])))),
-      vcombine_u32(vqmovun_s64(vmovl_s32(vget_low_s32(v.val[3]))),
-                   vqmovun_s64(vmovl_s32(vget_high_s32(v.val[3])))),
-  }};
-
-  return ret;
-}
-
-template <> inline float32x4x4_t vcast(const int32x4x4_t &v)
-{
-  const float32x4x4_t ret = {{
-      vcvtq_f32_s32(v.val[0]), vcvtq_f32_s32(v.val[1]), vcvtq_f32_s32(v.val[2]),
-      vcvtq_f32_s32(v.val[3]),
-  }};
-
-  return ret;
-}
-
-template <> inline uint8x16_t vcast(const uint32x4x4_t &v)
-{
-  return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovn_u32(v.val[0]), vqmovn_u32(v.val[1]))),
-                     vqmovn_u16(vcombine_u16(vqmovn_u32(v.val[2]), vqmovn_u32(v.val[3]))));
-}
-
-template <> inline int32x4x4_t vcast(const uint32x4x4_t &v)
-{
-  const int32x4x4_t ret = {{
-      vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[0])))),
-                   vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[0]))))),
-      vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[1])))),
-                   vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[1]))))),
-      vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[2])))),
-                   vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[2]))))),
-      vcombine_s32(vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_low_u32(v.val[3])))),
-                   vmovn_s64(vreinterpretq_s64_u64(vmovl_u32(vget_high_u32(v.val[3]))))),
-  }};
-
-  return ret;
-}
-
-template <> inline float32x4x4_t vcast(const uint32x4x4_t &v)
-{
-  const float32x4x4_t ret = {{
-      vcvtq_f32_u32(v.val[0]), vcvtq_f32_u32(v.val[1]), vcvtq_f32_u32(v.val[2]),
-      vcvtq_f32_u32(v.val[3]),
-  }};
-
-  return ret;
-}
-
-template <> inline uint8x16_t vcast(const float32x4x4_t &v)
-{
-  // Saturate cast
-  return vcombine_u8(vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(v.val[0])),
-                                             vqmovun_s32(vcvtq_s32_f32(v.val[1])))),
-                     vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(v.val[2])),
-                                             vqmovun_s32(vcvtq_s32_f32(v.val[3])))));
-}
-
-template <> inline uint32x4x4_t vcast(const float32x4x4_t &v)
-{
-  const uint32x4x4_t ret = {{
-      vcvtq_u32_f32(v.val[0]), vcvtq_u32_f32(v.val[1]), vcvtq_u32_f32(v.val[2]),
-      vcvtq_u32_f32(v.val[3]),
-  }};
-
-  return ret;
-}
-
-template <> inline int32x4x4_t vcast(const float32x4x4_t &v)
-{
-  const int32x4x4_t ret = {{
-      vcvtq_s32_f32(v.val[0]), vcvtq_s32_f32(v.val[1]), vcvtq_s32_f32(v.val[2]),
-      vcvtq_s32_f32(v.val[3]),
-  }};
-
-  return ret;
-}
-
-template <typename T> struct cast_vector;
-template <> struct cast_vector<bool>
-{
-  using type = bool8x16_t;
-};
-template <> struct cast_vector<uint8_t>
-{
-  using type = uint8x16_t;
-};
-template <> struct cast_vector<uint32_t>
-{
-  using type = uint32x4x4_t;
-};
-template <> struct cast_vector<int32_t>
-{
-  using type = int32x4x4_t;
-};
-template <> struct cast_vector<float>
-{
-  using type = float32x4x4_t;
-};
-
-template <typename T> inline void store_result(T *ptr, const typename cast_vector<T>::type &v)
-{
-  wrapper::vstore(ptr, v.val[0]);
-  wrapper::vstore(ptr + 4, v.val[1]);
-  wrapper::vstore(ptr + 8, v.val[2]);
-  wrapper::vstore(ptr + 12, v.val[3]);
-}
-
-template <> inline void store_result<uint8_t>(uint8_t *ptr, const uint8x16_t &v)
-{
-  wrapper::vstore(ptr, v);
-}
-
-inline bool8x16_t vloadq(const bool *ptr)
-{
-  bool8x16_t ret;
-  ret.val = wrapper::vloadq(reinterpret_cast<const uint8_t *>(ptr));
-  return ret;
-}
-
-template <typename T> inline typename cast_vector<T>::type load_input(const T *ptr)
-{
-  return wrapper::vloadq(ptr);
-}
-
-template <> inline typename cast_vector<bool>::type load_input(const bool *ptr)
-{
-  return vloadq(ptr);
-}
-
-template <> inline typename cast_vector<uint32_t>::type load_input(const uint32_t *ptr)
-{
-  return vld4q_u32(ptr);
-}
-
-template <> inline typename cast_vector<int32_t>::type load_input(const int32_t *ptr)
-{
-  return vld4q_s32(ptr);
-}
-
-template <> inline typename cast_vector<float>::type load_input(const float *ptr)
-{
-  return vld4q_f32(ptr);
-}
-
-template <typename T> inline T get_value(const T *ptr) { return *ptr; }
-
-template <> inline bool get_value(const bool *ptr)
-{
-  bool ret = (*ptr != 0);
-  return ret;
-}
-
-template <typename FromT> void run_cast(const ITensor *input, ITensor *output, const Window &window)
-{
-  const int window_step_x = 16;
-  const auto window_start_x = static_cast<int>(window.x().start());
-  const auto window_end_x = static_cast<int>(window.x().end());
-
-  // Collapse window and reset first dimension to handle tail calculations manually
-  Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-  win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-  // Create iterators
-  Iterator in(input, win_collapsed);
-  Iterator out(output, win_collapsed);
-
-#ifdef __aarch64__
-  constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
-#else  //__aarch64__
-  constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
-#endif //__aarch64__
-
-  execute_window_loop(
-      win_collapsed,
-      [&](const Coordinates &) {
-        const auto in_ptr = reinterpret_cast<const FromT *>(in.ptr());
-
-        int x = window_start_x;
-        for (; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-          using from_vector = typename cast_vector<FromT>::type;
-          const from_vector vin = load_input(in_ptr + x);
-
-          switch (output->info()->data_type())
-          {
-            case DataType::U8:
-            {
-              using to_vector = typename cast_vector<uint8_t>::type;
-              const to_vector vout = vcast<to_vector, from_vector>(vin);
-              store_result<uint8_t>(reinterpret_cast<uint8_t *>(out.ptr()) + x, vout);
-              break;
-            }
-            case DataType::QASYMM8:
-            {
-              using to_vector = typename cast_vector<float>::type;
-              const UniformQuantizationInfo &qinfo_out =
-                  output->info()->quantization_info().uniform();
-              const auto vf = vcast<to_vector, from_vector>(vin);
-              const auto vout = vquantize(vf, qinfo_out);
-              store_result<qasymm8_t>(reinterpret_cast<qasymm8_t *>(out.ptr()) + x, vout);
-              break;
-            }
-            case DataType::U32:
-            {
-              using to_vector = typename cast_vector<uint32_t>::type;
-              const to_vector vout = vcast<to_vector, from_vector>(vin);
-              store_result<uint32_t>(reinterpret_cast<uint32_t *>(out.ptr()) + x, vout);
-              break;
-            }
-            case DataType::S32:
-            {
-              using to_vector = typename cast_vector<int32_t>::type;
-              const to_vector vout = vcast<to_vector, from_vector>(vin);
-              store_result<int32_t>(reinterpret_cast<int32_t *>(out.ptr()) + x, vout);
-              break;
-            }
-            case DataType::F32:
-            {
-              using to_vector = typename cast_vector<float>::type;
-              const to_vector vout = vcast<to_vector, from_vector>(vin);
-              store_result<float>(reinterpret_cast<float *>(out.ptr()) + x, vout);
-              break;
-            }
-            default:
-              ARM_COMPUTE_ERROR("Unsupported data type.");
-          }
-        }
-
-        // Compute left-over elements
-        for (; x < window_end_x; ++x)
-        {
-          FromT val = get_value(in_ptr + x);
-          switch (output->info()->data_type())
-          {
-            case DataType::U8:
-            {
-              *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = static_cast<uint8_t>(val);
-              break;
-            }
-            case DataType::QASYMM8:
-            {
-              const QuantizationInfo &qinfo_out = output->info()->quantization_info();
-              const auto qval =
-                  quantize_qasymm8(static_cast<float>(val), qinfo_out, rounding_policy);
-              *(reinterpret_cast<qasymm8_t *>(out.ptr()) + x) = qval;
-              break;
-            }
-            case DataType::U32:
-            {
-              *(reinterpret_cast<uint32_t *>(out.ptr()) + x) = static_cast<uint32_t>(val);
-              break;
-            }
-            case DataType::S32:
-            {
-              *(reinterpret_cast<int32_t *>(out.ptr()) + x) = static_cast<int32_t>(val);
-              break;
-            }
-            case DataType::F32:
-            {
-              *(reinterpret_cast<float *>(out.ptr()) + x) = static_cast<float>(val);
-              break;
-            }
-            default:
-              ARM_COMPUTE_ERROR("Unsupported data type.");
-          }
-        }
-      },
-      in, out);
-}
-
-void run_cast_qasymm8(const ITensor *input, ITensor *output, const Window &window)
-{
-  const int window_step_x = 16;
-  const auto window_start_x = static_cast<int>(window.x().start());
-  const auto window_end_x = static_cast<int>(window.x().end());
-
-  // Collapse window and reset first dimension to handle tail calculations manually
-  Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-  win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-  // Create iterators
-  Iterator in(input, win_collapsed);
-  Iterator out(output, win_collapsed);
-
-#ifdef __aarch64__
-  constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
-#else  //__aarch64__
-  constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
-#endif //__aarch64__
-  const auto &qinfo_in = input->info()->quantization_info().uniform();
-  const auto &qinfo_out = output->info()->quantization_info().uniform();
-
-  execute_window_loop(
-      win_collapsed,
-      [&](const Coordinates &) {
-        const auto in_ptr = reinterpret_cast<const qasymm8_t *>(in.ptr());
-
-        int x = window_start_x;
-        for (; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-          using from_vector = typename cast_vector<float>::type;
-          const auto vf = wrapper::vloadq(in_ptr + x);
-          const auto vin = vdequantize(vf, qinfo_in);
-          switch (output->info()->data_type())
-          {
-            case DataType::U8:
-            {
-              using to_vector = typename cast_vector<uint8_t>::type;
-              const to_vector vout = vcast<to_vector, from_vector>(vin);
-              store_result<uint8_t>(reinterpret_cast<uint8_t *>(out.ptr()) + x, vout);
-              break;
-            }
-            case DataType::QASYMM8:
-            {
-              using to_vector = typename cast_vector<float>::type;
-              const auto vf = vcast<to_vector, from_vector>(vin);
-              const auto vout = vquantize(vf, qinfo_out);
-              store_result<qasymm8_t>(reinterpret_cast<qasymm8_t *>(out.ptr()) + x, vout);
-              break;
-            }
-            case DataType::U32:
-            {
-              using to_vector = typename cast_vector<uint32_t>::type;
-              const to_vector vout = vcast<to_vector, from_vector>(vin);
-              store_result<uint32_t>(reinterpret_cast<uint32_t *>(out.ptr()) + x, vout);
-              break;
-            }
-            case DataType::S32:
-            {
-              using to_vector = typename cast_vector<int32_t>::type;
-              const to_vector vout = vcast<to_vector, from_vector>(vin);
-              store_result<int32_t>(reinterpret_cast<int32_t *>(out.ptr()) + x, vout);
-              break;
-            }
-            case DataType::F32:
-            {
-              using to_vector = typename cast_vector<float>::type;
-              const to_vector vout = vcast<to_vector, from_vector>(vin);
-              store_result<float>(reinterpret_cast<float *>(out.ptr()) + x, vout);
-              break;
-            }
-            default:
-              ARM_COMPUTE_ERROR("Unsupported data type.");
-          }
-        }
-
-        // Compute left-over elements
-        for (; x < window_end_x; ++x)
-        {
-          qasymm8_t qval_in = *(in_ptr + x);
-          const auto val = dequantize_qasymm8(qval_in, qinfo_in);
-
-          switch (output->info()->data_type())
-          {
-            case DataType::U8:
-            {
-              *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = static_cast<uint8_t>(val);
-              break;
-            }
-            case DataType::QASYMM8:
-            {
-              const auto qval_out = quantize_qasymm8(val, qinfo_out, rounding_policy);
-              *(reinterpret_cast<qasymm8_t *>(out.ptr()) + x) = qval_out;
-              break;
-            }
-            case DataType::U32:
-            {
-              *(reinterpret_cast<uint32_t *>(out.ptr()) + x) = static_cast<uint32_t>(val);
-              break;
-            }
-            case DataType::S32:
-            {
-              *(reinterpret_cast<int32_t *>(out.ptr()) + x) = static_cast<int32_t>(val);
-              break;
-            }
-            case DataType::F32:
-            {
-              *(reinterpret_cast<float *>(out.ptr()) + x) = static_cast<float>(val);
-              break;
-            }
-            default:
-              ARM_COMPUTE_ERROR("Unsupported data type.");
-          }
-        }
-      },
-      in, out);
-}
-} // namespace
-
-NECastKernel::NECastKernel() : _input(nullptr), _output(nullptr), _input_subtype(SubDataType::NONE)
-{
-}
-
-void NECastKernel::configure(const ITensor *input, ITensor *output, SubDataType input_subtype)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), input_subtype));
-
-  _input = input;
-  _output = output;
-  _input_subtype = input_subtype;
-
-  // Configure kernel window
-  auto win_config = validate_and_configure_window(input->info(), output->info());
-
-  ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-
-  INEKernel::configure(std::get<1>(win_config));
-}
-
-Status NECastKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                              SubDataType input_subtype)
-{
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, input_subtype));
-  ARM_COMPUTE_RETURN_ON_ERROR(
-      std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
-  return Status{};
-}
-
-void NECastKernel::run(const Window &window, const ThreadInfo &info)
-{
-  ARM_COMPUTE_UNUSED(info);
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
-  switch (_input->info()->data_type())
-  {
-    case DataType::U8:
-      if (_input_subtype == SubDataType::BOOL)
-      {
-        run_cast<bool>(_input, _output, window);
-      }
-      else
-      {
-        run_cast<uint8_t>(_input, _output, window);
-      }
-      break;
-    case DataType::QASYMM8:
-      run_cast_qasymm8(_input, _output, window);
-      break;
-    case DataType::U32:
-      run_cast<uint32_t>(_input, _output, window);
-      break;
-    case DataType::S32:
-      run_cast<int32_t>(_input, _output, window);
-      break;
-    case DataType::F32:
-      run_cast<float>(_input, _output, window);
-      break;
-    default:
-      ARM_COMPUTE_ERROR("Unsupported data type.");
-  }
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp
deleted file mode 100644
index 95e269dee..000000000
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
-#include <arm_neon.h>
-#include <cstdint>
-
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
-  ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 2);
-
-  const DataLayout data_layout = input->data_layout();
-  const int idx_channel =
-      get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-  ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] % (block_shape * block_shape) !=
-                              0);
-  // Validate output if initialized
-  if (output->total_size() != 0)
-  {
-    const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int idx_height =
-        get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] !=
-                                (block_shape * input->tensor_shape()[idx_width]));
-    ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] !=
-                                (block_shape * input->tensor_shape()[idx_height]));
-    ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-  }
-
-  return Status{};
-}
-} // namespace
-
-NEDepthToSpaceLayerKernelEx::NEDepthToSpaceLayerKernelEx()
-    : _input(nullptr), _output(nullptr), _block_shape()
-{
-}
-
-void NEDepthToSpaceLayerKernelEx::configure(const ITensor *input, ITensor *output,
-                                            int32_t block_shape)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-  TensorShape output_shape = compute_depth_to_space_shape_ex(input->info(), block_shape);
-  // Output auto inizialitation if not yet initialized
-  auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
-
-  // Perform validation step
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape));
-
-  _input = input;
-  _output = output;
-  _block_shape = block_shape;
-
-  // Configure kernel window
-  Window win = calculate_max_window(*input->info(), Steps());
-  ICPPKernel::configure(win);
-}
-
-Status NEDepthToSpaceLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                             int32_t block_shape)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, block_shape));
-  return Status{};
-}
-
-void NEDepthToSpaceLayerKernelEx::run(const Window &window, const ThreadInfo &info)
-{
-  ARM_COMPUTE_UNUSED(info);
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
-
-  const int idx_channel =
-      get_data_layout_dimension_index(_input->info()->data_layout(), DataLayoutDimension::CHANNEL);
-  const int depth_size = _input->info()->dimension(idx_channel);
-  const int r = (depth_size / (_block_shape * _block_shape));
-  const int element_size = _input->info()->element_size();
-
-  Window slice_out = window.first_slice_window_3D();
-
-  // The slice_out slice does not move
-  slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
-  slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
-  slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
-
-  // Main loop for NCHW and NHWC
-  if (_input->info()->data_layout() == DataLayout::NCHW)
-  {
-    Window slice_in = window.first_slice_window_2D();
-    do
-    {
-      Iterator in(_input, slice_in);
-      execute_window_loop(slice_in,
-                          [&](const Coordinates &id) {
-                            const int x = id.x();
-                            const int y = id.y();
-
-                            const int z = id.z() % r;
-                            const int out_x = x * _block_shape + (id.z() / r) % _block_shape;
-                            const int out_y = y * _block_shape + (id.z() / r) / _block_shape;
-                            Coordinates output_coords{out_x, out_y, z, id[3]};
-                            memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
-                          },
-                          in);
-    } while (window.slide_window_slice_2D(slice_in));
-  }
-  else
-  {
-    Window slice_in = window.first_slice_window_3D();
-    do
-    {
-      Iterator in(_input, slice_in);
-      execute_window_loop(slice_in,
-                          [&](const Coordinates &id) {
-                            const int x = id.y();
-                            const int y = id.z();
-
-                            const int z = id.x() % r;
-                            const int out_x = x * _block_shape + (id.x() / r) % _block_shape;
-                            const int out_y = y * _block_shape + (id.x() / r) / _block_shape;
-                            Coordinates output_coords{z, out_x, out_y, id[3]};
-                            memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
-                          },
-                          in);
-    } while (window.slide_window_slice_3D(slice_in));
-  }
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp
deleted file mode 100644
index 200fc4f87..000000000
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEElementwiseUnaryKernelEx.cpp
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/NEElementwiseUnaryKernelEx.h"
-
-#include "arm_compute/core/CPP/Validate.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/IAccessWindow.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NEFixedPoint.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
-
-#include <algorithm>
-#include <arm_neon.h>
-#include <cstdint>
-#include <map>
-#include <string>
-
-namespace arm_compute
-{
-class Coordinates;
-
-namespace
-{
-template <ElementWiseUnaryEx op, typename ScalarType>
-inline ScalarType elementwise_op_scalar(const ScalarType &a)
-{
-  switch (op)
-  {
-    case ElementWiseUnaryEx::NEG:
-      return -a;
-    default:
-      ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-  }
-}
-
-template <ElementWiseUnaryEx op, typename VectorType>
-inline VectorType elementwise_op(const VectorType &a)
-{
-  switch (op)
-  {
-    case ElementWiseUnaryEx::NEG:
-      return wrapper::vneg(a);
-    default:
-      ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-  }
-}
-
-template <ElementWiseUnaryEx op, typename ScalarType>
-void elementwise_op(const ITensor *in, ITensor *out, const Window &window)
-{
-  const int window_step_x = 16 / sizeof(ScalarType);
-  const auto window_start_x = static_cast<int>(window.x().start());
-  const auto window_end_x = static_cast<int>(window.x().end());
-
-  Window win = window;
-  win.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-  Iterator input(in, win);
-  Iterator output(out, win);
-
-  execute_window_loop(win,
-                      [&](const Coordinates &) {
-                        auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-                        const auto input_ptr = reinterpret_cast<const ScalarType *>(input.ptr());
-
-                        int x = window_start_x;
-                        for (; x <= window_end_x - window_step_x; x += window_step_x)
-                        {
-                          wrapper::vstore(output_ptr + x,
-                                          elementwise_op<op>(wrapper::vloadq(input_ptr + x)));
-                        }
-                        for (; x < window_end_x; ++x)
-                        {
-                          *(output_ptr + x) = elementwise_op_scalar<op>(*(input_ptr + x));
-                        }
-                      },
-                      input, output);
-}
-
-template <ElementWiseUnaryEx op>
-std::function<void(const ITensor *input, ITensor *output, const Window &window)>
-configure_func(const ITensor *input, ITensor *output)
-{
-  std::string function_to_call("op_");
-  function_to_call += string_from_data_type(input->info()->data_type()) + "_";
-  function_to_call += string_from_data_type(output->info()->data_type());
-
-  static std::map<std::string, NEElementwiseUnaryKernelEx::ElementwiseUnaryFunction *>
-      map_function = {
-          {"op_F32_F32", &elementwise_op<op, float>}, {"op_S32_S32", &elementwise_op<op, int32_t>},
-      };
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-  map_function["op_F16_F16"] = &elementwise_op<op, float16_t>;
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
-  auto it = map_function.find(function_to_call);
-
-  if (it != map_function.end())
-  {
-    auto func = it->second;
-    return [func](const ITensor *input, ITensor *output, const Window &window) {
-      func(input, output, window);
-    };
-  }
-  return nullptr;
-}
-} // namespace
-
-NEElementwiseUnaryKernelEx::NEElementwiseUnaryKernelEx()
-    : _function(nullptr), _input(nullptr), _output(nullptr)
-{
-}
-
-void NEElementwiseUnaryKernelEx::configure(ElementWiseUnaryEx op, const ITensor *input,
-                                           ITensor *output)
-{
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input->info(), *output->info()));
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-  // Configure kernel window
-  const std::pair<TensorShape, ValidRegion> broadcast_pair =
-      ITensorInfo::broadcast_shape_and_valid_region(*input->info());
-  const TensorShape &out_shape = broadcast_pair.first;
-  const ValidRegion &valid_region = broadcast_pair.second;
-
-  // Auto initialize output if not initialized
-  auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type());
-
-  Window win = calculate_max_window(valid_region);
-
-  _input = input;
-  _output = output;
-
-  INEKernel::configure(win);
-
-  switch (op)
-  {
-    case ElementWiseUnaryEx::NEG:
-      _function = configure_func<ElementWiseUnaryEx::NEG>(input, output);
-      break;
-    default:
-      ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-  }
-}
-
-Status NEElementwiseUnaryKernelEx::validate_arguments(const ITensorInfo &input,
-                                                      const ITensorInfo &output)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::F16, DataType::F32,
-                                                       DataType::S32);
-
-  // Validate in case of configured output
-  if (output.total_size() > 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output);
-  }
-
-  return Status{};
-}
-
-Status NEElementwiseUnaryKernelEx::validate(ElementWiseUnaryEx op, const ITensorInfo *input,
-                                            const ITensorInfo *output)
-{
-  ARM_COMPUTE_UNUSED(op);
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *output));
-  return Status{};
-}
-
-void NEElementwiseUnaryKernelEx::run(const Window &window, const ThreadInfo &info)
-{
-  ARM_COMPUTE_UNUSED(info);
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-  ARM_COMPUTE_ERROR_ON(_function == nullptr);
-  _function(_input, _output, window);
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp
deleted file mode 100644
index 641641b5a..000000000
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEPReLUKernel.cpp
+++ /dev/null
@@ -1,291 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/NEPReLUKernel.h"
-
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/NEAsymm.h"
-#include "arm_compute/core/NEON/NEElementwiseOperationFuncs.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Window.h"
-
-#include <arm_neon.h>
-
-using namespace arm_compute;
-namespace
-{
-
-/** Conditional element-wise operations */
-enum class ConditionalOperation
-{
-  PRELU, /**< (x * y) for x < 0, x for x >= 0 */
-};
-
-template <ConditionalOperation op, typename ScalarType>
-inline ScalarType elementwise_conditional_op_scalar(const ScalarType &a, const ScalarType &b)
-{
-  auto res = ScalarType(0);
-
-  switch (op)
-  {
-    case ConditionalOperation::PRELU:
-      res = a < 0 ? a * b : a;
-      break;
-    default:
-      ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-  }
-  return res;
-}
-
-template <ConditionalOperation op>
-inline uint8_t elementwise_conditional_op_quantized_scalar(const float &a, const float &b,
-                                                           QuantizationInfo qinfo)
-{
-  return quantize_qasymm8(elementwise_conditional_op_scalar<op>(a, b), qinfo,
-                          RoundingPolicy::TO_NEAREST_UP);
-}
-
-template <ConditionalOperation op, typename VectorType>
-inline VectorType elementwise_conditional_op(const VectorType &a, const VectorType &b)
-{
-  VectorType res = {0, 0, 0, 0};
-  VectorType const_0 = {0, 0, 0, 0};
-
-  switch (op)
-  {
-    case ConditionalOperation::PRELU:
-      res = wrapper::vbsl(wrapper::vcgt(a, const_0), a, wrapper::vmul(a, b));
-      ;
-      break;
-    default:
-      ARM_COMPUTE_ERROR("NOT_SUPPORTED!");
-  }
-  return res;
-}
-
-template <ConditionalOperation op>
-inline float32x4x4_t elementwise_conditional_op(const float32x4x4_t &a, const float32x4x4_t &b)
-{
-  float32x4x4_t out = {{
-      elementwise_conditional_op<op>(a.val[0], b.val[0]),
-      elementwise_conditional_op<op>(a.val[1], b.val[1]),
-      elementwise_conditional_op<op>(a.val[2], b.val[2]),
-      elementwise_conditional_op<op>(a.val[3], b.val[3]),
-  }};
-  return out;
-}
-
-template <ConditionalOperation op, typename ScalarType, typename VectorType>
-inline VectorType elementwise_conditional_op_broadcast(const VectorType &a,
-                                                       const ScalarType &broadcast_value,
-                                                       const bool reorder)
-{
-  VectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag());
-  return elementwise_conditional_op<op>(reorder ? broadcast_vector : a,
-                                        reorder ? a : broadcast_vector);
-}
-
-template <ConditionalOperation op, typename ScalarType, typename VectorType>
-inline int elementwise_conditional_op_loop(int window_start_x, int window_end_x, int window_step_x,
-                                           const ScalarType *input1_ptr,
-                                           const ScalarType *input2_ptr, ScalarType *output_ptr)
-{
-  int x = window_start_x;
-  for (; x <= (window_end_x - window_step_x); x += window_step_x)
-  {
-    const auto a = wrapper::vloadq(input1_ptr + x);
-    const auto b = wrapper::vloadq(input2_ptr + x);
-    wrapper::vstore(output_ptr + x, elementwise_conditional_op<op>(a, b));
-  }
-  return x;
-}
-
-template <ConditionalOperation op>
-inline int elementwise_conditional_op_quantized_loop(int window_start_x, int window_end_x,
-                                                     int window_step_x, const uint8_t *input1_ptr,
-                                                     const uint8_t *input2_ptr, uint8_t *output_ptr,
-                                                     int32x4_t voffset1, int32x4_t voffset2,
-                                                     float32x4_t vscale1, float32x4_t vscale2,
-                                                     float32x4_t voffseto, float32x4_t invvscaleo)
-{
-  int x = window_start_x;
-  for (; x <= (window_end_x - window_step_x); x += window_step_x)
-  {
-    // Get inputs and compute output
-    const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1);
-    const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2);
-    const float32x4x4_t rf = elementwise_conditional_op<op>(af, bf);
-    store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
-  }
-  return x;
-}
-
-template <ConditionalOperation op, typename ScalarType, typename VectorType>
-inline int elementwise_conditional_op_broadcast_loop(int window_start_x, int window_end_x,
-                                                     int window_step_x,
-                                                     const ScalarType *non_broadcast_input_ptr,
-                                                     const ScalarType &broadcast_value,
-                                                     ScalarType *output_ptr, const bool reorder)
-{
-  int x = window_start_x;
-  for (; x <= (window_end_x - window_step_x); x += window_step_x)
-  {
-    const auto a = wrapper::vloadq((non_broadcast_input_ptr + x));
-    wrapper::vstore(output_ptr + x,
-                    elementwise_conditional_op_broadcast<op>(a, broadcast_value, reorder));
-  }
-  return x;
-}
-
-template <ConditionalOperation op>
-inline int elementwise_conditional_op_quantized_broadcast_loop(
-    int window_start_x, int window_end_x, int window_step_x, const uint8_t *non_broadcast_input_ptr,
-    float32x4x4_t broadcast_vector, uint8_t *output_ptr, int32x4_t voffset_non_broadcast,
-    float32x4_t vscale_non_broadcast, float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
-{
-  int x = window_start_x;
-  for (; x <= (window_end_x - window_step_x); x += window_step_x)
-  {
-    const float32x4x4_t af =
-        load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
-    const float32x4x4_t rf = elementwise_conditional_op<op>(reorder ? broadcast_vector : af,
-                                                            reorder ? af : broadcast_vector);
-    store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
-  }
-  return x;
-}
-
-template <ConditionalOperation op, typename ScalarType, typename VectorType>
-void elementwise_conditional_op(const ITensor *in1, const ITensor *in2, ITensor *out,
-                                const Window &window)
-{
-  elementwise_op(in1, in2, out, window, &elementwise_conditional_op_scalar<op, ScalarType>,
-                 &elementwise_conditional_op_broadcast_loop<op, ScalarType, VectorType>,
-                 &elementwise_conditional_op_loop<op, ScalarType, VectorType>);
-}
-
-template <ConditionalOperation op>
-void elementwise_conditional_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out,
-                                          const Window &window)
-{
-  elementwise_op_quantized(in1, in2, out, window, &elementwise_conditional_op_quantized_scalar<op>,
-                           &elementwise_conditional_op_quantized_broadcast_loop<op>,
-                           &elementwise_conditional_op_quantized_loop<op>);
-}
-} // namespace
-
-NEPReLUKernel::NEPReLUKernel() : _input(nullptr), _alpha(nullptr), _output(nullptr) {}
-
-void NEPReLUKernel::configure(const ITensor *input, const ITensor *alpha, ITensor *output)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, alpha, output);
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input->info(), *alpha->info(), *output->info()));
-
-  // Configure kernel window
-  const std::pair<TensorShape, ValidRegion> broadcast_pair =
-      ITensorInfo::broadcast_shape_and_valid_region(*input->info(), *alpha->info());
-  const TensorShape &out_shape = broadcast_pair.first;
-  const ValidRegion &valid_region = broadcast_pair.second;
-
-  // Auto initialize output if not initialized
-  auto_init_if_empty(*output->info(), out_shape, 1, input->info()->data_type());
-
-  Window win = calculate_max_window(valid_region);
-
-  _input = input;
-  _alpha = alpha;
-  _output = output;
-  INEKernel::configure(win);
-}
-
-void NEPReLUKernel::run(const Window &window, const ThreadInfo &info)
-{
-  ARM_COMPUTE_UNUSED(info);
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
-
-  if (_input->info()->data_type() == DataType::F32)
-  {
-    elementwise_conditional_op<ConditionalOperation::PRELU, float, float32x4_t>(_input, _alpha,
-                                                                                _output, window);
-  }
-  else if (_input->info()->data_type() == DataType::QASYMM8)
-  {
-    elementwise_conditional_op_quantized<ConditionalOperation::PRELU>(_input, _alpha, _output,
-                                                                      window);
-  }
-  else
-  {
-    ARM_COMPUTE_ERROR("Wrong Type");
-  }
-}
-
-Status NEPReLUKernel::validate_arguments(const ITensorInfo &input, const ITensorInfo &alpha,
-                                         const ITensorInfo &output)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &alpha, &output);
-
-  const TensorShape out_shape =
-      TensorShape::broadcast_shape(input.tensor_shape(), alpha.tensor_shape());
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0,
-                                  "Inputs are not broadcast compatible");
-
-  // Checks performed when output is configured
-  if (output.total_size() > 0)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
-        "Wrong shape for output");
-  }
-
-  return Status{};
-}
-
-Status NEPReLUKernel::validate(const ITensorInfo *input, const ITensorInfo *alpha,
-                               const ITensorInfo *output)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, alpha, output);
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input, *alpha, *output));
-
-  return Status{};
-}
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
index 6ba0f1fd4..5841f1d69 100644
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
+++ b/compute/ARMComputeEx/src/core/NEON/kernels/NEQuantizationSymmetricKernel.cpp
@@ -64,7 +64,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
   ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
   ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
   ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S8);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8_SIGNED);
   ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
   ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scale_factor, 1, DataType::F16,
                                                        DataType::F32);
diff --git a/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp b/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp
deleted file mode 100644
index 44feb200f..000000000
--- a/compute/ARMComputeEx/src/core/NEON/kernels/NESpaceToDepthLayerKernelEx.cpp
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/wrapper/wrapper.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
-#include <arm_neon.h>
-#include <cstdint>
-
-using namespace arm_compute::misc::shape_calculator;
-
-namespace arm_compute
-{
-namespace
-{
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, int32_t block_shape)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-  ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
-
-  ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 1);
-
-  // Validate output if initialized
-  if (output->total_size() != 0)
-  {
-    const DataLayout data_layout = input->data_layout();
-    const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int idx_height =
-        get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const int idx_channel =
-        get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-    const int idx_batch =
-        get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_width] % block_shape != 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_height] % block_shape != 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] !=
-                                output->tensor_shape()[idx_batch]);
-    ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_channel] % (block_shape * block_shape) !=
-                                0);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().total_size() !=
-                                output->tensor_shape().total_size());
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-  }
-
-  return Status{};
-}
-} // namespace
-
-NESpaceToDepthLayerKernelEx::NESpaceToDepthLayerKernelEx()
-    : _input(nullptr), _output(nullptr), _block_shape()
-{
-}
-
-void NESpaceToDepthLayerKernelEx::configure(const ITensor *input, ITensor *output,
-                                            int32_t block_shape)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-  TensorShape output_shape = compute_space_to_depth_shape_ex(input->info(), block_shape);
-  auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
-
-  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape));
-
-  _input = input;
-  _block_shape = block_shape;
-  _output = output;
-
-  // Configure kernel window
-  Window win = calculate_max_window(*output->info(), Steps());
-  INEKernel::configure(win);
-}
-
-Status NESpaceToDepthLayerKernelEx::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                             int32_t block_shape)
-{
-  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, block_shape));
-  return Status{};
-}
-
-void NESpaceToDepthLayerKernelEx::run(const Window &window, const ThreadInfo &info)
-{
-  ARM_COMPUTE_UNUSED(info);
-  ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
-
-  const DataLayout data_layout = _input->info()->data_layout();
-  const int channel_idx =
-      get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-  const int element_size = _input->info()->element_size();
-
-  const size_t channel_size = _input->info()->dimension(channel_idx);
-
-  Window slice_out = window.first_slice_window_3D();
-
-  int batch_id = 0;
-
-  // Main loop for NCHW and NHWC
-  if (_output->info()->data_layout() == DataLayout::NCHW)
-  {
-    do
-    {
-      Iterator out(_output, slice_out);
-      execute_window_loop(slice_out,
-                          [&](const Coordinates &id) {
-                            const size_t channel_id = id.z();
-                            const size_t in_x =
-                                id.x() * _block_shape + (channel_id / channel_size) % _block_shape;
-                            const size_t in_y =
-                                id.y() * _block_shape + (channel_id / channel_size) / _block_shape;
-                            const int z = channel_id % channel_size;
-                            Coordinates input_coords{in_x, in_y, z, batch_id};
-                            memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
-                          },
-                          out);
-      ++batch_id;
-    } while (window.slide_window_slice_3D(slice_out));
-  }
-  else
-  {
-    do
-    {
-      Iterator out(_output, slice_out);
-      execute_window_loop(slice_out,
-                          [&](const Coordinates &id) {
-                            const size_t channel_id = id.x();
-                            const size_t in_x =
-                                id.y() * _block_shape + (channel_id / channel_size) % _block_shape;
-                            const size_t in_y =
-                                id.z() * _block_shape + (channel_id / channel_size) / _block_shape;
-                            const int z = channel_id % channel_size;
-                            Coordinates input_coords{z, in_x, in_y, batch_id};
-                            memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
-                          },
-                          out);
-      ++batch_id;
-    } while (window.slide_window_slice_3D(slice_out));
-  }
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp
deleted file mode 100644
index 2d379cf36..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLArgOperation.cpp
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLArgOperation.h"
-
-#include "arm_compute/core/CL/kernels/CLArgOperationKernel.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-
-namespace arm_compute
-{
-
-CLArgOperation::CLArgOperation()
-{
-  // DO NOTHING
-}
-
-void CLArgOperation::configure(ICLTensor *input, ICLTensor *output, std::vector<uint32_t> axis,
-                               ArgOperation op)
-{
-  ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), axis, output->info(), op));
-  _input = input;
-  _output = output;
-  _axis = axis;
-  _arg_op = op;
-  // NOTE The argminmax_axis must have no duplication.
-  _num_of_kernels = axis.size();
-  const size_t num_of_interm_tensors = _num_of_kernels - 1;
-
-  _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
-  _argop_kernels =
-      arm_compute::support::cpp14::make_unique<CLArgOperationKernel[]>(_num_of_kernels);
-
-  TensorShape shape{input->info()->tensor_shape()};
-  for (size_t i = 0; i < num_of_interm_tensors; i++)
-  {
-    shape.set(_axis[i], 1);
-    _interm_tensors[i].allocator()->init(
-        TensorInfo(shape, input->info()->num_channels(), input->info()->data_type())
-            .set_data_layout(input->info()->data_layout()));
-    _interm_tensors[i].allocator()->allocate();
-  }
-
-  // Set a vector that is ordered ICLTensors sequentially.
-  std::vector<ICLTensor *> tensors;
-  tensors.emplace_back(input);
-  for (size_t i = 0; i < num_of_interm_tensors; i++)
-  {
-    tensors.emplace_back(_interm_tensors.get() + i);
-  }
-  tensors.emplace_back(output);
-
-  // Apply ArgMinMax on all kernels
-  for (size_t i = 0; i < _num_of_kernels; i++)
-  {
-    _argop_kernels[i].configure(tensors[i], tensors[i + 1], _axis[i], op);
-  }
-}
-
-Status CLArgOperation::validate(const ITensorInfo *input, const std::vector<uint32_t> &axis,
-                                const ITensorInfo *output, ArgOperation op)
-{
-  const size_t num_of_kernels = axis.size();
-  const size_t num_of_interm_tensors = num_of_kernels - 1;
-
-  // Create temporary tensor infos
-  auto interm_tensors =
-      arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
-
-  // Create intermediate tensor info
-  TensorShape shape{input->tensor_shape()};
-
-  for (size_t i = 0; i < num_of_interm_tensors; i++)
-  {
-    shape.set(axis[i], 1);
-    interm_tensors[i].set_data_type(input->data_type());
-    interm_tensors[i].set_tensor_shape(shape);
-    interm_tensors[i].set_num_channels(input->num_channels());
-  }
-
-  // Set a vector that is ordered ITensorInfo sequentially.
-  std::vector<const ITensorInfo *> tensors;
-  tensors.emplace_back(input);
-  for (size_t i = 0; i < num_of_interm_tensors; i++)
-  {
-    tensors.emplace_back(interm_tensors.get() + i);
-  }
-  tensors.emplace_back(output);
-
-  // Validate argminmax only on all kernels
-  for (size_t i = 0; i < num_of_kernels; i++)
-  {
-    ARM_COMPUTE_RETURN_ON_ERROR(
-        CLArgOperationKernel::validate(tensors[i], tensors[i + 1], axis[i], op));
-  }
-
-  return Status{};
-}
-
-void CLArgOperation::run()
-{
-  for (size_t i = 0; i < _num_of_kernels; ++i)
-  {
-    CLScheduler::get().enqueue(_argop_kernels[i]);
-  }
-}
-
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
index 92ee69a36..e5122ab8f 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLBinaryLogicalOp.cpp
@@ -48,7 +48,7 @@ using namespace arm_compute;
 void CLBinaryLogicalOp::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output,
                                   BinaryLogicalOperation op)
 {
-  auto k = arm_compute::support::cpp14::make_unique<CLBinaryLogicalOpKernel>();
+  auto k = support::cpp14::make_unique<CLBinaryLogicalOpKernel>();
   k->configure(input1, input2, output, op);
   _kernel = std::move(k);
 
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp
deleted file mode 100644
index b3118f39e..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLCast.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLCast.h"
-
-#include "arm_compute/core/CL/kernels/CLCastKernel.h"
-
-using namespace arm_compute;
-
-void CLCast::configure(ICLTensor *input, ICLTensor *output, SubDataType input_subtype)
-{
-  auto k = arm_compute::support::cpp14::make_unique<CLCastKernel>();
-  k->configure(input, output, input_subtype);
-  _kernel = std::move(k);
-}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp
deleted file mode 100644
index db662505a..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLDepthToSpace.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLDepthToSpace.h"
-
-#include "arm_compute/core/CL/kernels/CLDepthToSpaceKernel.h"
-
-using namespace arm_compute;
-
-void CLDepthToSpace::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size)
-{
-  auto k = arm_compute::support::cpp14::make_unique<CLDepthToSpaceKernel>();
-  k->configure(input, output, block_size);
-  _kernel = std::move(k);
-}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
new file mode 100644
index 000000000..3dede0562
--- /dev/null
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLDirectTransposeConvLayer.cpp
@@ -0,0 +1,267 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Copyright (c) 2019-2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/functions/CLDirectTransposeConvLayer.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/UtilsEx.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <memory>
+#include <tuple>
+
+namespace arm_compute
+{
+using namespace arm_compute::misc::shape_calculator;
+
+CLDirectTransposeConvLayer::CLDirectTransposeConvLayer(
+    std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
+    : _memory_group(std::move(memory_manager)),
+      _scale_f(),
+      _conv_f(),
+      _flip_weights(),
+      _scaled_output(),
+      _original_weights(nullptr),
+      _weights_flipped(),
+      _flip_axis(),
+      _is_prepared(false)
+{
+}
+
+Status CLDirectTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights,
+                                            const ITensorInfo *bias, ITensorInfo *output,
+                                            const PadStrideInfo &info, unsigned int invalid_right,
+                                            unsigned int invalid_bottom,
+                                            const WeightsInfo &weights_info)
+{
+  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+      input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+  const DataLayout data_layout = input->data_layout();
+
+  const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+  const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+  const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h));
+  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
+
+  auto out_dims = transposeconv_output_dimensions(
+      input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w),
+      weights->dimension(idx_h), info, invalid_right, invalid_bottom);
+
+  const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights);
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
+
+  if (bias != nullptr)
+  {
+    if (is_data_type_quantized_asymmetric(input->data_type()))
+    {
+      ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+    }
+    else
+    {
+      ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+    }
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias);
+  }
+
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w],
+                                  "Output's width is invalid.");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h],
+                                  "Output's height is invalid.");
+  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c],
+                                  "Output's depth is invalid.");
+
+  unsigned int pad_left = 0;
+  unsigned int pad_right = 0;
+  unsigned int pad_top = 0;
+  unsigned int pad_bottom = 0;
+  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+      *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top,
+      pad_bottom);
+  TensorInfo scale_out_info(input->clone()
+                                ->set_is_resizable(true)
+                                .reset_padding()
+                                .set_tensor_shape(scale_out_shape)
+                                .set_data_layout(data_layout));
+  const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+
+  ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, info));
+  ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output,
+                                                           conv_info, weights_info));
+
+  return Status{};
+}
+
+void CLDirectTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights,
+                                           const ICLTensor *bias, ICLTensor *output,
+                                           const PadStrideInfo &info, unsigned int invalid_right,
+                                           unsigned int invalid_bottom,
+                                           const WeightsInfo &weights_info)
+{
+  configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, info,
+            invalid_right, invalid_bottom, weights_info);
+}
+
+void CLDirectTransposeConvLayer::configure(const CLCompileContext &compile_context,
+                                           ICLTensor *input, ICLTensor *weights,
+                                           const ICLTensor *bias, ICLTensor *output,
+                                           const PadStrideInfo &info, unsigned int invalid_right,
+                                           unsigned int invalid_bottom,
+                                           const WeightsInfo &weights_info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+  unsigned int pad_left = 0;
+  unsigned int pad_right = 0;
+  unsigned int pad_top = 0;
+  unsigned int pad_bottom = 0;
+  const unsigned int stride_x = info.stride().first;
+  const unsigned int stride_y = info.stride().second;
+
+  const DataLayout data_layout = input->info()->data_layout();
+
+  const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+  const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+  _original_weights = weights;
+  _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
+  _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
+  _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis);
+
+  auto out_dims = transposeconv_output_dimensions(
+      input->info()->dimension(idx_w), input->info()->dimension(idx_h),
+      weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right,
+      invalid_bottom);
+
+  const TensorShape output_shape =
+      compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
+
+  // Output auto initialization if not yet initialized
+  auto_init_if_empty(
+      *output->info(),
+      input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
+
+  // Perform validation step
+  ARM_COMPUTE_ERROR_THROW_ON(CLDirectTransposeConvLayer::validate(
+      input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(),
+      info, invalid_right, invalid_bottom));
+
+  _is_prepared = weights_info.retain_internal_weights();
+
+  _memory_group.manage(&_scaled_output);
+
+  // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order
+  // to match output shape
+  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+      *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
+      pad_right, pad_top, pad_bottom);
+
+  TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
+                            input->info()->quantization_info());
+  scale_out_info.set_data_layout(data_layout);
+  _scaled_output.allocator()->init(scale_out_info);
+
+  // configure scale function
+  const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
+                                    DimensionRoundingType::FLOOR);
+  _scale_f.configure(input, &_scaled_output, upsample_info);
+
+  // Setup the function to convolve the upscaled output
+  const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+  _conv_f.configure(compile_context, &_scaled_output, &_weights_flipped, bias, output, conv_info,
+                    weights_info);
+  _scaled_output.allocator()->allocate();
+
+  // Setup flip axis data
+  _flip_axis.allocator()->allocate();
+  _flip_axis.map(true);
+  auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer());
+  if (weights->info()->data_layout() == DataLayout::NHWC)
+  {
+    axis_data[0] = 1;
+    axis_data[1] = 2;
+  }
+  else
+  {
+    axis_data[0] = 0;
+    axis_data[1] = 1;
+  }
+  _flip_axis.unmap();
+}
+
+void CLDirectTransposeConvLayer::run()
+{
+  prepare();
+
+  MemoryGroupResourceScope scope_mg(_memory_group);
+
+  _scale_f.run();
+  _conv_f.run();
+}
+
+void CLDirectTransposeConvLayer::prepare()
+{
+  if (!_is_prepared)
+  {
+    ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+
+    // Run weights flipping and mark original weights tensor as unused
+    _weights_flipped.allocator()->allocate();
+    _flip_weights.run();
+    _original_weights->mark_as_unused();
+
+    // Prepare convolution
+    _conv_f.prepare();
+
+    // Free flipped weights
+    if (!_weights_flipped.is_used())
+    {
+      _weights_flipped.allocator()->free();
+    }
+
+    _is_prepared = true;
+  }
+}
+} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
index 3d9a28a48..ae9d8afc6 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLEmbeddingLookup.cpp
@@ -47,7 +47,7 @@ using namespace arm_compute;
 void CLEmbeddingLookup::configure(const ICLTensor *input, ICLTensor *output,
                                   const ICLTensor *lookups)
 {
-  auto k = arm_compute::support::cpp14::make_unique<CLEmbeddingLookupKernel>();
+  auto k = support::cpp14::make_unique<CLEmbeddingLookupKernel>();
   k->configure(input, output, lookups);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
index f098832b0..01989461e 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedHybridLayer.cpp
@@ -45,7 +45,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
 
 #include <algorithm>
 
@@ -60,7 +60,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
   ARM_COMPUTE_UNUSED(weights);
   ARM_COMPUTE_UNUSED(output);
   ARM_COMPUTE_RETURN_ON_ERROR(
-      CLGEMMLowpMatrixMultiplyCoreEx::validate(&input, &weights, nullptr, &output));
+      CLGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output));
 
   return Status{};
 }
@@ -68,7 +68,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
 
 void CLFullyConnectedHybridLayerReshapeWeights::configure(const ICLTensor *input, ICLTensor *output)
 {
-  auto k = arm_compute::support::cpp14::make_unique<CLTransposeKernel>();
+  auto k = support::cpp14::make_unique<CLTransposeKernel>();
   k->configure(input, output);
   _kernel = std::move(k);
 }
@@ -172,7 +172,8 @@ void CLFullyConnectedHybridLayer::configure(const ICLTensor *input, const ICLTen
 
   // Quantize input
   _quantized_input.allocator()->init(
-      input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8));
+      input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
+          DataType::QASYMM8_SIGNED));
   _memory_group.manage(&_quantized_input);
   _quant_input_kernel.configure(input, &_scale_factor, &_quantized_input);
 
@@ -199,7 +200,7 @@ Status CLFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
 {
   ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
   ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::S8);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8_SIGNED);
   ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
   ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
 
@@ -256,8 +257,9 @@ Status CLFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
   ARM_COMPUTE_RETURN_ON_ERROR(CLScaleFactorSymm8Kernel::validate(input, &scale_factor));
 
   // Validate quantization symm8 kernel
-  const ITensorInfo &quantized_input = TensorInfo(
-      input->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8));
+  const ITensorInfo &quantized_input =
+      TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_data_type(
+          DataType::QASYMM8_SIGNED));
   ARM_COMPUTE_RETURN_ON_ERROR(
       CLQuantizationSymmetricKernel::validate(input, &scale_factor, &quantized_input));
 
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
index 63e291b36..2ff4b9659 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedLayerEx.cpp
@@ -46,7 +46,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
 
 #include <algorithm>
 
@@ -141,7 +141,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
 
 void CLFullyConnectedLayerReshapeWeightsEx::configure(const ICLTensor *input, ICLTensor *output)
 {
-  auto k = arm_compute::support::cpp14::make_unique<CLTransposeKernel>();
+  auto k = support::cpp14::make_unique<CLTransposeKernel>();
   k->configure(input, output);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
index 9aebc473e..157b4d977 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLFullyConnectedReshapingLayer.cpp
@@ -53,18 +53,21 @@ void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *inp
       fc->configure(input_to_use, _weights, _biases, _output);
       return std::unique_ptr<arm_compute::IFunction>(fc);
     }
-    else
+    else if (kernel_type == KernelType::PREPROCESSED_WEIGHTS)
     {
-      assert(kernel_type == KernelType::PREPROCESSED_WEIGHTS);
-
       bool is_hybrid = (input->info()->data_type() == DataType::F32 ||
                         input->info()->data_type() == DataType::F16) &&
-                       weights->info()->data_type() == DataType::S8;
+                       (weights->info()->data_type() == DataType::S8 ||
+                        weights->info()->data_type() == DataType::QASYMM8_SIGNED);
 
       if (is_hybrid)
       {
         auto fc = new arm_compute::CLFullyConnectedHybridLayer{_memory_manager};
+        ITensorInfo *weights_info = const_cast<ITensorInfo *>(_weights->info());
+        const auto orgin_weights_data_type = weights_info->data_type();
+        weights_info->set_data_type(DataType::QASYMM8_SIGNED);
         fc->configure(input_to_use, _weights, _biases, _output);
+        weights_info->set_data_type(orgin_weights_data_type);
         return std::unique_ptr<arm_compute::IFunction>(fc);
       }
       else
@@ -74,6 +77,11 @@ void CLFullyConnectedReshapingLayer::configure(const arm_compute::ICLTensor *inp
         return std::unique_ptr<arm_compute::IFunction>(fc);
       }
     }
+    else
+    {
+      throw std::runtime_error("CLFullyConnectedReshapingLayer: Unsupported kernel type");
+    }
+
   }();
 
   if (_needs_reshape)
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.cpp
deleted file mode 100644
index ca5499dfc..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCoreEx.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-
-namespace arm_compute
-{
-using namespace arm_compute::misc::shape_calculator;
-using namespace arm_compute::cl_gemm;
-
-namespace
-{
-inline bool is_gemm_reshaped(bool reshape_b_only_on_first_run, GPUTarget gpu_target)
-{
-  return (get_arch_from_target(gpu_target) != GPUTarget::MIDGARD) && (reshape_b_only_on_first_run);
-}
-} // namespace
-
-CLGEMMLowpMatrixMultiplyCoreEx::CLGEMMLowpMatrixMultiplyCoreEx(
-    std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _mm_midgard_kernel(), _mtx_a_reduction_kernel(),
-      _mtx_b_reduction_kernel(), _vector_sum_col(), _vector_sum_row(), _a_offset(0), _b_offset(0),
-      _reshape_b_only_on_first_run(false), _is_prepared(false)
-{
-}
-
-void CLGEMMLowpMatrixMultiplyCoreEx::configure(const ICLTensor *a, const ICLTensor *b,
-                                               const ICLTensor *c, ICLTensor *output,
-                                               const GEMMInfo &gemm_info)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
-  ARM_COMPUTE_UNUSED(c);
-  ARM_COMPUTE_ERROR_THROW_ON(CLGEMMLowpMatrixMultiplyCoreEx::validate(
-      a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
-
-  _is_prepared = false;
-  _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
-  _a_offset = a->info()->quantization_info().uniform().offset;
-  _b_offset = b->info()->quantization_info().uniform().offset;
-
-  // Get the GPU target
-  const GPUTarget gpu_target = CLScheduler::get().target();
-
-  // Set the target for the kernels
-  _mm_midgard_kernel.set_target(gpu_target);
-
-  // GEMMRHSMatrixInfo rhs_info;
-  // GEMMLHSMatrixInfo lhs_info;
-
-  // Arguments used by GEMMReshapeInfo
-  // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m,
-  // n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo
-  // in order to know how the matrices have been reshaped
-  bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-  const unsigned int m = reinterpret_input_as_3d
-                             ? (a->info()->dimension(1) * a->info()->dimension(2))
-                             : a->info()->dimension(1);
-  const unsigned int n = b->info()->dimension(0);
-  const unsigned int k = a->info()->dimension(0);
-  const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
-
-  const ICLTensor *matrix_b = b;
-  // Configure matrix multiply kernel
-  _mm_midgard_kernel.configure(
-      a, matrix_b, output,
-      GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
-}
-
-Status CLGEMMLowpMatrixMultiplyCoreEx::validate(const ITensorInfo *a, const ITensorInfo *b,
-                                                const ITensorInfo *c, const ITensorInfo *output,
-                                                const GEMMInfo &gemm_info)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::S8);
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
-  ARM_COMPUTE_UNUSED(c);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(),
-                                  "Matrix A already reshaped is not supported");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(),
-                                  "Matrix B already reshaped is not supported");
-
-  const ITensorInfo *matrix_a_info = a;
-
-  // Get the GPU target
-  const GPUTarget gpu_target = CLScheduler::get().target();
-
-  bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-  const unsigned int m =
-      reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-  const unsigned int n = b->dimension(0);
-  const unsigned int k = a->dimension(0);
-  const int depth_output_gemm3d = gemm_info.depth_output_gemm3d();
-
-  bool reshape_matrix_b = is_gemm_reshaped(gemm_info.reshape_b_only_on_first_run(), gpu_target);
-
-  const GEMMReshapeInfo reshape_info =
-      GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
-
-  TensorInfo weights_info(*b);
-  const ITensorInfo *matrix_b_info = &weights_info;
-  if (reshape_matrix_b)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(false,
-                                    "CLGEMMLowpMatrixMultiplyCoreEx does not support reshape_b");
-  }
-
-  // Validate matrix multiply
-  ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernelEx::validate(
-      matrix_a_info, matrix_b_info, output, reshape_info));
-
-  return Status{};
-}
-
-void CLGEMMLowpMatrixMultiplyCoreEx::run()
-{
-  prepare();
-
-  MemoryGroupResourceScope scope_mg(_memory_group);
-
-  // Run matrix multiply
-  CLScheduler::get().enqueue(_mm_midgard_kernel, false);
-}
-
-void CLGEMMLowpMatrixMultiplyCoreEx::prepare()
-{
-  if (!_is_prepared)
-  {
-    _is_prepared = true;
-  }
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
index f594d7a2e..e0b833b04 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLGatherEx.cpp
@@ -48,7 +48,7 @@ using namespace arm_compute;
 void CLGatherEx::configure(const ICLTensor *input, const ICLTensor *indices, ICLTensor *output,
                            int axis)
 {
-  auto k = arm_compute::support::cpp14::make_unique<CLGatherExKernel>();
+  auto k = support::cpp14::make_unique<CLGatherExKernel>();
   k->configure(input, indices, output, axis);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
index 27ed8e828..65b89a389 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLHashtableLookup.cpp
@@ -47,7 +47,7 @@ using namespace arm_compute;
 void CLHashtableLookup::configure(const ICLTensor *lookups, const ICLTensor *keys,
                                   const ICLTensor *input, ICLTensor *output, ICLTensor *hits)
 {
-  auto k = arm_compute::support::cpp14::make_unique<CLHashtableLookupKernel>();
+  auto k = support::cpp14::make_unique<CLHashtableLookupKernel>();
   k->configure(lookups, keys, input, output, hits);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
index 80393e8d1..5a7e40839 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLInstanceNormalizationLayerEx.cpp
@@ -50,7 +50,7 @@ CLInstanceNormalizationLayerEx::CLInstanceNormalizationLayerEx() {}
 void CLInstanceNormalizationLayerEx::configure(ICLTensor *input, ICLTensor *output,
                                                ICLTensor *gamma, ICLTensor *beta, float epsilon)
 {
-  auto k = arm_compute::support::cpp14::make_unique<CLInstanceNormalizationLayerKernelEx>();
+  auto k = support::cpp14::make_unique<CLInstanceNormalizationLayerKernelEx>();
   k->configure(input, output, gamma, beta, epsilon);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp
deleted file mode 100644
index fbb15ab1d..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLPReLU.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLPReLU.h"
-
-#include "arm_compute/core/CL/kernels/CLPReLUKernel.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-using namespace arm_compute;
-
-void CLPReLU::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output)
-{
-  auto k = arm_compute::support::cpp14::make_unique<CLPReLUKernel>();
-  k->configure(input, alpha, output);
-  _kernel = std::move(k);
-
-  if (output->info()->dimension(0) > 1)
-  {
-    ICLTensor *broadcasted_info = (input->info()->dimension(0) == 1) ? input : alpha;
-
-    if (broadcasted_info->info()->dimension(0) == 1)
-    {
-      _border_handler.configure(broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
-    }
-  }
-}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp
deleted file mode 100644
index 6049b7e70..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLRNNLayerEx.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLRNNLayerEx.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "support/ToolchainSupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-using namespace arm_compute::misc::shape_calculator;
-
-CLRNNLayerEx::CLRNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(),
-      _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(),
-      _gemm_output(), _add_output(), _is_prepared(false)
-{
-}
-
-Status CLRNNLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights,
-                              const ITensorInfo *recurrent_weights, const ITensorInfo *bias,
-                              const ITensorInfo *hidden_state, const ITensorInfo *output,
-                              const ActivationLayerInfo &info)
-{
-  const int idx_width = 0;
-  const int idx_height = 1;
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state,
-                                      output);
-  ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width));
-  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) !=
-                              recurrent_weights->dimension(idx_width));
-  ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) !=
-                              recurrent_weights->dimension(1));
-  ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != 1);
-  ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(idx_width) != weights->dimension(idx_height));
-  ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_width) != weights->dimension(idx_height));
-  ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
-                                                     hidden_state->tensor_shape());
-
-  auto shape_info =
-      TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1,
-                 input->data_type());
-
-  ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info));
-  ARM_COMPUTE_RETURN_ON_ERROR(
-      CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f));
-  ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(
-      ArithmeticOperation::ADD, &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
-  ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayerKernel::validate(&shape_info, &shape_info, info));
-
-  return Status{};
-}
-
-void CLRNNLayerEx::configure(const ICLTensor *input, const ICLTensor *weights,
-                             const ICLTensor *recurrent_weights, const ICLTensor *bias,
-                             ICLTensor *hidden_state, ICLTensor *output, ActivationLayerInfo &info)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
-  ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayerEx::validate(input->info(), weights->info(),
-                                                    recurrent_weights->info(), bias->info(),
-                                                    hidden_state->info(), output->info(), info));
-
-  const int idx_height = 1;
-  TensorShape shape =
-      compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
-
-  _is_prepared = false;
-
-  _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
-  _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
-
-  // Manage intermediate buffers and configure
-  _memory_group.manage(&_fully_connected_out);
-  _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out);
-
-  _memory_group.manage(&_gemm_output);
-  _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f);
-
-  _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
-  _memory_group.manage(&_add_output);
-
-  _add_kernel.configure(ArithmeticOperation::ADD, &_fully_connected_out, &_gemm_output,
-                        &_add_output, ConvertPolicy::SATURATE);
-
-  _fully_connected_out.allocator()->allocate();
-  _gemm_output.allocator()->allocate();
-
-  _activation_kernel.configure(&_add_output, hidden_state, info);
-  _add_output.allocator()->allocate();
-
-  _copy_kernel.configure(hidden_state, output);
-}
-
-void CLRNNLayerEx::run()
-{
-  prepare();
-
-  _memory_group.acquire();
-
-  _fully_connected_kernel.run();
-  _gemm_state_f.run();
-  CLScheduler::get().enqueue(_add_kernel);
-  CLScheduler::get().enqueue(_activation_kernel);
-
-  // copy hidden out to output
-  CLScheduler::get().enqueue(_copy_kernel);
-
-  _memory_group.release();
-}
-
-void CLRNNLayerEx::prepare()
-{
-  if (!_is_prepared)
-  {
-    _fully_connected_kernel.prepare();
-    _gemm_state_f.prepare();
-
-    _is_prepared = true;
-  }
-}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
index 8ce2d746c..a41e6db60 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLReduceOperation.cpp
@@ -60,8 +60,7 @@ Status CLReduceOperation::validate(const ITensorInfo *input, const ITensorInfo *
   const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0);
 
   // Create temporary tensor infos
-  auto interm_tensors =
-      arm_compute::support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
+  auto interm_tensors = support::cpp14::make_unique<TensorInfo[]>(num_of_interm_tensors);
 
   // Create intermediate tensor info
   TensorShape shape{input->tensor_shape()};
@@ -119,9 +118,8 @@ void CLReduceOperation::configure(ICLTensor *input, ICLTensor *output,
   const size_t num_of_kernels = axis.size();
   const size_t num_of_interm_tensors = num_of_kernels - (keep_dims ? 1 : 0);
 
-  _interm_tensors = arm_compute::support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
-  _reduce_kernels =
-      arm_compute::support::cpp14::make_unique<CLReduceOperationKernel[]>(num_of_kernels);
+  _interm_tensors = support::cpp14::make_unique<CLTensor[]>(num_of_interm_tensors);
+  _reduce_kernels = support::cpp14::make_unique<CLReduceOperationKernel[]>(num_of_kernels);
 
   // Set a vector that is ordered ICLTensors sequentially.
   std::vector<ICLTensor *> tensors;
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp
deleted file mode 100644
index 7d7b2264b..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLSpaceToDepth.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2016-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLSpaceToDepth.h"
-
-#include "arm_compute/core/CL/kernels/CLSpaceToDepthKernel.h"
-
-using namespace arm_compute;
-
-void CLSpaceToDepth::configure(ICLTensor *input, ICLTensor *output, const int32_t block_size)
-{
-  auto k = arm_compute::support::cpp14::make_unique<CLSpaceToDepthKernel>();
-  k->configure(input, output, block_size);
-  _kernel = std::move(k);
-}
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
index e61746ef2..3215d01a7 100644
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayer.cpp
@@ -15,7 +15,7 @@
  */
 
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,218 +37,124 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-
 #include "arm_compute/runtime/CL/functions/CLTransposeConvLayer.h"
-#include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
 
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/UtilsEx.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CPP/CPPScheduler.h"
 
+#include <cmath>
 #include <memory>
 #include <tuple>
 
 using namespace arm_compute;
 using namespace arm_compute::misc::shape_calculator;
 
-CLTransposeConvLayer::CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
-    : _memory_group(std::move(memory_manager)),
-      _scale_f(),
-      _conv_f(),
-      _flip_weights(),
-      _scaled_output(),
-      _original_weights(nullptr),
-      _weights_flipped(),
-      _is_prepared(false)
+CLTransposeConvLayer::CLTransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_manager(std::move(memory_manager)), _function()
+{
+}
+
+void CLTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias,
+                                     ICLTensor *output, const PadStrideInfo &deconv_info,
+                                     unsigned int invalid_right, unsigned int invalid_bottom,
+                                     const WeightsInfo &weights_info)
 {
+  configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info,
+            invalid_right, invalid_bottom, weights_info);
+}
+
+void CLTransposeConvLayer::configure(const CLCompileContext &compile_context, ICLTensor *input,
+                                     ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
+                                     const PadStrideInfo &deconv_info, unsigned int invalid_right,
+                                     unsigned int invalid_bottom, const WeightsInfo &weights_info)
+{
+  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+
+  switch (CLTransposeConvLayer::get_deconvolution_method(input->info(), weights->info(), nullptr,
+                                                         output->info(), deconv_info, invalid_right,
+                                                         invalid_bottom, weights_info))
+  {
+    case DeconvolutionMethod::DIRECT:
+    {
+      auto f = arm_compute::support::cpp14::make_unique<CLDirectTransposeConvLayer>();
+      f->configure(compile_context, input, weights, bias, output, deconv_info, invalid_right,
+                   invalid_bottom, weights_info);
+      _function = std::move(f);
+      break;
+    }
+    case DeconvolutionMethod::GEMM:
+    {
+      auto f = arm_compute::support::cpp14::make_unique<CLGEMMDeconvolutionLayer>(_memory_manager);
+      f->configure(compile_context, input, weights, bias, output, deconv_info);
+      _function = std::move(f);
+      break;
+    }
+    default:
+      ARM_COMPUTE_ERROR("Not supported.");
+      break;
+  }
 }
 
 Status CLTransposeConvLayer::validate(const ITensorInfo *input, const ITensorInfo *weights,
                                       const ITensorInfo *bias, ITensorInfo *output,
-                                      const PadStrideInfo &info, unsigned int invalid_right,
+                                      const PadStrideInfo &deconv_info, unsigned int invalid_right,
                                       unsigned int invalid_bottom, const WeightsInfo &weights_info)
 {
   ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16,
-                                                       DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
-
-  const DataLayout data_layout = input->data_layout();
-
-  const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-  const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-  const size_t idx_c = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
-  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) != weights->dimension(idx_h));
-  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
-
-  const unsigned int kernel_x = weights->dimension(idx_w);
-  const unsigned int kernel_y = weights->dimension(idx_h);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(invalid_right > kernel_x - 1,
-                                  "invalid_right must be smaller than kernel_x");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(invalid_bottom > kernel_y - 1,
-                                  "inner_border_top must be smaller than kernel_y");
-
-  // NOTE From the existing CLDeconvolutionLayer, invalid_right and invalid_bottom were added.
-  auto out_dims = transposeconv_output_dimensions(
-      input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w),
-      weights->dimension(idx_h), info, invalid_right, invalid_bottom);
-
-  const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights);
-
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, weights);
-
-  if (bias != nullptr)
+  switch (CLTransposeConvLayer::get_deconvolution_method(
+      input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info))
   {
-    if (is_data_type_quantized_asymmetric(input->data_type()))
+    case DeconvolutionMethod::DIRECT:
     {
-      ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+      // Validate direct convolution layer
+      ARM_COMPUTE_RETURN_ON_ERROR(CLDirectTransposeConvLayer::validate(
+          input, weights, bias, output, deconv_info, invalid_right, invalid_bottom, weights_info));
+      break;
     }
-    else
+    case DeconvolutionMethod::GEMM:
     {
-      ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+      // Validate gemm-based convolution layer
+      ARM_COMPUTE_RETURN_ON_ERROR(
+          CLGEMMDeconvolutionLayer::validate(input, weights, bias, output, deconv_info));
+      break;
     }
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, bias);
+    default:
+      ARM_COMPUTE_ERROR("Not supported.");
+      break;
   }
 
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_w) != output_shape[idx_w],
-                                  "Output's width is invalid.");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_h) != output_shape[idx_h],
-                                  "Output's height is invalid.");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(idx_c) != output_shape[idx_c],
-                                  "Output's depth is invalid.");
-
-  unsigned int pad_left = 0;
-  unsigned int pad_right = 0;
-  unsigned int pad_top = 0;
-  unsigned int pad_bottom = 0;
-  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
-      *input, *weights, info, out_dims, invalid_right, invalid_bottom, pad_left, pad_right, pad_top,
-      pad_bottom);
-  TensorInfo scale_out_info(input->clone()
-                                ->set_is_resizable(true)
-                                .reset_padding()
-                                .set_tensor_shape(scale_out_shape)
-                                .set_data_layout(data_layout));
-  const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-
-  ARM_COMPUTE_RETURN_ON_ERROR(
-      CLTransposeConvLayerUpsample::validate(input, &scale_out_info, BorderSize(0, 0), info));
-  ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output,
-                                                           conv_info, weights_info));
-
   return Status{};
 }
 
-void CLTransposeConvLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias,
-                                     ICLTensor *output, const PadStrideInfo &info,
-                                     unsigned int invalid_right, unsigned int invalid_bottom,
-                                     const WeightsInfo &weights_info)
+DeconvolutionMethod CLTransposeConvLayer::get_deconvolution_method(
+    const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias,
+    ITensorInfo *output, const PadStrideInfo &deconv_info, unsigned int invalid_right,
+    unsigned int invalid_bottom, const WeightsInfo &weights_info)
 {
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-
-  const unsigned int stride_x = info.stride().first;
-  const unsigned int stride_y = info.stride().second;
+  ARM_COMPUTE_UNUSED(output, bias, weights_info);
 
-  const DataLayout data_layout = input->info()->data_layout();
+  const DataLayout data_layout = input->data_layout();
 
   const size_t idx_w = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
   const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
-  _original_weights = weights;
-  _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
-  _flip_weights.configure(weights, &_weights_flipped);
-
-  // NOTE From the existing CLDeconvolutionLayer, invalid_right and invalid_bottom were
-  // added.
-  auto out_dims = transposeconv_output_dimensions(
-      input->info()->dimension(idx_w), input->info()->dimension(idx_h),
-      weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info, invalid_right,
-      invalid_bottom);
-
-  const TensorShape output_shape =
-      compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
-
-  // Output auto initialization if not yet initialized
-  auto_init_if_empty(
-      *output->info(),
-      input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
-
-  // Perform validation step
-  ARM_COMPUTE_ERROR_THROW_ON(CLTransposeConvLayer::validate(
-      input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(),
-      info, invalid_right, invalid_bottom));
-
-  _is_prepared = weights_info.retain_internal_weights();
-
-  _memory_group.manage(&_scaled_output);
-
-  // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order
-  // to match output shape
-  unsigned int pad_left = 0;
-  unsigned int pad_right = 0;
-  unsigned int pad_top = 0;
-  unsigned int pad_bottom = 0;
-  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
-      *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
-      pad_right, pad_top, pad_bottom);
-
-  TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
-                            input->info()->quantization_info());
-  scale_out_info.set_data_layout(data_layout);
-  _scaled_output.allocator()->init(scale_out_info);
-
-  // configure scale function
-  const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
-                                    DimensionRoundingType::FLOOR);
-  _scale_f.configure(input, &_scaled_output, BorderSize(0, 0), upsample_info);
-
-  // setup the function to convolve the upscaled output
-  const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-  _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info);
-  _scaled_output.allocator()->allocate();
+  if (weights->dimension(idx_w) != deconv_info.stride().first ||
+      weights->dimension(idx_h) != deconv_info.stride().second || invalid_right != 0 ||
+      invalid_bottom != 0)
+  {
+    return DeconvolutionMethod::DIRECT;
+  }
+
+  return DeconvolutionMethod::GEMM;
 }
 
 void CLTransposeConvLayer::run()
 {
   prepare();
-
-  _memory_group.acquire();
-
-  _scale_f.run();
-  _conv_f.run();
-
-  _memory_group.release();
+  _function->run();
 }
 
-void CLTransposeConvLayer::prepare()
-{
-  if (!_is_prepared)
-  {
-    ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
-
-    // Run weights flipping and mark original weights tensor as unused
-    _weights_flipped.allocator()->allocate();
-    _weights_flipped.map(true);
-    _original_weights->map(CLScheduler::get().queue(), true);
-    CPPScheduler::get().schedule(&_flip_weights, Window::DimZ);
-    _weights_flipped.unmap();
-    _original_weights->unmap(CLScheduler::get().queue());
-    _original_weights->mark_as_unused();
-
-    // Prepare convolution
-    _conv_f.prepare();
-
-    if (!_weights_flipped.is_used())
-    {
-      _weights_flipped.allocator()->free();
-    }
-
-    _is_prepared = true;
-  }
-}
+void CLTransposeConvLayer::prepare() { _function->prepare(); }
diff --git a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp b/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp
deleted file mode 100644
index 07feb5a64..000000000
--- a/compute/ARMComputeEx/src/runtime/CL/functions/CLTransposeConvLayerUpsample.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2018 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CL/functions/CLTransposeConvLayerUpsample.h"
-
-#include "arm_compute/core/CL/OpenCL.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/core/CL/ICLTensor.h"
-
-#include <cmath>
-#include <memory>
-#include <tuple>
-
-using namespace arm_compute;
-
-CLTransposeConvLayerUpsample::CLTransposeConvLayerUpsample() // NOLINT
-    : _upsample(),
-      _output(nullptr)
-{
-}
-
-Status CLTransposeConvLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                              const BorderSize &inner_border,
-                                              const PadStrideInfo &info)
-{
-  return CLTransposeConvLayerUpsampleKernel::validate(input, output, inner_border, info);
-}
-
-void CLTransposeConvLayerUpsample::configure(ICLTensor *input, ICLTensor *output,
-                                             const BorderSize &inner_border,
-                                             const PadStrideInfo &info)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-  _output = output;
-  _upsample.configure(input, _output, inner_border, info);
-}
-
-void CLTransposeConvLayerUpsample::run()
-{
-  _output->map(CLScheduler::get().queue(), true);
-  if (is_data_type_quantized_asymmetric(_output->info()->data_type()))
-  {
-    const uint8_t quantized_zero = _output->info()->quantization_info().uniform().offset;
-    std::fill_n(_output->buffer(), _output->info()->total_size(), quantized_zero);
-  }
-  else
-  {
-    memset(_output->buffer(), 0, _output->info()->total_size());
-  }
-  _output->unmap(CLScheduler::get().queue());
-
-  CLScheduler::get().enqueue(_upsample, false);
-}
diff --git a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp b/compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp
index 114e1a72d..768c15b41 100644
--- a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/CPP/functions/CPPOneHotEx.cpp
@@ -41,14 +41,14 @@
 #include "arm_compute/runtime/CPP/functions/CPPOneHotEx.h"
 
 #include "arm_compute/core/CPP/kernels/CPPOneHotKernelEx.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
 void CPPOneHotEx::configure(const ITensor *indices, const ITensor *depth, const ITensor *on_value,
                             const ITensor *off_value, ITensor *output, const int axis)
 {
-  auto k = arm_compute::support::cpp14::make_unique<CPPOneHotKernelEx>();
+  auto k = support::cpp14::make_unique<CPPOneHotKernelEx>();
   k->configure(indices, depth, on_value, off_value, output, axis);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp b/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp
deleted file mode 100644
index 6c90ef3b4..000000000
--- a/compute/ARMComputeEx/src/runtime/CPP/functions/CPPUpsampleEx.cpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/CPP/functions/CPPUpsampleEx.h"
-
-#include "arm_compute/core/CPP/kernels/CPPUpsampleKernelEx.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-
-void CPPUpsampleEx::configure(const ITensor *input, ITensor *output, const PadStrideInfo &info)
-{
-  auto k = arm_compute::support::cpp14::make_unique<CPPUpsampleKernelEx>();
-  k->configure(input, output, info);
-  _kernel = std::move(k);
-}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp
index ff81ff854..2752eb6aa 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEActivationLayerEx.cpp
@@ -42,7 +42,7 @@
 
 #include "arm_compute/core/NEON/kernels/NEActivationLayerKernelEx.h"
 #include "arm_compute/runtime/IRuntimeContext.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
 
 namespace arm_compute
 {
@@ -53,7 +53,7 @@ NEActivationLayerEx::NEActivationLayerEx(IRuntimeContext *ctx) // NOLINT
 void NEActivationLayerEx::configure(ITensor *input, ITensor *output,
                                     ActivationLayerInfo activation_info)
 {
-  auto k = arm_compute::support::cpp14::make_unique<NEActivationLayerKernelEx>();
+  auto k = support::cpp14::make_unique<NEActivationLayerKernelEx>();
   k->configure(input, output, activation_info);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
index e42c453cf..2fc94b267 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEBinaryLogicalOperation.cpp
@@ -42,7 +42,7 @@
 #include <arm_compute/core/NEON/kernels/NEBinaryLogicalOperationKernel.h>
 
 #include "arm_compute/core/ITensor.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
 
 #include <utility>
 
@@ -53,7 +53,7 @@ template <BinaryLogicalOperation COP>
 void NEBinaryLogicalOperationStatic<COP>::configure(ITensor *input1, ITensor *input2,
                                                     ITensor *output)
 {
-  auto k = arm_compute::support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
+  auto k = support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
   k->configure(COP, input1, input2, output);
   _kernel = std::move(k);
 }
@@ -69,7 +69,7 @@ Status NEBinaryLogicalOperationStatic<COP>::validate(const ITensorInfo *input1,
 void NEBinaryLogicalOperation::configure(ITensor *input1, ITensor *input2, ITensor *output,
                                          BinaryLogicalOperation op)
 {
-  auto k = arm_compute::support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
+  auto k = support::cpp14::make_unique<NEBinaryLogicalOperationKernel>();
   k->configure(op, input1, input2, output);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp
deleted file mode 100644
index dc5c62061..000000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NECast.cpp
+++ /dev/null
@@ -1,60 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NECast.h"
-
-#include "arm_compute/core/NEON/kernels/NECastKernel.h"
-#include "support/ToolchainSupport.h"
-
-namespace arm_compute
-{
-void NECast::configure(const ITensor *input, ITensor *output, SubDataType input_subtype)
-{
-  auto k = arm_compute::support::cpp14::make_unique<NECastKernel>();
-  k->configure(input, output, input_subtype);
-  _kernel = std::move(k);
-}
-
-Status NECast::validate(const ITensorInfo *input, const ITensorInfo *output,
-                        SubDataType input_subtype)
-{
-  return NECastKernel::validate(input, output, input_subtype);
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp
deleted file mode 100644
index 5ec0b8677..000000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEDepthToSpaceLayerEx.cpp
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NEDepthToSpaceLayerEx.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/NEON/kernels/NEDepthToSpaceLayerKernelEx.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-
-namespace arm_compute
-{
-void NEDepthToSpaceLayerEx::configure(const ITensor *input, ITensor *output, int32_t block_shape)
-{
-  auto k = arm_compute::support::cpp14::make_unique<NEDepthToSpaceLayerKernelEx>();
-  k->configure(input, output, block_shape);
-  _kernel = std::move(k);
-}
-
-Status NEDepthToSpaceLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                       int32_t block_shape)
-{
-  return NEDepthToSpaceLayerKernelEx::validate(input, output, block_shape);
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
index 53fb15081..e0ab3e025 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEEmbeddingLookup.cpp
@@ -41,13 +41,13 @@
 #include "arm_compute/runtime/NEON/functions/NEEmbeddingLookup.h"
 
 #include "arm_compute/core/NEON/kernels/NEEmbeddingLookupKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
 void NEEmbeddingLookup::configure(const ITensor *input, ITensor *output, const ITensor *lookups)
 {
-  auto k = arm_compute::support::cpp14::make_unique<NEEmbeddingLookupKernel>();
+  auto k = support::cpp14::make_unique<NEEmbeddingLookupKernel>();
   k->configure(input, output, lookups);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
index f45773251..a123439d9 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedHybridLayer.cpp
@@ -58,7 +58,7 @@ namespace
 Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const ITensorInfo &output)
 {
   ARM_COMPUTE_RETURN_ON_ERROR(
-      NEGEMMLowpMatrixMultiplyCoreEx::validate(&input, &weights, nullptr, &output));
+      NEGEMMLowpMatrixMultiplyCore::validate(&input, &weights, nullptr, &output));
 
   return Status{};
 }
@@ -66,7 +66,7 @@ Status validate_mm(const ITensorInfo &input, const ITensorInfo &weights, const I
 
 void NEFullyConnectedHybridLayerReshapeWeights::configure(const ITensor *input, ITensor *output)
 {
-  auto k = arm_compute::support::cpp14::make_unique<NETransposeKernel>();
+  auto k = support::cpp14::make_unique<NETransposeKernel>();
   k->configure(input, output);
   _kernel = std::move(k);
 }
@@ -158,7 +158,8 @@ void NEFullyConnectedHybridLayer::configure(const ITensor *input, const ITensor
 
   // Quantize input
   _quantized_input.allocator()->init(
-      input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8));
+      input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(
+          DataType::QASYMM8_SIGNED));
   _scale_factor.allocator()->init(
       TensorInfo(TensorShape{output->info()->dimension(1)}, 1, DataType::F32));
   _quant_input_kernel.configure(input, &_quantized_input, &_scale_factor);
@@ -186,7 +187,7 @@ Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
   ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights);
   ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
   ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::S8);
+  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8_SIGNED);
   ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
   ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
   ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2);
@@ -224,8 +225,9 @@ Status NEFullyConnectedHybridLayer::validate(const ITensorInfo *input, const ITe
   ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_to_use->dimension(1));
 
   // Validate quantization kernel
-  const ITensorInfo &quantized_input = TensorInfo(
-      input->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S8));
+  const ITensorInfo &quantized_input =
+      TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_data_type(
+          DataType::QASYMM8_SIGNED));
   const ITensorInfo &scale_factor = TensorInfo(TensorShape{output->dimension(1)}, 1, DataType::F32);
   ARM_COMPUTE_RETURN_ON_ERROR(
       NEQuantizationSymmetricKernel::validate(input, &quantized_input, &scale_factor));
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
index fcac3c7ae..dc6c78478 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEFullyConnectedReshapingLayer.cpp
@@ -56,12 +56,17 @@ void NEFullyConnectedReshapingLayer::configure(const arm_compute::ITensor *input
       assert(kernel_type == KernelType::PREPROCESSED_WEIGHTS);
 
       bool is_hybrid = input->info()->data_type() == DataType::F32 &&
-                       weights->info()->data_type() == DataType::S8;
+                       (weights->info()->data_type() == DataType::S8 ||
+                        weights->info()->data_type() == DataType::QASYMM8_SIGNED);
 
       if (is_hybrid)
       {
         auto fc = new arm_compute::NEFullyConnectedHybridLayer{_memory_manager};
+        ITensorInfo *weights_info = const_cast<ITensorInfo *>(_weights->info());
+        const auto orgin_weights_data_type = weights_info->data_type();
+        weights_info->set_data_type(DataType::QASYMM8_SIGNED);
         fc->configure(input_to_use, _weights, _biases, _output);
+        weights_info->set_data_type(orgin_weights_data_type);
         return std::unique_ptr<arm_compute::IFunction>(fc);
       }
       else
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp
deleted file mode 100644
index 1290cfd39..000000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.cpp
+++ /dev/null
@@ -1,513 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCoreEx.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "support/ToolchainSupport.h"
-
-using namespace arm_compute;
-using namespace arm_compute::misc::shape_calculator;
-
-NEGEMMLowpMatrixMultiplyCoreEx::NEGEMMLowpMatrixMultiplyCoreEx(
-    std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr),
-      _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _mtx_a_reduction_kernel(),
-      _mtx_b_reduction_kernel(), _offset_contribution_kernel(),
-      _offset_contribution_output_stage_kernel(), _vector_sum_col(), _vector_sum_row(), _tmp_a(),
-      _tmp_b(), _mm_result_s32(), _signed_a(), _signed_output(), _original_b(nullptr), _a_offset(0),
-      _b_offset(0), _run_vector_matrix_multiplication(false), _assembly_path(false),
-      _fused_assembly_path(false), _reshape_b_only_on_first_run(false), _is_prepared(false),
-      _fuse_output_stage(false), _flip_signedness(false)
-{
-}
-
-void NEGEMMLowpMatrixMultiplyCoreEx::configure(const ITensor *a, const ITensor *b, const ITensor *c,
-                                               ITensor *output, const GEMMInfo &gemm_info)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
-  ARM_COMPUTE_UNUSED(c);
-  ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpMatrixMultiplyCoreEx::validate(
-      a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info));
-
-  const ITensor *matrix_a = a;
-  const ITensor *matrix_b = b;
-  GEMMInfo info = gemm_info;
-
-  // Clear state
-  _mtx_a_reshape_kernel = nullptr;
-  _mtx_b_reshape_kernel = nullptr;
-
-  // Set internal variables
-  _a_offset = a->info()->quantization_info().uniform().offset;
-  _b_offset = b->info()->quantization_info().uniform().offset;
-  _run_vector_matrix_multiplication = a->info()->dimension(1) < 2;
-  _reshape_b_only_on_first_run = info.reshape_b_only_on_first_run();
-  _is_prepared = false;
-  _fused_assembly_path = false;
-  _original_b = b;
-
-  const ITensor *a_to_use = a;
-
-  // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
-  if (info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
-  {
-    _fuse_output_stage = true;
-    _memory_group.manage(&_mm_result_s32);
-    TensorInfo info_mm_result_s32(output->info()->tensor_shape(), 1, DataType::S32);
-    _mm_result_s32.allocator()->init(info_mm_result_s32);
-  }
-
-#ifdef __aarch64__
-  switch (a->info()->data_type())
-  {
-    case DataType::QASYMM8:
-    case DataType::QASYMM8_SIGNED:
-    case DataType::U8:
-    case DataType::S8:
-    {
-      if (a_to_use->info()->data_type() == DataType::QASYMM8 &&
-          info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
-      {
-        _asm_glue.configure(a_to_use, b, c, output, gemm_info);
-        _fused_assembly_path = _asm_glue.is_configured();
-      }
-      else
-      {
-        _asm_glue.configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output,
-                            gemm_info);
-      }
-      _assembly_path = _asm_glue.is_configured();
-      break;
-    }
-    default:
-    {
-      ARM_COMPUTE_ERROR("Datatype not supported");
-      break;
-    }
-  }
-#endif /* __aarch64__ */
-  if (!(_assembly_path || _run_vector_matrix_multiplication))
-  {
-    matrix_a = &_tmp_a;
-    matrix_b = &_tmp_b;
-
-    // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width /
-    // 4.0f) ]
-    TensorInfo a_info(compute_interleaved_shape(*a_to_use->info()), 1,
-                      a_to_use->info()->data_type(), a_to_use->info()->quantization_info());
-    // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width /
-    // 16.0f) ]
-    TensorInfo b_info(compute_transpose1xW_shape(*b->info()), 1, b->info()->data_type(),
-                      b->info()->quantization_info());
-    _tmp_a.allocator()->init(a_info);
-    _tmp_b.allocator()->init(b_info);
-    _memory_group.manage(&_tmp_a);
-    if (!_reshape_b_only_on_first_run)
-    {
-      _memory_group.manage(&_tmp_b);
-    }
-
-    // Configure interleave kernel
-    {
-      auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
-      k->configure(a_to_use, &_tmp_a);
-      _mtx_a_reshape_kernel = std::move(k);
-    }
-
-    // Configure transpose kernel
-    {
-      auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
-      k->configure(b, &_tmp_b);
-      _mtx_b_reshape_kernel = std::move(k);
-    }
-  }
-
-  if (!_fused_assembly_path)
-  {
-    // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
-    if (_a_offset != 0)
-    {
-      TensorInfo info_vector_sum_col(compute_reductionA_shape(*b->info()), 1, DataType::S32);
-
-      _vector_sum_col.allocator()->init(info_vector_sum_col);
-      if (!_reshape_b_only_on_first_run)
-      {
-        _memory_group.manage(&_vector_sum_col);
-      }
-
-      // Configure Matrix B reduction kernel
-      _mtx_b_reduction_kernel.configure(b, &_vector_sum_col, a_to_use->info()->dimension(0), false);
-    }
-
-    // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
-    if (_b_offset != 0)
-    {
-      TensorInfo info_vector_sum_row(compute_reductionB_shape(*a_to_use->info()), 1, DataType::S32);
-
-      _vector_sum_row.allocator()->init(info_vector_sum_row);
-      _memory_group.manage(&_vector_sum_row);
-
-      // Configure matrix A reduction kernel
-      _mtx_a_reduction_kernel.configure(a_to_use, &_vector_sum_row, a_to_use->info()->dimension(0),
-                                        false);
-    }
-
-    if (_fuse_output_stage)
-    {
-      // Configure matrix multiply kernel
-      if (!_assembly_path)
-      {
-        auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
-        k->configure(matrix_a, matrix_b, &_mm_result_s32);
-        _mm_kernel = std::move(k);
-      }
-
-      _offset_contribution_output_stage_kernel.configure(
-          &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col,
-          _b_offset == 0 ? nullptr : &_vector_sum_row, c,
-          _flip_signedness ? &_signed_output : output, a->info()->dimension(0), _a_offset,
-          _b_offset, info.gemmlowp_output_stage());
-    }
-    else
-    {
-      // Configure matrix multiply kernel
-      if (!_assembly_path)
-      {
-        auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
-        k->configure(matrix_a, matrix_b, output);
-        _mm_kernel = std::move(k);
-      }
-      // Configure offset contribution kernel
-      _offset_contribution_kernel.configure(output, _a_offset == 0 ? nullptr : &_vector_sum_col,
-                                            _b_offset == 0 ? nullptr : &_vector_sum_row,
-                                            a_to_use->info()->dimension(0), _a_offset, _b_offset);
-    }
-  }
-
-  // Allocate tensors
-  if (!_assembly_path && !_run_vector_matrix_multiplication)
-  {
-    _tmp_a.allocator()->allocate();
-    if (!_reshape_b_only_on_first_run)
-    {
-      _tmp_b.allocator()->allocate();
-    }
-  }
-
-  if (!_fused_assembly_path)
-  {
-    if (_a_offset != 0 && !_reshape_b_only_on_first_run)
-    {
-      _vector_sum_col.allocator()->allocate();
-    }
-
-    if (_b_offset != 0)
-    {
-      _vector_sum_row.allocator()->allocate();
-    }
-  }
-
-  if (_fuse_output_stage)
-  {
-    _mm_result_s32.allocator()->allocate();
-  }
-}
-
-Status NEGEMMLowpMatrixMultiplyCoreEx::validate(const ITensorInfo *a, const ITensorInfo *b,
-                                                const ITensorInfo *c, const ITensorInfo *output,
-                                                const GEMMInfo &gemm_info)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::S8);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::S8);
-  ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-      c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE,
-      "Bias addition not supported in NEGEMMLowpMatrixMultiplyCoreEx for output S32");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
-                                  "The product AB is defined only if the number of columns in A is "
-                                  "equal to the number of rows in B");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(),
-                                  "Matrix A already reshaped is not supported");
-  ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(),
-                                  "Matrix B already reshaped is not supported");
-
-  GEMMInfo info = gemm_info;
-  const ITensorInfo *matrix_a_info = a;
-  const ITensorInfo *matrix_b_info = b;
-
-  const ITensorInfo *a_to_use = a;
-
-  TensorInfo tmp_a_info{};
-  TensorInfo tmp_b_info{};
-  TensorInfo mm_result_s32_info{};
-
-  int32_t a_offset = a->quantization_info().uniform().offset;
-  int32_t b_offset = b->quantization_info().uniform().offset;
-
-  bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
-  if (fuse_output_stage)
-  {
-    auto_init_if_empty(
-        mm_result_s32_info,
-        a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
-  }
-
-  // Check if we need to run the optimized assembly kernel
-  bool run_optimised = false;
-  bool run_optimised_requantized = false;
-  if (a_to_use->data_type() == DataType::QASYMM8 &&
-      info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
-  {
-    run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, c, output, gemm_info));
-    run_optimised_requantized = run_optimised;
-  }
-  else
-  {
-    run_optimised = bool(NEGEMMAssemblyDispatch::validate(
-        a_to_use, b, c, fuse_output_stage ? &mm_result_s32_info : output, gemm_info));
-  }
-
-  if (run_optimised)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
-    if (info.depth_output_gemm3d() != 0)
-    {
-      if (info.reinterpret_input_as_3d())
-      {
-        ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
-        ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
-      }
-      else
-      {
-        ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1) * output->dimension(2));
-      }
-    }
-    else
-    {
-      ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
-    }
-  }
-  else
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(),
-                                    "NEGEMM cannot reinterpret the input tensor as 3D");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0,
-                                    "NEGEMM cannot reinterpret the output tensor as 3D");
-
-    const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
-    if (!run_vector_matrix_multiplication)
-    {
-      matrix_a_info = &tmp_a_info;
-      matrix_b_info = &tmp_b_info;
-
-      // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width /
-      // 4.0f) ]
-      TensorShape shape_tmp_a = a->tensor_shape();
-      shape_tmp_a.set(0, a->dimension(0) * 4);
-      shape_tmp_a.set(1, std::ceil(a->dimension(1) / 4.f));
-
-      // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width
-      // / 16.0f) ]
-      TensorShape shape_tmp_b = b->tensor_shape();
-      shape_tmp_b.set(0, b->dimension(1) * 16);
-      shape_tmp_b.set(1, std::ceil(b->dimension(0) / 16.f));
-
-      // Validate interleave kernel
-      auto_init_if_empty(tmp_a_info, a_to_use->clone()->set_tensor_shape(shape_tmp_a));
-      auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(shape_tmp_b));
-
-      ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMInterleave4x4Kernel::validate(a_to_use, &tmp_a_info));
-      ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMTranspose1xWKernel::validate(b, &tmp_b_info));
-    }
-  }
-
-  if (!run_optimised_requantized)
-  {
-    TensorInfo info_vector_sum_col{};
-    TensorInfo info_vector_sum_row{};
-
-    // Validate matrix B reduction kernel only if _a_offset is not equal to 0
-    if (a_offset != 0)
-    {
-      info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
-
-      // Configure Matrix B reduction kernel
-      ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixBReductionKernel::validate(
-          b, &info_vector_sum_col, a->dimension(0), false));
-    }
-
-    // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
-    if (b_offset != 0)
-    {
-      info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
-
-      // Configure matrix A reduction kernel
-      ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(
-          a_to_use, &info_vector_sum_row, a->dimension(0), false));
-    }
-
-    if (fuse_output_stage)
-    {
-      if (!run_optimised)
-      {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyKernel::validate(
-            matrix_a_info, matrix_b_info, &mm_result_s32_info));
-      }
-
-      // Validate offset contribution kernel
-      ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionOutputStageKernel::validate(
-          &mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col,
-          b_offset == 0 ? nullptr : &info_vector_sum_row, c, output, a_offset, b_offset,
-          info.gemmlowp_output_stage()));
-    }
-    else
-    {
-      if (!run_optimised)
-      {
-        ARM_COMPUTE_RETURN_ON_ERROR(
-            NEGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
-      }
-      // Validate offset contribution kernel
-      ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOffsetContributionKernel::validate(
-          output, a_offset == 0 ? nullptr : &info_vector_sum_col,
-          b_offset == 0 ? nullptr : &info_vector_sum_row, a_offset, b_offset));
-    }
-  }
-  return Status{};
-}
-
-void NEGEMMLowpMatrixMultiplyCoreEx::run()
-{
-  prepare();
-
-  MemoryGroupResourceScope scope_mg(_memory_group);
-
-  // Reshape inputs
-  if (_mtx_a_reshape_kernel)
-  {
-    NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
-  }
-  if (_mtx_b_reshape_kernel && !_reshape_b_only_on_first_run)
-  {
-    NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
-  }
-
-  // Run GEMM
-  if (_asm_glue.is_configured())
-  {
-    _asm_glue.run();
-  }
-  else
-  {
-    NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
-  }
-
-  if (!_fused_assembly_path)
-  {
-    // Run matrix A reduction kernel only if _b_offset is not equal to 0
-    if (_b_offset != 0)
-    {
-      NEScheduler::get().schedule(&_mtx_a_reduction_kernel, Window::DimX);
-    }
-
-    // Run matrix B reduction kernel only if _a_offset is not equal to 0
-    if (_a_offset != 0 && !_reshape_b_only_on_first_run)
-    {
-      NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
-    }
-
-    if (_fuse_output_stage)
-    {
-      // Run offset contribution kernel
-      NEScheduler::get().schedule(&_offset_contribution_output_stage_kernel, Window::DimY);
-    }
-    else
-    {
-      // Run offset contribution kernel
-      NEScheduler::get().schedule(&_offset_contribution_kernel, Window::DimY);
-    }
-  }
-}
-
-void NEGEMMLowpMatrixMultiplyCoreEx::prepare()
-{
-  if (!_is_prepared)
-  {
-    // Run assembly reshape
-    if (_asm_glue.is_configured() && _reshape_b_only_on_first_run)
-    {
-      ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
-
-      _asm_glue.prepare();
-      _original_b->mark_as_unused();
-    }
-    // Run non-assembly reshape
-    else if (_mtx_b_reshape_kernel && _reshape_b_only_on_first_run)
-    {
-      ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
-
-      // Run reshape kernel and mark original weights tensor as unused
-      _tmp_b.allocator()->allocate();
-      NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
-      _original_b->mark_as_unused();
-    }
-
-    // Run matrix B reduction kernel only if _a_offset is not equal to 0
-    if (_a_offset != 0 && _reshape_b_only_on_first_run)
-    {
-      _vector_sum_col.allocator()->allocate();
-      NEScheduler::get().schedule(&_mtx_b_reduction_kernel, Window::DimX);
-    }
-
-    _is_prepared = true;
-  }
-}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
index c8bb88aea..433c35d58 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEGatherEx.cpp
@@ -41,7 +41,7 @@
 #include "arm_compute/runtime/NEON/functions/NEGatherEx.h"
 
 #include "arm_compute/core/NEON/kernels/NEGatherKernelEx.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
 
 #include <utility>
 
@@ -49,7 +49,7 @@ namespace arm_compute
 {
 void NEGatherEx::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis)
 {
-  auto k = arm_compute::support::cpp14::make_unique<NEGatherKernelEx>();
+  auto k = support::cpp14::make_unique<NEGatherKernelEx>();
   k->configure(input, indices, output, axis);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
index 078019f4e..52d58accf 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NEHashtableLookup.cpp
@@ -41,14 +41,14 @@
 #include "arm_compute/runtime/NEON/functions/NEHashtableLookup.h"
 
 #include "arm_compute/core/NEON/kernels/NEHashtableLookupKernel.h"
-#include "support/ToolchainSupport.h"
+#include "support/MemorySupport.h"
 
 using namespace arm_compute;
 
 void NEHashtableLookup::configure(const ITensor *lookups, const ITensor *keys, const ITensor *input,
                                   ITensor *output, ITensor *hits)
 {
-  auto k = arm_compute::support::cpp14::make_unique<NEHashtableLookupKernel>();
+  auto k = support::cpp14::make_unique<NEHashtableLookupKernel>();
   k->configure(lookups, keys, input, output, hits);
   _kernel = std::move(k);
 }
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp
deleted file mode 100644
index dac3b849d..000000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEPReLU.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NEPReLU.h"
-
-#include "arm_compute/core/NEON/kernels/NEPReLUKernel.h"
-#include "support/ToolchainSupport.h"
-
-#include <utility>
-
-using namespace arm_compute;
-
-void NEPReLU::configure(const ITensor *input, const ITensor *alpha, ITensor *output)
-{
-  auto k = arm_compute::support::cpp14::make_unique<NEPReLUKernel>();
-  k->configure(input, alpha, output);
-  _kernel = std::move(k);
-}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp
deleted file mode 100644
index 0e9a5e969..000000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NERNNLayerEx.cpp
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NERNNLayerEx.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-namespace arm_compute
-{
-NERNNLayerEx::NERNNLayerEx(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(),
-      _activation_kernel(), _fully_connected_kernel(), _copy_kernel(), _fully_connected_out(),
-      _gemm_output(), _add_output(), _is_prepared(false)
-{
-}
-
-Status NERNNLayerEx::validate(const ITensorInfo *input, const ITensorInfo *weights,
-                              const ITensorInfo *recurrent_weights, const ITensorInfo *bias,
-                              const ITensorInfo *hidden_state, const ITensorInfo *output,
-                              const ActivationLayerInfo &info)
-{
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state,
-                                      output);
-
-  const int idx_width = 0;
-  const int idx_height = 1;
-  ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_width) != weights->dimension(idx_width));
-  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_height) !=
-                              recurrent_weights->dimension(idx_width));
-  ARM_COMPUTE_RETURN_ERROR_ON(recurrent_weights->dimension(idx_width) !=
-                              recurrent_weights->dimension(idx_height));
-  ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() != 1);
-  ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(idx_width) != weights->dimension(idx_height));
-  ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_width) != weights->dimension(idx_height));
-  ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
-                                                     hidden_state->tensor_shape());
-
-  auto shape_info = TensorInfo(misc::shape_calculator::compute_rnn_shape(
-                                   recurrent_weights, hidden_state->dimension(idx_height)),
-                               1, input->data_type());
-
-  ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, weights, bias, &shape_info));
-  ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAdditionKernel::validate(
-      &shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
-  ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayerKernel::validate(&shape_info, &shape_info, info));
-
-  return Status{};
-}
-
-void NERNNLayerEx::configure(const ITensor *input, const ITensor *weights,
-                             const ITensor *recurrent_weights, const ITensor *bias,
-                             ITensor *hidden_state, ITensor *output, ActivationLayerInfo &info)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
-  ARM_COMPUTE_ERROR_THROW_ON(NERNNLayerEx::validate(input->info(), weights->info(),
-                                                    recurrent_weights->info(), bias->info(),
-                                                    hidden_state->info(), output->info(), info));
-
-  const int idx_height = 1;
-  TensorShape shape = misc::shape_calculator::compute_rnn_shape(
-      recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
-
-  _is_prepared = false;
-
-  // Manage intermediate buffers and configure
-  _fully_connected_out.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
-  _gemm_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
-
-  // Manage intermediate buffers and configure
-  _memory_group.manage(&_fully_connected_out);
-  _fully_connected_kernel.configure(input, weights, bias, &_fully_connected_out);
-
-  _memory_group.manage(&_gemm_output);
-  _gemm_state_f.configure(hidden_state, recurrent_weights, nullptr, &_gemm_output, 1.f, 0.f);
-
-  _add_output.allocator()->init(TensorInfo(shape, 1, input->info()->data_type()));
-  _memory_group.manage(&_add_output);
-
-  _add_kernel.configure(&_fully_connected_out, &_gemm_output, &_add_output,
-                        ConvertPolicy::SATURATE);
-
-  _fully_connected_out.allocator()->allocate();
-  _gemm_output.allocator()->allocate();
-
-  _activation_kernel.configure(&_add_output, hidden_state, info);
-  _add_output.allocator()->allocate();
-
-  _copy_kernel.configure(hidden_state, output);
-}
-
-void NERNNLayerEx::run()
-{
-  prepare();
-
-  MemoryGroupResourceScope scope_mg(_memory_group);
-
-  _fully_connected_kernel.run();
-
-  _gemm_state_f.run();
-
-  NEScheduler::get().schedule(&_add_kernel, Window::DimY);
-  NEScheduler::get().schedule(&_activation_kernel, Window::DimY);
-
-  // copy hidden out to output
-  NEScheduler::get().schedule(&_copy_kernel, Window::DimY);
-}
-
-void NERNNLayerEx::prepare()
-{
-  if (!_is_prepared)
-  {
-    _fully_connected_kernel.prepare();
-    _gemm_state_f.prepare();
-
-    _is_prepared = true;
-  }
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp
deleted file mode 100644
index 116bba3c0..000000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NEReduceMeanEx.cpp
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2018-2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NEReduceMeanEx.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-using namespace arm_compute;
-
-NEReduceMeanEx::NEReduceMeanEx(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(),
-      _reduction_ops(), _keep_dims()
-{
-}
-
-Status NEReduceMeanEx::validate(const ITensorInfo *input, const Coordinates &reduction_axis,
-                                bool keep_dims, const ITensorInfo *output)
-{
-  ARM_COMPUTE_UNUSED(keep_dims);
-  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
-  ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
-
-  TensorShape out_shape = input->tensor_shape();
-  const unsigned int reduction_ops = reduction_axis.num_dimensions();
-  const int input_dims = input->num_dimensions();
-  Coordinates axis_local = reduction_axis;
-
-  // Convert negative axis
-  for (unsigned int i = 0; i < reduction_ops; ++i)
-  {
-    axis_local[i] = wrap_around(axis_local[i], input_dims);
-  }
-
-  std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
-  for (unsigned int i = 0; i < reduction_ops; ++i)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
-    ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) >
-                                input->num_dimensions() - 1);
-    if (output->total_size() > 0 && keep_dims)
-    {
-      ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
-    }
-    if (keep_dims)
-    {
-      out_shape.set(axis_local[i], 1);
-    }
-    else
-    {
-      out_shape.remove_dimension(axis_local[i] - i);
-    }
-  }
-  const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
-  ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
-
-  return Status{};
-}
-
-void NEReduceMeanEx::configure(ITensor *input, const Coordinates &reduction_axis, bool keep_dims,
-                               ITensor *output)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input);
-
-  _reduction_ops = reduction_axis.num_dimensions();
-  _reduction_kernels =
-      arm_compute::support::cpp14::make_unique<NEReductionOperation[]>(_reduction_ops);
-  _reduced_outs =
-      arm_compute::support::cpp14::make_unique<Tensor[]>(_reduction_ops - (keep_dims ? 1 : 0));
-  _keep_dims = keep_dims;
-
-  Coordinates axis_local = reduction_axis;
-  const int input_dims = input->info()->num_dimensions();
-  const unsigned int reduction_ops = reduction_axis.num_dimensions();
-
-  // Convert negative axis
-  for (unsigned int i = 0; i < reduction_ops; ++i)
-  {
-    axis_local[i] = wrap_around(axis_local[i], input_dims);
-  }
-
-  // Perform reduction for every axis
-  for (unsigned int i = 0; i < _reduction_ops; ++i)
-  {
-    TensorShape out_shape = i == 0 ? input->info()->tensor_shape()
-                                   : (_reduced_outs.get() + i - 1)->info()->tensor_shape();
-    out_shape.set(axis_local[i], 1);
-    auto in = (i == 0) ? input : (_reduced_outs.get() + i - 1);
-
-    if (i == _reduction_ops - 1 && keep_dims)
-    {
-      _reduction_kernels[i].configure(in, output, axis_local[i], ReductionOperation::MEAN_SUM);
-    }
-    else
-    {
-      _reduced_outs[i].allocator()->init(TensorInfo(out_shape, input->info()->num_channels(),
-                                                    input->info()->data_type(),
-                                                    input->info()->quantization_info())
-                                             .set_data_layout(output->info()->data_layout()));
-      _memory_group.manage(_reduced_outs.get() + i);
-      _reduction_kernels[i].configure(in, _reduced_outs.get() + i, axis_local[i],
-                                      ReductionOperation::MEAN_SUM);
-    }
-  }
-
-  // Allocate intermediate tensors
-  for (unsigned int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
-  {
-    _reduced_outs[i].allocator()->allocate();
-  }
-
-  // Configure reshape layer if we want to drop the dimensions
-  if (!keep_dims)
-  {
-    TensorShape out_shape = input->info()->tensor_shape();
-
-    // We have to sort the reduction axis vectors in order for remove_dimension
-    // to work properly
-    std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
-    for (unsigned int i = 0; i < _reduction_ops; ++i)
-    {
-      out_shape.remove_dimension(axis_local[i] - i);
-    }
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(out_shape));
-    _reshape.configure(_reduced_outs.get() + _reduction_ops - 1, output);
-  }
-}
-
-void NEReduceMeanEx::run()
-{
-  _memory_group.acquire();
-
-  for (unsigned int i = 0; i < _reduction_ops; ++i)
-  {
-    _reduction_kernels[i].run();
-  }
-
-  if (!_keep_dims)
-  {
-    _reshape.run();
-  }
-  _memory_group.release();
-}
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp
deleted file mode 100644
index 198bb7672..000000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToBatchLayerEx.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NESpaceToBatchLayerEx.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-namespace arm_compute
-{
-NESpaceToBatchLayerEx::NESpaceToBatchLayerEx()
-    : _space_to_batch_kernel(), _memset_kernel(), _has_padding(false)
-{
-}
-
-void NESpaceToBatchLayerEx::configure(const ITensor *input, const ITensor *block_shape,
-                                      const ITensor *paddings, ITensor *output)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
-
-  if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
-  {
-    _has_padding = true;
-    _memset_kernel.configure(
-        output, PixelValue(0, output->info()->data_type(), output->info()->quantization_info()));
-  }
-  _space_to_batch_kernel.configure(input, block_shape, paddings, output);
-}
-
-void NESpaceToBatchLayerEx::configure(const ITensor *input, const int block_shape_x,
-                                      const int block_shape_y, const Size2D &padding_left,
-                                      const Size2D &padding_right, ITensor *output)
-{
-  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-  if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
-  {
-    _has_padding = true;
-    _memset_kernel.configure(
-        output, PixelValue(0, output->info()->data_type(), output->info()->quantization_info()));
-  }
-  _space_to_batch_kernel.configure(input, block_shape_x, block_shape_y, padding_left, padding_right,
-                                   output);
-}
-
-Status NESpaceToBatchLayerEx::validate(const ITensorInfo *input, const ITensorInfo *block_shape,
-                                       const ITensorInfo *paddings, const ITensorInfo *output)
-{
-  ARM_COMPUTE_RETURN_ON_ERROR(
-      NESpaceToBatchLayerKernel::validate(input, block_shape, paddings, output));
-
-  return Status{};
-}
-
-Status NESpaceToBatchLayerEx::validate(const ITensorInfo *input, const int block_shape_x,
-                                       const int block_shape_y, const Size2D &padding_left,
-                                       const Size2D &padding_right, const ITensorInfo *output)
-{
-  ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(
-      input, block_shape_x, block_shape_y, padding_left, padding_right, output));
-
-  return Status{};
-}
-
-void NESpaceToBatchLayerEx::run()
-{
-  // Zero out output only if we have paddings
-  if (_has_padding)
-  {
-    NEScheduler::get().schedule(&_memset_kernel, Window::DimY);
-  }
-  NEScheduler::get().schedule(&_space_to_batch_kernel, Window::DimY);
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp
deleted file mode 100644
index 97697e3ea..000000000
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NESpaceToDepthLayerEx.cpp
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2019 ARM Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "arm_compute/runtime/NEON/functions/NESpaceToDepthLayerEx.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/NEON/kernels/NESpaceToDepthLayerKernelEx.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-
-namespace arm_compute
-{
-void NESpaceToDepthLayerEx::configure(const ITensor *input, ITensor *output, int32_t block_shape)
-{
-  auto k = arm_compute::support::cpp14::make_unique<NESpaceToDepthLayerKernelEx>();
-  k->configure(input, output, block_shape);
-  _kernel = std::move(k);
-}
-
-Status NESpaceToDepthLayerEx::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                       int32_t block_shape)
-{
-  ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToDepthLayerKernelEx::validate(input, output, block_shape));
-  return Status{};
-}
-} // namespace arm_compute
diff --git a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
index df0689273..09f178005 100644
--- a/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
+++ b/compute/ARMComputeEx/src/runtime/NEON/functions/NETransposeConvLayer.cpp
@@ -1,21 +1,5 @@
 /*
- * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,14 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-
 #include "arm_compute/runtime/NEON/functions/NETransposeConvLayer.h"
 
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/UtilsEx.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/misc/ShapeCalculatorEx.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
@@ -52,20 +33,15 @@ using namespace arm_compute::misc::shape_calculator;
 
 namespace arm_compute
 {
+
 NETransposeConvLayer::NETransposeConvLayer(std::shared_ptr<IMemoryManager> memory_manager) // NOLINT
     : _memory_group(std::move(memory_manager)),
       _conv_f(),
       _upsample_f(),
       _flip_weights(),
-      _permute_input(),
-      _permute_weights(),
-      _permute_output(),
       _scaled_output(),
       _weights_flipped(),
-      _permuted_input(),
-      _permuted_weights(),
-      _permuted_output(),
-      _is_nchw(false),
+      _flip_axis(),
       _original_weights(nullptr),
       _input(nullptr),
       _info(),
@@ -80,7 +56,7 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf
 {
   ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
   ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16,
-                                                       DataType::QASYMM8);
+                                                       DataType::QASYMM8, DataType::QASYMM8_SIGNED);
   ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, input);
   ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(weights, input);
   const unsigned int width_idx =
@@ -95,13 +71,16 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf
       weights->dimension(height_idx), info, invalid_right, invalid_bottom);
 
   ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
-  if (is_data_type_quantized_asymmetric(input->data_type()) && bias)
+  if (bias != nullptr)
   {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
-  }
-  else if (bias)
-  {
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+    if (is_data_type_quantized_asymmetric(input->data_type()))
+    {
+      ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+    }
+    else
+    {
+      ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+    }
   }
 
   if (output->tensor_shape().total_size() > 0)
@@ -110,12 +89,12 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf
 
     const TensorShape output_shape = compute_transposeconv_output_shape(out_dims, *input, *weights);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) < output_shape.x(),
-                                    "Output's dim 0 is invalid.");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) < output_shape.y(),
-                                    "Output's dim 1 is invalid.");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) < output_shape.z(),
-                                    "Output's dim 2 is invalid.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(),
+                                    "Output's width is invalid.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(),
+                                    "Output's height is invalid.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(),
+                                    "Output's depth is invalid.");
   }
 
   unsigned int pad_left = 0;
@@ -127,7 +106,6 @@ Status NETransposeConvLayer::validate(const ITensorInfo *input, const ITensorInf
       pad_bottom);
   TensorInfo scale_out_info(
       input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape));
-  scale_out_info.set_data_layout(input->data_layout());
   const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
 
   const unsigned int batches_idx =
@@ -149,19 +127,13 @@ void NETransposeConvLayer::configure(ITensor *input, const ITensor *weights, con
                                      ITensor *output, const PadStrideInfo &info,
                                      unsigned int invalid_right, unsigned int invalid_bottom)
 {
+  // Perform validation step
   ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+  ARM_COMPUTE_ERROR_THROW_ON(NETransposeConvLayer::validate(
+      input->info(), weights->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(),
+      info, invalid_right, invalid_bottom));
 
   const DataLayout data_layout = input->info()->data_layout();
-
-  _input = input;
-  _original_weights = weights;
-  _info = info;
-  _is_prepared = false;
-  _is_nchw = data_layout == DataLayout::NCHW;
-
-  const unsigned int stride_x = info.stride().first;
-  const unsigned int stride_y = info.stride().second;
-
   const unsigned int width_idx =
       get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
   const unsigned int height_idx =
@@ -173,101 +145,54 @@ void NETransposeConvLayer::configure(ITensor *input, const ITensor *weights, con
 
   const TensorShape output_shape =
       compute_transposeconv_output_shape(out_dims, *input->info(), *weights->info());
+
+  _input = input;
+  _original_weights = weights;
+  _info = info;
+  _is_prepared = false;
+
+  unsigned int pad_left = 0;
+  unsigned int pad_right = 0;
+  unsigned int pad_top = 0;
+  unsigned int pad_bottom = 0;
+  const unsigned int stride_x = info.stride().first;
+  const unsigned int stride_y = info.stride().second;
+
   // Output auto initialization if not yet initialized
   auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(),
                      input->info()->quantization_info());
 
-  // Perform validation step
-  ARM_COMPUTE_ERROR_THROW_ON(NETransposeConvLayer::validate(
-      input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(),
-      info, invalid_right, invalid_bottom));
-
+  _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
   _memory_group.manage(&_scaled_output);
 
-  if (!_is_nchw)
-  {
-    _memory_group.manage(&_permuted_input);
-    _memory_group.manage(&_permuted_weights);
-    _memory_group.manage(&_permuted_output);
-
-    // Configure the function to transform the input tensor from NHWC -> NCHW
-    _permuted_input.info()->set_quantization_info(input->info()->quantization_info());
-    _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
-    _permuted_input.info()->set_data_layout(DataLayout::NCHW);
-
-    // Configure the function to transform the weights tensor from NHWC -> NCHW
-    _permuted_weights.info()->set_quantization_info(weights->info()->quantization_info());
-    _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
-    _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
-
-    // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in
-    // order to match output shape
-
-    unsigned int pad_left = 0;
-    unsigned int pad_right = 0;
-    unsigned int pad_top = 0;
-    unsigned int pad_bottom = 0;
-    const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
-        *_permuted_input.info(), *_permuted_weights.info(), info, out_dims, invalid_right,
-        invalid_bottom, pad_left, pad_right, pad_top, pad_bottom);
-
-    TensorInfo scale_out_info(scale_out_shape, 1, _permuted_input.info()->data_type(),
-                              _permuted_input.info()->quantization_info());
-    scale_out_info.set_data_layout(DataLayout::NCHW);
-    _scaled_output.allocator()->init(scale_out_info);
-
-    const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
-                                      DimensionRoundingType::CEIL);
-    _upsample_f.configure(&_permuted_input, &_scaled_output, upsample_info);
-
-    _weights_flipped.allocator()->init(*_permuted_weights.info()->clone());
-    _weights_flipped.info()->set_quantization_info(weights->info()->quantization_info());
-    _flip_weights.configure(&_permuted_weights, &_weights_flipped);
-
-    // setup the function to convolve the upscaled output
-    const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-
-    const auto out_shape = output->info()->tensor_shape();
-    TensorShape permuted_out_shape{out_shape[1], out_shape[2], out_shape[0], out_shape[3]};
-    TensorInfo permuted_out_info(permuted_out_shape, 1, output->info()->data_type(),
-                                 output->info()->quantization_info());
-    _permuted_output.allocator()->init(permuted_out_info);
-    _permuted_output.info()->set_data_layout(DataLayout::NCHW);
-    _conv_f.configure(&_scaled_output, &_weights_flipped, bias, &_permuted_output, conv_info);
-
-    // Configure the function to transform the convoluted output to NHWC
-    _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
-
-    _permuted_input.allocator()->allocate();
-    _permuted_weights.allocator()->allocate();
-    _permuted_output.allocator()->allocate();
-  }
-  else
-  {
-    // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in
-    // order to match output shape
-    unsigned int pad_left = 0;
-    unsigned int pad_right = 0;
-    unsigned int pad_top = 0;
-    unsigned int pad_bottom = 0;
-    const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
-        *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
-        pad_right, pad_top, pad_bottom);
-
-    TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
-                              input->info()->quantization_info());
-    _scaled_output.allocator()->init(scale_out_info);
-    const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
-                                      DimensionRoundingType::FLOOR);
-    _upsample_f.configure(input, &_scaled_output, upsample_info);
-
-    _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
-    _flip_weights.configure(weights, &_weights_flipped);
-
-    // setup the function to convolve the upscaled output
-    const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-    _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info);
-  }
+  _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
+  _flip_weights.configure(weights, &_weights_flipped, &_flip_axis);
+
+  // setup the function to convolve the upscaled output
+  const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
+
+  const TensorShape scale_out_shape = compute_transposeconv_upsampled_shape(
+      *input->info(), *weights->info(), info, out_dims, invalid_right, invalid_bottom, pad_left,
+      pad_right, pad_top, pad_bottom);
+
+  const PadStrideInfo upsample_info(stride_x, stride_y, pad_left, pad_right, pad_top, pad_bottom,
+                                    DimensionRoundingType::FLOOR);
+
+  TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(),
+                            input->info()->quantization_info());
+  scale_out_info.set_data_layout(data_layout);
+  _scaled_output.allocator()->init(scale_out_info);
+
+  _upsample_f.configure(input, &_scaled_output, upsample_info);
+
+  _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info);
+
+  // Setup flip axis data
+  _flip_axis.allocator()->allocate();
+  auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer());
+  axis_data[0] = static_cast<uint32_t>(width_idx);
+  axis_data[1] = static_cast<uint32_t>(height_idx);
+
   _scaled_output.allocator()->allocate();
 }
 
@@ -275,22 +200,10 @@ void NETransposeConvLayer::run()
 {
   prepare();
 
-  // MemoryGroupResourceScope scope_mg(_memory_group);
-
-  // Permute input
-  if (!_is_nchw)
-  {
-    _permute_input.run();
-  }
+  MemoryGroupResourceScope scope_mg(_memory_group);
 
   _upsample_f.run();
   _conv_f.run();
-
-  // Permute output
-  if (!_is_nchw)
-  {
-    _permute_output.run();
-  }
 }
 
 void NETransposeConvLayer::prepare()
@@ -301,22 +214,12 @@ void NETransposeConvLayer::prepare()
 
     // Run weights flipping and mark original weights tensor as unused
     _weights_flipped.allocator()->allocate();
-    // Permute weights
-    if (!_is_nchw)
-    {
-      _permute_weights.run();
-    }
-    NEScheduler::get().schedule(&_flip_weights, Window::DimZ);
+    _flip_weights.run();
     _original_weights->mark_as_unused();
 
     // Prepare convolution
     _conv_f.prepare();
 
-    if (!_weights_flipped.is_used())
-    {
-      _weights_flipped.allocator()->free();
-    }
-
     _is_prepared = true;
   }
 }
diff --git a/compute/cker/CMakeLists.txt b/compute/cker/CMakeLists.txt
index 09f67259c..609dd45a3 100644
--- a/compute/cker/CMakeLists.txt
+++ b/compute/cker/CMakeLists.txt
@@ -8,6 +8,9 @@ target_link_libraries(nnfw_lib_cker INTERFACE gemmlowp)
 target_link_libraries(nnfw_lib_cker INTERFACE ruy)
 target_link_libraries(nnfw_lib_cker INTERFACE ruy_instrumentation)
 target_compile_definitions(nnfw_lib_cker INTERFACE USE_RUY_GEMV)
+if(EXPERIMENTAL_RUY_FEATURE)
+  target_compile_definitions(nnfw_lib_cker INTERFACE EXPERIMENTAL_RUY_FEATURE)
+endif(EXPERIMENTAL_RUY_FEATURE)
 if(PROFILE_RUY)
   target_link_libraries(nnfw_lib_cker INTERFACE ruy_profiler)
 endif(PROFILE_RUY)
diff --git a/compute/cker/include/cker/NeonTensorUtils.h b/compute/cker/include/cker/NeonTensorUtils.h
index 5c38bc6f3..246fd9a46 100644
--- a/compute/cker/include/cker/NeonTensorUtils.h
+++ b/compute/cker/include/cker/NeonTensorUtils.h
@@ -546,7 +546,7 @@ bool NeonIsZeroVector(const float *vector, int v_size)
 
 void NeonCpuBackendGemm(const int8_t *input, const int32_t *bias,
                         const int8_t *input_to_gate_weights, int32_t n_batch, int32_t n_input,
-                        int32_t n_output, int32_t, int32_t *scratch)
+                        int32_t n_output, int32_t, int32_t *scratch, ruy::Context *ruy_context)
 {
   MatrixParams<int8_t> lhs_params;
   lhs_params.order = Order::kRowMajor;
@@ -571,8 +571,6 @@ void NeonCpuBackendGemm(const int8_t *input, const int32_t *bias,
   }
 
   // Below code is from tflite::cpu_backend_gemm::detail::GemmImplUsingRuy
-  ruy::Context *ruy_context = ruy_support::GetRuyContext();
-
   ruy::Matrix<int8_t> ruy_lhs;
   ruy::Matrix<int8_t> ruy_rhs;
   ruy::Matrix<int32_t> ruy_dst;
@@ -851,13 +849,13 @@ void NeonMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix,
                                              const int m_cols, const int8_t *__restrict__ vectors,
                                              const float *scaling_factors, int n_batch,
                                              int32_t *scratch, float *__restrict__ result,
-                                             int result_stride)
+                                             int result_stride, ruy::Context *ruy_context)
 {
   if (m_rows % 4 == 0 && result_stride == 1)
   {
     const int32_t *bias = static_cast<const int32_t *>(nullptr);
     NeonCpuBackendGemm(vectors, bias, matrix, n_batch, m_cols, m_rows,
-                       /*output_zp =*/0, scratch);
+                       /*output_zp =*/0, scratch, ruy_context);
 
     // Multiply by float scaling factors and write to result
     const int total_size = n_batch * m_rows;
diff --git a/compute/cker/include/cker/PortableTensorUtils.h b/compute/cker/include/cker/PortableTensorUtils.h
index 9769d4ba6..54714e214 100644
--- a/compute/cker/include/cker/PortableTensorUtils.h
+++ b/compute/cker/include/cker/PortableTensorUtils.h
@@ -20,6 +20,7 @@
 
 #include "cker/Types.h"
 #include "cker/neon/neon_check.h"
+#include <ruy/context.h>
 
 #include <cstring>
 #include <cmath>
@@ -142,7 +143,7 @@ void PortableMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matr
                                                  const int8_t *__restrict__ vector,
                                                  const float *scaling_factors, int n_batch,
                                                  int32_t *, float *__restrict__ result,
-                                                 int result_stride)
+                                                 int result_stride, ruy::Context *)
 {
   PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vector, scaling_factors,
                                               n_batch, result, result_stride);
diff --git a/compute/cker/include/cker/TensorUtils.h b/compute/cker/include/cker/TensorUtils.h
index 6b23c0b30..e07c91239 100644
--- a/compute/cker/include/cker/TensorUtils.h
+++ b/compute/cker/include/cker/TensorUtils.h
@@ -73,10 +73,10 @@ void MatrixBatchVectorMultiplyAccumulate(const float *matrix, int m_rows, int m_
 void MatrixBatchVectorMultiplyAccumulate(const int8_t *matrix, const int m_rows, const int m_cols,
                                          const int8_t *vectors, const float *scaling_factors,
                                          int n_batch, int32_t *scratch, float *result,
-                                         int result_stride)
+                                         int result_stride, ruy::Context *ruy_context)
 {
   NEON_OR_PORTABLE(MatrixBatchVectorMultiplyAccumulate, matrix, m_rows, m_cols, vectors,
-                   scaling_factors, n_batch, scratch, result, result_stride);
+                   scaling_factors, n_batch, scratch, result, result_stride, ruy_context);
 }
 
 void ZeroVector(float *vector, int v_size) { PortableZeroVector(vector, v_size); }
diff --git a/compute/cker/include/cker/Types.h b/compute/cker/include/cker/Types.h
index 41b1916cf..886ce5e5e 100644
--- a/compute/cker/include/cker/Types.h
+++ b/compute/cker/include/cker/Types.h
@@ -259,6 +259,12 @@ struct FullyConnectedParams
   // FullyConnectedWeightsFormat weights_format;
 };
 
+struct L2NormParams
+{
+  // uint8 inference params.
+  int32_t input_zero_point;
+};
+
 struct GatherParams
 {
   int32_t axis;
@@ -271,6 +277,14 @@ struct InstanceNormParams
   float float_activation_max;
 };
 
+struct ResizeBilinearParams
+{
+  int32_t output_height;
+  int32_t output_width;
+  bool align_corners;
+  bool half_pixel_centers;
+};
+
 struct TransposeConvParams
 {
   PaddingType padding_type;
@@ -325,6 +339,12 @@ struct SplitParams
   int16_t axis;
 };
 
+struct SplitVParams
+{
+  uint16_t num_split;
+  int16_t axis;
+};
+
 struct FusedBatchNormParams
 {
   bool is_training;
@@ -338,6 +358,11 @@ struct SpaceToBatchParams
   int32_t output_offset;
 };
 
+struct SpaceToDepthParams
+{
+  int32_t block_size;
+};
+
 enum class Order
 {
   kColMajor,
diff --git a/compute/cker/include/cker/Utils.h b/compute/cker/include/cker/Utils.h
index b69d55c26..2abb998d0 100644
--- a/compute/cker/include/cker/Utils.h
+++ b/compute/cker/include/cker/Utils.h
@@ -123,6 +123,68 @@ inline int CountLeadingZeros(uint32_t integer_input)
   return leading_zeros;
 }
 
+inline void GetInvSqrtQuantizedMultiplierExp(int32_t input, int reverse_shift,
+                                             int32_t *output_inv_sqrt, int *output_shift)
+{
+  assert(input >= 0);
+  if (input <= 1)
+  {
+    // Handle the input value 1 separately to avoid overflow in that case
+    // in the general computation below (b/143972021). Also handle 0 as if it
+    // were a 1. 0 is an invalid input here (divide by zero) and 1 is a valid
+    // but rare/unrealistic input value. We can expect both to occur in some
+    // incompletely trained models, but probably not in fully trained models.
+    *output_inv_sqrt = std::numeric_limits<std::int32_t>::max();
+    *output_shift = 0;
+    return;
+  }
+  assert(input > 1);
+  *output_shift = 11;
+  while (input >= (1 << 29))
+  {
+    input /= 4;
+    ++*output_shift;
+  }
+  const unsigned max_left_shift_bits = CountLeadingZeros(static_cast<uint32_t>(input)) - 1;
+  const unsigned max_left_shift_bit_pairs = max_left_shift_bits / 2;
+  const unsigned left_shift_bit_pairs = max_left_shift_bit_pairs - 1;
+  *output_shift -= left_shift_bit_pairs;
+  input <<= 2 * left_shift_bit_pairs;
+  assert(input >= (1 << 27));
+  assert(input < (1 << 29));
+  using gemmlowp::FixedPoint;
+  using gemmlowp::Rescale;
+  using gemmlowp::SaturatingRoundingMultiplyByPOT;
+  // Using 3 integer bits gives us enough room for the internal arithmetic in
+  // this Newton-Raphson iteration.
+  using F3 = FixedPoint<int32_t, 3>;
+  using F0 = FixedPoint<int32_t, 0>;
+  const F3 fixedpoint_input = F3::FromRaw(input >> 1);
+  const F3 fixedpoint_half_input = SaturatingRoundingMultiplyByPOT<-1>(fixedpoint_input);
+  const F3 fixedpoint_half_three =
+      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F3, (1 << 28) + (1 << 27), 1.5);
+  // Newton-Raphson iteration
+  // Naive unoptimized starting guess: x = 1
+  F3 x = F3::One();
+  // Naive unoptimized number of iterations: 5
+  for (int i = 0; i < 5; i++)
+  {
+    const F3 x3 = Rescale<3>(x * x * x);
+    x = Rescale<3>(fixedpoint_half_three * x - fixedpoint_half_input * x3);
+  }
+  const F0 fixedpoint_half_sqrt_2 =
+      GEMMLOWP_CHECKED_FIXEDPOINT_CONSTANT(F0, 1518500250, std::sqrt(2.) / 2.);
+  x = x * fixedpoint_half_sqrt_2;
+  *output_inv_sqrt = x.raw();
+  if (*output_shift < 0)
+  {
+    *output_inv_sqrt <<= -*output_shift;
+    *output_shift = 0;
+  }
+  // Convert right shift (right is positive) to left shift.
+  *output_shift *= reverse_shift;
+}
+
 // Comment from tensorflow lite:
 //
 // DO NOT USE THIS STRUCT FOR NEW FUNCTIONALITY BEYOND IMPLEMENTING
diff --git a/compute/cker/include/cker/operation/BatchToSpaceND.h b/compute/cker/include/cker/operation/BatchToSpaceND.h
new file mode 100644
index 000000000..e33b2fba5
--- /dev/null
+++ b/compute/cker/include/cker/operation/BatchToSpaceND.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_BATCH_TO_SPACE_ND_H__
+#define __NNFW_CKER_BATCH_TO_SPACE_ND_H__
+
+#include "cker/Shape.h"
+
+#define UNUSED(x) ((void)(x))
+
+namespace nnfw
+{
+namespace cker
+{
+
+// Helper methods for BatchToSpaceND.
+// `spatial_index_dim` specifies post-crop offset index in this spatial
+// dimension, i.e. spatial offset introduced by flattening batch to spatial
+// dimension minus the crop size at beginning. `block_shape_dim` is the block
+// size in current dimension. `input_dim` and `output_dim` are input and output
+// size of BatchToSpaceND operation in current dimension.
+// Output start index is inclusive and end index is exclusive.
+inline void GetIndexRange(int spatial_index_dim, int block_shape_dim, int input_dim, int output_dim,
+                          int *start_index, int *end_index)
+{
+  // (*start_index) * block_shape_dim is effectively rounded up to the next
+  // multiple of block_shape_dim by the integer division.
+  *start_index = std::max(0, (-spatial_index_dim + block_shape_dim - 1) / block_shape_dim);
+  // Similarly, (*end_index) * block_shape_dim is rounded up too (note that
+  // end_index is exclusive).
+  *end_index =
+      std::min(input_dim, (output_dim - spatial_index_dim + block_shape_dim - 1) / block_shape_dim);
+}
+
+template <typename T>
+inline void BatchToSpaceND(const Shape &unextended_input1_shape, const T *input1_data,
+                           const int32_t *block_shape_data, const int32_t *crops_data,
+                           const Shape &unextended_output_shape, T *output_data)
+{
+  auto input_dim = unextended_input1_shape.DimensionsCount();
+  auto output_dim = unextended_output_shape.DimensionsCount();
+
+  assert(input_dim == 3 || input_dim == 4);
+  assert(input_dim == output_dim);
+
+  UNUSED(input_dim);
+  UNUSED(output_dim);
+
+  // Extends the input/output shape from 3D to 4D if needed, NHC -> NH1C.
+  auto extend_shape = [](const Shape &shape) {
+    if (shape.DimensionsCount() == 4)
+    {
+      return shape;
+    }
+    Shape new_shape(4, 1);
+    new_shape.SetDim(0, shape.Dims(0));
+    new_shape.SetDim(1, shape.Dims(1));
+    new_shape.SetDim(3, shape.Dims(2));
+    return new_shape;
+  };
+  const Shape input1_shape = extend_shape(unextended_input1_shape);
+  const Shape output_shape = extend_shape(unextended_output_shape);
+
+  const int32_t output_width = output_shape.Dims(2);
+  const int32_t output_height = output_shape.Dims(1);
+  const int32_t output_batch_size = output_shape.Dims(0);
+
+  const int32_t depth = input1_shape.Dims(3);
+  const int32_t input_width = input1_shape.Dims(2);
+  const int32_t input_height = input1_shape.Dims(1);
+  const int32_t input_batch_size = input1_shape.Dims(0);
+
+  const int32_t block_shape_height = block_shape_data[0];
+  const int32_t block_shape_width = block_shape_data[1];
+
+  const int32_t crops_top = crops_data[0];
+  const int32_t crops_left = crops_data[2];
+
+  for (int in_batch = 0; in_batch < input_batch_size; ++in_batch)
+  {
+    const int out_batch = in_batch % output_batch_size;
+    const int spatial_offset = in_batch / output_batch_size;
+
+    int in_h_start = 0;
+    int in_h_end = 0;
+    // GetIndexRange ensures start and end indices are in [0, output_height).
+    GetIndexRange(spatial_offset / block_shape_width - crops_top, block_shape_height, input_height,
+                  output_height, &in_h_start, &in_h_end);
+
+    for (int in_h = in_h_start; in_h < in_h_end; ++in_h)
+    {
+      const int out_h = in_h * block_shape_height + spatial_offset / block_shape_width - crops_top;
+      assert(out_h >= 0);
+      assert(out_h < output_height);
+
+      int in_w_start = 0;
+      int in_w_end = 0;
+      // GetIndexRange ensures start and end indices are in [0, output_width).
+      GetIndexRange(spatial_offset % block_shape_width - crops_left, block_shape_width, input_width,
+                    output_width, &in_w_start, &in_w_end);
+
+      for (int in_w = in_w_start; in_w < in_w_end; ++in_w)
+      {
+        const int out_w =
+            in_w * block_shape_width + spatial_offset % block_shape_width - crops_left;
+        assert(out_w >= 0);
+        assert(out_w < output_width);
+        T *out = output_data + Offset(output_shape, out_batch, out_h, out_w, 0);
+        const T *in = input1_data + Offset(input1_shape, in_batch, in_h, in_w, 0);
+        memcpy(out, in, depth * sizeof(T));
+      }
+    }
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_BATCH_TO_SPACE_ND_H__
diff --git a/compute/cker/include/cker/operation/FullyConnected.h b/compute/cker/include/cker/operation/FullyConnected.h
index 9bcf3fd82..4280c9ae2 100644
--- a/compute/cker/include/cker/operation/FullyConnected.h
+++ b/compute/cker/include/cker/operation/FullyConnected.h
@@ -18,6 +18,7 @@
 #ifndef __NNFW_CKER_FULLY_CONNECTED_H__
 #define __NNFW_CKER_FULLY_CONNECTED_H__
 
+#include <ruy/context.h>
 #include "cker/Shape.h"
 #include "cker/Types.h"
 #include "cker/Utils.h"
@@ -78,8 +79,11 @@ inline void FullyConnected(const FullyConnectedParams &params, const Shape &inpu
   MatrixBatchVectorMultiplyAccumulate(weights_data, num_units, input_size, input_data, batch_size,
                                       output_data, /*result_stride=*/1);
 
-  // Apply activation function
-  ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
+  if (params.activation != FusedActivationFunctionType::kNone)
+  {
+    // Apply activation function
+    ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
+  }
 }
 
 inline void FullyConnected(const FullyConnectedParams &params, const Shape &input_shape,
@@ -140,7 +144,7 @@ inline void FullyConnectedHybrid(const FullyConnectedParams &params, const Shape
                                  const float *input_data, const Shape &filter_shape,
                                  const int8_t *filter_data, const Shape &, const float *bias_data,
                                  const Shape &output_shape, float *output_data,
-                                 FCTempArena &temp_arena)
+                                 FCTempArena &temp_arena, ruy::Context *ruy_context)
 {
   int total_input_size = input_shape.FlatSize();
   const int input_size = filter_shape.Dims(1);
@@ -186,19 +190,72 @@ inline void FullyConnectedHybrid(const FullyConnectedParams &params, const Shape
   int32_t *scratch = temp_arena.accum_scratch.data();
   MatrixBatchVectorMultiplyAccumulate(filter_data, num_units, input_size, quant_data,
                                       scaling_factors_ptr, batch_size, scratch, output_data,
-                                      /*result_stride=*/1);
+                                      /*result_stride=*/1, ruy_context);
 #else
   MatrixBatchVectorMultiplyAccumulate(filter_data, num_units, input_size, quant_data,
                                       scaling_factors_ptr, batch_size, output_data,
                                       /*result_stride=*/1);
+  UNUSED_RELEASE(ruy_context);
   UNUSED_RELEASE(output_shape);
 #endif
 
   // Apply activation function to floats.
-  ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
+  if (params.activation != FusedActivationFunctionType::kNone)
+  {
+    // Apply activation function
+    ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
+  }
   return;
 }
 
+inline void FullyConnectedSparseWeight(const FullyConnectedParams &params, const Shape &input_shape,
+                                       const float *input_data, const Shape &weights_shape,
+                                       const float *weights_data, const Shape &bias_shape,
+                                       const float *bias_data, const Shape &output_shape,
+                                       float *output_data, int w0_size, const uint16_t *w1_segments,
+                                       const uint16_t *w1_indices)
+{
+  UNUSED_RELEASE(params);
+  UNUSED_RELEASE(input_shape);
+
+  assert(weights_shape.DimensionsCount() == 2);
+  assert(output_shape.DimensionsCount() == 2);
+
+  const int output_dims_count = output_shape.DimensionsCount();
+  const int weights_dims_count = weights_shape.DimensionsCount();
+  const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
+  const int output_depth =
+      MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1);
+  const int accum_depth = weights_shape.Dims(weights_dims_count - 1);
+
+  UNUSED_RELEASE(bias_shape);
+  if (bias_data)
+  {
+    VectorBatchVectorAssign(bias_data, output_depth, batches, output_data);
+  }
+  else
+  {
+    ZeroVector(output_data, batches * output_depth);
+  }
+  for (int b = 0; b < batches; ++b)
+  {
+    for (int idx_0 = 0; idx_0 < w0_size; ++idx_0)
+    {
+      for (int pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1)
+      {
+        int idx_1 = w1_indices[pw1];
+        output_data[b * output_depth + idx_0] +=
+            weights_data[pw1] * input_data[b * accum_depth + idx_1];
+      }
+    }
+  }
+  if (params.activation != FusedActivationFunctionType::kNone)
+  {
+    // Apply activation function
+    ApplyActivationToVector(output_data, batches * output_depth, params.activation, output_data);
+  }
+}
+
 } // namespace cker
 } // namespace nnfw
 
diff --git a/compute/cker/include/cker/operation/Helper/PhiloxRandom.h b/compute/cker/include/cker/operation/Helper/PhiloxRandom.h
new file mode 100644
index 000000000..8e8879ce9
--- /dev/null
+++ b/compute/cker/include/cker/operation/Helper/PhiloxRandom.h
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TENSORFLOW_CORE_LIB_RANDOM_PHILOX_RANDOM_H_
+#define TENSORFLOW_CORE_LIB_RANDOM_PHILOX_RANDOM_H_
+
+#include <stdlib.h>
+
+#include "cker/Types.h"
+#include "cker/Shape.h"
+#include "cker/Utils.h"
+
+// Function qualifiers that need to work on both CPU and GPU.
+#if defined(__CUDACC__) || defined(__HIPCC__)
+// For nvcc.
+#define PHILOX_DEVICE_FUNC __host__ __device__
+#define PHILOX_INLINE __inline__
+#else
+// For non-nvcc.
+#define PHILOX_DEVICE_FUNC
+#define PHILOX_INLINE inline
+#endif
+#define PHILOX_DEVICE_INLINE PHILOX_DEVICE_FUNC PHILOX_INLINE
+
+#include <math.h>
+
+namespace nnfw
+{
+namespace cker
+{
+namespace random
+{
+
+// A class that represents an inline array. It can be used on both CPU and GPU,
+// and also trivially copyable between CPU and GPU.
+// Arguments:
+//   T: the array element type;
+//   ElementCount: the fixed size of the array;
+template <typename T, int ElementCount> class Array
+{
+public:
+  static constexpr int kElementCount = ElementCount;
+  PHILOX_DEVICE_INLINE Array()
+  {
+    for (int i = 0; i < ElementCount; ++i)
+    {
+      data_[i] = T(0);
+    }
+  }
+
+  PHILOX_DEVICE_INLINE const T &operator[](int index) const { return data_[index]; }
+
+  PHILOX_DEVICE_INLINE T &operator[](int index) { return data_[index]; }
+
+  size_t size() const { return ElementCount; }
+
+private:
+  T data_[ElementCount];
+};
+
+// A class that encapsulates all the states for a random number generator using
+// the philox_4x32_10 algorithm. Each invocation returns a 128-bit random bits
+// in the form of four uint32.
+// There are multiple variants of this algorithm, we picked the 4x32_10 version
+// that is most suited for our applications.
+// Since this class is meant to be copied between CPU to GPU, it maintains a
+// value semantics.
+//
+// For example: To use this class and populate an array of 1024 randoms on CPU
+// with two threads,
+//
+//  void Fill(PhiloxRandom rnd, uint32* output, int start, int limit) {
+//    assert(start % 4 == 0);
+//    assert(limit % 4 == 0);
+//    rnd.Skip(start / 4);
+//    for (int i = start; i < limit; i += 4) {
+//      auto sample = rnd();
+//      ... copy sample[0..3] to output[i..i+3]
+//    }
+//  }
+//
+//  PhiloxRandom rng(seed);
+//  PhiloxRandom rng_copy = rng;
+//  rng.Skip(1000/4);
+//
+//  ... schedule Fill(rng_copy, output, 0, 512) in thread 1;
+//  ... schedule Fill(rng_copy, output, 512, 1024) in thread 2;
+//  ... wait for thread 1 & 2 to finish executing Fill().
+//
+// NOTE:
+// 1. PhiloxRandom is trivially copyable.
+// 2. PhiloxRandom is compilable by gcc and nvcc.
+class PhiloxRandom
+{
+public:
+  using ResultType = Array<uint32_t, 4>;
+  using ResultElementType = uint32_t;
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = 4;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 10;
+  // The type for the 64-bit key stored in the form of two 32-bit uint
+  // that are used in the diffusion process.
+  using Key = Array<uint32_t, 2>;
+
+  PHILOX_DEVICE_INLINE
+  PhiloxRandom() {}
+
+  PHILOX_DEVICE_INLINE
+  explicit PhiloxRandom(uint64_t seed)
+  {
+    key_[0] = static_cast<uint32_t>(seed);
+    key_[1] = static_cast<uint32_t>(seed >> 32);
+  }
+
+  PHILOX_DEVICE_INLINE
+  explicit PhiloxRandom(uint64_t seed_lo, uint64_t seed_hi)
+  {
+    key_[0] = static_cast<uint32_t>(seed_lo);
+    key_[1] = static_cast<uint32_t>(seed_lo >> 32);
+    counter_[2] = static_cast<uint32_t>(seed_hi);
+    counter_[3] = static_cast<uint32_t>(seed_hi >> 32);
+  }
+
+  PHILOX_DEVICE_INLINE
+  PhiloxRandom(ResultType counter, Key key) : counter_(counter), key_(key) {}
+
+  PHILOX_DEVICE_INLINE
+  ResultType const &counter() const { return counter_; }
+
+  PHILOX_DEVICE_INLINE
+  Key const &key() const { return key_; }
+
+  // Skip the specified number of samples of 128-bits in the current stream.
+  PHILOX_DEVICE_INLINE
+  void Skip(uint64_t count)
+  {
+    const uint32_t count_lo = static_cast<uint32_t>(count);
+    uint32_t count_hi = static_cast<uint32_t>(count >> 32);
+
+    counter_[0] += count_lo;
+    if (counter_[0] < count_lo)
+    {
+      ++count_hi;
+    }
+
+    counter_[1] += count_hi;
+    if (counter_[1] < count_hi)
+    {
+      if (++counter_[2] == 0)
+      {
+        ++counter_[3];
+      }
+    }
+  }
+
+  // Returns a group of four random numbers using the underlying Philox
+  // algorithm.
+  PHILOX_DEVICE_INLINE ResultType operator()()
+  {
+    ResultType counter = counter_;
+    Key key = key_;
+
+    // Run the single rounds for ten times. Manually unrolling the loop
+    // for better performance.
+    counter = ComputeSingleRound(counter, key);
+    RaiseKey(&key);
+    counter = ComputeSingleRound(counter, key);
+    RaiseKey(&key);
+    counter = ComputeSingleRound(counter, key);
+    RaiseKey(&key);
+    counter = ComputeSingleRound(counter, key);
+    RaiseKey(&key);
+    counter = ComputeSingleRound(counter, key);
+    RaiseKey(&key);
+    counter = ComputeSingleRound(counter, key);
+    RaiseKey(&key);
+    counter = ComputeSingleRound(counter, key);
+    RaiseKey(&key);
+    counter = ComputeSingleRound(counter, key);
+    RaiseKey(&key);
+    counter = ComputeSingleRound(counter, key);
+    RaiseKey(&key);
+    counter = ComputeSingleRound(counter, key);
+
+    SkipOne();
+
+    return counter;
+  }
+
+private:
+  // We use the same constants as recommended by the original paper.
+  static constexpr uint32_t kPhiloxW32A = 0x9E3779B9;
+  static constexpr uint32_t kPhiloxW32B = 0xBB67AE85;
+  static constexpr uint32_t kPhiloxM4x32A = 0xD2511F53;
+  static constexpr uint32_t kPhiloxM4x32B = 0xCD9E8D57;
+
+  // Helper function to skip the next sample of 128-bits in the current stream.
+  PHILOX_DEVICE_INLINE void SkipOne()
+  {
+    if (++counter_[0] == 0)
+    {
+      if (++counter_[1] == 0)
+      {
+        if (++counter_[2] == 0)
+        {
+          ++counter_[3];
+        }
+      }
+    }
+  }
+
+  // Helper function to return the lower and higher 32-bits from two 32-bit
+  // integer multiplications.
+  PHILOX_DEVICE_INLINE
+  static void MultiplyHighLow(uint32_t a, uint32_t b, uint32_t *result_low, uint32_t *result_high)
+  {
+#ifndef __CUDA_ARCH__
+    const uint64_t product = static_cast<uint64_t>(a) * b;
+    *result_low = static_cast<uint32_t>(product);
+    *result_high = static_cast<uint32_t>(product >> 32);
+#else
+    *result_low = a * b;
+    *result_high = __umulhi(a, b);
+#endif
+  }
+
+  // Helper function for a single round of the underlying Philox algorithm.
+  PHILOX_DEVICE_INLINE static ResultType ComputeSingleRound(const ResultType &counter,
+                                                            const Key &key)
+  {
+    uint32_t lo0;
+    uint32_t hi0;
+    MultiplyHighLow(kPhiloxM4x32A, counter[0], &lo0, &hi0);
+
+    uint32_t lo1;
+    uint32_t hi1;
+    MultiplyHighLow(kPhiloxM4x32B, counter[2], &lo1, &hi1);
+
+    ResultType result;
+    result[0] = hi1 ^ counter[1] ^ key[0];
+    result[1] = lo1;
+    result[2] = hi0 ^ counter[3] ^ key[1];
+    result[3] = lo0;
+    return result;
+  }
+
+  PHILOX_DEVICE_INLINE void RaiseKey(Key *key)
+  {
+    (*key)[0] += kPhiloxW32A;
+    (*key)[1] += kPhiloxW32B;
+  }
+
+private:
+  ResultType counter_;
+  Key key_;
+};
+
+} // namespace random
+} // namespace cker
+} // namespace nnfw
+#endif // TENSORFLOW_CORE_LIB_RANDOM_PHILOX_RANDOM_H_
diff --git a/compute/cker/include/cker/operation/Helper/RandomDistributions.h b/compute/cker/include/cker/operation/Helper/RandomDistributions.h
new file mode 100644
index 000000000..baeafd7c9
--- /dev/null
+++ b/compute/cker/include/cker/operation/Helper/RandomDistributions.h
@@ -0,0 +1,778 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_HELPER_RANDOM_DISTRIBUTIONS_H__
+#define __NNFW_CKER_HELPER_RANDOM_DISTRIBUTIONS_H__
+
+#include <string.h>
+
+#include <cmath>
+
+#include <algorithm>
+#include <type_traits>
+
+#include "cker/Types.h"
+#include "cker/Shape.h"
+#include "cker/Utils.h"
+
+#include "cker/eigen/EigenSupport.h"
+#include "cker/operation/Helper/PhiloxRandom.h"
+
+namespace nnfw
+{
+namespace cker
+{
+namespace random
+{
+
+// Helper function to convert a 16-bit integer to a half between [0..1).
+PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16_t x);
+// Helper function to convert a 16-bit integer to a bfloat16 between [0..1).
+// PHILOX_DEVICE_INLINE bfloat16 Uint16ToGfloat16(uint16 x);
+// Helper function to convert a 32-bit integer to a float between [0..1).
+PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32_t x);
+// Helper function to convert two 32-bit integers to a double between [0..1).
+PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32_t x0, uint32_t x1);
+
+// Computes a + b. Requires that the result is representable in the destination
+// type and that b is not maximal (i.e. b + 1 is not 0). Notably, the addend b
+// need *not* be representable in that type. (The condition on b excludes the
+// extremal case INT_MIN + UINT_MAX = INT_MAX, which this function cannot
+// compute.)
+template <typename Int>
+PHILOX_DEVICE_INLINE Int SignedAdd(Int a, typename std::make_unsigned<Int>::type b)
+{
+  // Implementation note: both b_div_2 and b - b_div_2 are positive and
+  // representable as Int.
+  auto b_div_2 = b >> 1;
+  return a + static_cast<Int>(b_div_2) + static_cast<Int>(b - b_div_2);
+}
+
+// A class that generates uniform distribution random numbers from the
+// underlying random integer generator.
+// Arguments:
+//   Generator: a generator type that returns a number of uint32 upon each
+//              invocation. It needs to define kResultElementCount for the
+//              sample count for each invocation, and ResultType for the
+//              actual returned sample type.
+//   RealType: the data type of the real numbers that will be returned by the
+//             distribution. This could be either float or double for now.
+// This class is meant to be implemented through specialization. The default
+// is not defined by design.
+template <class Generator, typename RealType> class UniformDistribution;
+
+template <class Generator> class UniformDistribution<Generator, Eigen::half>
+{
+public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = Generator::kResultElementCount;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 3;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = false;
+  typedef Array<Eigen::half, kResultElementCount> ResultType;
+  typedef Eigen::half ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(Generator *gen)
+  {
+    typename Generator::ResultType sample = (*gen)();
+    ResultType result;
+    for (int i = 0; i < kResultElementCount; ++i)
+    {
+      result[i] = Uint16ToHalf(sample[i]); // Truncate the upper 16 bits.
+    }
+    return result;
+  }
+};
+
+template <class Generator> class UniformDistribution<Generator, float>
+{
+public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = Generator::kResultElementCount;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 3;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = false;
+  typedef Array<float, kResultElementCount> ResultType;
+  typedef float ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(Generator *gen)
+  {
+    typename Generator::ResultType sample = (*gen)();
+    ResultType result;
+    for (int i = 0; i < kResultElementCount; ++i)
+    {
+      result[i] = Uint32ToFloat(sample[i]);
+    }
+    return result;
+  }
+};
+
+template <class Generator> class UniformDistribution<Generator, double>
+{
+public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = Generator::kResultElementCount / 2;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 3;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = false;
+  typedef Array<double, kResultElementCount> ResultType;
+  typedef double ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(Generator *gen)
+  {
+    typename Generator::ResultType sample = (*gen)();
+    ResultType result;
+    for (int i = 0; i < kResultElementCount; ++i)
+    {
+      result[i] = Uint64ToDouble(sample[2 * i], sample[2 * i + 1]);
+    }
+    return result;
+  }
+};
+
+template <class Generator> class UniformDistribution<Generator, int32_t>
+{
+public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = Generator::kResultElementCount;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 3;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = false;
+  typedef Array<int32_t, kResultElementCount> ResultType;
+  typedef int32_t ResultElementType;
+
+  // Must have lo < hi
+  UniformDistribution(int32_t lo, int32_t hi)
+      : lo_(lo), range_(static_cast<uint32_t>(hi) - static_cast<uint32_t>(lo))
+  {
+  }
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(Generator *gen)
+  {
+    typename Generator::ResultType sample = (*gen)();
+    ResultType result;
+    for (int i = 0; i < kResultElementCount; ++i)
+    {
+      result[i] = SignedAdd(lo_, sample[i] % range_);
+    }
+    return result;
+  }
+
+private:
+  // Note that lo_ is intentionally signed while range_ is intentionally
+  // unsigned.  This is because hi - lo can overflow signed integers if
+  // lo < 0 < hi, but always fits in unsigned.
+  int32_t lo_;
+  int32_t range_;
+};
+
+template <class Generator> class UniformDistribution<Generator, int64_t>
+{
+public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = Generator::kResultElementCount / 2;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 3;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = false;
+  typedef Array<int64_t, kResultElementCount> ResultType;
+  typedef int64_t ResultElementType;
+
+  // Must have lo < hi
+  UniformDistribution(int64_t lo, int64_t hi)
+      : lo_(lo), range_(static_cast<uint64_t>(hi) - static_cast<uint64_t>(lo))
+  {
+  }
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(Generator *gen)
+  {
+    typename Generator::ResultType sample = (*gen)();
+    ResultType result;
+    for (int i = 0; i < kResultElementCount; ++i)
+    {
+      auto bits = sample[2 * i] | static_cast<uint64_t>(sample[2 * i + 1]) << 32;
+      result[i] = SignedAdd(lo_, bits % range_);
+    }
+    return result;
+  }
+
+private:
+  // Note that lo_ is intentionally signed while range_ is intentionally
+  // unsigned.  This is because hi - lo can overflow signed integers if
+  // lo < 0 < hi, but always fits in unsigned.
+  int64_t lo_;
+  uint64_t range_;
+};
+
+// Similar to `UniformDistribution`, except that instead of generating numbers
+// in the range [low, high), it generates numbers covering the whole range of
+// the integer type.
+template <typename Generator, typename IntType> class UniformFullIntDistribution;
+
+template <typename Generator, typename IntType> class UniformFullIntDistribution32
+{
+public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = Generator::kResultElementCount;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 3;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = false;
+  typedef Array<IntType, kResultElementCount> ResultType;
+  typedef IntType ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(Generator *gen)
+  {
+    typename Generator::ResultType sample = (*gen)();
+    ResultType result;
+    for (int i = 0; i < kResultElementCount; ++i)
+    {
+      result[i] = sample[i];
+    }
+    return result;
+  }
+};
+
+template <typename Generator, typename IntType> class UniformFullIntDistribution64
+{
+public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = Generator::kResultElementCount / 2;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 3;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = false;
+  typedef Array<IntType, kResultElementCount> ResultType;
+  typedef IntType ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(Generator *gen)
+  {
+    typename Generator::ResultType sample = (*gen)();
+    ResultType result;
+    for (int i = 0; i < kResultElementCount; ++i)
+    {
+      result[i] = sample[2 * i] | static_cast<uint64_t>(sample[2 * i + 1]) << 32;
+    }
+    return result;
+  }
+};
+
+template <typename Generator>
+class UniformFullIntDistribution<Generator, int32_t>
+    : public UniformFullIntDistribution32<Generator, int32_t>
+{
+};
+template <typename Generator>
+class UniformFullIntDistribution<Generator, uint32_t>
+    : public UniformFullIntDistribution32<Generator, uint32_t>
+{
+};
+template <typename Generator>
+class UniformFullIntDistribution<Generator, int64_t>
+    : public UniformFullIntDistribution64<Generator, int64_t>
+{
+};
+template <typename Generator>
+class UniformFullIntDistribution<Generator, uint64_t>
+    : public UniformFullIntDistribution64<Generator, uint64_t>
+{
+};
+
+// A class that adapts the underlying native multiple samples to return a single
+// sample at a time.
+template <class Generator> class SingleSampleAdapter
+{
+public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = 1;
+  // The number of elements that will be returned by the underlying generator.
+  static constexpr int kNativeElementCount = Generator::kResultElementCount;
+  typedef typename Generator::ResultElementType ResultType;
+  typedef typename Generator::ResultElementType ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  explicit SingleSampleAdapter(Generator *gen)
+      : generator_(gen), used_result_index_(Generator::kResultElementCount)
+  {
+  }
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()()
+  {
+    if (used_result_index_ == Generator::kResultElementCount)
+    {
+      unused_results_ = (*generator_)();
+      used_result_index_ = 0;
+    }
+
+    return unused_results_[used_result_index_++];
+  }
+
+  PHILOX_DEVICE_INLINE
+  void Skip(uint64_t num_skips)
+  {
+    if (!num_skips)
+    {
+      return;
+    }
+    int num_unused_results = kNativeElementCount - used_result_index_;
+    if (num_skips <= num_unused_results)
+    {
+      used_result_index_ += num_skips;
+      return;
+    }
+    num_skips -= num_unused_results;
+    used_result_index_ = kNativeElementCount;
+    SkipFromGenerator(num_skips / kNativeElementCount);
+    num_skips = num_skips % kNativeElementCount;
+    if (num_skips)
+    {
+      unused_results_ = (*generator_)();
+      used_result_index_ = num_skips;
+    }
+  }
+
+private:
+  // This implementation iteratively skips over `num_skips` samples
+  // from `generator_`. There is an O(1) implementation for PhiloxRandom
+  // in random_distributions.cc.
+  PHILOX_DEVICE_INLINE
+  void SkipFromGenerator(uint64_t num_skips)
+  {
+    while (num_skips--)
+    {
+      (*generator_)();
+    }
+  }
+
+  Generator *generator_;
+  typename Generator::ResultType unused_results_;
+  int used_result_index_;
+};
+
+// A class that generates unit normal distribution random numbers from the
+// underlying random integer generator.
+// Arguments:
+//   Generator: a generator type that returns a number of uint32 upon each
+//              each invocation. It needs to define kResultElementCount for the
+//              sample count for each invocation, and ResultType for actual
+//              returned sample type.
+//   RealType: the data type of the real numbers that will be returned by the
+//             distribution. This could be either float or double for now.
+// This class is meant to be implemented through specialization. The default
+// is not defined by design.
+template <class Generator, typename RealType> class NormalDistribution;
+
+PHILOX_DEVICE_INLINE
+void BoxMullerFloat(uint32_t x0, uint32_t x1, float *f0, float *f1);
+
+PHILOX_DEVICE_INLINE
+void BoxMullerDouble(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, double *d0, double *d1);
+
+// Exactly like the float version, except that we convert to half afterwards;
+// since we don't have half-precision sin/cos even on GPUs, there's nothing to
+// gain from working in half internally.
+template <class Generator> class NormalDistribution<Generator, Eigen::half>
+{
+public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = Generator::kResultElementCount;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 70;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = false;
+  typedef Array<Eigen::half, kResultElementCount> ResultType;
+  typedef Eigen::half ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(Generator *gen)
+  {
+    typename Generator::ResultType sample = (*gen)();
+    ResultType result;
+    for (int i = 0; i < kResultElementCount; i += 2)
+    {
+      float f[2];
+      BoxMullerFloat(sample[i], sample[i + 1], &f[0], &f[1]);
+      result[i] = Eigen::half(f[0]);
+      result[i + 1] = Eigen::half(f[1]);
+    }
+    return result;
+  }
+};
+
+template <class Generator> class NormalDistribution<Generator, float>
+{
+public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = Generator::kResultElementCount;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 70;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = false;
+  typedef Array<float, kResultElementCount> ResultType;
+  typedef float ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(Generator *gen)
+  {
+    typename Generator::ResultType sample = (*gen)();
+    ResultType result;
+    for (int i = 0; i < kResultElementCount; i += 2)
+    {
+      BoxMullerFloat(sample[i], sample[i + 1], &result[i], &result[i + 1]);
+    }
+    return result;
+  }
+};
+
+template <class Generator> class NormalDistribution<Generator, double>
+{
+public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = Generator::kResultElementCount / 2;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 70;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = false;
+  typedef Array<double, kResultElementCount> ResultType;
+  typedef double ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(Generator *gen)
+  {
+    typename Generator::ResultType sample = (*gen)();
+    ResultType result;
+    for (int i = 0; i < kResultElementCount; i += 2)
+    {
+      const int i2 = 2 * i;
+      BoxMullerDouble(sample[i2], sample[i2 + 1], sample[i2 + 2], sample[i2 + 3], &result[i],
+                      &result[i + 1]);
+    }
+    return result;
+  }
+};
+
+// A class that returns standard normal distribution between
+// [-kTruncateValue, kTruncateValue].
+// Arguments:
+//   Generator: a generator type that returns a number of uint32 upon each
+//              each invocation. It needs to define kResultElementCount for the
+//              sample count for each invocation, and ResultType for actual
+//              returned sample type.
+//   RealType: the data type of the real numbers that will be returned by the
+//             distribution. This could be either float or double for now.
+// This class is meant to be implemented through specialization. The default
+// is not defined by design.
+template <class SingleSampleGenerator, typename RealType> class TruncatedNormalDistribution;
+
+// Exactly like the float version, except that we convert to half afterwards;
+// since we don't have half-precision sin/cos even on GPUs, there's nothing to
+// gain from working in half internally.
+template <class SingleSampleGenerator>
+class TruncatedNormalDistribution<SingleSampleGenerator, Eigen::half>
+{
+public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = SingleSampleGenerator::kNativeElementCount;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 90;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = true;
+  // The threshold where the normal distribution is truncated.
+  const float kTruncateValue = 2.0f;
+
+  typedef Array<Eigen::half, kResultElementCount> ResultType;
+  typedef Eigen::half ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(SingleSampleGenerator *gen)
+  {
+    ResultType results;
+    int index = 0;
+    while (true)
+    {
+      // Repeatedly take samples from the normal distribution, until we have
+      // the desired number of elements that fall within the pre-defined cutoff
+      // threshold.
+      const uint32_t x0 = (*gen)();
+      const uint32_t x1 = (*gen)();
+      float f[2];
+      BoxMullerFloat(x0, x1, &f[0], &f[1]);
+
+      if (Eigen::numext::abs(f[0]) < kTruncateValue)
+      {
+        results[index++] = Eigen::half(f[0]);
+        if (index >= kResultElementCount)
+        {
+          return results;
+        }
+      }
+      if (Eigen::numext::abs(f[1]) < kTruncateValue)
+      {
+        results[index++] = Eigen::half(f[1]);
+        if (index >= kResultElementCount)
+        {
+          return results;
+        }
+      }
+    }
+  }
+};
+
+// Partial specialization for float.
+template <class SingleSampleGenerator>
+class TruncatedNormalDistribution<SingleSampleGenerator, float>
+{
+public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = SingleSampleGenerator::kNativeElementCount;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 90;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = true;
+  // The threshold where the normal distribution is truncated.
+  const float kTruncateValue = 2.0f;
+
+  typedef Array<float, kResultElementCount> ResultType;
+  typedef float ResultElementType;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(SingleSampleGenerator *gen)
+  {
+    ResultType results;
+    int index = 0;
+    while (true)
+    {
+      // Repeatedly take samples from the normal distribution, until we have
+      // the desired number of elements that fall within the pre-defined cutoff
+      // threshold.
+      const uint32_t x0 = (*gen)();
+      const uint32_t x1 = (*gen)();
+      float f[2];
+      BoxMullerFloat(x0, x1, &f[0], &f[1]);
+
+      if (Eigen::numext::abs(f[0]) < kTruncateValue)
+      {
+        results[index++] = f[0];
+        if (index >= kResultElementCount)
+        {
+          return results;
+        }
+      }
+      if (Eigen::numext::abs(f[1]) < kTruncateValue)
+      {
+        results[index++] = f[1];
+        if (index >= kResultElementCount)
+        {
+          return results;
+        }
+      }
+    }
+  }
+};
+
+// Partial specialization for double.
+template <class SingleSampleGenerator>
+class TruncatedNormalDistribution<SingleSampleGenerator, double>
+{
+public:
+  // The number of elements that will be returned.
+  static constexpr int kResultElementCount = (SingleSampleGenerator::kNativeElementCount > 1)
+                                                 ? SingleSampleGenerator::kNativeElementCount / 2
+                                                 : 1;
+  // Cost of generation of a single element (in cycles).
+  static constexpr int kElementCost = 90;
+  // Indicate that this distribution may take variable number of samples
+  // during the runtime.
+  static constexpr bool kVariableSamplesPerOutput = true;
+  typedef Array<double, kResultElementCount> ResultType;
+  typedef double ResultElementType;
+  const double kTruncateValue = 2.0;
+
+  PHILOX_DEVICE_INLINE
+  ResultType operator()(SingleSampleGenerator *gen)
+  {
+    ResultType results;
+    int index = 0;
+    while (1)
+    {
+      const uint32_t x0 = (*gen)();
+      const uint32_t x1 = (*gen)();
+      const uint32_t x2 = (*gen)();
+      const uint32_t x3 = (*gen)();
+      double d[2];
+      BoxMullerDouble(x0, x1, x2, x3, &d[0], &d[1]);
+
+      if (Eigen::numext::abs(d[0]) < kTruncateValue)
+      {
+        results[index++] = d[0];
+        if (index >= kResultElementCount)
+        {
+          return results;
+        }
+      }
+      if (Eigen::numext::abs(d[1]) < kTruncateValue)
+      {
+        results[index++] = d[1];
+        if (index >= kResultElementCount)
+        {
+          return results;
+        }
+      }
+    }
+  }
+};
+
+// Helper function to convert two 32-bit uniform integers to two floats
+// under the unit normal distribution.
+PHILOX_DEVICE_INLINE
+void BoxMullerFloat(uint32_t x0, uint32_t x1, float *f0, float *f1)
+{
+  // This function implements the Box-Muller transform:
+  // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
+  // Do not send a really small number to log().
+  // We cannot mark "epsilon" as "static const" because NVCC would complain
+  const float epsilon = 1.0e-7f;
+  float u1 = Uint32ToFloat(x0);
+  if (u1 < epsilon)
+  {
+    u1 = epsilon;
+  }
+  const float v1 = 2.0f * M_PI * Uint32ToFloat(x1);
+  const float u2 = Eigen::numext::sqrt(-2.0f * Eigen::numext::log(u1));
+#if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__)
+  *f0 = Eigen::numext::sin(v1);
+  *f1 = Eigen::numext::cos(v1);
+#else
+  sincosf(v1, f0, f1);
+#endif
+  *f0 *= u2;
+  *f1 *= u2;
+}
+
+// Helper function to convert four 32-bit uniform integers to two doubles
+// under the unit normal distribution.
+PHILOX_DEVICE_INLINE
+void BoxMullerDouble(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3, double *d0, double *d1)
+{
+  // This function implements the Box-Muller transform:
+  // http://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform#Basic_form
+  // Do not send a really small number to log().
+  // We cannot mark "epsilon" as "static const" because NVCC would complain
+  const double epsilon = 1.0e-7;
+  double u1 = Uint64ToDouble(x0, x1);
+  if (u1 < epsilon)
+  {
+    u1 = epsilon;
+  }
+  const double v1 = 2 * M_PI * Uint64ToDouble(x2, x3);
+  const double u2 = Eigen::numext::sqrt(-2.0 * Eigen::numext::log(u1));
+#if defined(TENSORFLOW_USE_SYCL) || !defined(__linux__)
+  *d0 = Eigen::numext::sin(v1);
+  *d1 = Eigen::numext::cos(v1);
+#else
+  sincos(v1, d0, d1);
+#endif
+  *d0 *= u2;
+  *d1 *= u2;
+}
+
+// Helper function to convert an 16-bit integer to a half between [0..1).
+PHILOX_DEVICE_INLINE Eigen::half Uint16ToHalf(uint16_t x)
+{
+  // IEEE754 halfs are formatted as follows (MSB first):
+  //    sign(1) exponent(5) mantissa(10)
+  // Conceptually construct the following:
+  //    sign == 0
+  //    exponent == 15  -- an excess 15 representation of a zero exponent
+  //    mantissa == 10 random bits
+  const uint16_t man = x & 0x3ffu; // 10 bit mantissa
+  const uint16_t exp = static_cast<uint16_t>(15);
+  const uint16_t val = (exp << 10) | man;
+
+  Eigen::half result;
+  result.x = val;
+  return result - Eigen::half(1.0);
+}
+
+// Helper function to convert an 32-bit integer to a float between [0..1).
+PHILOX_DEVICE_INLINE float Uint32ToFloat(uint32_t x)
+{
+  // IEEE754 floats are formatted as follows (MSB first):
+  //    sign(1) exponent(8) mantissa(23)
+  // Conceptually construct the following:
+  //    sign == 0
+  //    exponent == 127  -- an excess 127 representation of a zero exponent
+  //    mantissa == 23 random bits
+  const uint32_t man = x & 0x7fffffu; // 23 bit mantissa
+  const uint32_t exp = static_cast<uint32_t>(127);
+  const uint32_t val = (exp << 23) | man;
+
+  // Assumes that endian-ness is same for float and uint32.
+  float result;
+  memcpy(&result, &val, sizeof(val));
+  return result - 1.0f;
+}
+
+// Helper function to convert two 32-bit integers to a double between [0..1).
+PHILOX_DEVICE_INLINE double Uint64ToDouble(uint32_t x0, uint32_t x1)
+{
+  // IEEE754 doubles are formatted as follows (MSB first):
+  //    sign(1) exponent(11) mantissa(52)
+  // Conceptually construct the following:
+  //    sign == 0
+  //    exponent == 1023  -- an excess 1023 representation of a zero exponent
+  //    mantissa == 52 random bits
+  const uint32_t mhi = x0 & 0xfffffu;                            // upper 20 bits of mantissa
+  const uint32_t mlo = x1;                                       // lower 32 bits of mantissa
+  const uint64_t man = (static_cast<uint64_t>(mhi) << 32) | mlo; // mantissa
+  const uint64_t exp = static_cast<uint64_t>(1023);
+  const uint64_t val = (exp << 52) | man;
+  // Assumes that endian-ness is same for double and uint64.
+  double result;
+  memcpy(&result, &val, sizeof(val));
+  return result - 1.0;
+}
+
+} // namespace random
+} // namespace tensorflow
+}
+
+#endif // __NNFW_CKER_HELPER_RANDOM_DISTRIBUTIONS_H__
diff --git a/compute/cker/include/cker/operation/Helper/RandomOp.h b/compute/cker/include/cker/operation/Helper/RandomOp.h
new file mode 100644
index 000000000..7dc51fe94
--- /dev/null
+++ b/compute/cker/include/cker/operation/Helper/RandomOp.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_HELPER_RANDOM_OP_H__
+#define __NNFW_CKER_HELPER_RANDOM_OP_H__
+
+#include "cker/Types.h"
+#include "cker/Shape.h"
+#include "cker/Utils.h"
+
+#include "cker/operation/Helper/RandomDistributions.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+namespace functor
+{
+
+template <typename Device, class Distribution> struct FillPhiloxRandom;
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+// Declares the partially CPU-specialized functor struct.
+//
+// NOTE: Due to inlining done by the compiler, you may need to add
+// explicit instantiation of the functor in random_op.cc.  See example
+// functor::FillPhiloxRandom<CPUDevice, random::UniformDistribution>.
+template <class Distribution> struct FillPhiloxRandom<CPUDevice, Distribution>
+{
+  void operator()(random::PhiloxRandom gen, typename Distribution::ResultElementType *data,
+                  int64_t size, Distribution dist);
+};
+
+} // namespace functor
+} // namespace tensorflow
+}
+#endif // __NNFW_CKER_HELPER_RANDOM_OP_H__
diff --git a/compute/cker/include/cker/operation/Helper/RandomOpCpu.h b/compute/cker/include/cker/operation/Helper/RandomOpCpu.h
new file mode 100644
index 000000000..85d267723
--- /dev/null
+++ b/compute/cker/include/cker/operation/Helper/RandomOpCpu.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_HELPER_RANDOM_OP_CPU_H__
+#define __NNFW_CKER_HELPER_RANDOM_OP_CPU_H__
+
+#define EIGEN_USE_THREADS
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+
+#include "cker/Types.h"
+#include "cker/Shape.h"
+#include "cker/Utils.h"
+
+#include "cker/eigen/EigenSupport.h"
+
+#include "cker/operation/Helper/PhiloxRandom.h"
+#include "cker/operation/Helper/RandomOp.h"
+#include "cker/operation/Helper/RandomDistributions.h"
+
+#if EIGEN_COMP_GNUC && __cplusplus > 199711L
+#define DISABLE_FLOAT_EQUALITY_WARNING \
+  _Pragma("GCC diagnostic push") _Pragma("GCC diagnostic ignored \"-Wfloat-equal\"")
+#define ENABLE_FLOAT_EQUALITY_WARNING _Pragma("GCC diagnostic pop")
+#else
+#define DISABLE_FLOAT_EQUALITY_WARNING
+#define ENABLE_FLOAT_EQUALITY_WARNING
+#endif
+
+namespace nnfw
+{
+namespace cker
+{
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+namespace functor
+{
+using random::PhiloxRandom;
+using random::SingleSampleAdapter;
+
+// The default implementation of the functor, which should never be invoked
+// But we still need to provide implementation for now for the linker to work,
+// since we do not support all the distributions yet.
+template <typename Device, class Distribution> struct FillPhiloxRandom
+{
+  typedef typename Distribution::ResultElementType T;
+  void operator()() {}
+};
+
+// A class to fill a specified range of random groups
+template <class Distribution, bool VariableSamplesPerOutput> struct FillPhiloxRandomTask;
+
+// Specialization for distribution that takes a fixed number of samples for
+// each output.
+template <class Distribution> struct FillPhiloxRandomTask<Distribution, false>
+{
+  typedef typename Distribution::ResultElementType T;
+  static void Run(random::PhiloxRandom gen, T *data, int64_t size, Distribution dist)
+  {
+    const int kGroupSize = Distribution::kResultElementCount;
+    gen.Skip(0);
+    int64_t offset = 0;
+
+    // First fill all the full-size groups
+    int64_t limit_group_full = size / kGroupSize;
+    for (int64_t index = 0; index < limit_group_full; ++index)
+    {
+      auto samples = dist(&gen);
+      std::copy(&samples[0], &samples[0] + kGroupSize, data + offset);
+      offset += kGroupSize;
+    }
+
+    int64_t remaining_size = size - limit_group_full * kGroupSize;
+
+    // If there are any remaining elements that need to be filled, process them
+    if (remaining_size > 0)
+    {
+      auto samples = dist(&gen);
+      std::copy(&samples[0], &samples[0] + remaining_size, data + offset);
+    }
+  }
+};
+
+// Specialization for distribution that takes a variable number of samples for
+// each output. This will be slower due to the generality.
+template <class Distribution> struct FillPhiloxRandomTask<Distribution, true>
+{
+  typedef typename Distribution::ResultElementType T;
+  static constexpr int64_t kReservedSamplesPerOutput = 256;
+
+  static void Run(random::PhiloxRandom base_gen, T *data, int64_t size, Distribution dist)
+  {
+    const int kGroupSize = Distribution::kResultElementCount;
+    static const int kGeneratorSkipPerOutputGroup =
+        kGroupSize * kReservedSamplesPerOutput / PhiloxRandom::kResultElementCount;
+
+    int64_t offset = 0;
+
+    // First fill all the full-size groups
+    int64_t limit_group_full = size / kGroupSize;
+    int64_t group_index;
+    for (group_index = 0; group_index < limit_group_full; ++group_index)
+    {
+      // Reset the generator to the beginning of the output group region
+      // This is necessary if we want the results to be independent of order
+      // of work
+      PhiloxRandom gen = base_gen;
+      gen.Skip(group_index * kGeneratorSkipPerOutputGroup);
+      SingleSampleAdapter<PhiloxRandom> single_samples(&gen);
+
+      auto samples = dist(&single_samples);
+      std::copy(&samples[0], &samples[0] + kGroupSize, data + offset);
+      offset += kGroupSize;
+    }
+
+    int64_t remaining_size = size - limit_group_full * kGroupSize;
+    // If there are any remaining elements that need to be filled, process them
+    if (remaining_size > 0)
+    {
+      PhiloxRandom gen = base_gen;
+      gen.Skip(group_index * kGeneratorSkipPerOutputGroup);
+      SingleSampleAdapter<PhiloxRandom> single_samples(&gen);
+
+      auto samples = dist(&single_samples);
+      std::copy(&samples[0], &samples[0] + remaining_size, data + offset);
+    }
+  }
+};
+
+// Partial specialization for CPU to fill the entire region with randoms
+// It splits the work into several tasks and run them in parallel
+template <class Distribution>
+void FillPhiloxRandom<CPUDevice, Distribution>::
+operator()(random::PhiloxRandom gen, typename Distribution::ResultElementType *data, int64_t size,
+           Distribution dist)
+{
+  FillPhiloxRandomTask<Distribution, Distribution::kVariableSamplesPerOutput>::Run(gen, data, size,
+                                                                                   dist);
+}
+
+} // namespace functor
+
+} // end namespace tensorflow
+}
+
+#endif // __NNFW_CKER_HELPER_RANDOM_OP_CPU_H__
diff --git a/compute/cker/include/cker/operation/L2Normalize.h b/compute/cker/include/cker/operation/L2Normalize.h
new file mode 100644
index 000000000..a0075c3d0
--- /dev/null
+++ b/compute/cker/include/cker/operation/L2Normalize.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_L2NORMALIZE_H__
+#define __NNFW_CKER_L2NORMALIZE_H__
+
+#include "cker/Shape.h"
+#include "cker/Utils.h"
+#include "cker/Types.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+void L2NormalizeFloat32(const Shape &input_shape, const float *input_data,
+                        const Shape &output_shape, float *output_data)
+{
+  float epsilon = 1e-6;
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+  for (int i = 0; i < outer_size; ++i)
+  {
+    float squared_l2_norm = 0;
+    for (int c = 0; c < depth; ++c)
+    {
+      const float val = input_data[c];
+      squared_l2_norm += val * val;
+    }
+    float l2_norm = std::sqrt(squared_l2_norm);
+    l2_norm = std::max(l2_norm, epsilon);
+    for (int c = 0; c < depth; ++c)
+    {
+      *output_data = *input_data / l2_norm;
+      ++output_data;
+      ++input_data;
+    }
+  }
+}
+
+void L2NormalizeQuant8(L2NormParams &params, const Shape &input_shape, const uint8_t *input_data,
+                       const Shape &output_shape, uint8_t *output_data)
+{
+  const int trailing_dim = input_shape.DimensionsCount() - 1;
+  const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
+  const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
+  const int32_t input_zero_point = params.input_zero_point;
+
+  for (int i = 0; i < outer_size; ++i)
+  {
+    int32_t square_l2_norm = 0;
+    for (int c = 0; c < depth; c++)
+    {
+      // Note that input_data advances by depth in the second pass below.
+      int32_t diff = input_data[c] - input_zero_point;
+      square_l2_norm += diff * diff;
+    }
+    int32_t inv_l2norm_multiplier;
+    int inv_l2norm_shift;
+    GetInvSqrtQuantizedMultiplierExp(square_l2_norm, -1, &inv_l2norm_multiplier, &inv_l2norm_shift);
+    for (int c = 0; c < depth; c++)
+    {
+      int32_t diff = *input_data - input_zero_point;
+      int32_t rescaled_diff = MultiplyByQuantizedMultiplierSmallerThanOneExp(
+          128 * diff, inv_l2norm_multiplier, inv_l2norm_shift);
+      int32_t unclamped_output_val = 128 + rescaled_diff;
+      int32_t output_val = std::min(static_cast<int32_t>(255),
+                                    std::max(static_cast<int32_t>(0), unclamped_output_val));
+      *output_data = static_cast<uint8_t>(output_val);
+      ++input_data;
+      ++output_data;
+    }
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_L2NORMALIZE_H__
diff --git a/compute/cker/include/cker/operation/Logistic.h b/compute/cker/include/cker/operation/Logistic.h
index 7477858fc..3d3e59e55 100644
--- a/compute/cker/include/cker/operation/Logistic.h
+++ b/compute/cker/include/cker/operation/Logistic.h
@@ -32,18 +32,9 @@ namespace cker
 inline void Logistic(const Shape &input_shape, const float *input_data, const Shape &output_shape,
                      float *output_data)
 {
-#ifdef __aarch64__
   auto input_map = MapAsVector(input_data, input_shape);
   auto output_map = MapAsVector(output_data, output_shape);
   output_map.array() = input_map.array().unaryExpr(Eigen::internal::scalar_logistic_op<float>());
-#else
-  // Note, this can be done using TANH: (1/2) + (1/2) * TANH(x/2)
-  const int size = MatchingFlatSize(input_shape, output_shape);
-  for (int i = 0; i < size; i++)
-  {
-    output_data[i] = 1.f / (1.f + std::exp(-input_data[i]));
-  }
-#endif
 }
 
 } // namespace cker
diff --git a/compute/cker/include/cker/operation/MatrixBandPart.h b/compute/cker/include/cker/operation/MatrixBandPart.h
index 9f49c8fdd..5674ff3ef 100644
--- a/compute/cker/include/cker/operation/MatrixBandPart.h
+++ b/compute/cker/include/cker/operation/MatrixBandPart.h
@@ -32,10 +32,10 @@ void MatrixBandPart(const T num_lower_diags, const T num_upper_diags, const Shap
 {
   auto last_dim = input_shape.DimensionsCount() - 1;
 
-  T batch_num = 0;
-  for (int dim = 0; dim < last_dim - 2; dim++)
+  T batch_num = 1;
+  for (int dim = 0; dim < input_shape.DimensionsCount() - 2; dim++)
   {
-    batch_num += input_shape.Dims(dim);
+    batch_num *= input_shape.Dims(dim);
   }
 
   const T row_num = input_shape.Dims(last_dim - 1);
diff --git a/compute/cker/include/cker/operation/Pad.h b/compute/cker/include/cker/operation/Pad.h
index af432f3a8..4a2732d82 100644
--- a/compute/cker/include/cker/operation/Pad.h
+++ b/compute/cker/include/cker/operation/Pad.h
@@ -26,9 +26,10 @@ namespace nnfw
 {
 namespace cker
 {
+template <typename T>
 inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &input_shape,
-                const float *input_data, const Shape &output_shape, float *output_data,
-                const float *constant_value_data)
+                const T *input_data, const Shape &output_shape, T *output_data,
+                const T *constant_value_data)
 {
   // Note, this is pad with mode=`CONSTANT`: it doesn't support `REFLECT` and `SYMMETRIC`
   // TODO: come up with more subtle solution that uses subtensors like arm compute
@@ -38,7 +39,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu
   /** List of padding information */
   using PaddingList = std::vector<PaddingInfo>;
 
-  auto constant_value = constant_value_data ? *constant_value_data : 0;
+  const T constant_value = constant_value_data ? *constant_value_data : 0;
   assert(output_shape.DimensionsCount() == input_shape.DimensionsCount());
 
   PaddingList padding_list(pad_rank);
@@ -64,7 +65,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu
     {
       const int32_t in_row_len = input_shape.Dims(0);
       std::fill_n(output_data, padding_list[0].first, constant_value);
-      std::memcpy(output_data + padding_list[0].first, input_data, in_row_len * sizeof(float));
+      std::memcpy(output_data + padding_list[0].first, input_data, in_row_len * sizeof(T));
       std::fill_n(output_data + padding_list[0].first + in_row_len, padding_list[0].second,
                   constant_value);
       break;
@@ -89,7 +90,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu
         out_offset += padding_list[1].first;
 
         // copy a row of input data
-        memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(float));
+        memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(T));
 
         out_offset += in_row_len;
 
@@ -132,7 +133,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu
           out_offset += padding_list[2].first;
 
           // copy a row of input data
-          memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(float));
+          memcpy(output_data + out_offset, input_data + in_offset, in_row_len * sizeof(T));
 
           out_offset += in_row_len;
 
@@ -191,7 +192,7 @@ inline void Pad(const int32_t *padding_data, int32_t pad_rank, const Shape &inpu
             out_c_offset += padding_list[3].first;
 
             // copy a row of input data
-            memcpy(output_data + out_c_offset, input_data + in_offset, in_row_len * sizeof(float));
+            memcpy(output_data + out_c_offset, input_data + in_offset, in_row_len * sizeof(T));
 
             out_c_offset += in_row_len;
 
diff --git a/compute/cker/include/cker/operation/Quantize.h b/compute/cker/include/cker/operation/Quantize.h
new file mode 100644
index 000000000..5c82d111f
--- /dev/null
+++ b/compute/cker/include/cker/operation/Quantize.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_QUANTIZE_H__
+#define __NNFW_CKER_QUANTIZE_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include "cker/Utils.h"
+#include <stdexcept>
+#include <iostream>
+namespace nnfw
+{
+namespace cker
+{
+template <typename InputT, typename OutputT>
+inline void Quantize(const Shape &input_shape, const InputT *input_data, const Shape &output_shape,
+                     OutputT *output_data, const float output_scale, const int32_t output_offset)
+{
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  int min_val = std::numeric_limits<OutputT>::min();
+  int max_val = std::numeric_limits<OutputT>::max();
+
+  for (int i = 0; i < flat_size; i++)
+  {
+    int32_t unclamped = static_cast<int32_t>(round(input_data[i] / output_scale)) + output_offset;
+    int32_t clamped = std::min(std::max(unclamped, min_val), max_val);
+    output_data[i] = clamped;
+  }
+}
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_QUANTIZE_H__
diff --git a/compute/cker/include/cker/operation/ReLU6.h b/compute/cker/include/cker/operation/ReLU6.h
new file mode 100644
index 000000000..20df561dc
--- /dev/null
+++ b/compute/cker/include/cker/operation/ReLU6.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_RELU6_H__
+#define __NNFW_CKER_RELU6_H__
+
+#include "cker/Shape.h"
+#include "cker/eigen/Utils.h"
+
+#include <cmath>
+#include <Eigen/Core>
+
+namespace nnfw
+{
+namespace cker
+{
+
+inline void ReLU6(const Shape &input_shape, const float *input_data, float *output_data)
+{
+  int size = input_shape.FlatSize();
+
+  for (int i = 0; i < size; ++i)
+  {
+    if (input_data[i] <= 0)
+    {
+      output_data[i] = 0;
+    }
+    else if (input_data[i] > 6.0)
+    {
+      output_data[i] = 6.0;
+    }
+    else
+    {
+      output_data[i] = input_data[i];
+    }
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_RELU6_H__
diff --git a/compute/cker/include/cker/operation/Reduce.h b/compute/cker/include/cker/operation/Reduce.h
index 4ba3652d3..cf9634a67 100644
--- a/compute/cker/include/cker/operation/Reduce.h
+++ b/compute/cker/include/cker/operation/Reduce.h
@@ -159,6 +159,92 @@ public:
                             num_resolved_axis, temp_index_data(), reducer, output_data);
   }
 
+  // Computes the mean of elements across dimensions given in axis.
+  // It does so in two stages, first calculates the sum of elements along the axis
+  // then divides it by the number of element in axis for quantized values.
+  template <typename T, typename U>
+  inline bool QuantizedMeanOrSum(const T *input_data, int32_t input_zero_point, float input_scale,
+                                 const Shape &input_shape, T *output_data,
+                                 int32_t output_zero_point, float output_scale,
+                                 const Shape &output_shape, const std::vector<int> &axes,
+                                 bool /*keep_dims*/, U *temp_sum, bool compute_sum,
+                                 U reducer(const U current, const T in))
+  {
+    // Reset output data.
+    size_t num_outputs = 1;
+    for (int idx = 0; idx < output_shape.DimensionsCount(); ++idx)
+    {
+      size_t current = static_cast<size_t>(output_shape.Dims(idx));
+      // Overflow prevention.
+      if (num_outputs > std::numeric_limits<size_t>::max() / current)
+      {
+        return false;
+      }
+      num_outputs *= current;
+    }
+    for (size_t idx = 0; idx < num_outputs; ++idx)
+    {
+      output_data[idx] = T();
+      temp_sum[idx] = U();
+    }
+
+    // Resolve axis.
+    int num_resolved_axis = 0;
+    if (!ResolveAxis(input_shape.DimensionsCount(), axes, resolved_axis_data(), &num_resolved_axis))
+    {
+      return false;
+    }
+
+    if (!ReduceImpl<T, U>(input_data, input_shape, output_shape, resolved_axis_data(),
+                          num_resolved_axis, temp_index_data(), reducer, temp_sum))
+    {
+      return false;
+    }
+
+    // Calculate mean by dividing output_data by num of aggregated element.
+    U num_elements_in_axis = 1;
+    for (int idx = 0; idx < num_resolved_axis; ++idx)
+    {
+      size_t current = static_cast<size_t>(input_shape.Dims(resolved_axis_data()[idx]));
+      // Overflow prevention.
+      if (current > static_cast<size_t>(std::numeric_limits<U>::max() / num_elements_in_axis))
+      {
+        return false;
+      }
+      num_elements_in_axis *= current;
+    }
+
+    if (num_elements_in_axis > 0)
+    {
+      const float scale = input_scale / output_scale;
+      if (compute_sum)
+      {
+        // TODO(b/116341117): Eliminate float and do this completely in 8bit.
+        const float bias = -input_zero_point * scale * num_elements_in_axis + 0.5f;
+        for (size_t idx = 0; idx < num_outputs; ++idx)
+        {
+          const U value =
+              static_cast<U>(std::round(temp_sum[idx] * scale + bias)) + output_zero_point;
+          output_data[idx] = static_cast<T>(value);
+        }
+      }
+      else
+      {
+        const float bias = -input_zero_point * scale + 0.5f;
+        for (size_t idx = 0; idx < num_outputs; ++idx)
+        {
+          float float_mean =
+              static_cast<float>(temp_sum[idx]) / static_cast<float>(num_elements_in_axis);
+          float result = std::min(std::round(float_mean * scale + bias) + output_zero_point,
+                                  static_cast<float>(std::numeric_limits<T>::max()));
+          result = std::max(result, static_cast<float>(std::numeric_limits<T>::min()));
+          output_data[idx] = static_cast<T>(result);
+        }
+      }
+    }
+    return true;
+  }
+
   inline int32_t *resolved_axis_data(void)
   {
     return _resolved_axis.size() ? _resolved_axis.data() : _resolved_axis_small;
diff --git a/compute/cker/include/cker/operation/ResizeBilinear.h b/compute/cker/include/cker/operation/ResizeBilinear.h
new file mode 100644
index 000000000..7fc1e9123
--- /dev/null
+++ b/compute/cker/include/cker/operation/ResizeBilinear.h
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_RESIZEBILINEAR_H__
+#define __NNFW_CKER_RESIZEBILINEAR_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+#include <cmath>
+
+namespace nnfw
+{
+namespace cker
+{
+
+inline void ResizeBilinearKernel2x2(int32_t x0, int32_t x1, int32_t y0, int32_t y1, int32_t x,
+                                    int32_t y, int32_t depth, int32_t batch,
+                                    const Shape &input_shape, const float *input_data,
+                                    const Shape &output_shape, float *output_data)
+{
+  const int32_t input_width = input_shape.Dims(2);
+  const int32_t output_width = output_shape.Dims(2);
+
+  const int32_t input_x_offset = (x1 - x0) * depth;
+  const int32_t input_y_offset = (y1 - y0) * depth * input_width;
+  const int32_t output_x_offset = depth;
+  const int32_t output_y_offset = depth * output_width;
+
+  for (int ch = 0; ch < depth; ch++)
+  {
+    const int32_t input_offset = Offset(input_shape, batch, y0, x0, ch);
+
+    float x0y0 = input_data[input_offset];
+    float x1y0 = input_data[input_offset + input_x_offset];
+    float x0y1 = input_data[input_offset + input_y_offset];
+    float x1y1 = input_data[input_offset + input_x_offset + input_y_offset];
+
+    // Top left corner.
+    const int32_t output_offset = Offset(output_shape, batch, y, x, ch);
+    output_data[output_offset] = x0y0;
+
+    // Top right corner.
+    output_data[output_offset + output_x_offset] = (x0y0 + x1y0) / 2;
+
+    // Bottom left corner.
+    float output = (x0y0 + x0y1) / 2;
+    output_data[output_offset + output_y_offset] = output;
+
+    // Bottom right corner.
+    output_data[output_offset + output_x_offset + output_y_offset] =
+        (output + ((x1y0 + x1y1) / 2)) / 2;
+  }
+}
+
+inline void ResizeBilinear2x2(int32_t batches, int32_t input_height, int32_t input_width,
+                              int32_t depth, int32_t output_height, int32_t output_width,
+                              const Shape &input_shape, const float *input_data,
+                              const Shape &output_shape, float *output_data)
+{
+  for (int b = 0; b < batches; b++)
+  {
+    for (int y0 = 0, y = 0; y <= output_height - 2; y += 2, y0++)
+    {
+      for (int x0 = 0, x = 0; x <= output_width - 2; x += 2, x0++)
+      {
+        int32_t x1 = std::min(x0 + 1, input_width - 1);
+        int32_t y1 = std::min(y0 + 1, input_height - 1);
+        ResizeBilinearKernel2x2(x0, x1, y0, y1, x, y, depth, b, input_shape, input_data,
+                                output_shape, output_data);
+      }
+    }
+  }
+}
+
+inline void ResizeBilinearKernel(const float *input_ptr, int32_t depth, float scale,
+                                 float *output_ptr)
+{
+  for (int32_t i = 0; i < depth; i++)
+  {
+    *output_ptr += *input_ptr * scale;
+    output_ptr++;
+    input_ptr++;
+  }
+}
+
+inline void ComputeInterpolationValues(const float value, const float scale,
+                                       const bool half_pixel_centers, int32_t input_size,
+                                       float *scaled_value, int32_t *lower_bound,
+                                       int32_t *upper_bound)
+{
+  if (half_pixel_centers)
+  {
+    *scaled_value = (value + 0.5f) * scale - 0.5f;
+  }
+  else
+  {
+    *scaled_value = value * scale;
+  }
+  float scaled_value_floor = std::floor(*scaled_value);
+  *lower_bound = std::max(static_cast<int32_t>(scaled_value_floor), static_cast<int32_t>(0));
+  *upper_bound = std::min(static_cast<int32_t>(std::ceil(*scaled_value)), input_size - 1);
+}
+
+inline void ResizeBilinearGeneric(int32_t batches, int32_t input_height, int32_t input_width,
+                                  int32_t depth, int32_t output_height, int32_t output_width,
+                                  float height_scale, float width_scale, const Shape &input_shape,
+                                  const float *input_data, float *output_data,
+                                  const bool half_pixel_centers)
+{
+  memset(output_data, 0, batches * output_height * output_width * depth * sizeof(float));
+
+  int32_t output_offset = 0;
+  for (int b = 0; b < batches; ++b)
+  {
+    for (int y = 0; y < output_height; ++y)
+    {
+      float input_y;
+      int32_t y0, y1;
+      ComputeInterpolationValues(y, height_scale, half_pixel_centers, input_height, &input_y, &y0,
+                                 &y1);
+      for (int x = 0; x < output_width; ++x)
+      {
+        float input_x;
+        int32_t x0, x1;
+        ComputeInterpolationValues(x, width_scale, half_pixel_centers, input_width, &input_x, &x0,
+                                   &x1);
+        float *output_ptr = &output_data[output_offset];
+
+        // Run kernel on the 4 corners of the bilinear resize algorithm.
+        int32_t input_offset = Offset(input_shape, b, y0, x0, 0);
+        float scale = (1 - (input_y - y0)) * (1 - (input_x - x0));
+        const float *input_ptr = &input_data[input_offset];
+        ResizeBilinearKernel(input_ptr, depth, scale, output_ptr);
+
+        input_offset = Offset(input_shape, b, y0, x1, 0);
+        scale = (1 - (input_y - y0)) * (input_x - x0);
+        input_ptr = &input_data[input_offset];
+        ResizeBilinearKernel(input_ptr, depth, scale, output_ptr);
+
+        input_offset = Offset(input_shape, b, y1, x0, 0);
+        scale = (input_y - y0) * (1 - (input_x - x0));
+        input_ptr = &input_data[input_offset];
+        ResizeBilinearKernel(input_ptr, depth, scale, output_ptr);
+
+        input_offset = Offset(input_shape, b, y1, x1, 0);
+        scale = (input_y - y0) * (input_x - x0);
+        input_ptr = &input_data[input_offset];
+        ResizeBilinearKernel(input_ptr, depth, scale, output_ptr);
+
+        output_offset += depth;
+      }
+    }
+  }
+}
+
+template <typename T>
+inline void ResizeBilinearGenericSmallChannel(int32_t batches, int32_t input_height,
+                                              int32_t input_width, int32_t depth,
+                                              int32_t output_height, int32_t output_width,
+                                              float height_scale, float width_scale,
+                                              const Shape &input_shape, const T *input_data,
+                                              T *output_data, const bool half_pixel_centers)
+{
+  T *output_ptr = &output_data[0];
+  for (int b = 0; b < batches; ++b)
+  {
+    for (int y = 0; y < output_height; ++y)
+    {
+      float input_y;
+      int32_t y0, y1;
+      ComputeInterpolationValues(y, height_scale, half_pixel_centers, input_height, &input_y, &y0,
+                                 &y1);
+      for (int x = 0; x < output_width; ++x)
+      {
+        float input_x;
+        int32_t x0, x1;
+        ComputeInterpolationValues(x, width_scale, half_pixel_centers, input_width, &input_x, &x0,
+                                   &x1);
+
+        int32_t input_offset[4] = {
+            Offset(input_shape, b, y0, x0, 0), Offset(input_shape, b, y0, x1, 0),
+            Offset(input_shape, b, y1, x0, 0), Offset(input_shape, b, y1, x1, 0)};
+        float scale[4] = {(1 - (input_y - y0)) * (1 - (input_x - x0)),
+                          (1 - (input_y - y0)) * (input_x - x0),
+                          (input_y - y0) * (1 - (input_x - x0)), (input_y - y0) * (input_x - x0)};
+
+        for (int d = 0; d < depth; d++)
+        {
+          const T *input_ptr = &input_data[d];
+          *output_ptr++ = static_cast<T>(
+              input_ptr[input_offset[0]] * scale[0] + input_ptr[input_offset[1]] * scale[1] +
+              input_ptr[input_offset[2]] * scale[2] + input_ptr[input_offset[3]] * scale[3]);
+        }
+      }
+    }
+  }
+}
+
+void ResizeBilinear(ResizeBilinearParams &params, const Shape &input_shape, const float *input_data,
+                    const Shape &output_shape, float *output_data)
+{
+  int32_t batches = static_cast<int32_t>(MatchingDim(input_shape, 0, output_shape, 0));
+  int32_t input_height = input_shape.Dims(1);
+  int32_t input_width = input_shape.Dims(2);
+  int32_t depth = static_cast<int32_t>(MatchingDim(input_shape, 3, output_shape, 3));
+
+  // Specialize for 2x2 upsample.
+  if (!params.align_corners && !params.half_pixel_centers &&
+      params.output_height == 2 * input_height && params.output_width == 2 * input_width)
+  {
+    ResizeBilinear2x2(batches, input_height, input_width, depth, params.output_height,
+                      params.output_width, input_shape, input_data, output_shape, output_data);
+  }
+  else
+  {
+    float height_scale = static_cast<float>(input_height) / params.output_height;
+    float width_scale = static_cast<float>(input_width) / params.output_width;
+    if (params.align_corners && params.output_height > 1)
+    {
+      height_scale = static_cast<float>(input_height - 1) / (params.output_height - 1);
+    }
+    if (params.align_corners && params.output_width > 1)
+    {
+      width_scale = static_cast<float>(input_width - 1) / (params.output_width - 1);
+    }
+
+    ResizeBilinearGeneric(batches, input_height, input_width, depth, params.output_height,
+                          params.output_width, height_scale, width_scale, input_shape, input_data,
+                          output_data, params.half_pixel_centers);
+  }
+}
+
+void ResizeBilinear(ResizeBilinearParams &params, const Shape &input_shape,
+                    const uint8_t *input_data, const Shape &output_shape, uint8_t *output_data)
+{
+  int32_t batches = MatchingDim(input_shape, 0, output_shape, 0);
+  int32_t input_height = input_shape.Dims(1);
+  int32_t input_width = input_shape.Dims(2);
+  int32_t depth = MatchingDim(input_shape, 3, output_shape, 3);
+
+  float height_scale = (params.align_corners && params.output_height > 1)
+                           ? (static_cast<float>(input_height - 1) / (params.output_height - 1))
+                           : (static_cast<float>(input_height) / params.output_height);
+
+  float width_scale = (params.align_corners && params.output_width > 1)
+                          ? (static_cast<float>(input_width - 1) / (params.output_width - 1))
+                          : (static_cast<float>(input_width) / params.output_width);
+
+  ResizeBilinearGenericSmallChannel<uint8_t>(
+      batches, input_height, input_width, depth, params.output_height, params.output_width,
+      height_scale, width_scale, input_shape, input_data, output_data, params.half_pixel_centers);
+}
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_RESIZEBILINEAR_H__
diff --git a/compute/cker/include/cker/operation/SpaceToDepth.h b/compute/cker/include/cker/operation/SpaceToDepth.h
new file mode 100644
index 000000000..ef679315e
--- /dev/null
+++ b/compute/cker/include/cker/operation/SpaceToDepth.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_SPACE_TO_DEPTH_H__
+#define __NNFW_CKER_SPACE_TO_DEPTH_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename T>
+inline void SpaceToDepth(const SpaceToDepthParams &params, const Shape &unextended_input_shape,
+                         const T *input_data, const Shape &unextended_output_shape, T *output_data)
+{
+  assert(unextended_input_shape.DimensionsCount() <= 4);
+  assert(unextended_output_shape.DimensionsCount() <= 4);
+  const Shape input_shape = Shape::ExtendedShape(4, unextended_input_shape);
+  const Shape output_shape = Shape::ExtendedShape(4, unextended_output_shape);
+
+  const int output_depth = output_shape.Dims(3);
+  const int output_width = output_shape.Dims(2);
+  const int output_height = output_shape.Dims(1);
+
+  const int input_depth = input_shape.Dims(3);
+  const int batch_size = input_shape.Dims(0);
+
+  // Number of continuous values that we can copy in one interation.
+  const int stride = params.block_size * input_depth;
+
+  for (int batch = 0; batch < batch_size; ++batch)
+  {
+    for (int out_h = 0; out_h < output_height; ++out_h)
+    {
+      T *output_ptr = output_data + Offset(output_shape, batch, out_h, 0, 0);
+      for (int offset_h = 0; offset_h < params.block_size; ++offset_h)
+      {
+        T *dst = output_ptr;
+        for (int out_w = 0; out_w < output_width; ++out_w)
+        {
+          memcpy(dst, input_data, stride * sizeof(T));
+          input_data += stride;
+          dst += output_depth;
+        }
+        output_ptr += stride;
+      }
+    }
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_SPACE_TO_DEPTH_H__
diff --git a/compute/cker/include/cker/operation/SplitV.h b/compute/cker/include/cker/operation/SplitV.h
new file mode 100644
index 000000000..9e46f4b04
--- /dev/null
+++ b/compute/cker/include/cker/operation/SplitV.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_SPLIT_V_H__
+#define __NNFW_CKER_SPLIT_V_H__
+
+#include "cker/Shape.h"
+#include "cker/Types.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+template <typename Scalar>
+void SplitV(const SplitVParams &params, const Shape &input_shape, const Scalar *input_data,
+            std::vector<nnfw::cker::Shape> &output_shapes, Scalar *const *output_data)
+{
+  const int split_dimensions = input_shape.DimensionsCount();
+  int axis = params.axis < 0 ? params.axis + split_dimensions : params.axis;
+  int outputs_count = params.num_split;
+
+  int64_t split_size = 0;
+
+  for (int i = 0; i < outputs_count; i++)
+  {
+    // TFLITE_DCHECK_EQ(output_shapes[i]->DimensionsCount(), split_dimensions);
+    for (int j = 0; j < split_dimensions; j++)
+    {
+      if (j != axis)
+      {
+        MatchingDim(output_shapes[i], j, input_shape, j);
+      }
+    }
+    split_size += output_shapes[i].Dims(axis);
+  }
+
+  int64_t outer_size = 1;
+  for (int i = 0; i < axis; ++i)
+  {
+    outer_size *= input_shape.Dims(i);
+  }
+  // For all output arrays,
+  // FlatSize() = outer_size * Dims(axis) * base_inner_size;
+  int64_t base_inner_size = 1;
+  for (int i = axis + 1; i < split_dimensions; ++i)
+  {
+    base_inner_size *= input_shape.Dims(i);
+  }
+
+  const Scalar *input_ptr = input_data;
+  int copy_size = 0;
+  for (int k = 0; k < outer_size; k++)
+  {
+    for (int i = 0; i < outputs_count; ++i)
+    {
+      copy_size = output_shapes[i].Dims(axis) * base_inner_size;
+      memcpy(output_data[i] + k * copy_size, input_ptr, copy_size * sizeof(Scalar));
+      input_ptr += copy_size;
+    }
+  }
+}
+
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_SPLIT_V_H__
diff --git a/compute/cker/include/cker/operation/StatelessRandomUniform.h b/compute/cker/include/cker/operation/StatelessRandomUniform.h
new file mode 100644
index 000000000..d5952ae23
--- /dev/null
+++ b/compute/cker/include/cker/operation/StatelessRandomUniform.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_CKER_STATELESS_RANDOM_UNIFORM_H__
+#define __NNFW_CKER_STATELESS_RANDOM_UNIFORM_H__
+
+#include "cker/Types.h"
+#include "cker/Shape.h"
+#include "cker/Utils.h"
+
+#include "cker/eigen/EigenSupport.h"
+
+#include "cker/operation/Helper/Tensor.h"
+#include "cker/operation/Helper/PhiloxRandom.h"
+#include "cker/operation/Helper/RandomOpCpu.h"
+#include "cker/operation/Helper/RandomDistributions.h"
+
+namespace nnfw
+{
+namespace cker
+{
+
+void GenerateKey(Tensor seed, random::PhiloxRandom::Key *out_key,
+                 random::PhiloxRandom::ResultType *out_counter)
+{
+  // Grab the two seeds
+  uint32_t seed0;
+  uint32_t seed1;
+
+  const auto seed_vals = seed.flat<int32_t>();
+
+  seed0 = seed_vals(0);
+  seed1 = seed_vals(1);
+  // Scramble the seeds so that the user doesn't need to worry about which
+  // part of the seed needs to be strong.
+  (*out_key)[0] = 0x3ec8f720;
+  (*out_key)[1] = 0x02461e29;
+  (*out_counter)[0] = static_cast<uint32_t>(seed0);
+  (*out_counter)[1] = (*out_counter)[3] = 0;
+  (*out_counter)[2] = static_cast<uint32_t>(seed1);
+  const auto mix = random::PhiloxRandom(*out_counter, *out_key)();
+  (*out_key)[0] = mix[0];
+  (*out_key)[1] = mix[1];
+  (*out_counter)[0] = (*out_counter)[1] = 0;
+  (*out_counter)[2] = mix[2];
+  (*out_counter)[3] = mix[3];
+}
+
+template <typename Device, class Distribution>
+void Fill(random::PhiloxRandom random, Tensor *output)
+{
+  // Build distribution
+  typedef typename Distribution::ResultElementType T;
+
+  auto flat = output->flat<T>();
+  // Reuse the compute kernels from the stateful random ops
+  functor::FillPhiloxRandom<Device, Distribution>()(random, flat.data(), flat.size(),
+                                                    Distribution());
+}
+
+inline void StatelessRandomUniform(const Shape &shape_shape, const int *shape_data,
+                                   const Shape &seed_shape, const int *seed_data,
+                                   const Shape &output_shape, float *output_data)
+{
+  Tensor shape_t;
+  Tensor seed_t;
+
+  shape_t.shape.ReplaceWith(shape_shape.DimensionsCount(), shape_shape.DimsData());
+  shape_t.buffer = (void *)shape_data;
+
+  seed_t.shape.ReplaceWith(seed_shape.DimensionsCount(), seed_shape.DimsData());
+  seed_t.buffer = (void *)seed_data;
+
+  Tensor output_t;
+  output_t.shape.ReplaceWith(output_shape.DimensionsCount(), output_shape.DimsData());
+  output_t.buffer = output_data;
+
+  random::PhiloxRandom::Key key;
+  random::PhiloxRandom::ResultType counter;
+
+  GenerateKey(seed_t, &key, &counter);
+
+  Fill<Eigen::ThreadPoolDevice, random::UniformDistribution<random::PhiloxRandom, float>>(
+      random::PhiloxRandom(counter, key), &output_t);
+}
+} // namespace cker
+} // namespace nnfw
+
+#endif // __NNFW_CKER_STATELESS_RANDOM_UNIFORM_H__
diff --git a/compute/cker/include/cker/ruy/RuySupport.h b/compute/cker/include/cker/ruy/RuySupport.h
index 432b181bd..9612dd517 100644
--- a/compute/cker/include/cker/ruy/RuySupport.h
+++ b/compute/cker/include/cker/ruy/RuySupport.h
@@ -22,11 +22,6 @@
 #include <ruy/context.h>
 #include "cker/Types.h"
 
-namespace
-{
-const int kDefaultNumThreadpoolThreads = 4;
-}
-
 namespace nnfw
 {
 namespace cker
@@ -34,42 +29,6 @@ namespace cker
 namespace ruy_support
 {
 
-struct RuyContext
-{
-public:
-  RuyContext() : ruy_context_(new ruy::Context)
-  {
-    SetMaxNumThreads(onert::util::getConfigInt(onert::util::config::RUY_THREADS));
-#ifdef USE_RUY_GEMV
-    ruy_context_->cache_policy = ruy::kCacheLHSOnNarrowMul;
-#endif
-  };
-
-  ruy::Context *ruy_context() const { return ruy_context_.get(); }
-
-  static inline RuyContext &GetRuyContext()
-  {
-    static thread_local RuyContext instance;
-    return instance;
-  }
-
-  void SetMaxNumThreads(int max_num_threads)
-  {
-    const int target_num_threads =
-        max_num_threads > -1 ? max_num_threads : kDefaultNumThreadpoolThreads;
-    ruy_context_->max_num_threads = target_num_threads;
-  }
-
-private:
-  const std::unique_ptr<ruy::Context> ruy_context_;
-};
-
-inline ruy::Context *GetRuyContext()
-{
-  auto &ctx = RuyContext::GetRuyContext();
-  return ctx.ruy_context();
-}
-
 template <typename Scalar, typename DataPointer>
 void MakeRuyMatrix(const MatrixParams<Scalar> &params, DataPointer data_ptr,
                    ruy::Matrix<Scalar> *dst)
diff --git a/docs/conf.py b/docs/conf.py
index 3abe4f4c2..649b677a9 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -21,7 +21,7 @@ copyright = '2020, Samsung Research & contributors'
 author = 'Samsung Research & contributors'
 
 # The full version, including alpha/beta/rc tags
-release = '1.7.0'
+release = '1.8.0'
 
 # -- General configuration ---------------------------------------------------
 
diff --git a/docs/howto/how-to-build-runtime.md b/docs/howto/how-to-build-runtime.md
index 2bfd14c63..f4751198e 100644
--- a/docs/howto/how-to-build-runtime.md
+++ b/docs/howto/how-to-build-runtime.md
@@ -13,7 +13,7 @@ In the Ubuntu, you can easily install it with the following command.
 
 ```
 $ sudo apt-get install cmake libboost-all-dev
-``` 
+```
 
 If your linux system does not have the basic development configuration, you will need to install more packages. A list of all packages needed to configure the development environment can be found in the https://github.com/Samsung/ONE/blob/master/infra/docker/Dockerfile.1804 file.
 
@@ -44,7 +44,7 @@ python3-venv \
 scons \
 software-properties-common \
 unzip \
-wget 
+wget
 
 $ mkdir /tmp/gtest
 $ cd /tmp/gtest
@@ -63,7 +63,7 @@ In a typical linux development environment, including Ubuntu, you can build the
 ```
 $ git clone https://github.com/Samsung/ONE.git one
 $ cd one
-$ cp -n Makefile.template Makefile; make install
+$ make -f Makefile.template install
 ```
 
 Unfortunately, the debug build on the x86_64 architecture currently has an error. To solve the problem, you must use gcc version 9 or higher. Another workaround is to do a release build rather than a debug build. This is not a suitable method for debugging during development, but it is enough to check the function of the runtime. To release build the runtime, add the environment variable `BUILD_TYPE=release` to the build command as follows.
@@ -107,7 +107,7 @@ $ tree -L 3 ./Product/out
 │   │   ├── NeuralNetworksEx.h
 │   │   ├── NeuralNetworksExtensions.h
 │   │   ├── NeuralNetworks.h
-│   │   ├── nnfw_dev.h
+│   │   ├── nnfw_experimental.h
 │   │   └── nnfw.h
 │   └── onert
 │       ├── backend
diff --git a/docs/howto/how-to-use-nnfw-api.md b/docs/howto/how-to-use-nnfw-api.md
index 6c0fb4924..1198a31d5 100644
--- a/docs/howto/how-to-use-nnfw-api.md
+++ b/docs/howto/how-to-use-nnfw-api.md
@@ -23,8 +23,8 @@ nnfw_load_model_from_file(session, nnpackage_path);
 ```
 3) (Optional) Assign a specific backend to operations
 ``` c
-  // Use acl_neon backend for CONV_2D and acl_cl for otherwise.
-  // Note that defalut backend is acl_cl
+  // Use 'acl_neon' backend for CONV_2D and 'cpu' for otherwise.
+  // Note that defalut backend is 'cpu'.
   nnfw_set_op_backend(session, "CONV_2D", "acl_neon");
 ```
 
diff --git a/docs/nnfw/howto/CrossBuildForAndroid.md b/docs/nnfw/howto/CrossBuildForAndroid.md
index d7e48c89a..08d5fd680 100644
--- a/docs/nnfw/howto/CrossBuildForAndroid.md
+++ b/docs/nnfw/howto/CrossBuildForAndroid.md
@@ -44,11 +44,9 @@ Different from cross build for linux,
 Here is an example of using Makefile.
 
 ```bash
-cp -n Makefile.template Makefile
-
 TARGET_OS=android \
 CROSS_BUILD=1 \
 NDK_DIR=/path/android-tools/r20/ndk \
 EXT_ACL_FOLDER=/path/arm_compute-v19.11.1-bin-android/lib/android-arm64-v8a-neon-cl \
-make install
+make -f Makefile.template install
 ```
diff --git a/docs/overview/supported-operations.md b/docs/overview/supported-operations.md
index 6120e24f8..1d9050a72 100644
--- a/docs/overview/supported-operations.md
+++ b/docs/overview/supported-operations.md
@@ -106,7 +106,7 @@ SELECT_V2 | O |   |  
 SHAPE | O | O | O
 SIN | O | O | O
 SKIP_GRAM | O |   |  
-SLICE | O | O |  
+SLICE | O | O | O
 SOFTMAX | O | O | O
 SPACE_TO_BATCH_ND | O | O | O
 SPACE_TO_DEPTH | O | O | O
diff --git a/docs/release/1.7/release-note-1.7.0.md b/docs/release/1.7/release-note-1.7.0.md
deleted file mode 100644
index c1a4f50b2..000000000
--- a/docs/release/1.7/release-note-1.7.0.md
+++ /dev/null
@@ -1,46 +0,0 @@
-## Feature Highlights
-
-- **ONE** Compiler
-  - Compiler supports more operations
-  - New command line interface for user interface consistancy
-- **ONE** Runtime
-  - Runtime CPU backend supports more operations
-  - Runtime CPU backend supports more quant8 operations
-  - API changes
-  - New optimization
-  
-## ONE Compiler
-
-### Compiler supports more operations
-
-- MatrixDiag, MatrixSetDiag, ReverseSequence, ReverseV2, SegmentSum, SelectV2, SparseToDense, Where
-
-### New command line interface for user interface consistancy
-
-- one-import: imports conventional model files to circle
-   - one-import-tf: imports TensorFlow model to circle
-   - one-import-tflite: imports TensorFlow lite model to circle
-- one-optimize: circle optimize command
-- one-quantize: circle quantize command
-   - supports float32 to uint8, layer wise (for Conv series)
-- one-pack: package command
-- one-prepare-venv: prepares python virtual environment for importing TensorFlow model
-- one-codegen: backend(if available) code generator
-
-## ONE Runtime
-
-### Runtime CPU backend supports more operations
-
-- LogSoftmax, SpaceToBatchND
-
-### Runtime CPU backend supports more quant8 operations
-
-- Logistic, Mul, Tanh, SpaceToBatchND, Transpose, Sub, Max, Min, Less, Greater, GreaterEqual, LessEqual, Equal, NotEqual
-
-### API changes
-
-- Introduce basic asynchronous execution API
-
-### New optimization
-    
-- Remove dynamic tensor overhead from static models
diff --git a/docs/release/1.8/release-note-1.8.0.md b/docs/release/1.8/release-note-1.8.0.md
new file mode 100644
index 000000000..1cbbd0b70
--- /dev/null
+++ b/docs/release/1.8/release-note-1.8.0.md
@@ -0,0 +1,42 @@
+# Release Note 1.8.0
+
+## Feature Highlights
+
+- **ONE** Compiler
+    - Support new command line interface
+
+- **ONE** Runtime
+    - CPU backend supports 7 more operations
+    - CPU backend supports 9 more quant8 operations
+
+## ONE Compiler
+
+### New command line interface for user interface consistancy
+
+- `one-import-bcq` : import BCQ(Binary coding quantized) TensorFlow model
+- Commands now support `--version` option to show version number
+
+### Changes
+
+- Experimental support for TensorFlow 2.x has updated to 2.3.0 (TensorFlow 1.3.2 is our official support version)
+- Support more operators in luci-interpreter
+- Enhancing one-quantizer
+
+## ONE Runtime
+
+### Rename headers
+
+- Rename `nnfw_dev.h` to `nnfw_experimental.h`
+
+### Optimization
+
+- Remove copies for model input/outputs whenever possible
+
+### Support CPU backend operation
+
+- BatchToSpaceND, L2Normalization, ReLU6, ResizeBilinear, SpaceToDepth, SplitV, StatelessRandomUniform
+
+### Support CPU backend quant8 operation
+
+- BatchToSpaceND, L2Normalization, Pad, PadV2, ResizeBilinear, Slice, Quantize, SpaceToDepth, Sum
+
diff --git a/docs/runtime/api-layered-arch.png b/docs/runtime/api-layered-arch.png
new file mode 100644
index 000000000..86eda7513
--- /dev/null
+++ b/docs/runtime/api-layered-arch.png
diff --git a/docs/runtime/api.md b/docs/runtime/api.md
index 593279293..3ff9ff056 100644
--- a/docs/runtime/api.md
+++ b/docs/runtime/api.md
@@ -1 +1,35 @@
 # API
+
+## Runtime Layered Architecture
+
+Here is a figure of runtime layered architecture.
+
+![Layered Architecture](api-layered-arch.png)
+
+There are three parts - Frontend, Core and Backend. Core works with Frontend and Backend API. Frontend gets user inputs(neural networks models) and Backend does the actual computation.
+
+## Frontend API
+
+Frontend API is about from creation/loading the model and
+
+Runtime supports two (frontend) APIs - NN API and NNFW API.
+
+### NN API
+
+NN API stands for Android Neural Networks API. It is part of Android Open Source Project and we provide a binding between NN API and One Runtime.
+
+For usage, refer to [Howto : NN API](../howto/how-to-use-nnapi-binding.md).
+
+### NNFW API
+
+NNFW API is ONE's own API. It supports loading models from NN Packages. As it is our own API, It can do most of functionalities that One Runtime offers. Representatively, it provides functions for execution with multiple backends.
+
+For usage, refer to [Howto : NNFW API](../howto/how-to-use-nnfw-api.md).
+
+## Backend API
+
+Backend API is defined by One Runtime.
+
+Backend API is about actual computation of operations and memory management for operands. In order to allow different kinds of computation units or computation libraries, One Runtime defines Backend API to support user defined operation kernels and memory manager. It contains a lot of C++ headers which are subject to change.
+
+For detailed descriptions, refer to [Backend API](../runtime/backend-api.md).
diff --git a/docs/runtime/core.md b/docs/runtime/core.md
index 42ba75f02..64a6c620c 100644
--- a/docs/runtime/core.md
+++ b/docs/runtime/core.md
@@ -68,7 +68,7 @@ Let's say we have some functions written in a certain programming language. Then
 
 With generated tensors and kernels, the compiler creates executor objects. There are 3 types of executors are supported - Linear, Dataflow, and Parallel. Linear executor is the default executor and Dataflow Executor and Parallel Executor are experimental.
 
-For more about executors, please refer to [Executors](./executors.md) document.
+For more about executors, please refer to [Executors](executors.md) document.
 
 ### Module `exec`
 
@@ -83,4 +83,4 @@ For more about executors, please refer to [Executors](./executors.md) document.
 
 Backends are plugins and they are loaded dynamically(via `dlopen`). So this module is a set of interface classes for backend implementation. `compiler` can compile with a variety of backends without knowing specific backend implementation.
 
-Backend interface classes are mostly about memory management and kernel generation. For more, please refer to [Backend API](./backend-api.md) document.
+Backend interface classes are mostly about memory management and kernel generation. For more, please refer to [Backend API](backend-api.md) document.
diff --git a/docs/runtime/heterogeneous-execution.md b/docs/runtime/heterogeneous-execution.md
index dc39dae59..e7a5e2734 100644
--- a/docs/runtime/heterogeneous-execution.md
+++ b/docs/runtime/heterogeneous-execution.md
@@ -12,11 +12,11 @@ Here is another case. Let's say we have a model that is not sequential so there
 
 ![Add-3Conv model](heterogeneous-execution-add-3-conv-model.png)
 
-Say we have 3 backends that are based on CPU, GPU and NPU(Neural Processing Unit) respectively. After executing Add, 3 Conv2D operations are ready to run. We may utilize those backends with [Parallel Executor (experimental)](./executors.md#parallel-executor-experimental). For this case we may get performance gain regardless of kernels' speed as those are run in parallel independently.
+Say we have 3 backends that are based on CPU, GPU and NPU(Neural Processing Unit) respectively. After executing Add, 3 Conv2D operations are ready to run. We may utilize those backends with [Parallel Executor (experimental)](executors.md#parallel-executor-experimental). For this case we may get performance gain regardless of kernels' speed as those are run in parallel independently.
 
 ## Graph Transformation
 
-Unfortunately it is not that simple to get performance gain. As each backend has its own memory management module, a copy must be done between backend boundaries. Plus, it may require layout changes so "Permute" operations are added from  `PermutationInsertionPass`. This process is done from [Lowering](./core.md#1-lowering) phase of compilation.
+Unfortunately it is not that simple to get performance gain. As each backend has its own memory management module, a copy must be done between backend boundaries. Plus, it may require layout changes so "Permute" operations are added from  `PermutationInsertionPass`. This process is done from [Lowering](core.md#1-lowering) phase of compilation.
 
 Here is an example of that. Let's say we have assigned different backends for Add and Conv2D. So a Permute operation is inserted between them.
 
diff --git a/infra/cmake/packages/ARMComputeSourceConfig.cmake b/infra/cmake/packages/ARMComputeSourceConfig.cmake
index 51a235a35..adec1f91b 100644
--- a/infra/cmake/packages/ARMComputeSourceConfig.cmake
+++ b/infra/cmake/packages/ARMComputeSourceConfig.cmake
@@ -8,7 +8,7 @@ function(_ARMComputeSource_import)
   nnas_include(OptionTools)
 
   envoption(EXTERNAL_DOWNLOAD_SERVER "https://github.com")
-  set(ARMCOMPUTE_URL ${EXTERNAL_DOWNLOAD_SERVER}/ARM-software/ComputeLibrary/archive/v19.11.1.tar.gz)
+  set(ARMCOMPUTE_URL ${EXTERNAL_DOWNLOAD_SERVER}/ARM-software/ComputeLibrary/archive/v20.05.tar.gz)
   ExternalSource_Download(ARMCOMPUTE ${ARMCOMPUTE_URL})
 
   set(ARMComputeSource_DIR ${ARMCOMPUTE_SOURCE_DIR} PARENT_SCOPE)
diff --git a/infra/cmake/packages/FarmhashSourceConfig.cmake b/infra/cmake/packages/FarmhashSourceConfig.cmake
index 8a9a384df..a19c8b992 100644
--- a/infra/cmake/packages/FarmhashSourceConfig.cmake
+++ b/infra/cmake/packages/FarmhashSourceConfig.cmake
@@ -9,7 +9,7 @@ function(_FarmhashSource_import)
 
   # NOTE TensorFlow 1.12 downloads farmhash from the following URL
   #      TensorFlow 1.13.1 downloads farmhash from the following URL
-  #      TensorFlow 2.3-rc0 downloads farmhash from the following URL
+  #      TensorFlow 2.3.0 downloads farmhash from the following URL
   envoption(FARMHASH_1_12_URL https://github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz)
 
   ExternalSource_Download(FARMHASH ${FARMHASH_1_12_URL})
diff --git a/infra/cmake/packages/FlatBuffersConfig.cmake b/infra/cmake/packages/FlatBuffersConfig.cmake
index ab0b7708f..da084e7d3 100644
--- a/infra/cmake/packages/FlatBuffersConfig.cmake
+++ b/infra/cmake/packages/FlatBuffersConfig.cmake
@@ -25,7 +25,8 @@ function(_FlatBuffers_build)
                       BUILD_DIR   ${CMAKE_BINARY_DIR}/externals/FLATBUFFERS/build
                       INSTALL_DIR ${EXT_OVERLAY_DIR}
                       BUILD_FLAGS ${ADDITIONAL_CXX_FLAGS}
-                      IDENTIFIER  "1.10-fix1"
+                      IDENTIFIER  "1.10-fix2"
+                      EXTRA_OPTS "-DFLATBUFFERS_BUILD_TESTS:BOOL=OFF"
                       PKG_NAME    "FLATBUFFERS")
 
 endfunction(_FlatBuffers_build)
diff --git a/infra/cmake/packages/HDF5Config.cmake b/infra/cmake/packages/HDF5Config.cmake
index e282e0bc9..19803f1ea 100644
--- a/infra/cmake/packages/HDF5Config.cmake
+++ b/infra/cmake/packages/HDF5Config.cmake
@@ -27,6 +27,7 @@ _HDF5_build()
 find_path(HDF5_CONFIG_DIR "hdf5-config.cmake"
           PATHS ${EXT_OVERLAY_DIR}
           PATH_SUFFIXES
+            cmake
             share/cmake
             share/cmake/hdf5
             cmake/hdf5
diff --git a/infra/cmake/packages/Pybind11Config.cmake b/infra/cmake/packages/Pybind11Config.cmake
new file mode 100644
index 000000000..b6d500496
--- /dev/null
+++ b/infra/cmake/packages/Pybind11Config.cmake
@@ -0,0 +1,22 @@
+function(_Pybind11_import)
+  nnas_find_package(Pybind11Source QUIET)
+
+  if(NOT Pybind11Source_FOUND)
+    set(Pybind11_FOUND FALSE PARENT_SCOPE)
+    return()
+  endif(NOT Pybind11Source_FOUND)
+
+  nnas_include(ExternalBuildTools)
+  ExternalBuild_CMake(CMAKE_DIR   ${Pybind11Source_DIR}
+                      BUILD_DIR   ${CMAKE_BINARY_DIR}/externals/PYBIND11/build
+                      INSTALL_DIR ${EXT_OVERLAY_DIR}
+                      IDENTIFIER  "2.5.0"
+                      PKG_NAME    "PYBIND11"
+                      EXTRA_OPTS "-DPYBIND11_TEST:BOOL=OFF")
+
+  find_path(Pybind11_INCLUDE_DIRS NAMES pybind11.h PATHS ${EXT_OVERLAY_DIR} PATH_SUFFIXES include/pybind11)
+
+  set(Pybind11_FOUND TRUE PARENT_SCOPE)
+endfunction(_Pybind11_import)
+
+_Pybind11_import()
diff --git a/infra/cmake/packages/Pybind11SourceConfig.cmake b/infra/cmake/packages/Pybind11SourceConfig.cmake
new file mode 100644
index 000000000..76f51e4d3
--- /dev/null
+++ b/infra/cmake/packages/Pybind11SourceConfig.cmake
@@ -0,0 +1,18 @@
+function(_Pybind11Source_import)
+  if(NOT DOWNLOAD_PYBIND11)
+    set(Pybind11Source_FOUND FALSE PARENT_SCOPE)
+    return()
+  endif(NOT DOWNLOAD_PYBIND11)
+
+  nnas_include(ExternalSourceTools)
+  nnas_include(OptionTools)
+
+  envoption(PYBIND11_URL https://github.com/pybind/pybind11/archive/v2.5.0.tar.gz)
+
+  ExternalSource_Download(PYBIND11 ${PYBIND11_URL})
+
+  set(Pybind11Source_DIR ${PYBIND11_SOURCE_DIR} PARENT_SCOPE)
+  set(Pybind11Source_FOUND TRUE PARENT_SCOPE)
+endfunction(_Pybind11Source_import)
+
+_Pybind11Source_import()
diff --git a/infra/cmake/packages/TensorFlowEigenSource-2.3.0/TensorFlowEigenSourceConfig.cmake b/infra/cmake/packages/TensorFlowEigenSource-2.3.0/TensorFlowEigenSourceConfig.cmake
new file mode 100644
index 000000000..d50d04508
--- /dev/null
+++ b/infra/cmake/packages/TensorFlowEigenSource-2.3.0/TensorFlowEigenSourceConfig.cmake
@@ -0,0 +1,21 @@
+function(_TensorFlowEigenSource_import)
+  if(NOT DOWNLOAD_EIGEN)
+    set(TensorFlowEigenSource_FOUND FALSE PARENT_SCOPE)
+    return()
+  endif(NOT DOWNLOAD_EIGEN)
+
+  nnas_include(ExternalSourceTools)
+  nnas_include(OptionTools)
+
+  # Exact version used by TensorFlow v2.3.0.
+  # See tensorflow/tensorflow/workspace.bzl.
+  envoption(EXTERNAL_DOWNLOAD_SERVER "https://gitlab.com")
+  envoption(TENSORFLOW_2_3_0_EIGEN_URL ${EXTERNAL_DOWNLOAD_SERVER}/libeigen/eigen/-/archive/386d809bde475c65b7940f290efe80e6a05878c4/eigen-386d809bde475c65b7940f290efe80e6a05878c4.tar.gz)
+
+  ExternalSource_Download(EIGEN DIRNAME TENSORFLOW-2.3.0-EIGEN ${TENSORFLOW_2_3_0_EIGEN_URL})
+
+  set(TensorFlowEigenSource_DIR ${EIGEN_SOURCE_DIR} PARENT_SCOPE)
+  set(TensorFlowEigenSource_FOUND TRUE PARENT_SCOPE)
+endfunction(_TensorFlowEigenSource_import)
+
+_TensorFlowEigenSource_import()
diff --git a/infra/cmake/packages/TensorFlowEigenSource-2.3.0/TensorFlowEigenSourceConfigVersion.cmake b/infra/cmake/packages/TensorFlowEigenSource-2.3.0/TensorFlowEigenSourceConfigVersion.cmake
new file mode 100644
index 000000000..04df5eb6d
--- /dev/null
+++ b/infra/cmake/packages/TensorFlowEigenSource-2.3.0/TensorFlowEigenSourceConfigVersion.cmake
@@ -0,0 +1,10 @@
+set(PACKAGE_VERSION "2.3.0")
+set(PACKAGE_VERSION_EXACT FALSE)
+set(PACKAGE_VERSION_COMPATIBLE FALSE)
+set(PACKAGE_VERSION_UNSUITABLE TRUE)
+
+if(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION)
+  set(PACKAGE_VERSION_EXACT TRUE)
+  set(PACKAGE_VERSION_COMPATIBLE TRUE)
+  set(PACKAGE_VERSION_UNSUITABLE FALSE)
+endif(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION)
diff --git a/infra/cmake/packages/TensorFlowSource-2.3.0/TensorFlowSourceConfig.cmake b/infra/cmake/packages/TensorFlowSource-2.3.0/TensorFlowSourceConfig.cmake
new file mode 100644
index 000000000..5c3a0f8cc
--- /dev/null
+++ b/infra/cmake/packages/TensorFlowSource-2.3.0/TensorFlowSourceConfig.cmake
@@ -0,0 +1,18 @@
+function(_TensorFlowSource_import)
+  if(NOT DOWNLOAD_TENSORFLOW)
+    set(TensorFlowSource_FOUND FALSE PARENT_SCOPE)
+    return()
+  endif(NOT DOWNLOAD_TENSORFLOW)
+
+  nnas_include(ExternalSourceTools)
+  nnas_include(OptionTools)
+
+  envoption(TENSORFLOW_2_3_0_URL https://github.com/tensorflow/tensorflow/archive/v2.3.0.tar.gz)
+
+  ExternalSource_Download(TENSORFLOW DIRNAME TENSORFLOW-2.3.0 ${TENSORFLOW_2_3_0_URL})
+
+  set(TensorFlowSource_DIR ${TENSORFLOW_SOURCE_DIR} PARENT_SCOPE)
+  set(TensorFlowSource_FOUND TRUE PARENT_SCOPE)
+endfunction(_TensorFlowSource_import)
+
+_TensorFlowSource_import()
diff --git a/infra/cmake/packages/TensorFlowSource-2.3.0/TensorFlowSourceConfigVersion.cmake b/infra/cmake/packages/TensorFlowSource-2.3.0/TensorFlowSourceConfigVersion.cmake
new file mode 100644
index 000000000..04df5eb6d
--- /dev/null
+++ b/infra/cmake/packages/TensorFlowSource-2.3.0/TensorFlowSourceConfigVersion.cmake
@@ -0,0 +1,10 @@
+set(PACKAGE_VERSION "2.3.0")
+set(PACKAGE_VERSION_EXACT FALSE)
+set(PACKAGE_VERSION_COMPATIBLE FALSE)
+set(PACKAGE_VERSION_UNSUITABLE TRUE)
+
+if(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION)
+  set(PACKAGE_VERSION_EXACT TRUE)
+  set(PACKAGE_VERSION_COMPATIBLE TRUE)
+  set(PACKAGE_VERSION_UNSUITABLE FALSE)
+endif(PACKAGE_FIND_VERSION VERSION_EQUAL PACKAGE_VERSION)
diff --git a/infra/docker/Dockerfile b/infra/docker/Dockerfile
index e675b53ad..052cc4fb6 100644
--- a/infra/docker/Dockerfile
+++ b/infra/docker/Dockerfile
@@ -1,8 +1,6 @@
 FROM ubuntu:16.04
 
 ARG UBUNTU_MIRROR
-ENV http_proxy $http_proxy
-ENV https_proxy $https_proxy
 
 RUN if [ -n "$http_proxy" ] ; then echo "Acquire::http::proxy \"${http_proxy}\";" >> /etc/apt/apt.conf ; fi
 RUN if [ -n "$https_proxy" ] ; then echo "Acquire::https::proxy \"${https_proxy}\";" >> /etc/apt/apt.conf ; fi
@@ -22,6 +20,7 @@ RUN apt-get update && apt-get -qqy install libprotobuf-dev protobuf-compiler
 
 # Additonal tools
 RUN apt-get update && apt-get -qqy install doxygen graphviz wget unzip clang-format-3.9 python3 python3-pip python3-venv hdf5-tools pylint
+RUN pip3 install --upgrade pip
 RUN pip3 install yapf==0.22.0 numpy
 
 # Install google test (source)
diff --git a/infra/docker/Dockerfile.1804 b/infra/docker/Dockerfile.1804
index fc6fc9a1a..cc31bba1f 100644
--- a/infra/docker/Dockerfile.1804
+++ b/infra/docker/Dockerfile.1804
@@ -1,12 +1,6 @@
 FROM ubuntu:18.04
 
 ARG UBUNTU_MIRROR
-ENV http_proxy $http_proxy
-ENV https_proxy $https_proxy
-
-RUN if [ -n "$http_proxy" ] ; then echo "Acquire::http::proxy \"${http_proxy}\";" >> /etc/apt/apt.conf ; fi
-RUN if [ -n "$https_proxy" ] ; then echo "Acquire::https::proxy \"${https_proxy}\";" >> /etc/apt/apt.conf ; fi
-RUN if [ -n "$UBUNTU_MIRROR" ] ; then sed "s/archive.ubuntu.com/${UBUNTU_MIRROR}/g" -i /etc/apt/sources.list ; fi
 
 # Install 'add-apt-repository'
 RUN apt-get update && apt-get -qqy install software-properties-common
@@ -22,6 +16,7 @@ RUN apt-get update && apt-get -qqy install libprotobuf-dev protobuf-compiler
 
 # Additonal tools
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get -qqy install doxygen graphviz wget unzip clang-format-3.9 python3 python3-pip python3-venv hdf5-tools pylint
+RUN pip3 install --upgrade pip
 RUN pip3 install yapf==0.22.0 numpy
 
 # Install google test (source)
diff --git a/infra/nncc/CMakeLists.txt b/infra/nncc/CMakeLists.txt
index 3ac6680de..0be6885e2 100644
--- a/infra/nncc/CMakeLists.txt
+++ b/infra/nncc/CMakeLists.txt
@@ -98,6 +98,7 @@ option(DOWNLOAD_CAFFE "Download Caffe source" ON)
 option(DOWNLOAD_PYTORCH "Download Pytorch source" ON)
 option(DOWNLOAD_ONNX "Download ONNX source" ON)
 option(DOWNLOAD_ABSEIL "Download Abseil-cpp source" ON)
+option(DOWNLOAD_PYBIND11 "Download Pybind11 source" ON)
 
 option(DOWNLOAD_GTEST "Download Google Test source" ON)
 option(BUILD_GTEST "Build Google Test from the downloaded source" ON)
diff --git a/infra/nncc/command/utcount b/infra/nncc/command/utcount
index d4610e3f0..d06c5c9de 100644
--- a/infra/nncc/command/utcount
+++ b/infra/nncc/command/utcount
@@ -13,7 +13,7 @@ BUILD_ITEMS="angkor cwrap pepper-str pepper-strcast pp stdex \
 oops pepper-assert \
 hermes hermes-std \
 loco locop locomotiv logo-core logo \
-foder souschef arser \
+foder souschef arser vconone \
 safemain mio-circle mio-tflite \
 tflite2circle \
 luci \
diff --git a/infra/nnfw/cmake/CfgOptionFlags.cmake b/infra/nnfw/cmake/CfgOptionFlags.cmake
index d1395f871..3c6b7d960 100644
--- a/infra/nnfw/cmake/CfgOptionFlags.cmake
+++ b/infra/nnfw/cmake/CfgOptionFlags.cmake
@@ -23,7 +23,7 @@ CMAKE_DEPENDENT_OPTION(BUILD_RUNTIME_NNAPI_TEST "Build Runtime NN API Generated
                        OFF)
 option(BUILD_RUNTIME_NNFW_API_TEST "Build Runtime NNFW API Tests" ON)
 option(BUILD_TFLITE_RUN "Build tflite-run" ON)
-option(BUILD_TFLITE_RUN_2_2_0 "Build tflite-run 2.2.0" OFF)
+option(BUILD_TFLITE_VANILLA_RUN "Build tflite-vanilla-run" OFF)
 option(BUILD_TFLITE_BENCHMARK_MODEL "Build tflite benchmark model" OFF)
 option(BUILD_NNAPI_TEST "Build nnapi_test" ON)
 option(BUILD_NNPACKAGE_RUN "Build nnpackge_run" ON)
@@ -70,7 +70,7 @@ option(DOWNLOAD_BOOST "Download boost source" OFF)
 option(DOWNLOAD_RUY "Download ruy source" ON)
 option(BUILD_BOOST "Build boost source" OFF)
 option(BUILD_TENSORFLOW_LITE "Build TensorFlow Lite from the downloaded source" ON)
-option(BUILD_TENSORFLOW_LITE_2_2_0 "Build TensorFlow Lite from the downloaded source" OFF)
+option(BUILD_TENSORFLOW_LITE_2_3_0 "Build TensorFlow Lite 2.3.0 from the downloaded source" OFF)
 option(BUILD_GTEST "Download and build Google Test" ON)
 option(BUILD_ARMCOMPUTE "Build ARM Compute from the downloaded source" ON)
 option(BUILD_RUY "Build ruy library from the downloaded source" ON)
diff --git a/infra/nnfw/cmake/packages/EigenConfig.cmake b/infra/nnfw/cmake/packages/EigenConfig.cmake
index f37d653cf..e71830a16 100644
--- a/infra/nnfw/cmake/packages/EigenConfig.cmake
+++ b/infra/nnfw/cmake/packages/EigenConfig.cmake
@@ -1,5 +1,5 @@
 function(_Eigen_import)
-  nnas_find_package(TensorFlowEigenSource-2.3.0-rc0 QUIET)
+  nnas_find_package(TensorFlowEigenSource EXACT 2.3.0 QUIET)
 
   if(NOT TensorFlowEigenSource_FOUND)
     set(Eigen_FOUND FALSE PARENT_SCOPE)
diff --git a/infra/nnfw/cmake/packages/TensorFlowLite-2.2.0Config.cmake b/infra/nnfw/cmake/packages/TensorFlowLite-2.2.0Config.cmake
deleted file mode 100644
index e698235f4..000000000
--- a/infra/nnfw/cmake/packages/TensorFlowLite-2.2.0Config.cmake
+++ /dev/null
@@ -1,92 +0,0 @@
-if(BUILD_TENSORFLOW_LITE_2_2_0)
-  macro(return_unless VAR)
-  if(NOT ${VAR})
-    message("${VAR} NOT TRUE")
-    set(TensorFlowLite_2_2_0_FOUND PARENT_SCOPE)
-    return()
-  endif(NOT ${VAR})
-  endmacro(return_unless)
-
-  nnas_include(ExternalSourceTools)
-  nnas_include(OptionTools)
-
-  # Below urls come from https://github.com/tensorflow/tensorflow/blob/v2.2.0/tensorflow/lite/tools/make/Makefile
-
-  set(absl_url "https://github.com/abseil/abseil-cpp/archive/43ef2148c0936ebf7cb4be6b19927a9d9d145b8f.tar.gz")
-  ExternalSource_Download("tflite220_Absl" ${absl_url})
-  set(TFLite220AbslSource_DIR "${tflite220_Absl_SOURCE_DIR}")
-  if (NOT TFLite220AbslSource_DIR STREQUAL "")
-    set(TFLite220AbslSource_FOUND TRUE)
-  endif()
-  return_unless(TFLite220AbslSource_FOUND)
-
-  set(eigen_url "https://gitlab.com/libeigen/eigen/-/archive/52a2fbbb008a47c5e3fb8ac1c65c2feecb0c511c/eigen-52a2fbbb008a47c5e3fb8ac1c65c2feecb0c511c.tar.gz")
-  ExternalSource_Download("tflite220_Eigen" ${eigen_url})
-  set(TFLite220EigenSource_DIR "${tflite220_Eigen_SOURCE_DIR}")
-  if (NOT TFLite220EigenSource_DIR STREQUAL "")
-    set(TFLite220EigenSource_FOUND TRUE)
-  endif()
-  return_unless(TFLite220EigenSource_FOUND)
-
-  set(farmhash_url "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz")
-  ExternalSource_Download("tflite220_Farmhash" ${farmhash_url})
-  set(TFLite220FarmhashSource_DIR "${tflite220_Farmhash_SOURCE_DIR}")
-  if (NOT TFLite220FarmhashSource_DIR STREQUAL "")
-    set(TFLite220FarmhashSource_FOUND TRUE)
-  endif()
-  return_unless(TFLite220FarmhashSource_FOUND)
-
-  set(fft2d_url "https://storage.googleapis.com/mirror.tensorflow.org/www.kurims.kyoto-u.ac.jp/~ooura/fft2d.tgz")
-  ExternalSource_Download("tflite220_FFT2D" ${fft2d_url})
-  set(TFLite220FFT2DSource_DIR "${tflite220_FFT2D_SOURCE_DIR}")
-  if (NOT TFLite220FFT2DSource_DIR STREQUAL "")
-    set(TFLite220FFT2DSource_FOUND TRUE)
-  endif()
-  return_unless(TFLite220FFT2DSource_FOUND)
-
-  set(flatbuffers_url "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/flatbuffers/archive/v1.11.0.tar.gz")
-  ExternalSource_Download("tflite220_FlatBuffers" ${flatbuffers_url})
-  set(TFLite220FlatBuffersSource_DIR "${tflite220_FlatBuffers_SOURCE_DIR}")
-  if (NOT TFLite220FlatBuffersSource_DIR STREQUAL "")
-    set(TFLite220FlatBuffersSource_FOUND TRUE)
-  endif()
-  return_unless(TFLite220FlatBuffersSource_FOUND)
-
-  set(fp16_url "https://github.com/Maratyszcza/FP16/archive/febbb1c163726b5db24bed55cc9dc42529068997.zip")
-  ExternalSource_Download("tflite220_FP16" ${fp16_url})
-  set(TFLite220FP16Source_DIR "${tflite220_FP16_SOURCE_DIR}")
-  if (NOT TFLite220FP16Source_DIR STREQUAL "")
-    set(TFLite220FP16Source_FOUND TRUE)
-  endif()
-  return_unless(TFLite220FP16Source_FOUND)
-
-  set(gemmlowp_url "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/gemmlowp/archive/12fed0cd7cfcd9e169bf1925bc3a7a58725fdcc3.zip")
-  ExternalSource_Download("tflite220_GEMMLowp" ${gemmlowp_url})
-  set(TFLite220GEMMLowpSource_DIR "${tflite220_GEMMLowp_SOURCE_DIR}")
-  if (NOT TFLite220GEMMLowpSource_DIR STREQUAL "")
-    set(TFLite220GEMMLowpSource_FOUND TRUE)
-  endif()
-  return_unless(TFLite220GEMMLowpSource_FOUND)
-
-  set(neon2sse_url "https://github.com/intel/ARM_NEON_2_x86_SSE/archive/master.zip")
-  ExternalSource_Download("tflite220_NEON2SSE" ${neon2sse_url})
-  set(TFLite220NEON2SSESource_DIR "${tflite220_NEON2SSE_SOURCE_DIR}")
-  if (NOT TFLite220NEON2SSESource_DIR STREQUAL "")
-    set(TFLite220NEON2SSESource_FOUND TRUE)
-  endif()
-  return_unless(TFLite220NEON2SSESource_FOUND)
-
-  set(tensorflow_url "https://github.com/tensorflow/tensorflow/archive/v2.2.0.tar.gz")
-  ExternalSource_Download("tflite220_TensorFlow" ${tensorflow_url})
-  set(TFLite220TensorFlowSource_DIR "${tflite220_TensorFlow_SOURCE_DIR}")
-  if (NOT TFLite220TensorFlowSource_DIR STREQUAL "")
-    set(TFLite220TensorFlowSource_FOUND TRUE)
-  endif()
-  return_unless(TFLite220TensorFlowSource_FOUND)
-
-  nnas_include(ExternalProjectTools)
-  add_extdirectory("${CMAKE_CURRENT_LIST_DIR}/TensorFlowLite-2.2.0" tflite-2.2.0)
-
-  set(TensorFlowLite_2_2_0_FOUND TRUE)
-  return()
-endif()
diff --git a/infra/nnfw/cmake/packages/TensorFlowLite-2.2.0/CMakeLists.txt b/infra/nnfw/cmake/packages/TensorFlowLite-2.3.0/CMakeLists.txt
index 8e7f78eb1..20547b92d 100644
--- a/infra/nnfw/cmake/packages/TensorFlowLite-2.2.0/CMakeLists.txt
+++ b/infra/nnfw/cmake/packages/TensorFlowLite-2.3.0/CMakeLists.txt
@@ -1,8 +1,8 @@
-# Reference: https://github.com/tensorflow/tensorflow/blob/v2.2.0/tensorflow/lite/tools/make/Makefile
+# Reference: https://github.com/tensorflow/tensorflow/blob/v2.3.0/tensorflow/lite/tools/make/Makefile
 #
-# Tensorflow Lite library 2.2.0
+# Tensorflow Lite library 2.3.0
 #
-set(TENSORFLOW_LITE_BASE ${TFLite220TensorFlowSource_DIR}/tensorflow/lite)
+set(TENSORFLOW_LITE_BASE ${TFLiteVanillaTensorFlowSource_DIR}/tensorflow/lite)
 
 file(GLOB TFLITE_CORE_SRCS "${TENSORFLOW_LITE_BASE}/*.c"
      "${TENSORFLOW_LITE_BASE}/*.cc"
@@ -18,8 +18,7 @@ file(GLOB TFLITE_API_SRCS "${TENSORFLOW_LITE_BASE}/core/api/*.c"
 list(APPEND TFLITE_PROFILING_SRCS "${TENSORFLOW_LITE_BASE}/profiling/memory_info.cc")
 list(APPEND TFLITE_PROFILING_SRCS "${TENSORFLOW_LITE_BASE}/profiling/time.cc")
 
-file(GLOB TFLITE_EXPERIMENTAL_SRCS "${TENSORFLOW_LITE_BASE}/experimental/resource/*.cc"
-     "${TENSORFLOW_LITE_BASE}/experimental/ruy/*.cc")
+file(GLOB TFLITE_EXPERIMENTAL_SRCS "${TENSORFLOW_LITE_BASE}/experimental/resource/*.cc")
 
 file(GLOB TFLITE_SPARSITY_SRCS "${TENSORFLOW_LITE_BASE}/tools/optimize/sparsity/*.cc")
 
@@ -32,24 +31,37 @@ list(APPEND TFLITE_SRCS ${TFLITE_EXPERIMENTAL_SRCS})
 list(APPEND TFLITE_SRCS ${TFLITE_SPARSITY_SRCS})
 
 # externals
-list(APPEND TFLITE_SRCS "${TFLite220FarmhashSource_DIR}/src/farmhash.cc")
-list(APPEND TFLITE_SRCS "${TFLite220FFT2DSource_DIR}/fftsg.c")
-list(APPEND TFLITE_SRCS "${TFLite220FFT2DSource_DIR}/fftsg2d.c")
-list(APPEND TFLITE_SRCS "${TFLite220FlatBuffersSource_DIR}/src/util.cpp")
+list(APPEND TFLITE_SRCS "${TFLiteVanillaFarmhashSource_DIR}/src/farmhash.cc")
+list(APPEND TFLITE_SRCS "${TFLiteVanillaFFT2DSource_DIR}/fftsg.c")
+list(APPEND TFLITE_SRCS "${TFLiteVanillaFFT2DSource_DIR}/fftsg2d.c")
+list(APPEND TFLITE_SRCS "${TFLiteVanillaFlatBuffersSource_DIR}/src/util.cpp")
 
 # externals - absl
-file(GLOB_RECURSE ABSL_SRCS "${TFLite220AbslSource_DIR}/absl/*.cc")
-file(GLOB_RECURSE ABSL_EXCLS "${TFLite220AbslSource_DIR}/absl/*test*.cc"
-     "${TFLite220AbslSource_DIR}/absl/*benchmark*.cc"
-     "${TFLite220AbslSource_DIR}/absl/synchronization/*.cc"
-     "${TFLite220AbslSource_DIR}/absl/debugging/*.cc"
-     "${TFLite220AbslSource_DIR}/absl/hash/*.cc"
-     "${TFLite220AbslSource_DIR}/absl/flags/*.cc")
+file(GLOB_RECURSE ABSL_SRCS "${TFLiteVanillaAbslSource_DIR}/absl/*.cc")
+file(GLOB_RECURSE ABSL_EXCLS "${TFLiteVanillaAbslSource_DIR}/absl/*test*.cc"
+     "${TFLiteVanillaAbslSource_DIR}/absl/*benchmark*.cc"
+     "${TFLiteVanillaAbslSource_DIR}/absl/synchronization/*.cc"
+     "${TFLiteVanillaAbslSource_DIR}/absl/debugging/*.cc"
+     "${TFLiteVanillaAbslSource_DIR}/absl/hash/*.cc"
+     "${TFLiteVanillaAbslSource_DIR}/absl/flags/*.cc"
+     "${TFLiteVanillaAbslSource_DIR}/absl/random/*.cc")
 list(REMOVE_ITEM ABSL_SRCS ${ABSL_EXCLS})
 list(APPEND TFLITE_SRCS ${ABSL_SRCS})
 
+# externals - ruy
+file(GLOB RUY_SRCS "${TFLiteVanillaRuySource_DIR}/ruy/*.cc")
+file(GLOB_RECURSE RUY_EXCLS "${TFLiteVanillaRuySource_DIR}/ruy/*test*.cc"
+      "${TFLiteVanillaRuySource_DIR}/ruy/*benchmark*.cc"
+      "${TFLiteVanillaRuySource_DIR}/ruy/*example*.cc")
+list(REMOVE_ITEM RUY_SRCS ${RUY_EXCLS})
+# Temporary fix for ruy compilation error.
+# TODO(b/158800055): Remove this hack once the ruy version is correctly bumped.
+list(REMOVE_ITEM RUY_SRCS "${TFLiteVanillaRuySource_DIR}/ruy/prepare_packed_matrices.cc")
+list(APPEND TFLITE_SRCS ${RUY_SRCS})
+
+
 # Build with mmap? true
-# caution: v2.2.0's Makefile has wrong code on this part. This is fixed on master branch.
+# caution: v2.3.0's Makefile has wrong code on this part. This is fixed on master branch.
 set(BUILD_WITH_MMAP TRUE)
 if(${BUILD_WITH_MMAP})
   list(REMOVE_ITEM TFLITE_SRCS "${TENSORFLOW_LITE_BASE}/mmap_allocation_disabled.cc")
@@ -86,25 +98,26 @@ file(GLOB_RECURSE TFLITE_EXCLS "${TENSORFLOW_LITE_BASE}/*test*.cc"
 list(REMOVE_ITEM TFLITE_SRCS ${TFLITE_EXCLS})
 
 # include headers
-list(APPEND TFLITE_INCLUDES "${TFLite220TensorFlowSource_DIR}")
-list(APPEND TFLITE_INCLUDES "${TFLite220EigenSource_DIR}")
-list(APPEND TFLITE_INCLUDES "${TFLite220AbslSource_DIR}")
-list(APPEND TFLITE_INCLUDES "${TFLite220GEMMLowpSource_DIR}")
-list(APPEND TFLITE_INCLUDES "${TFLite220NEON2SSESource_DIR}")
-list(APPEND TFLITE_INCLUDES "${TFLite220FarmhashSource_DIR}/src")
-list(APPEND TFLITE_INCLUDES "${TFLite220FlatBuffersSource_DIR}/include")
-list(APPEND TFLITE_INCLUDES "${TFLite220FP16Source_DIR}/include")
-
-add_library(tensorflow-lite-2.2.0 STATIC ${TFLITE_SRCS})
-target_include_directories(tensorflow-lite-2.2.0 SYSTEM PUBLIC ${TFLITE_INCLUDES})
-target_compile_definitions(tensorflow-lite-2.2.0 PUBLIC "GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK -DTFLITE_WITH_RUY -DTFLITE_WITH_RUY_GEMV")
-set_property(TARGET tensorflow-lite-2.2.0 PROPERTY POSITION_INDEPENDENT_CODE ON)
-target_link_libraries(tensorflow-lite-2.2.0 eigen ${LIB_PTHREAD} dl)
-if(${BUILD_WITH_NNAPI})
-  target_link_libraries(tensorflow-lite-2.2.0 rt)
+list(APPEND TFLITE_INCLUDES "${TFLiteVanillaTensorFlowSource_DIR}")
+list(APPEND TFLITE_INCLUDES "${TFLiteVanillaEigenSource_DIR}")
+list(APPEND TFLITE_INCLUDES "${TFLiteVanillaAbslSource_DIR}")
+list(APPEND TFLITE_INCLUDES "${TFLiteVanillaGEMMLowpSource_DIR}")
+list(APPEND TFLITE_INCLUDES "${TFLiteVanillaNEON2SSESource_DIR}")
+list(APPEND TFLITE_INCLUDES "${TFLiteVanillaFarmhashSource_DIR}/src")
+list(APPEND TFLITE_INCLUDES "${TFLiteVanillaFlatBuffersSource_DIR}/include")
+list(APPEND TFLITE_INCLUDES "${TFLiteVanillaFP16Source_DIR}/include")
+list(APPEND TFLITE_INCLUDES "${TFLiteVanillaRuySource_DIR}")
+
+add_library(tensorflow-lite-2.3.0 STATIC ${TFLITE_SRCS})
+target_include_directories(tensorflow-lite-2.3.0 SYSTEM PUBLIC ${TFLITE_INCLUDES})
+target_compile_definitions(tensorflow-lite-2.3.0 PUBLIC "GEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK -DTFLITE_WITH_RUY -DTFLITE_WITH_RUY_GEMV")
+set_property(TARGET tensorflow-lite-2.3.0 PROPERTY POSITION_INDEPENDENT_CODE ON)
+target_link_libraries(tensorflow-lite-2.3.0 eigen ${LIB_PTHREAD} dl)
+if(NOT ANDROID AND ${BUILD_WITH_NNAPI})
+  target_link_libraries(tensorflow-lite-2.3.0 rt)
 endif()
 
 if(ANDROID)
-  target_link_libraries(tensorflow-lite-2.2.0 log)
-  target_include_directories(tensorflow-lite-2.2.0 PUBLIC "${NDK_DIR}/..")
+  target_link_libraries(tensorflow-lite-2.3.0 log)
+  target_include_directories(tensorflow-lite-2.3.0 PUBLIC "${NDK_DIR}/..")
 endif()
diff --git a/infra/nnfw/cmake/packages/TensorFlowLite-2.3.0Config.cmake b/infra/nnfw/cmake/packages/TensorFlowLite-2.3.0Config.cmake
new file mode 100644
index 000000000..d00ca96a6
--- /dev/null
+++ b/infra/nnfw/cmake/packages/TensorFlowLite-2.3.0Config.cmake
@@ -0,0 +1,100 @@
+if(BUILD_TENSORFLOW_LITE_2_3_0)
+  macro(return_unless VAR)
+  if(NOT ${VAR})
+    message("${VAR} NOT TRUE")
+    set(TensorFlowLite_2_3_0_FOUND PARENT_SCOPE)
+    return()
+  endif(NOT ${VAR})
+  endmacro(return_unless)
+
+  nnas_include(ExternalSourceTools)
+  nnas_include(OptionTools)
+
+  # Below urls come from https://github.com/tensorflow/tensorflow/blob/v2.3.0/tensorflow/lite/tools/make/Makefile
+
+  set(absl_url "https://github.com/abseil/abseil-cpp/archive/df3ea785d8c30a9503321a3d35ee7d35808f190d.tar.gz")
+  ExternalSource_Download("TFLiteVanilla_Absl" ${absl_url})
+  set(TFLiteVanillaAbslSource_DIR "${TFLiteVanilla_Absl_SOURCE_DIR}")
+  if (NOT TFLiteVanillaAbslSource_DIR STREQUAL "")
+    set(TFLiteVanillaAbslSource_FOUND TRUE)
+  endif()
+  return_unless(TFLiteVanillaAbslSource_FOUND)
+
+  set(eigen_url "https://gitlab.com/libeigen/eigen/-/archive/386d809bde475c65b7940f290efe80e6a05878c4/eigen-386d809bde475c65b7940f290efe80e6a05878c4.tar.gz")
+  ExternalSource_Download("TFLiteVanilla_Eigen" ${eigen_url})
+  set(TFLiteVanillaEigenSource_DIR "${TFLiteVanilla_Eigen_SOURCE_DIR}")
+  if (NOT TFLiteVanillaEigenSource_DIR STREQUAL "")
+    set(TFLiteVanillaEigenSource_FOUND TRUE)
+  endif()
+  return_unless(TFLiteVanillaEigenSource_FOUND)
+
+  set(farmhash_url "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/farmhash/archive/816a4ae622e964763ca0862d9dbd19324a1eaf45.tar.gz")
+  ExternalSource_Download("TFLiteVanilla_Farmhash" ${farmhash_url})
+  set(TFLiteVanillaFarmhashSource_DIR "${TFLiteVanilla_Farmhash_SOURCE_DIR}")
+  if (NOT TFLiteVanillaFarmhashSource_DIR STREQUAL "")
+    set(TFLiteVanillaFarmhashSource_FOUND TRUE)
+  endif()
+  return_unless(TFLiteVanillaFarmhashSource_FOUND)
+
+  set(fft2d_url "https://storage.googleapis.com/mirror.tensorflow.org/github.com/petewarden/OouraFFT/archive/v1.0.tar.gz")
+  ExternalSource_Download("TFLiteVanilla_FFT2D" ${fft2d_url})
+  set(TFLiteVanillaFFT2DSource_DIR "${TFLiteVanilla_FFT2D_SOURCE_DIR}")
+  if (NOT TFLiteVanillaFFT2DSource_DIR STREQUAL "")
+    set(TFLiteVanillaFFT2DSource_FOUND TRUE)
+  endif()
+  return_unless(TFLiteVanillaFFT2DSource_FOUND)
+
+  set(flatbuffers_url "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/flatbuffers/archive/v1.12.0.tar.gz")
+  ExternalSource_Download("TFLiteVanilla_FlatBuffers" ${flatbuffers_url})
+  set(TFLiteVanillaFlatBuffersSource_DIR "${TFLiteVanilla_FlatBuffers_SOURCE_DIR}")
+  if (NOT TFLiteVanillaFlatBuffersSource_DIR STREQUAL "")
+    set(TFLiteVanillaFlatBuffersSource_FOUND TRUE)
+  endif()
+  return_unless(TFLiteVanillaFlatBuffersSource_FOUND)
+
+  set(fp16_url "https://github.com/Maratyszcza/FP16/archive/4dfe081cf6bcd15db339cf2680b9281b8451eeb3.zip")
+  ExternalSource_Download("TFLiteVanilla_FP16" ${fp16_url})
+  set(TFLiteVanillaFP16Source_DIR "${TFLiteVanilla_FP16_SOURCE_DIR}")
+  if (NOT TFLiteVanillaFP16Source_DIR STREQUAL "")
+    set(TFLiteVanillaFP16Source_FOUND TRUE)
+  endif()
+  return_unless(TFLiteVanillaFP16Source_FOUND)
+
+  set(gemmlowp_url "https://storage.googleapis.com/mirror.tensorflow.org/github.com/google/gemmlowp/archive/fda83bdc38b118cc6b56753bd540caa49e570745.zip")
+  ExternalSource_Download("TFLiteVanilla_GEMMLowp" ${gemmlowp_url})
+  set(TFLiteVanillaGEMMLowpSource_DIR "${TFLiteVanilla_GEMMLowp_SOURCE_DIR}")
+  if (NOT TFLiteVanillaGEMMLowpSource_DIR STREQUAL "")
+    set(TFLiteVanillaGEMMLowpSource_FOUND TRUE)
+  endif()
+  return_unless(TFLiteVanillaGEMMLowpSource_FOUND)
+
+  set(neon2sse_url "https://github.com/intel/ARM_NEON_2_x86_SSE/archive/1200fe90bb174a6224a525ee60148671a786a71f.tar.gz")
+  ExternalSource_Download("TFLiteVanilla_NEON2SSE" ${neon2sse_url})
+  set(TFLiteVanillaNEON2SSESource_DIR "${TFLiteVanilla_NEON2SSE_SOURCE_DIR}")
+  if (NOT TFLiteVanillaNEON2SSESource_DIR STREQUAL "")
+    set(TFLiteVanillaNEON2SSESource_FOUND TRUE)
+  endif()
+  return_unless(TFLiteVanillaNEON2SSESource_FOUND)
+
+  set(tensorflow_url "https://github.com/tensorflow/tensorflow/archive/v2.3.0.tar.gz")
+  ExternalSource_Download("TFLiteVanilla_TensorFlow" ${tensorflow_url})
+  set(TFLiteVanillaTensorFlowSource_DIR "${TFLiteVanilla_TensorFlow_SOURCE_DIR}")
+  if (NOT TFLiteVanillaTensorFlowSource_DIR STREQUAL "")
+    set(TFLiteVanillaTensorFlowSource_FOUND TRUE)
+  endif()
+  return_unless(TFLiteVanillaTensorFlowSource_FOUND)
+
+  set(ruy_url "https://github.com/google/ruy/archive/34ea9f4993955fa1ff4eb58e504421806b7f2e8f.zip")
+  ExternalSource_Download("TFLiteVanilla_Ruy" ${ruy_url})
+  set(TFLiteVanillaRuySource_DIR "${TFLiteVanilla_Ruy_SOURCE_DIR}")
+  if (NOT TFLiteVanillaRuySource_DIR STREQUAL "")
+    set(TFLiteVanillaRuySource_FOUND TRUE)
+  endif()
+  return_unless(TFLiteVanillaRuySource_FOUND)
+
+  nnas_include(ExternalProjectTools)
+  add_extdirectory("${CMAKE_CURRENT_LIST_DIR}/TensorFlowLite-2.3.0" tflite-2.3.0)
+
+  set(TensorFlowLite_2_3_0_FOUND TRUE)
+  return()
+endif()
diff --git a/infra/nnfw/config/gbs.conf b/infra/nnfw/config/gbs.conf
index 515cadaba..bad9eb204 100644
--- a/infra/nnfw/config/gbs.conf
+++ b/infra/nnfw/config/gbs.conf
@@ -5,7 +5,7 @@ profile = profile.tizen
 [profile.tizen]
 user=obs_viewer
 obs = obs.tizen
-repos = repo.tizen_base,repo.tizen_mobile
+repos = repo.tizen_one,repo.tizen_base,repo.tizen_mobile
 buildroot = /home/GBS-ROOT/
 
 [obs.tizen]
@@ -15,6 +15,8 @@ url = http://api.tizen.org
 url = http://download.tizen.org/snapshots/tizen/unified/latest/repos/standard/packages/
 
 [repo.tizen_base]
-url =  http://download.tizen.org/snapshots/tizen/base/latest/repos/standard/packages/
+url = http://download.tizen.org/snapshots/tizen/base/latest/repos/standard/packages/
 
+[repo.tizen_one]
+url = http://nnfw.mooo.com/archive/tizen/
 
diff --git a/infra/packaging/build b/infra/packaging/build
index 036c2d575..e941a724b 100644
--- a/infra/packaging/build
+++ b/infra/packaging/build
@@ -85,7 +85,8 @@ function join_by
 # Invoke "preset_configure" function that the preset provides
 preset_configure
 
-NPROC=$(cat /proc/cpuinfo | grep -c processor)
+NPROC=${NPROC:-$(cat /proc/cpuinfo | grep -c processor)}
+echo "[BUILD] \"make\" with -j${NPROC} option. You can specify the number of jobs by defining NPROC"
 cmake --build . -- -j$((NPROC/2)) all
 cmake --build . -- install
 # Install NN Package tools
diff --git a/infra/packaging/preset/20200630 b/infra/packaging/preset/20200630
index e1599357a..5d1635809 100644
--- a/infra/packaging/preset/20200630
+++ b/infra/packaging/preset/20200630
@@ -14,6 +14,7 @@ function preset_configure()
   REQUIRED_UNITS+=("souschef")
   REQUIRED_UNITS+=("safemain")
   REQUIRED_UNITS+=("arser")
+  REQUIRED_UNITS+=("vconone")
   # Hermes Logging Framework
   REQUIRED_UNITS+=("hermes" "hermes-std")
   # loco IR and related utilities
@@ -27,12 +28,16 @@ function preset_configure()
   REQUIRED_UNITS+=("tf2tfliteV2" "luci-interpreter" "circle-verify")
   REQUIRED_UNITS+=("record-minmax" "circle-quantizer")
   REQUIRED_UNITS+=("one-cmds")
+  REQUIRED_UNITS+=("bcq-tools")
+
+  NPROC=${NPROC:-$(cat /proc/cpuinfo | grep -c processor)}
 
   # TODO Use "nncc configure" and "nncc build"
   cmake \
     -DCMAKE_INSTALL_PREFIX="${NNCC_INSTALL_PREFIX}" \
     -DCMAKE_BUILD_TYPE=release \
     -DBUILD_WHITELIST=$(join_by ";" "${REQUIRED_UNITS[@]}") \
+    -DEXTERNALS_BUILD_THREADS=$((NPROC/2)) \
     ${EXTRA_OPTIONS[@]} \
     "${NNAS_PROJECT_PATH}/infra/nncc"
 }
@@ -44,14 +49,4 @@ function preset_install()
 
   # Install tf2nnpkg
   install -T -m 755 -D "${SCRIPT_PATH}/res/tf2nnpkg.${PRESET}" "${NNAS_INSTALL_PREFIX}/bin/tf2nnpkg"
-
-  # Create python virtual enviornment
-  python3 -m venv "${NNAS_INSTALL_PREFIX}/bin/venv"
-
-  # Install tensorflow
-  source "${NNAS_INSTALL_PREFIX}/bin/venv/bin/activate"
-  python -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host files.pythonhost.org \
-    install -U pip setuptools
-  python -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host files.pythonhost.org \
-    install tensorflow-cpu==2.3.0rc0
 }
diff --git a/infra/packaging/preset/20200731_windows b/infra/packaging/preset/20200731_windows
new file mode 100644
index 000000000..65d179eaf
--- /dev/null
+++ b/infra/packaging/preset/20200731_windows
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+function preset_configure()
+{
+  REQUIRED_UNITS=()
+  # Common Libraries
+  REQUIRED_UNITS+=("angkor" "cwrap" "pepper-str" "pepper-strcast" "pp" "stdex")
+  REQUIRED_UNITS+=("oops" "pepper-assert" "foder")
+  REQUIRED_UNITS+=("souschef")
+  REQUIRED_UNITS+=("safemain")
+  REQUIRED_UNITS+=("arser")
+  REQUIRED_UNITS+=("vconone")
+  # Hermes Logging Framework
+  REQUIRED_UNITS+=("hermes" "hermes-std")
+  # loco IR and related utilities
+  REQUIRED_UNITS+=("loco" "locop" "locomotiv" "logo-core" "logo")
+  # Flatbuffer I/O
+  REQUIRED_UNITS+=("mio-tflite" "mio-circle")
+  # Circle compiler library (.circle -> .circle)
+  REQUIRED_UNITS+=("luci")
+  # Tools
+  REQUIRED_UNITS+=("tflite2circle" "circle2circle" "tflchef" "circlechef")
+  REQUIRED_UNITS+=("tf2tfliteV2" "luci-interpreter" "circle-verify")
+  REQUIRED_UNITS+=("record-minmax" "circle-quantizer")
+  REQUIRED_UNITS+=("one-cmds")
+
+  NPROC=$(cat /proc/cpuinfo | grep -c processor)
+
+  # TODO Use "nncc configure" and "nncc build"
+  cmake \
+    -G "MSYS Makefiles" \
+    -DTF2NNPKG_FOR_WINDOWS=ON \
+    -DUSE_PROTOBUF_LEGACY_IMPORT=ON \
+    -DCMAKE_EXE_LINKER_FLAGS="-Wl,--allow-multiple-definition" \
+    -DCMAKE_SHARED_LINKER_FLAGS="-Wl,--allow-multiple-definition" \
+    -DENABLE_TEST=OFF \
+    -DDOWNLOAD_GTEST=OFF \
+    -DBUILD_GTEST=OFF \
+    -DCMAKE_C_COMPILER=gcc \
+    -DCMAKE_CXX_COMPILER=g++ \
+    -DCMAKE_INSTALL_PREFIX="${NNCC_INSTALL_PREFIX}" \
+    -DCMAKE_BUILD_TYPE=release \
+    -DBUILD_WHITELIST=$(join_by ";" "${REQUIRED_UNITS[@]}") \
+    -DEXTERNALS_BUILD_THREADS=$((NPROC/2)) \
+    ${EXTRA_OPTIONS[@]} \
+    "${NNAS_PROJECT_PATH}/infra/nncc"
+}
+
+function preset_install()
+{
+  # Install libraries to bin/ for Windows release
+  mv ${NNCC_INSTALL_PREFIX}/lib/*.dll ${NNCC_INSTALL_PREFIX}/bin
+  rm -rf ${NNCC_INSTALL_PREFIX}/lib
+
+  install -t "${NNPKG_INSTALL_PREFIX}/bin" -D \
+    "${NNAS_PROJECT_PATH}/tools/nnpackage_tool/model2nnpkg/model2nnpkg.sh"
+
+  # Install tf2nnpkg
+  install -T -m 755 -D "${SCRIPT_PATH}/res/tf2nnpkg.20200630" "${NNAS_INSTALL_PREFIX}/bin/tf2nnpkg"
+
+  # Though you have to install tensorflow to run 'tf2tfliteV2',
+  # tensorflow can't be installed in mingw. First, You can install tensorflow 
+  # from Window native CMD(run as administrator) with python virtual environment.
+  # And, you must copy it to "${NNAS_INSTALL_PREFIX}/bin/venv"
+}
diff --git a/infra/packaging/res/tf2nnpkg.20200630 b/infra/packaging/res/tf2nnpkg.20200630
index 9101f8273..7846fd388 100644
--- a/infra/packaging/res/tf2nnpkg.20200630
+++ b/infra/packaging/res/tf2nnpkg.20200630
@@ -14,10 +14,16 @@ command_exists() {
 usage()
 {
   echo "Convert TensorFlow model to nnpackage."
-  echo "Usage: tf2nnpkg --info <path/to/info> --graphdef <path/to/pb> [OPTION] -o <path/to/nnpkg/directory>"
-  exit 0
+  echo "Usage: tf2nnpkg"
+  echo "    --info <path/to/info>"
+  echo "    --graphdef <path/to/pb>"
+  echo "    -o <path/to/nnpkg/directory>"
+  echo "    --v2 (optional) Use TF 2.x interface"
+  exit 255
 }
 
+TF_INTERFACE="--v1"
+
 # Parse command-line arguments
 #
 while [ "$#" -ne 0 ]; do
@@ -39,6 +45,10 @@ while [ "$#" -ne 0 ]; do
       export OUTPUT_DIR="$2"
       shift 2
       ;;
+    '--v2')
+      TF_INTERFACE="--v2"
+      shift
+      ;;
     *)
       echo "${CUR}"
       shift
@@ -83,10 +93,7 @@ OUTPUT=$(awk -F, '/^output/ { print $2 }' ${INFO_FILE} | cut -d: -f1 | tr -d ' '
 INPUT_SHAPES=$(grep ^input ${INFO_FILE} | cut -d "[" -f2 | cut -d "]" -f1 | tr -d ' ' | xargs | tr ' ' ':')
 
 # generate tflite file
-python "${ROOT}/bin/tf2tfliteV2.py" --v2 --input_path ${GRAPHDEF_FILE} \
---output_path "${TMPDIR}/${MODEL_NAME}.tflite" \
---input_arrays ${INPUT} --output_arrays ${OUTPUT} || \
-python "${ROOT}/bin/tf2tfliteV2.py" --v1 --input_path ${GRAPHDEF_FILE} \
+python "${ROOT}/bin/tf2tfliteV2.py" ${TF_INTERFACE} --input_path ${GRAPHDEF_FILE} \
 --output_path "${TMPDIR}/${MODEL_NAME}.tflite" \
 --input_arrays ${INPUT} --input_shapes ${INPUT_SHAPES} \
 --output_arrays ${OUTPUT}
diff --git a/infra/scripts/build-tcm.sh b/infra/scripts/build-tcm.sh
new file mode 100755
index 000000000..22fb33558
--- /dev/null
+++ b/infra/scripts/build-tcm.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+#
+# STEP 1
+#   Download latest TCM tool from 
+#   https://github.sec.samsung.net/RS-TCM/tca-standalone/releases/download/v0.0.8/tca-standalone-0.0.8.jar
+#
+# STEP 2
+#   Create symbolic link `./src` for source directory to be analyzed which has `.ahub` configuration.
+#
+# STEP 3
+#   run this `build-tcm.sh` script.
+#
+# See the following link for additional details.
+#   https://github.sec.samsung.net/RS-TCM/tca-standalone/wiki/Tutorials-CPP-Gtest
+#
+
+echo ${PROJECT_DIR:=${PWD}}
+
+java -jar $PROJECT_DIR/tca-standalone-0.0.8.jar \
+  --outdir=$PROJECT_DIR/tcm-output \
+  --config=$PROJECT_DIR/.ahub/tcchecker-tca/config.yaml \
+  --local=$PROJECT_DIR/src \
+  --logfile=$PROJECT_DIR/tcm-output/tcm.log \
+  --debug
diff --git a/infra/scripts/common.sh b/infra/scripts/common.sh
index 28aa213ec..a10aac271 100755
--- a/infra/scripts/common.sh
+++ b/infra/scripts/common.sh
@@ -15,15 +15,18 @@
 # TFLiteModelVerification $1 $2 $3
 #   Run ./tests/scripts/test-driver.sh script verification test
 #
-# Unittests $1 $2 $3
-#   Run ./tests/scripts/test-driver.sh script unittest
+# NNAPIGTest $1 $2 $3
+#   Run [INSTALL_PATH]/test/onert-test unittest command for nnapi gtest
 #
 # NNPackageTest $1 $2
-#   Run ./tests/scripts/nnpkg_test.sh script nnpackage test
+#   Run [INSTALL_PATH]/test/onert-test nnpkg-test command
 
 CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 ROOT_PATH="$(cd ${CURRENT_PATH}/../../ && pwd)"
 
+# Install path on CI
+INSTALL_PATH=$ROOT_PATH/Product/out
+
 function CheckTestPrepared()
 {
   # Model download server setting
@@ -47,16 +50,12 @@ function TFLiteModelVerification()
 
   export BACKENDS=$1
   if [[ "$2" == "" ]]; then
-    ./tests/scripts/test-driver.sh \
-      --reportdir=$ROOT_PATH/$3 \
-      --verification \
-      .
+    $INSTALL_PATH/test/onert-test verify-tflite --api=nnapi \
+      --reportdir=$ROOT_PATH/$3
   else
-    ./tests/scripts/test-driver.sh \
-      --frameworktest_list_file=$2 \
-      --reportdir=$ROOT_PATH/$3 \
-      --verification \
-      .
+    $INSTALL_PATH/test/onert-test verify-tflite --api=nnapi \
+      --list=$2 \
+      --reportdir=$ROOT_PATH/$3
   fi
   unset BACKENDS
 
@@ -64,10 +63,10 @@ function TFLiteModelVerification()
 }
 
 # $1: (required) backend
-# $2: (required) unittest skiplist file relative path from nnfw root directory
+# $2: (required) nnapi gtest skiplist file relative path from nnfw root directory
 #                pass empty string if there is no test list
 # $3: (required) relative path for report from nnfw root directory
-function Unittests()
+function NNAPIGTest()
 {
   [[ $# -ne 3 ]] && echo "Invalid function argument setting" && exit 1
 
@@ -75,7 +74,7 @@ function Unittests()
 
   # Backup original nnapi_gtest.skip
   # TODO Pass skiplist to test-driver.sh
-  SKIPLIST_FILE="${ROOT_PATH}/Product/out/unittest/nnapi_gtest.skip"
+  SKIPLIST_FILE="${INSTALL_PATH}/unittest/nnapi_gtest.skip"
   BACKUP_FILE="${SKIPLIST_FILE}.backup"
   if [[ "$2" != "" ]]; then
     cp ${SKIPLIST_FILE} ${BACKUP_FILE}
@@ -83,10 +82,9 @@ function Unittests()
   fi
 
   export BACKENDS=$1
-  ./tests/scripts/test-driver.sh \
+  $INSTALL_PATH/test/onert-test unittest \
     --reportdir=$ROOT_PATH/$3 \
-    --unittest \
-    .
+    --unittestdir=$INSTALL_PATH/unittest
   unset BACKENDS
 
   # TODO Pass skiplist to test-driver.sh
@@ -115,7 +113,7 @@ function NNPackageTest()
   do
     for entry in "nnpkg-tcs"/$f; do
       if [ -e $entry ]; then
-        BACKENDS="$1" tests/scripts/nnpkg_test.sh -d -i nnpkg-tcs $(basename "$entry")
+        BACKENDS="$1" $INSTALL_PATH/test/onert-test nnpkg-test -d -i nnpkg-tcs $(basename "$entry")
       fi
     done
     EXITCODE_F=$?
@@ -144,16 +142,11 @@ function TFLiteLoaderTest()
 
   export BACKENDS=$1
   if [[ "$2" == "" ]]; then
-    ./tests/scripts/test-driver.sh \
-      --frameworktest \
-      --framework_driverbin="$ROOT_PATH/Product/out/bin/tflite_loader_test_tool" \
+    $INSTALL_PATH/test/onert-test verify-tflite --api=loader \
       --reportdir=$ROOT_PATH/$3
-      .
   else
-    ./tests/scripts/test-driver.sh \
-      --frameworktest \
-      --framework_driverbin="$ROOT_PATH/Product/out/bin/tflite_loader_test_tool" \
-      --frameworktest_list_file=tests/scripts/list/tflite_loader_list.${TEST_ARCH}.txt \
+    $INSTALL_PATH/test/onert-test verify-tflite --api=loader \
+      --list=$2 \
       --reportdir=$ROOT_PATH/$3
   fi
   unset BACKENDS
diff --git a/infra/scripts/compiler_modules.sh b/infra/scripts/compiler_modules.sh
index d436e8a1f..a0323e0a0 100644
--- a/infra/scripts/compiler_modules.sh
+++ b/infra/scripts/compiler_modules.sh
@@ -7,7 +7,7 @@ DEBUG_BUILD_ITEMS="angkor;cwrap;pepper-str;pepper-strcast;pp;stdex"
 DEBUG_BUILD_ITEMS+=";oops;pepper-assert"
 DEBUG_BUILD_ITEMS+=";hermes;hermes-std"
 DEBUG_BUILD_ITEMS+=";loco;locop;locomotiv;logo-core;logo"
-DEBUG_BUILD_ITEMS+=";foder;souschef;arser"
+DEBUG_BUILD_ITEMS+=";foder;souschef;arser;vconone"
 DEBUG_BUILD_ITEMS+=";safemain;mio-circle;mio-tflite"
 DEBUG_BUILD_ITEMS+=";tflite2circle"
 DEBUG_BUILD_ITEMS+=";luci"
diff --git a/infra/scripts/docker_build_cross_aarch64_runtime.sh b/infra/scripts/docker_build_cross_aarch64_runtime.sh
index 7da673601..011d14c18 100755
--- a/infra/scripts/docker_build_cross_aarch64_runtime.sh
+++ b/infra/scripts/docker_build_cross_aarch64_runtime.sh
@@ -6,7 +6,7 @@ CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 ROOT_PATH="$CURRENT_PATH/../../"
 
 # prepare rootfs
-if [ ! -d $ROOTFS_DIR ]; then
+if [ -z "$ROOTFS_DIR" ] || [ ! -d $ROOTFS_DIR ]; then
   echo "It will use default rootfs path"
 else
   DOCKER_VOLUMES+=" -v $ROOTFS_DIR:/opt/rootfs"
diff --git a/infra/scripts/docker_build_cross_arm_runtime.sh b/infra/scripts/docker_build_cross_arm_runtime.sh
index f1f666aa3..551fb5700 100755
--- a/infra/scripts/docker_build_cross_arm_runtime.sh
+++ b/infra/scripts/docker_build_cross_arm_runtime.sh
@@ -6,7 +6,7 @@ CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 ROOT_PATH="$CURRENT_PATH/../../"
 
 # prepare rootfs
-if [ ! -d $ROOTFS_DIR ]; then
+if [ -z "$ROOTFS_DIR" ] || [ ! -d $ROOTFS_DIR ]; then
   echo "It will use default rootfs path"
 else
   DOCKER_VOLUMES+=" -v $ROOTFS_DIR:/opt/rootfs"
diff --git a/infra/scripts/docker_build_cross_arm_runtime_release.sh b/infra/scripts/docker_build_cross_arm_runtime_release.sh
index ea66f1774..876f318f4 100755
--- a/infra/scripts/docker_build_cross_arm_runtime_release.sh
+++ b/infra/scripts/docker_build_cross_arm_runtime_release.sh
@@ -6,7 +6,7 @@ CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 ROOT_PATH="$CURRENT_PATH/../../"
 
 # prepare rootfs
-if [ ! -d $ROOTFS_DIR ]; then
+if [ -z "$ROOTFS_DIR" ] || [ ! -d $ROOTFS_DIR ]; then
   echo "It will use default rootfs path"
 else
   DOCKER_VOLUMES+=" -v $ROOTFS_DIR:/opt/rootfs"
diff --git a/infra/scripts/docker_build_cross_coverage.sh b/infra/scripts/docker_build_cross_coverage.sh
index 08244e5d8..f42251baa 100755
--- a/infra/scripts/docker_build_cross_coverage.sh
+++ b/infra/scripts/docker_build_cross_coverage.sh
@@ -6,7 +6,7 @@ CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 ROOT_PATH="$CURRENT_PATH/../../"
 
 # prepare rootfs
-if [ ! -d $ROOTFS_DIR ]; then
+if [ -z "$ROOTFS_DIR" ] || [ ! -d $ROOTFS_DIR ]; then
   echo "It will use default rootfs path"
 else
   DOCKER_VOLUMES+=" -v $ROOTFS_DIR:/opt/rootfs"
diff --git a/infra/scripts/docker_build_nncc.sh b/infra/scripts/docker_build_nncc.sh
index 418b50dfe..046bc8a4c 100755
--- a/infra/scripts/docker_build_nncc.sh
+++ b/infra/scripts/docker_build_nncc.sh
@@ -54,7 +54,18 @@ pushd $ROOT_PATH > /dev/null
 mkdir -p ${NNCC_INSTALL_PREFIX}
 ./nncc docker-run ./nnas create-package --prefix "${PWD}/${NNCC_INSTALL_PREFIX}" -- "${CONFIG_OPTIONS}"
 
+# create python virtual environment
+./nncc docker-run python3 -m venv "${NNCC_INSTALL_PREFIX}/bin/venv"
+
+./nncc docker-run "${NNCC_INSTALL_PREFIX}/bin/venv/bin/python" \
+  -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host files.pythonhost.org \
+  install -U pip setuptools
+./nncc docker-run "${NNCC_INSTALL_PREFIX}/bin/venv/bin/python" \
+  -m pip --default-timeout=1000 --trusted-host pypi.org --trusted-host files.pythonhost.org \
+  install tensorflow-cpu==2.3.0
+
 mkdir -p ${ARCHIVE_PATH}
-tar -zcf ${ARCHIVE_PATH}/nncc-package.tar.gz -C ${NNCC_INSTALL_PREFIX} ./
+tar -zcf ${ARCHIVE_PATH}/nncc-package.tar.gz -C ${NNCC_INSTALL_PREFIX} --exclude "bin/venv" ./
+tar -zcf ${ARCHIVE_PATH}/nncc-venv-package.tar.gz -C ${NNCC_INSTALL_PREFIX} bin/venv
 
 popd > /dev/null
diff --git a/infra/scripts/docker_build_tizen_cross.sh b/infra/scripts/docker_build_tizen_cross.sh
index 18809ad07..ee0f183f1 100755
--- a/infra/scripts/docker_build_tizen_cross.sh
+++ b/infra/scripts/docker_build_tizen_cross.sh
@@ -6,7 +6,7 @@ CURRENT_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 ROOT_PATH="$CURRENT_PATH/../../"
 
 # prepare rootfs
-if [ ! -d $ROOTFS_DIR ]; then
+if [ -z "$ROOTFS_DIR" ] || [ ! -d $ROOTFS_DIR ]; then
   echo "It will use default rootfs path"
 else
   DOCKER_VOLUMES+=" -v $ROOTFS_DIR:/opt/rootfs"
diff --git a/infra/scripts/docker_collect_nnpkg_resources.sh b/infra/scripts/docker_collect_nnpkg_resources.sh
index 556c5bd74..55adaa15d 100755
--- a/infra/scripts/docker_collect_nnpkg_resources.sh
+++ b/infra/scripts/docker_collect_nnpkg_resources.sh
@@ -60,7 +60,7 @@ pushd $ROOT_PATH > /dev/null
 REQUIRED_UNITS=()
 # Common Libraries
 REQUIRED_UNITS+=("angkor" "cwrap" "pepper-str" "pepper-strcast" "pp" "stdex")
-REQUIRED_UNITS+=("oops" "safemain" "foder" "arser" "oops")
+REQUIRED_UNITS+=("oops" "safemain" "foder" "arser" "vconone")
 # Hermes Logging Framework
 REQUIRED_UNITS+=("hermes" "hermes-std")
 # loco IR and related utilities
diff --git a/infra/scripts/test_arm_nnpkg.sh b/infra/scripts/test_arm_nnpkg.sh
index 68adaf913..d00eb730f 100755
--- a/infra/scripts/test_arm_nnpkg.sh
+++ b/infra/scripts/test_arm_nnpkg.sh
@@ -7,10 +7,10 @@ BACKENDS=("acl_cl" "acl_neon" "cpu")
 
 for BACKEND in "${BACKENDS[@]}";
 do
-  NNPackageTest ${BACKEND} "tests/scripts/list/nnpkg_test_list.armv7l-linux.${BACKEND}"
+  NNPackageTest ${BACKEND} "Product/out/test/list/nnpkg_test_list.armv7l-linux.${BACKEND}"
 done
 
 # Interpreter test
 export DISABLE_COMPILE=1
-NNPackageTest "interp" "tests/scripts/list/nnpkg_test_list.noarch.interp"
+NNPackageTest "interp" "Product/out/test/list/nnpkg_test_list.noarch.interp"
 unset DISABLE_COMPILE
diff --git a/infra/scripts/test_coverage.sh b/infra/scripts/test_coverage.sh
index c3dc04821..12a9942ab 100755
--- a/infra/scripts/test_coverage.sh
+++ b/infra/scripts/test_coverage.sh
@@ -32,7 +32,7 @@ export GCOV_PREFIX_STRIP=`cat $ROOT_PATH/tests/scripts/build_path_depth.txt`
 TENSOR_LOGGING=trace_log.txt ONERT_LOG_ENABLE=1 GRAPH_DOT_DUMP=1 ./infra/scripts/test_ubuntu_runtime_mixed.sh
 # Enable trace event (acl_cl default backend)
 export TRACE_FILEPATH=trace.json
-TFLiteModelVerification "acl_cl" "tests/scripts/list/frameworktest_list.armv7l.acl_cl.txt" "report/acl_cl/trace"
+TFLiteModelVerification "acl_cl" "Product/out/test/list/frameworktest_list.armv7l.acl_cl.txt" "report/acl_cl/trace"
 unset TRACE_FILEPATH
 
 # Interpreter
diff --git a/infra/scripts/test_ubuntu_runtime.sh b/infra/scripts/test_ubuntu_runtime.sh
index 76e567a29..f250df5a0 100755
--- a/infra/scripts/test_ubuntu_runtime.sh
+++ b/infra/scripts/test_ubuntu_runtime.sh
@@ -68,7 +68,7 @@ else
 fi
 
 UNITTEST_SKIPLIST="Product/out/unittest/nnapi_gtest.skip.${TEST_PLATFORM}.${BACKEND}"
-FRAMEWORK_TESTLIST="tests/scripts/list/frameworktest_list.${TEST_ARCH}.${BACKEND}.txt"
+FRAMEWORK_TESTLIST="Product/out/test/list/frameworktest_list.${TEST_ARCH}.${BACKEND}.txt"
 REPORT_BASE="report/${BACKEND}"
 EXECUTORS=("Linear" "Dataflow" "Parallel")
 
@@ -91,7 +91,7 @@ do
     export EXECUTOR="${EXECUTOR}"
   fi
 
-  Unittests "${BACKEND}" "${UNITTEST_SKIPLIST}" "${REPORT_PATH}"
+  NNAPIGTest "${BACKEND}" "${UNITTEST_SKIPLIST}" "${REPORT_PATH}"
   TFLiteModelVerification "${BACKEND}" "${FRAMEWORK_TESTLIST}" "${REPORT_PATH}"
 
   if [ $EXECUTOR = "Interpreter" ]; then
@@ -103,12 +103,7 @@ done
 
 # Current support acl_cl backend testlist only
 # TODO Support more backends
-TFLITE_LOADER_TESTLIST="tests/scripts/list/tflite_loader_list.${TEST_ARCH}.txt"
+TFLITE_LOADER_TESTLIST="Product/out/test/list/tflite_loader_list.${TEST_ARCH}.txt"
 if [[ $TFLITE_LOADER = "1" ]]; then
   TFLiteLoaderTest "${BACKEND}" "${TFLITE_LOADER_TESTLIST}" "${REPORT_BASE}/loader/${EXECUTOR}"
-
-  # Test custom op
-  pushd ${ROOT_PATH} > /dev/null
-  ./Product/out/tests/FillFrom_runner
-  popd > /dev/null
 fi
diff --git a/infra/scripts/test_ubuntu_runtime_mixed.sh b/infra/scripts/test_ubuntu_runtime_mixed.sh
index 265a2acff..24fde8896 100755
--- a/infra/scripts/test_ubuntu_runtime_mixed.sh
+++ b/infra/scripts/test_ubuntu_runtime_mixed.sh
@@ -14,32 +14,26 @@ TEST_OS="linux"
 
 # This test requires test model installation
 pushd ${ROOT_PATH} > /dev/null
-echo
-echo "==== Run nnfw_api_gtest begin ===="
-echo
-NNFW_API_TEST_MODEL_INSTALLER=tests/scripts/nnfw_api_gtest/install_nnfw_api_gtest_nnpackages.sh
-TEST_BIN=Product/out/unittest_standalone/nnfw_api_gtest
-$NNFW_API_TEST_MODEL_INSTALLER --install-dir ${TEST_BIN}_models
-${TEST_BIN}
-echo
-echo "==== Run nnfw_api_gtest end ===="
-echo
+echo ""
+echo "==== Run standalone unittest begin ===="
+echo ""
+Product/out/test/onert-test prepare-model --model=nnpackage
+Product/out/test/onert-test unittest --unittestdir=Product/out/unittest_standalone
+echo ""
+echo "==== Run standalone unittest end ===="
+echo ""
+
+# Test custom op
+pushd ${ROOT_PATH} > /dev/null
+./Product/out/test/FillFrom_runner
 popd > /dev/null
 
-Product/out/unittest_standalone/test_compute
-Product/out/unittest_standalone/test_onert
-Product/out/unittest_standalone/test_onert_backend_cpu_common
-Product/out/unittest_standalone/test_onert_frontend_nnapi
-Product/out/unittest_standalone/tflite_test
-
-pushd ${ROOT_PATH}
-
 # NOTE Fixed backend assignment by type of operation
 # TODO Enhance this with randomized test
 BACKENDS=(acl_cl acl_neon cpu)
 
 # Get the intersect of framework test list files
-TESTLIST_PREFIX="tests/scripts/list/frameworktest_list.${TEST_ARCH}"
+TESTLIST_PREFIX="Product/out/test/list/frameworktest_list.${TEST_ARCH}"
 SKIPLIST_PREFIX="Product/out/unittest/nnapi_gtest.skip.${TEST_ARCH}-${TEST_OS}"
 sort $TESTLIST_PREFIX.${BACKENDS[0]}.txt > $TESTLIST_PREFIX.intersect.txt
 sort $SKIPLIST_PREFIX.${BACKENDS[0]} > $SKIPLIST_PREFIX.union
@@ -65,5 +59,5 @@ export OP_BACKEND_Conv2D="cpu"
 export OP_BACKEND_MaxPool2D="acl_cl"
 export OP_BACKEND_AvgPool2D="acl_neon"
 export ACL_LAYOUT="NCHW"
-Unittests "acl_cl;acl_neon;cpu" "Product/out/unittest/nnapi_gtest.skip.${TEST_ARCH}-${TEST_OS}.union" "report/mixed"
+NNAPIGTest "acl_cl;acl_neon;cpu" "Product/out/unittest/nnapi_gtest.skip.${TEST_ARCH}-${TEST_OS}.union" "report/mixed"
 TFLiteModelVerification "acl_cl;acl_neon;cpu" "${TESTLIST_PREFIX}.intersect.txt" "report/mixed"
diff --git a/infra/scripts/tizen_xu4_test.sh b/infra/scripts/tizen_xu4_test.sh
index 5521b5fdc..8f9e86fb0 100755
--- a/infra/scripts/tizen_xu4_test.sh
+++ b/infra/scripts/tizen_xu4_test.sh
@@ -23,19 +23,21 @@ function install_model()
 {
     # download tflite model files
     pushd $HOST_HOME
-    tests/scripts/framework/run_test.sh --download=on
+    tests/scripts/models/run_test.sh --download=on --run=off
     # TODO Since this command removes model file(.zip),
     # We must always download the file unlike model file(.tflite).
     # Because caching applies only to tflite file.
     find tests -name "*.zip" -exec rm {} \;
-    tar -zcf cache.tar.gz tests/scripts/framework/cache
+    tar -zcf cache.tar.gz -C tests/scripts/models cache
     $SDB_CMD push cache.tar.gz $TEST_ROOT/.
     rm -rf cache.tar.gz
-    $SDB_CMD shell tar -zxf $TEST_ROOT/cache.tar.gz -C $TEST_ROOT
+    $SDB_CMD shell tar -zxf $TEST_ROOT/cache.tar.gz -C $TEST_ROOT/Product/out/test/models
 
     # download api test model file for nnfw_api_gtest
     MODEL_CACHE_DIR=$(mktemp -d)
-    tests/scripts/nnfw_api_gtest/install_nnfw_api_gtest_nnpackages.sh --install-dir $MODEL_CACHE_DIR
+    tests/scripts/models/run_test.sh --download=on --run=off \
+        --configdir=test/scripts/nnfw_api_gtest/models \
+        --cachedir=$MODEL_CACHE_DIR
     tar -zcf $MODEL_CACHE_DIR/api_model_test.tar.gz -C $MODEL_CACHE_DIR .
     $SDB_CMD push $MODEL_CACHE_DIR/api_model_test.tar.gz $TEST_ROOT/Product/out/unittest_standalone/nnfw_api_gtest_models/
     $SDB_CMD shell tar -zxf $TEST_ROOT/Product/out/unittest_standalone/nnfw_api_gtest_models/api_model_test.tar.gz \
@@ -157,7 +159,7 @@ else
   rm -rf ${GCOV_DIR}/*
   pushd ${GCOV_DIR}
 
-  sdb pull ${TEST_ROOT}/tests/scripts/build_path.txt
+  sdb pull ${TEST_ROOT}/Product/out/test/build_path.txt
   SRC_PREFIX=`cat build_path.txt`
   GCOV_PREFIX_STRIP=`echo "${SRC_PREFIX}" | grep -o '/' | wc -l`
   GCOV_DATA_PATH="/opt/usr/nnfw-gcov"
diff --git a/nnpackage/spec/30_custom_op.md b/nnpackage/spec/30_custom_op.md
index 504695fdf..d98521b1d 100644
--- a/nnpackage/spec/30_custom_op.md
+++ b/nnpackage/spec/30_custom_op.md
@@ -42,7 +42,7 @@ typedef void (*nnfw_custom_eval)(nnfw_custom_kernel_params *params, char *userda
 ```
 
 The structures and relevant APIs are defined in nnfw APIs.
-Please see `nnfw_dev.h` for detail.
+Please see `nnfw_experimental.h` for detail.
 
 You can find example in `nnfw` repository.
 
diff --git a/packaging/nnapi_test_generated.tar.gz b/packaging/nnapi_test_generated.tar.gz
index ebbb8496c..504dbf9a6 100644
--- a/packaging/nnapi_test_generated.tar.gz
+++ b/packaging/nnapi_test_generated.tar.gz
diff --git a/packaging/nnfw.spec b/packaging/nnfw.spec
index ce1cd0b92..a1157c702 100644
--- a/packaging/nnfw.spec
+++ b/packaging/nnfw.spec
@@ -1,6 +1,6 @@
 Name:    nnfw
 Summary: nnfw
-Version: 1.7.0
+Version: 1.8.0
 Release: 1
 Group:   Development
 License: Apache-2.0 and MIT and BSD-2-Clause
@@ -30,7 +30,7 @@ BuildRequires:  flatbuffers-devel
 %ifarch %{arm} aarch64
 # Require python for acl-ex library build pre-process
 BuildRequires:  python
-BuildRequires:  libarmcl-devel
+BuildRequires:  libarmcl-devel >= v20.05
 %endif
 
 Requires(post): /sbin/ldconfig
@@ -62,6 +62,12 @@ Requires: %{name}-devel = %{version}-%{release}
 %description plugin-devel
 NNFW development package for backend plugin developer
 
+%package minimal-app
+Summary: Minimal test binary for VD manual test
+
+%description minimal-app
+Minimal test binary for VD manual test
+
 %if %{test_build} == 1
 %package test
 Summary: NNFW Test
@@ -83,7 +89,7 @@ NNFW test rpm. It does not depends on nnfw rpm since it contains nnfw runtime.
 %define install_dir %{_prefix}
 %define install_path %{buildroot}%{install_dir}
 %define build_env NNFW_WORKSPACE=build
-%define build_options -DCMAKE_BUILD_TYPE=%{build_type} -DTARGET_ARCH=%{target_arch} -DTARGET_OS=tizen -DENABLE_TEST=off
+%define build_options -DCMAKE_BUILD_TYPE=%{build_type} -DTARGET_ARCH=%{target_arch} -DTARGET_OS=tizen -DENABLE_TEST=off -DBUILD_MINIMAL_SAMPLE=on
 
 # Set option for test build (and coverage test build)
 %define test_install_home /opt/usr/nnfw-test
@@ -126,7 +132,7 @@ tar -xf %{SOURCE1005} -C ./externals
 %if %{coverage_build} == 1
 pwd > tests/scripts/build_path.txt
 %endif # coverage_build
-tar -zcf test-suite.tar.gz infra/scripts tests/scripts
+tar -zcf test-suite.tar.gz infra/scripts
 %endif # test_build
 %endif # arm armv7l aarch64
 
@@ -134,8 +140,10 @@ tar -zcf test-suite.tar.gz infra/scripts tests/scripts
 %ifarch arm armv7l aarch64
 
 mkdir -p %{buildroot}%{_libdir}
+mkdir -p %{buildroot}%{_bindir}
 mkdir -p %{buildroot}%{_includedir}
 install -m 644 build/out/lib/*.so %{buildroot}%{_libdir}
+install -m 755 build/out/bin/onert-minimal-app %{buildroot}%{_bindir}
 cp -r build/out/include/* %{buildroot}%{_includedir}/
 
 # For developer
@@ -154,13 +162,14 @@ install -m 0644 ./nnfw-plugin.pc.in %{buildroot}%{_libdir}/pkgconfig/nnfw-plugin
 %if %{test_build} == 1
 %{test_build_env} ./nnfw install
 # Share test script with ubuntu (ignore error if there is no list for target)
-cp tests/nnapi/nnapi_gtest.skip.* %{buildroot}%{test_install_dir}/unittest/.
+cp tests/nnapi/nnapi_gtest.skip.%{target_arch}-* %{buildroot}%{test_install_dir}/unittest/.
 cp %{buildroot}%{test_install_dir}/unittest/nnapi_gtest.skip.%{target_arch}-linux.cpu %{buildroot}%{test_install_dir}/unittest/nnapi_gtest.skip
 tar -zxf test-suite.tar.gz -C %{buildroot}%{test_install_home}
 
 %if %{coverage_build} == 1
 mkdir -p %{buildroot}%{test_install_home}/gcov
 find . -name "*.gcno" -exec xargs cp {} %{buildroot}%{test_install_home}/gcov/. \;
+install -m 0644 ./tests/scripts/build_path.txt %{buildroot}%{test_install_dir}/test/build_path.txt
 %endif # coverage_build
 %endif # test_build
 
@@ -189,11 +198,16 @@ find . -name "*.gcno" -exec xargs cp {} %{buildroot}%{test_install_home}/gcov/.
 %manifest %{name}.manifest
 %defattr(-,root,root,-)
 %ifarch arm armv7l aarch64
-%dir %{_includedir}/nnfw
+%dir %{_includedir}/onert
 %{_includedir}/onert/*
 %{_libdir}/pkgconfig/nnfw-plugin.pc
 %endif
 
+%files minimal-app
+%manifest %{name}.manifest
+%defattr(-,root,root,-)
+%{_bindir}/onert-minimal-app
+
 %if %{test_build} == 1
 %files test
 %manifest %{name}.manifest
diff --git a/res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.recipe b/res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.recipe
new file mode 100644
index 000000000..7322e90a4
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.recipe
@@ -0,0 +1,26 @@
+operand {
+  name: "ifm"
+  type: UINT8
+  shape { dim: 1 dim: 8 dim: 8 dim: 1 }
+  quant { min: 0 max: 255 scale: 1.0 zero_point: 0 }
+}
+operand {
+  name: "ofm"
+  type: UINT8
+  shape { dim: 1 dim: 7 dim: 7 dim: 1 }
+  quant { min: 0 max: 255 scale: 1.0 zero_point: 0 }
+}
+operation {
+  type: "AveragePool2D"
+  averagepool2d_options {
+    padding: VALID
+    stride_w: 1
+    stride_h: 1
+    filter_width: 2
+    filter_height: 2
+  }
+  input: "ifm"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.reverse b/res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/AveragePool2D_U8_000/test.reverse
diff --git a/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.recipe b/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.recipe
new file mode 100644
index 000000000..a09afc1de
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.recipe
@@ -0,0 +1,44 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 4 dim: 5 dim: 5 }
+}
+operand {
+  name: "ker"
+  type: FLOAT32
+  shape { dim: 1 dim: 1 dim: 2 dim: 25 }
+}
+operand {
+  name: "bias"
+  type: FLOAT32
+  shape { dim: 25 }
+  filler {
+    tag: "constant"
+    arg: "1.1"
+  }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 2 dim: 2 dim: 25 }
+}
+operation {
+  type: "DepthwiseConv2D"
+  version: 2
+  depthwiseconv2d_options {
+    padding: VALID
+    stride_w: 2
+    stride_h: 2
+    dilation_w_factor: 2
+    dilation_h_factor: 1
+    depth_multiplier: 5
+    activation : RELU6
+  }
+  input: "ifm"
+  input: "ker"
+  input: "bias"
+  output: "ofm"
+}
+input: "ifm"
+input: "ker"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.reverse b/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.reverse
diff --git a/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.rule b/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.rule
new file mode 100644
index 000000000..edfabc64e
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/DepthwiseConv2D_003/test.rule
@@ -0,0 +1,3 @@
+# To check if DEPTHWISE_CONV_2D version is 2
+
+RULE    "OP_VERSION_CHECK"        $(op_version DEPTHWISE_CONV_2D) '=' 2
diff --git a/res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.recipe b/res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.recipe
new file mode 100644
index 000000000..5e0b6b543
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.recipe
@@ -0,0 +1,61 @@
+operand {
+  name: "ifm"
+  type: UINT8
+  shape { dim: 1 dim: 112 dim: 112 dim: 4 }
+  quant { min: 0 max: 6 scale: 0.0235294 zero_point: 0 }
+}
+operand {
+  name: "ker"
+  type: UINT8
+  shape { dim: 1 dim: 3 dim: 3 dim: 4 }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "1.0"
+  }
+  quant {
+    min: -30.3175 min: -0.779597 min: -10.2751 min: -10.8594
+    max: 4.35049 max: 2.70807 max: 11.0269 max: 20.97
+    scale:0.135953 scale: 0.0136771 scale: 0.0835375 scale: 0.124821
+    zero_point:223 zero_point: 57 zero_point: 123 zero_point: 87
+    quantized_dimension: 3
+  }
+}
+operand {
+  name: "bias"
+  type: INT32
+  shape { dim: 4 }
+	filler {
+	  tag: "gaussian"
+	  arg: "0"
+	  arg: "1.0"
+	}
+  quant {
+    scale: 1.4758e-16 scale: 3.15185e-05 scale: 2.20685e-05 scale: 1.72205e-16
+    zero_point: 0 zero_point: 0 zero_point: 0 zero_point: 0
+  }
+}
+operand {
+  name: "ofm"
+  type: UINT8
+  shape { dim: 1 dim: 112 dim: 112 dim: 4 }
+  quant { min: 0 max: 6 scale: 0.0235294 zero_point: 0 }
+
+}
+operation {
+  type: "DepthwiseConv2D"
+  depthwiseconv2d_options {
+    padding: SAME
+    stride_w: 1
+    stride_h: 1
+    depth_multiplier: 1
+    activation : RELU6
+  }
+  input: "ifm"
+  input: "ker"
+  input: "bias"
+  output: "ofm"
+}
+input: "ifm"
+input: "ker"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.reverse b/res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/DepthwiseConv2D_U8_001/test.reverse
diff --git a/res/TensorFlowLiteRecipes/FullyConnected_003/test.recipe b/res/TensorFlowLiteRecipes/FullyConnected_003/test.recipe
new file mode 100644
index 000000000..0ecb5618b
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/FullyConnected_003/test.recipe
@@ -0,0 +1,55 @@
+operand {
+  name: "in"
+  type: FLOAT32
+  shape { dim: 1 dim: 16 }
+}
+operand {
+  name: "weight"
+  type: FLOAT32
+  shape { dim: 4 dim: 16 }
+  filler {
+    tag: "explicit"
+    arg: "1" arg: "2" arg: "-3" arg: "-4"
+    arg: "-5" arg: "6" arg: "-7" arg: "8"
+    arg: "4" arg: "-2" arg: "3" arg: "-1"
+    arg: "-8" arg: "-6" arg: "7" arg: "5"
+    arg: "1" arg: "2" arg: "-3" arg: "-4"
+    arg: "-5" arg: "6" arg: "-7" arg: "8"
+    arg: "4" arg: "-2" arg: "3" arg: "-1"
+    arg: "-8" arg: "-6" arg: "7" arg: "5"
+    arg: "1" arg: "2" arg: "-3" arg: "-4"
+    arg: "-5" arg: "6" arg: "-7" arg: "8"
+    arg: "4" arg: "-2" arg: "3" arg: "-1"
+    arg: "-8" arg: "-6" arg: "7" arg: "5"
+    arg: "1" arg: "2" arg: "-3" arg: "-4"
+    arg: "-5" arg: "6" arg: "-7" arg: "8"
+    arg: "4" arg: "-2" arg: "3" arg: "-1"
+    arg: "-8" arg: "-6" arg: "7" arg: "5"
+  }
+}
+operand {
+  name: "bias"
+  type: FLOAT32
+  shape { dim: 4 }
+  filler {
+    tag: "explicit"
+    arg: "1" arg: "-2" arg: "-3" arg: "4"
+  }
+}
+operand {
+  name: "out"
+  type: FLOAT32
+  shape { dim: 1 dim: 4 }
+}
+operation {
+  type: "FullyConnected"
+  fullyconnected_options {
+    activation: NONE
+  }
+  input: "in"
+  input: "weight"
+  input: "bias"
+  output: "out"
+}
+input: "in"
+output: "out"
diff --git a/res/TensorFlowLiteRecipes/FullyConnected_003/test.reverse b/res/TensorFlowLiteRecipes/FullyConnected_003/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/FullyConnected_003/test.reverse
diff --git a/res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.recipe b/res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.recipe
new file mode 100644
index 000000000..3fff5cd6d
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.recipe
@@ -0,0 +1,22 @@
+operand {
+  name: "ifm1"
+  type: UINT8
+  shape { dim: 1 dim: 4 dim: 4 dim: 3 }
+  quant { min: 0 max: 2 scale: 0.0078125 zero_point: 128}
+}
+operand {
+  name: "ofm"
+  type: UINT8
+  shape { dim: 1 dim: 4 dim: 4 dim: 3 }
+  quant { min: 0 max: 2 scale: 0.0078125 zero_point: 128}
+}
+operation {
+  type: "L2Normalize"
+  l2norm_options {
+    activation: NONE
+  }
+  input: "ifm1"
+  output: "ofm"
+}
+input: "ifm1"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.reverse b/res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/L2Normalize_U8_000/test.reverse
diff --git a/res/TensorFlowLiteRecipes/Logistic_U8_000/test.recipe b/res/TensorFlowLiteRecipes/Logistic_U8_000/test.recipe
new file mode 100644
index 000000000..7b2a84de7
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Logistic_U8_000/test.recipe
@@ -0,0 +1,19 @@
+operand {
+  name: "ifm"
+  type: UINT8
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+  quant { min: 0 max: 1 scale: 0.00390625 zero_point: -128 }
+}
+operand {
+  name: "ofm"
+  type: UINT8
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+  quant { min: 0 max: 1 scale: 0.00390625 zero_point: -128 }
+}
+operation {
+  type: "Logistic"
+  input: "ifm"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/Logistic_U8_000/test.reverse b/res/TensorFlowLiteRecipes/Logistic_U8_000/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Logistic_U8_000/test.reverse
diff --git a/res/TensorFlowLiteRecipes/Net_TConv_BN_000/test.recipe b/res/TensorFlowLiteRecipes/Net_TConv_BN_000/test.recipe
new file mode 100644
index 000000000..65248f23b
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Net_TConv_BN_000/test.recipe
@@ -0,0 +1,149 @@
+operand {
+  name: "Const_transposed"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 3
+    dim: 3
+    dim: 1
+  }
+  filler {
+    tag: "gaussian"
+    arg: "0.0"
+    arg: "0.1"
+  }
+  quant {
+    quantized_dimension: 0
+  }
+}
+operand {
+  name: "FusedBatchNormV3"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 4
+    dim: 4
+    dim: 1
+  }
+  quant {
+    quantized_dimension: 0
+  }
+}
+operand {
+  name: "FusedBatchNormV3_add_param"
+  type: FLOAT32
+  shape {
+    dim: 1
+  }
+  filler {
+    tag: "explicit"
+    arg: "-2.04724"
+  }
+  quant {
+    quantized_dimension: 0
+  }
+}
+operand {
+  name: "FusedBatchNormV3_mul_0"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 4
+    dim: 4
+    dim: 1
+  }
+  quant {
+    quantized_dimension: 0
+  }
+}
+operand {
+  name: "FusedBatchNormV3_mul_0_param"
+  type: FLOAT32
+  shape {
+    dim: 1
+  }
+  filler {
+    tag: "explicit"
+    arg: "2.00834"
+  }
+  quant {
+    quantized_dimension: 0
+  }
+}
+operand {
+  name: "Hole"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 2
+    dim: 2
+    dim: 1
+  }
+  quant {
+    min: 0
+    max: 255
+    quantized_dimension: 0
+  }
+}
+operand {
+  name: "conv2d_transpose"
+  type: FLOAT32
+  shape {
+    dim: 1
+    dim: 4
+    dim: 4
+    dim: 1
+  }
+  quant {
+    quantized_dimension: 0
+  }
+}
+operand {
+  name: "conv2d_transpose/input_sizes"
+  type: INT32
+  shape {
+    dim: 4
+  }
+  filler {
+    tag: "explicit"
+    arg: "1"
+    arg: "4"
+    arg: "4"
+    arg: "1"
+  }
+  quant {
+    quantized_dimension: 0
+  }
+}
+operation {
+  type: "TransposeConv"
+  input: "conv2d_transpose/input_sizes"
+  input: "Const_transposed"
+  input: "Hole"
+  output: "conv2d_transpose"
+  transpose_conv_options {
+    padding: VALID
+    stride_w: 1
+    stride_h: 1
+  }
+}
+operation {
+  type: "Mul"
+  input: "conv2d_transpose"
+  input: "FusedBatchNormV3_mul_0_param"
+  output: "FusedBatchNormV3_mul_0"
+  mul_options {
+    activation: NONE
+  }
+}
+operation {
+  type: "Add"
+  input: "FusedBatchNormV3_mul_0"
+  input: "FusedBatchNormV3_add_param"
+  output: "FusedBatchNormV3"
+  add_options {
+    activation: NONE
+  }
+}
+input: "Hole"
+output: "FusedBatchNormV3"
diff --git a/res/TensorFlowLiteRecipes/ResizeBilinear_U8_000/test.recipe b/res/TensorFlowLiteRecipes/ResizeBilinear_U8_000/test.recipe
new file mode 100644
index 000000000..6ae87b9d6
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/ResizeBilinear_U8_000/test.recipe
@@ -0,0 +1,32 @@
+operand {
+  name: "ifm1"
+  type: UINT8
+  shape { dim: 1 dim: 4 dim: 4 dim: 3 }
+  quant { min: 0 max: 255 scale: 1.0 zero_point: 0 }
+}
+operand {
+  name: "size"
+  type: INT32
+  shape { dim: 2 }
+  filler {
+    tag: "constant" arg: "16" arg: "16"
+  }
+}
+operand {
+  name: "ofm"
+  type: UINT8
+  shape { dim: 1 dim: 16 dim: 16 dim: 3 }
+  quant { min: 0 max: 255 scale: 1.0 zero_point: 0 }
+}
+operation {
+  type: "ResizeBilinear"
+  input: "ifm1"
+  input: "size"
+  output: "ofm"
+  resize_bilinear_options {
+    align_corners: false
+    half_pixel_centers: false
+  }
+}
+input: "ifm1"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/ResizeBilinear_U8_000/test.reverse b/res/TensorFlowLiteRecipes/ResizeBilinear_U8_000/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/ResizeBilinear_U8_000/test.reverse
diff --git a/res/TensorFlowLiteRecipes/SpaceToDepth_U8_000/test.recipe b/res/TensorFlowLiteRecipes/SpaceToDepth_U8_000/test.recipe
new file mode 100644
index 000000000..ec403dd86
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/SpaceToDepth_U8_000/test.recipe
@@ -0,0 +1,22 @@
+operand {
+  name: "ifm"
+  type: UINT8
+  shape { dim: 1 dim: 4 dim: 4 dim: 3 }
+  quant { min: 0 max: 255 scale: 1.0 zero_point: 0 }
+}
+operand {
+  name: "ofm"
+  type: UINT8
+  shape { dim: 1 dim: 2 dim: 2 dim: 12 }
+  quant { min: 0 max: 255 scale: 1.0 zero_point: 0 }
+}
+operation {
+  type: "SpaceToDepth"
+  space_to_depth_options {
+    block_size: 2
+  }
+  input: "ifm"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/SpaceToDepth_U8_000/test.reverse b/res/TensorFlowLiteRecipes/SpaceToDepth_U8_000/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/SpaceToDepth_U8_000/test.reverse
diff --git a/res/TensorFlowLiteRecipes/TransposeConv_000/test.recipe b/res/TensorFlowLiteRecipes/TransposeConv_000/test.recipe
index 79271a45f..1313e2683 100644
--- a/res/TensorFlowLiteRecipes/TransposeConv_000/test.recipe
+++ b/res/TensorFlowLiteRecipes/TransposeConv_000/test.recipe
@@ -10,7 +10,7 @@ operand {
 operand {
   name: "ker"
   type: FLOAT32
-  shape { dim: 1 dim: 3 dim: 3 dim: 1 }
+  shape { dim: 3 dim: 1 dim: 1 dim: 3 }
   filler {
     tag: "gaussian"
     arg: "0.0"
diff --git a/res/TensorFlowLiteRecipes/TransposeConv_001/test.recipe b/res/TensorFlowLiteRecipes/TransposeConv_001/test.recipe
new file mode 100644
index 000000000..ad76100d2
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/TransposeConv_001/test.recipe
@@ -0,0 +1,45 @@
+operand {
+  name: "out_shape"
+  type: INT32
+  shape { dim: 4 }
+  filler {
+    tag: "explicit"
+    arg: "1" arg: "4" arg: "4" arg: "1" 
+  }
+}
+operand {
+  name: "ker"
+  type: FLOAT32
+  shape { dim: 1 dim: 3 dim: 3 dim: 2 }
+  filler {
+    tag: "explicit"
+    arg: "1" arg: "2" arg: "-3" arg: "-4" arg: "5" arg: "-6"
+    arg: "7" arg: "8" arg: "-9" arg: "-10" arg: "11" arg: "-12"
+    arg: "13" arg: "14" arg: "-15" arg: "-16" arg: "17" arg: "-18"
+  }
+}
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 1 dim: 4 dim: 4 dim: 2 }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { dim: 1 dim: 4 dim: 4 dim: 1 }
+}
+
+operation {
+  type: "TransposeConv"
+  transpose_conv_options {
+    padding: SAME
+    stride_w: 1
+    stride_h: 1
+  }
+  input: "out_shape"
+  input: "ker"
+  input: "ifm"
+  output: "ofm"
+}
+input: "ifm"
+output: "ofm"
diff --git a/res/TensorFlowLiteRecipes/TransposeConv_001/test.reverse b/res/TensorFlowLiteRecipes/TransposeConv_001/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/TransposeConv_001/test.reverse
diff --git a/res/TensorFlowLiteRecipes/Unique_000/test.recipe b/res/TensorFlowLiteRecipes/Unique_000/test.recipe
new file mode 100644
index 000000000..887380c48
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Unique_000/test.recipe
@@ -0,0 +1,27 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 4 }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { }
+}
+operand {
+  name: "ofm_idx"
+  type: INT32
+  shape { dim: 4 }
+}
+operation {
+  type: "Unique"
+  unique_options {
+    idx_out_type: INT32
+  }
+  input: "ifm"
+  output: "ofm"
+  output: "ofm_idx"
+}
+input: "ifm"
+output: "ofm"
+output: "ofm_idx"
diff --git a/res/TensorFlowLiteRecipes/Unique_000/test.reverse b/res/TensorFlowLiteRecipes/Unique_000/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Unique_000/test.reverse
diff --git a/res/TensorFlowLiteRecipes/Unique_001/test.recipe b/res/TensorFlowLiteRecipes/Unique_001/test.recipe
new file mode 100644
index 000000000..9beb51690
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Unique_001/test.recipe
@@ -0,0 +1,27 @@
+operand {
+  name: "ifm"
+  type: FLOAT32
+  shape { dim: 4 }
+}
+operand {
+  name: "ofm"
+  type: FLOAT32
+  shape { }
+}
+operand {
+  name: "ofm_idx"
+  type: INT64
+  shape { dim: 4 }
+}
+operation {
+  type: "Unique"
+  unique_options {
+    idx_out_type: INT64
+  }
+  input: "ifm"
+  output: "ofm"
+  output: "ofm_idx"
+}
+input: "ifm"
+output: "ofm"
+output: "ofm_idx"
diff --git a/res/TensorFlowLiteRecipes/Unique_001/test.reverse b/res/TensorFlowLiteRecipes/Unique_001/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Unique_001/test.reverse
diff --git a/res/TensorFlowLiteRecipes/Unique_002/test.recipe b/res/TensorFlowLiteRecipes/Unique_002/test.recipe
new file mode 100644
index 000000000..67b947ff8
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Unique_002/test.recipe
@@ -0,0 +1,27 @@
+operand {
+  name: "ifm"
+  type: INT32
+  shape { dim: 5 }
+}
+operand {
+  name: "ofm"
+  type: INT32
+  shape { }
+}
+operand {
+  name: "ofm_idx"
+  type: INT32
+  shape { dim: 5 }
+}
+operation {
+  type: "Unique"
+  unique_options {
+    idx_out_type: INT32
+  }
+  input: "ifm"
+  output: "ofm"
+  output: "ofm_idx"
+}
+input: "ifm"
+output: "ofm"
+output: "ofm_idx"
diff --git a/res/TensorFlowLiteRecipes/Unique_002/test.reverse b/res/TensorFlowLiteRecipes/Unique_002/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Unique_002/test.reverse
diff --git a/res/TensorFlowLiteRecipes/Unique_003/test.recipe b/res/TensorFlowLiteRecipes/Unique_003/test.recipe
new file mode 100644
index 000000000..375db66e8
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Unique_003/test.recipe
@@ -0,0 +1,27 @@
+operand {
+  name: "ifm"
+  type: INT32
+  shape { dim: 5 }
+}
+operand {
+  name: "ofm"
+  type: INT32
+  shape { }
+}
+operand {
+  name: "ofm_idx"
+  type: INT64
+  shape { dim: 5 }
+}
+operation {
+  type: "Unique"
+  unique_options {
+    idx_out_type: INT64
+  }
+  input: "ifm"
+  output: "ofm"
+  output: "ofm_idx"
+}
+input: "ifm"
+output: "ofm"
+output: "ofm_idx"
diff --git a/res/TensorFlowLiteRecipes/Unique_003/test.reverse b/res/TensorFlowLiteRecipes/Unique_003/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Unique_003/test.reverse
diff --git a/res/TensorFlowLiteRecipes/Unique_U8_000/test.recipe b/res/TensorFlowLiteRecipes/Unique_U8_000/test.recipe
new file mode 100644
index 000000000..d3985e401
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Unique_U8_000/test.recipe
@@ -0,0 +1,28 @@
+operand {
+  name: "ifm"
+  type: UINT8
+  shape { dim: 4 }
+  quant { min: 0 max: 255 scale: 1.0 zero_point: 0 }
+}
+operand {
+  name: "ofm"
+  type: UINT8
+  shape { }
+}
+operand {
+  name: "ofm_idx"
+  type: INT32
+  shape { dim: 4 }
+}
+operation {
+  type: "Unique"
+  unique_options {
+    idx_out_type: INT32
+  }
+  input: "ifm"
+  output: "ofm"
+  output: "ofm_idx"
+}
+input: "ifm"
+output: "ofm"
+output: "ofm_idx"
diff --git a/res/TensorFlowLiteRecipes/Unique_U8_000/test.reverse b/res/TensorFlowLiteRecipes/Unique_U8_000/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Unique_U8_000/test.reverse
diff --git a/res/TensorFlowLiteRecipes/Unique_U8_001/test.recipe b/res/TensorFlowLiteRecipes/Unique_U8_001/test.recipe
new file mode 100644
index 000000000..b08dd85cc
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Unique_U8_001/test.recipe
@@ -0,0 +1,28 @@
+operand {
+  name: "ifm"
+  type: UINT8
+  shape { dim: 5 }
+  quant { min: 0 max: 255 scale: 1.0 zero_point: 0 }
+}
+operand {
+  name: "ofm"
+  type: UINT8
+  shape { }
+}
+operand {
+  name: "ofm_idx"
+  type: INT64
+  shape { dim: 5 }
+}
+operation {
+  type: "Unique"
+  unique_options {
+    idx_out_type: INT64
+  }
+  input: "ifm"
+  output: "ofm"
+  output: "ofm_idx"
+}
+input: "ifm"
+output: "ofm"
+output: "ofm_idx"
diff --git a/res/TensorFlowLiteRecipes/Unique_U8_001/test.reverse b/res/TensorFlowLiteRecipes/Unique_U8_001/test.reverse
new file mode 100644
index 000000000..e69de29bb
--- /dev/null
+++ b/res/TensorFlowLiteRecipes/Unique_U8_001/test.reverse
diff --git a/res/TensorFlowLiteSchema/2.3.0/schema.fbs b/res/TensorFlowLiteSchema/2.3.0/schema.fbs
new file mode 100644
index 000000000..b7f41c756
--- /dev/null
+++ b/res/TensorFlowLiteSchema/2.3.0/schema.fbs
@@ -0,0 +1,1094 @@
+// Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Revision History
+// Version 0: Initial version.
+// Version 1: Add subgraphs to schema.
+// Version 2: Rename operators to conform to NN API.
+// Version 3: Move buffer data from Model.Subgraph.Tensors to Model.Buffers.
+
+namespace tflite;
+
+// This corresponds to the version.
+file_identifier "TFL3";
+// File extension of any written files.
+file_extension "tflite";
+
+// IMPORTANT: All new members of tables, enums and unions must be added at the
+// end to ensure backwards compatibility.
+
+// The type of data stored in a tensor.
+enum TensorType : byte {
+  FLOAT32 = 0,
+  FLOAT16 = 1,
+  INT32 = 2,
+  UINT8 = 3,
+  INT64 = 4,
+  STRING = 5,
+  BOOL = 6,
+  INT16 = 7,
+  COMPLEX64 = 8,
+  INT8 = 9,
+  FLOAT64 = 10,
+}
+
+// Custom quantization parameters for experimenting with new quantization
+// techniques.
+table CustomQuantization {
+  custom:[ubyte] (force_align: 16);
+}
+
+// Represents a specific quantization technique's parameters.
+union QuantizationDetails {
+  CustomQuantization,
+}
+
+// Parameters for converting a quantized tensor back to float.
+table QuantizationParameters {
+  // These four parameters are the asymmetric linear quantization parameters.
+  // Given a quantized value q, the corresponding float value f should be:
+  //   f = scale * (q - zero_point)
+  // For other quantization types, the QuantizationDetails below is used.
+  min:[float];  // For importing back into tensorflow.
+  max:[float];  // For importing back into tensorflow.
+  scale:[float];  // For dequantizing the tensor's values.
+  zero_point:[long];
+
+  // If this is not none, the other quantization parameters (i.e. min, max,
+  // scale, zero_point fields above) are ignored and the value of the
+  // QuantizationDetails union should be used.
+  details:QuantizationDetails;
+
+  // Specifies the dimension of the Tensor's shape that the scales and
+  // zero_points correspond to. For example, a tensor t, with dims=[4, 3, 2, 1]
+  // with quantization params:
+  //   scale=[1.0, 2.0, 3.0], zero_point=[1, 2, 3], quantization_dimension=1
+  // will be quantized across the second dimension of t.
+  //   t[:, 0, :, :] will have scale[0]=1.0, zero_point[0]=1
+  //   t[:, 1, :, :] will have scale[1]=2.0, zero_point[0]=2
+  //   t[:, 2, :, :] will have scale[2]=3.0, zero_point[0]=3
+  quantized_dimension:int;
+}
+
+// Sparse tensors.
+// We use a modification of the TACO format.
+// Reference: http://tensor-compiler.org/kjolstad-oopsla17-tensor-compiler.pdf
+//
+// To encode a conceptual n-dimensional dense tensor with dims (d0, ..., dn-1),
+// potentially with a k-dimensional block (0 <= k <= n) with dims
+// (dn, ..., dn+k-1), the format needs to specify:
+//   1. In what order to traverse these dimensions. For example, to store a 2-D
+//      matrix in row major order, the traversal order would be (d0, d1),
+//      whereas to store it in column major order, the traversal order would be
+//      (d1, d0). If the 2-D matrix has a 2-D inner block, the traversal order
+//      could be (d0, d1, d2, d3).
+//   2. How each block dimension in (dn, ..., dn+k-1) maps to the original
+//      tensor dimension in (d0, ..., dn-1).
+//   3. In the traversal order defined above, the format (dense vs. sparse) and
+//      index metadata for each dimension. For a dense dimension, this is just
+//      the size of that dimension. For a sparse dimension, it's the same as
+//      the compressed index defined in the Compressed Sparse Row (CSR) format.
+//      (http://scipy-lectures.org/advanced/scipy_sparse/csr_matrix.html)
+
+// The storage type for a dimension. Currently we support:
+//   1. DENSE: each coordinate in this dimension is stored implicitly.
+//   2. SPARSE_CSR: only the coordinates with non-zero elements are stored. The
+//      compression technique is the same what CSR uses.
+// More types like a sparse dimension with a different compression technique
+// could be added to the list in the future.
+enum DimensionType : byte {
+  DENSE = 0,
+  SPARSE_CSR = 1,
+}
+
+table Int32Vector {
+  values:[int];
+}
+
+table Uint16Vector {
+  values:[ushort] (force_align: 4);
+}
+
+table Uint8Vector {
+  values:[ubyte] (force_align: 4);
+}
+
+// Variable-typed buffer to store the index metadata for a sparse dimension.
+// The widest type is Int32 instead of UInt32 because tensor's shape is a int32
+// vector. We don't want the per-dimensional index to overflow that range.
+union SparseIndexVector {
+  Int32Vector,
+  Uint16Vector,
+  Uint8Vector
+}
+
+table DimensionMetadata {
+  // Whether a dimension is dense or sparse.
+  format:DimensionType;
+  // Index metadata used for a dimension.
+  //   - If format is DimensionType.DENSE then we use the dense_size field to
+  //     store the size of that dimension. Each index in that dimension is
+  //     stored implicitly.
+  //   - If format is DimensionType.SPARSE_CSR then we use array_segments and
+  //     array_indices to encode that dimension. array_segments represents how
+  //     to segment the indices array, each segment corresponds to one element
+  //     in the previous dimension. array_indices represents the index of the
+  //     non-zero elements within this dimension (as those in the CSR matrix
+  //     format, where the first array is row pointers and the second array is
+  //     column indices).
+  dense_size:int;
+  array_segments:SparseIndexVector;
+  array_indices:SparseIndexVector;
+}
+
+// Parameters to encode a sparse TfLite tensor.
+table SparsityParameters {
+  // The traversal order of the dimensions defined in the `shape` field of the
+  // conceptual dense tensor. For a n-dimensional tensors with dims (d0, d1,
+  // ..., dn-1),
+  //   - if not block sparse, the traversal_order is just a permutation of (d0,
+  //     ..., dn-1). For example, a 2-D matrix stored in row-major order would
+  //     have traversal_order = (d0, d1).
+  //   - if block sparse with a k-dimensional block (0 <= k <= n), the
+  //     traversal_order has n + k elements. The first n elements are still a
+  //     permutation of (d0, ..., dn-1). The lask k elements are a permutation
+  //     of (dn, ..., dn+k-1), defining how to traverse a block internally. For
+  //     example, a 2-D matrix with 2-D blocks, both stored in row-major order
+  //     would have traversal_order = (d0, d1, d2, d3).
+  traversal_order:[int];
+  // For an n-dimensional tensor with a k-dimensional block (0 <= k <= n),
+  // stores how a block dimension in (dn, ..., dn+k-1) maps to the original
+  // tensor dimension in (d0, ..., dn).
+  // It's stored in the order of (dn, ..., dn+k-1).
+  // If not block-sparse, this field is NULL.
+  block_map:[int];
+  // In the traversal order defined above, the metadata needed for
+  // each dimension to locate the non-zero values in the original dense tensor.
+  // The size of the dim_metadata array = the size of the traversal_order array
+  // = n + k.
+  dim_metadata:[DimensionMetadata];
+}
+
+table Tensor {
+  // The tensor shape. The meaning of each entry is operator-specific but
+  // builtin ops use: [batch size, height, width, number of channels] (That's
+  // Tensorflow's NHWC).
+  shape:[int];
+  type:TensorType;
+  // An index that refers to the buffers table at the root of the model. Or,
+  // if there is no data buffer associated (i.e. intermediate results), then
+  // this is 0 (which refers to an always existent empty buffer).
+  //
+  // The data_buffer itself is an opaque container, with the assumption that the
+  // target device is little-endian. In addition, all builtin operators assume
+  // the memory is ordered such that if `shape` is [4, 3, 2], then index
+  // [i, j, k] maps to data_buffer[i*3*2 + j*2 + k].
+  buffer:uint;
+  name:string;  // For debugging and importing back into tensorflow.
+  quantization:QuantizationParameters;  // Optional.
+
+  is_variable:bool = false;
+
+  // Parameters to encode a sparse tensor. See the example in
+  // tensorflow/lite/testdata/sparse_tensor.json.
+  sparsity:SparsityParameters;  // Optional.
+
+  // Encodes `shape` with unknown dimensions. Unknown dimensions are
+  // represented with -1.
+  shape_signature:[int]; // Optional.
+}
+
+// A list of builtin operators. Builtin operators are slightly faster than custom
+// ones, but not by much. Moreover, while custom operators accept an opaque
+// object containing configuration parameters, builtins have a predetermined
+// set of acceptable options.
+
+enum BuiltinOperator : byte {
+  ADD = 0,
+  AVERAGE_POOL_2D = 1,
+  CONCATENATION = 2,
+  CONV_2D = 3,
+  DEPTHWISE_CONV_2D = 4,
+  DEPTH_TO_SPACE = 5,
+  DEQUANTIZE = 6,
+  EMBEDDING_LOOKUP = 7,
+  FLOOR = 8,
+  FULLY_CONNECTED = 9,
+  HASHTABLE_LOOKUP = 10,
+  L2_NORMALIZATION = 11,
+  L2_POOL_2D = 12,
+  LOCAL_RESPONSE_NORMALIZATION = 13,
+  LOGISTIC = 14,
+  LSH_PROJECTION = 15,
+  LSTM = 16,
+  MAX_POOL_2D = 17,
+  MUL = 18,
+  RELU = 19,
+  // NOTE(aselle): RELU_N1_TO_1 used to be called RELU1, but it was renamed
+  // since different model developers use RELU1 in different ways. Never
+  // create another op called RELU1.
+  RELU_N1_TO_1 = 20,
+  RELU6 = 21,
+  RESHAPE = 22,
+  RESIZE_BILINEAR = 23,
+  RNN = 24,
+  SOFTMAX = 25,
+  SPACE_TO_DEPTH = 26,
+  SVDF = 27,
+  TANH = 28,
+  // TODO(aselle): Consider rename to CONCATENATE_EMBEDDINGS
+  CONCAT_EMBEDDINGS = 29,
+  SKIP_GRAM = 30,
+  CALL = 31,
+  CUSTOM = 32,
+  EMBEDDING_LOOKUP_SPARSE = 33,
+  PAD = 34,
+  UNIDIRECTIONAL_SEQUENCE_RNN = 35,
+  GATHER = 36,
+  BATCH_TO_SPACE_ND = 37,
+  SPACE_TO_BATCH_ND = 38,
+  TRANSPOSE = 39,
+  MEAN = 40,
+  SUB = 41,
+  DIV = 42,
+  SQUEEZE = 43,
+  UNIDIRECTIONAL_SEQUENCE_LSTM = 44,
+  STRIDED_SLICE = 45,
+  BIDIRECTIONAL_SEQUENCE_RNN = 46,
+  EXP = 47,
+  TOPK_V2 = 48,
+  SPLIT = 49,
+  LOG_SOFTMAX = 50,
+  // DELEGATE is a special op type for the operations which are delegated to
+  // other backends.
+  // WARNING: Experimental interface, subject to change
+  DELEGATE = 51,
+  BIDIRECTIONAL_SEQUENCE_LSTM = 52,
+  CAST = 53,
+  PRELU = 54,
+  MAXIMUM = 55,
+  ARG_MAX = 56,
+  MINIMUM = 57,
+  LESS = 58,
+  NEG = 59,
+  PADV2 = 60,
+  GREATER = 61,
+  GREATER_EQUAL = 62,
+  LESS_EQUAL = 63,
+  SELECT = 64,
+  SLICE = 65,
+  SIN = 66,
+  TRANSPOSE_CONV = 67,
+  SPARSE_TO_DENSE = 68,
+  TILE = 69,
+  EXPAND_DIMS = 70,
+  EQUAL = 71,
+  NOT_EQUAL = 72,
+  LOG = 73,
+  SUM = 74,
+  SQRT = 75,
+  RSQRT = 76,
+  SHAPE = 77,
+  POW = 78,
+  ARG_MIN = 79,
+  FAKE_QUANT = 80,
+  REDUCE_PROD = 81,
+  REDUCE_MAX = 82,
+  PACK = 83,
+  LOGICAL_OR = 84,
+  ONE_HOT = 85,
+  LOGICAL_AND = 86,
+  LOGICAL_NOT = 87,
+  UNPACK = 88,
+  REDUCE_MIN = 89,
+  FLOOR_DIV = 90,
+  REDUCE_ANY = 91,
+  SQUARE = 92,
+  ZEROS_LIKE = 93,
+  FILL = 94,
+  FLOOR_MOD = 95,
+  RANGE = 96,
+  RESIZE_NEAREST_NEIGHBOR = 97,
+  LEAKY_RELU = 98,
+  SQUARED_DIFFERENCE = 99,
+  MIRROR_PAD = 100,
+  ABS = 101,
+  SPLIT_V = 102,
+  UNIQUE = 103,
+  CEIL = 104,
+  REVERSE_V2 = 105,
+  ADD_N = 106,
+  GATHER_ND = 107,
+  COS = 108,
+  WHERE = 109,
+  RANK = 110,
+  ELU = 111,
+  REVERSE_SEQUENCE = 112,
+  MATRIX_DIAG = 113,
+  QUANTIZE = 114,
+  MATRIX_SET_DIAG = 115,
+  ROUND = 116,
+  HARD_SWISH = 117,
+  IF = 118,
+  WHILE = 119,
+  NON_MAX_SUPPRESSION_V4 = 120,
+  NON_MAX_SUPPRESSION_V5 = 121,
+  SCATTER_ND = 122,
+  SELECT_V2 = 123,
+  DENSIFY = 124,
+  SEGMENT_SUM = 125,
+  BATCH_MATMUL = 126
+}
+
+
+// Options for the builtin operators.
+union BuiltinOptions {
+  Conv2DOptions,
+  DepthwiseConv2DOptions,
+  ConcatEmbeddingsOptions,
+  LSHProjectionOptions,
+  Pool2DOptions,
+  SVDFOptions,
+  RNNOptions,
+  FullyConnectedOptions,
+  SoftmaxOptions,
+  ConcatenationOptions,
+  AddOptions,
+  L2NormOptions,
+  LocalResponseNormalizationOptions,
+  LSTMOptions,
+  ResizeBilinearOptions,
+  CallOptions,
+  ReshapeOptions,
+  SkipGramOptions,
+  SpaceToDepthOptions,
+  EmbeddingLookupSparseOptions,
+  MulOptions,
+  PadOptions,
+  GatherOptions,
+  BatchToSpaceNDOptions,
+  SpaceToBatchNDOptions,
+  TransposeOptions,
+  ReducerOptions,
+  SubOptions,
+  DivOptions,
+  SqueezeOptions,
+  SequenceRNNOptions,
+  StridedSliceOptions,
+  ExpOptions,
+  TopKV2Options,
+  SplitOptions,
+  LogSoftmaxOptions,
+  CastOptions,
+  DequantizeOptions,
+  MaximumMinimumOptions,
+  ArgMaxOptions,
+  LessOptions,
+  NegOptions,
+  PadV2Options,
+  GreaterOptions,
+  GreaterEqualOptions,
+  LessEqualOptions,
+  SelectOptions,
+  SliceOptions,
+  TransposeConvOptions,
+  SparseToDenseOptions,
+  TileOptions,
+  ExpandDimsOptions,
+  EqualOptions,
+  NotEqualOptions,
+  ShapeOptions,
+  PowOptions,
+  ArgMinOptions,
+  FakeQuantOptions,
+  PackOptions,
+  LogicalOrOptions,
+  OneHotOptions,
+  LogicalAndOptions,
+  LogicalNotOptions,
+  UnpackOptions,
+  FloorDivOptions,
+  SquareOptions,
+  ZerosLikeOptions,
+  FillOptions,
+  BidirectionalSequenceLSTMOptions,
+  BidirectionalSequenceRNNOptions,
+  UnidirectionalSequenceLSTMOptions,
+  FloorModOptions,
+  RangeOptions,
+  ResizeNearestNeighborOptions,
+  LeakyReluOptions,
+  SquaredDifferenceOptions,
+  MirrorPadOptions,
+  AbsOptions,
+  SplitVOptions,
+  UniqueOptions,
+  ReverseV2Options,
+  AddNOptions,
+  GatherNdOptions,
+  CosOptions,
+  WhereOptions,
+  RankOptions,
+  ReverseSequenceOptions,
+  MatrixDiagOptions,
+  QuantizeOptions,
+  MatrixSetDiagOptions,
+  HardSwishOptions,
+  IfOptions,
+  WhileOptions,
+  DepthToSpaceOptions,
+  NonMaxSuppressionV4Options,
+  NonMaxSuppressionV5Options,
+  ScatterNdOptions,
+  SelectV2Options,
+  DensifyOptions,
+  SegmentSumOptions,
+  BatchMatMulOptions
+}
+
+enum Padding : byte { SAME, VALID }
+
+enum ActivationFunctionType : byte {
+  NONE = 0,
+  RELU = 1,
+  RELU_N1_TO_1 = 2,
+  RELU6 = 3,
+  TANH = 4,
+  SIGN_BIT = 5,
+}
+
+table Conv2DOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  fused_activation_function:ActivationFunctionType;
+  dilation_w_factor:int = 1;
+  dilation_h_factor:int = 1;
+}
+
+table Pool2DOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  filter_width:int;
+  filter_height:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+table DepthwiseConv2DOptions {
+  // Parameters for DepthwiseConv version 1 or above.
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+  // `depth_multiplier` is redundant. It's used by CPU kernels in
+  // TensorFlow 2.0 or below, but ignored in versions above.
+  // See comments in lite/c/builtin_op_data.h for more details.
+  depth_multiplier:int;
+  fused_activation_function:ActivationFunctionType;
+  // Parameters for DepthwiseConv version 2 or above.
+  dilation_w_factor:int = 1;
+  dilation_h_factor:int = 1;
+}
+
+table ConcatEmbeddingsOptions {
+  num_channels:int;
+  num_columns_per_channel:[int];
+  embedding_dim_per_channel:[int]; // This could be inferred from parameters.
+}
+
+enum LSHProjectionType: byte {
+  UNKNOWN = 0,
+  SPARSE = 1,
+  DENSE = 2,
+}
+
+table LSHProjectionOptions {
+  type: LSHProjectionType;
+}
+
+table SVDFOptions {
+  rank:int;
+  fused_activation_function:ActivationFunctionType;
+  // For weights-only quantization, use asymmetric quantization for non
+  // constant inputs at evaluation time.
+  asymmetric_quantize_inputs:bool;
+}
+
+// An implementation of TensorFlow RNNCell.
+table RNNOptions {
+  fused_activation_function:ActivationFunctionType;
+  asymmetric_quantize_inputs:bool;
+}
+
+// An implementation of TensorFlow dynamic_rnn with RNNCell.
+table SequenceRNNOptions {
+  time_major:bool;
+  fused_activation_function:ActivationFunctionType;
+  asymmetric_quantize_inputs:bool;
+}
+
+// An implementation of TensorFlow bidrectional_dynamic_rnn with RNNCell.
+table BidirectionalSequenceRNNOptions {
+  time_major:bool;
+  fused_activation_function:ActivationFunctionType;
+  merge_outputs: bool;
+  asymmetric_quantize_inputs:bool;
+}
+
+enum FullyConnectedOptionsWeightsFormat: byte {
+  DEFAULT = 0,
+  SHUFFLED4x16INT8 = 1,
+}
+
+// An implementation of TensorFlow fully_connected (a.k.a Dense) layer.
+table FullyConnectedOptions {
+  // Parameters for FullyConnected version 1 or above.
+  fused_activation_function:ActivationFunctionType;
+
+  // Parameters for FullyConnected version 2 or above.
+  weights_format:FullyConnectedOptionsWeightsFormat = DEFAULT;
+
+  // Parameters for FullyConnected version 5 or above.
+  // If set to true, then the number of dimension is preserved. Furthermore,
+  // all but the last dimension of the input and output shapes will be equal.
+  keep_num_dims: bool;
+
+  // Parameters for FullyConnected version 7 or above.
+  // If set to true, then weights-only op will use asymmetric quantization for
+  // inputs.
+  asymmetric_quantize_inputs: bool;
+}
+
+table SoftmaxOptions {
+  beta: float;
+}
+
+// An implementation of TensorFlow concat.
+table ConcatenationOptions {
+  axis:int;
+  fused_activation_function:ActivationFunctionType;
+}
+
+table AddOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table MulOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table L2NormOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table LocalResponseNormalizationOptions {
+  radius:int;
+  bias:float;
+  alpha:float;
+  beta:float;
+}
+
+enum LSTMKernelType : byte {
+  // Full LSTM kernel which supports peephole and projection.
+  FULL = 0,
+  // Basic LSTM kernels. Equivalent to TensorFlow BasicLSTMCell.
+  BASIC = 1,
+}
+
+// An implementation of TensorFlow LSTMCell and CoupledInputForgetGateLSTMCell
+table LSTMOptions {
+  // Parameters for LSTM version 1 or above.
+  fused_activation_function:ActivationFunctionType;
+  cell_clip: float; // Optional, 0.0 means no clipping
+  proj_clip: float; // Optional, 0.0 means no clipping
+
+  // Parameters for LSTM version 2 or above.
+  // Basic kernel is only supported in version 2 or above.
+  kernel_type: LSTMKernelType = FULL;
+
+  // Parameters for LSTM version 4 or above.
+  asymmetric_quantize_inputs: bool;
+}
+
+// An implementation of TensorFlow dynamic_rnn with LSTMCell.
+table UnidirectionalSequenceLSTMOptions {
+  fused_activation_function:ActivationFunctionType;
+  cell_clip: float; // Optional, 0.0 means no clipping
+  proj_clip: float; // Optional, 0.0 means no clipping
+
+  // If true then first dimension is sequence, otherwise batch.
+  time_major:bool;
+
+  // Parameter for Unidirectional Sequence LSTM version 4.
+  asymmetric_quantize_inputs:bool;
+}
+
+table BidirectionalSequenceLSTMOptions {
+  // Parameters supported by version 1:
+  fused_activation_function:ActivationFunctionType;
+  cell_clip: float; // Optional, 0.0 means no clipping
+  proj_clip: float; // Optional, 0.0 means no clipping
+
+  // If true, store the outputs of both directions into the first output.
+  merge_outputs: bool;
+
+  // Parameters supported by version 2:
+  // If true then first dimension is sequence, otherwise batch.
+  // Version 1 implementations assumed time_major to be true, so this default
+  // value should never change.
+  time_major: bool = true;
+
+  // Parameters for version 3 or above.
+  asymmetric_quantize_inputs:bool;
+}
+
+table ResizeBilinearOptions {
+  new_height: int (deprecated);
+  new_width: int (deprecated);
+  align_corners: bool;
+  half_pixel_centers: bool;
+}
+
+table ResizeNearestNeighborOptions {
+  align_corners: bool;
+  half_pixel_centers: bool;
+}
+
+// A call operation options
+table CallOptions {
+  // The subgraph index that needs to be called.
+  subgraph:uint;
+}
+
+table PadOptions {
+}
+
+table PadV2Options {
+}
+
+table ReshapeOptions {
+  new_shape:[int];
+}
+
+table SpaceToBatchNDOptions {
+}
+
+table BatchToSpaceNDOptions {
+}
+
+table SkipGramOptions {
+  ngram_size: int;
+  max_skip_size: int;
+  include_all_ngrams: bool;
+}
+
+table SpaceToDepthOptions {
+  block_size: int;
+}
+
+table DepthToSpaceOptions {
+  block_size: int;
+}
+
+table SubOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table DivOptions {
+  fused_activation_function:ActivationFunctionType;
+}
+
+table TopKV2Options {
+}
+
+enum CombinerType : byte {
+  SUM = 0,
+  MEAN = 1,
+  SQRTN = 2,
+}
+
+table EmbeddingLookupSparseOptions {
+  combiner:CombinerType;
+}
+
+table GatherOptions {
+  axis: int;
+}
+
+table TransposeOptions {
+}
+
+table ExpOptions {
+}
+
+table CosOptions {
+}
+
+table ReducerOptions {
+  keep_dims: bool;
+}
+
+table SqueezeOptions {
+  squeeze_dims:[int];
+}
+
+table SplitOptions {
+  num_splits: int;
+}
+
+table SplitVOptions {
+  num_splits: int;
+}
+
+table StridedSliceOptions {
+  begin_mask: int;
+  end_mask: int;
+  ellipsis_mask: int;
+  new_axis_mask: int;
+  shrink_axis_mask: int;
+}
+
+table LogSoftmaxOptions {
+}
+
+table CastOptions {
+  in_data_type: TensorType;
+  out_data_type: TensorType;
+}
+
+table DequantizeOptions {
+}
+
+table MaximumMinimumOptions {
+}
+
+table TileOptions {
+}
+
+table ArgMaxOptions {
+  output_type : TensorType;
+}
+
+table ArgMinOptions {
+  output_type : TensorType;
+}
+
+table GreaterOptions {
+}
+
+table GreaterEqualOptions {
+}
+
+table LessOptions {
+}
+
+table LessEqualOptions {
+}
+
+table NegOptions {
+}
+
+table SelectOptions {
+}
+
+table SliceOptions {
+}
+
+table TransposeConvOptions {
+  padding:Padding;
+  stride_w:int;
+  stride_h:int;
+}
+
+table ExpandDimsOptions {
+}
+
+table SparseToDenseOptions {
+  validate_indices:bool;
+}
+
+table EqualOptions {
+}
+
+table NotEqualOptions {
+}
+
+table ShapeOptions {
+  // Optional output type of the operation (int32 or int64). Defaults to int32.
+  out_type : TensorType;
+}
+
+table RankOptions {
+}
+
+table PowOptions {
+}
+
+table FakeQuantOptions {
+  // Parameters supported by version 1:
+  min:float;
+  max:float;
+  num_bits:int;
+
+  // Parameters supported by version 2:
+  narrow_range:bool;
+}
+
+table PackOptions {
+  values_count:int;
+  axis:int;
+}
+
+table LogicalOrOptions {
+}
+
+table OneHotOptions {
+  axis:int;
+}
+
+table AbsOptions {
+}
+
+
+table HardSwishOptions {
+}
+
+table LogicalAndOptions {
+}
+
+table LogicalNotOptions {
+}
+
+table UnpackOptions {
+  num:int;
+  axis:int;
+}
+
+table FloorDivOptions {
+}
+
+table SquareOptions {
+}
+
+table ZerosLikeOptions {
+}
+
+table FillOptions {
+}
+
+table FloorModOptions {
+}
+
+table RangeOptions {
+}
+
+table LeakyReluOptions {
+  alpha:float;
+}
+
+table SquaredDifferenceOptions {
+}
+
+enum MirrorPadMode : byte {
+  // Doesn't include borders.
+  REFLECT = 0,
+  // Includes borders.
+  SYMMETRIC = 1,
+}
+
+table MirrorPadOptions {
+  mode:MirrorPadMode;
+}
+
+table UniqueOptions {
+  idx_out_type:TensorType = INT32;
+}
+
+table ReverseV2Options {
+}
+
+table AddNOptions {
+}
+
+table GatherNdOptions {
+}
+
+table WhereOptions {
+}
+
+table ReverseSequenceOptions {
+  seq_dim:int;
+  batch_dim:int = 0;
+}
+
+table MatrixDiagOptions {
+}
+
+table QuantizeOptions {
+}
+
+table MatrixSetDiagOptions {
+}
+
+table IfOptions {
+  then_subgraph_index:int;
+  else_subgraph_index:int;
+}
+
+table WhileOptions {
+  cond_subgraph_index:int;
+  body_subgraph_index:int;
+}
+
+table NonMaxSuppressionV4Options {
+}
+
+table NonMaxSuppressionV5Options {
+}
+
+table ScatterNdOptions {
+}
+
+table SelectV2Options {
+}
+
+table DensifyOptions {
+}
+
+table SegmentSumOptions {
+}
+
+table BatchMatMulOptions {
+  adj_x:bool;
+  adj_y:bool;
+}
+
+// An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
+// builtin, or a string if the operator is custom.
+table OperatorCode {
+  builtin_code:BuiltinOperator;
+  custom_code:string;
+
+  // The version of the operator. The version need to be bumped whenever new
+  // parameters are introduced into an op.
+  version:int = 1;
+}
+
+enum CustomOptionsFormat : byte {
+  FLEXBUFFERS = 0,
+}
+
+// An operator takes tensors as inputs and outputs. The type of operation being
+// performed is determined by an index into the list of valid OperatorCodes,
+// while the specifics of each operations is configured using builtin_options
+// or custom_options.
+table Operator {
+  // Index into the operator_codes array. Using an integer here avoids
+  // complicate map lookups.
+  opcode_index:uint;
+
+  // Optional input are indicated by -1.
+  inputs:[int];
+  outputs:[int];
+
+  builtin_options:BuiltinOptions;
+  custom_options:[ubyte];
+  custom_options_format:CustomOptionsFormat;
+
+  // A list of booleans indicating the input tensors which are being mutated by
+  // this operator.(e.g. used by RNN and LSTM).
+  // For example, if the "inputs" array refers to 5 tensors and the second and
+  // fifth are mutable variables, then this list will contain
+  // [false, true, false, false, true].
+  //
+  // If the list is empty, no variable is mutated in this operator.
+  // The list either has the same length as `inputs`, or is empty.
+  mutating_variable_inputs:[bool];
+
+  // A list of indices to the subgraph's "tensors" that are internal to an Op.
+  // Internal tensors are those that do not flow in or out of the operation,
+  // but instead are part of internal computation. As such, the operation's
+  // implementation may manage its memory more efficiently. They are needed
+  // however (i.e. not just an implementation detail) since they are part of the
+  // computation, which may require relevant metadata such as quantization
+  // parameters.
+  intermediates:[int];
+}
+
+// The root type, defining a subgraph, which typically represents an entire
+// model.
+table SubGraph {
+  // A list of all tensors used in this subgraph.
+  tensors:[Tensor];
+
+  // Indices of the tensors that are inputs into this subgraph. Note this is
+  // the list of non-static tensors that feed into the subgraph for inference.
+  inputs:[int];
+
+  // Indices of the tensors that are outputs out of this subgraph. Note this is
+  // the list of output tensors that are considered the product of the
+  // subgraph's inference.
+  outputs:[int];
+
+  // All operators, in execution order.
+  operators:[Operator];
+
+  // Name of this subgraph (used for debugging).
+  name:string;
+}
+
+// Table of raw data buffers (used for constant tensors). Referenced by tensors
+// by index. The generous alignment accommodates mmap-friendly data structures.
+table Buffer {
+  data:[ubyte] (force_align: 16);
+}
+
+table Metadata {
+  // A human readable string to uniquely identify a Metadata.
+  name:string;
+  // An index to the buffers table.
+  buffer:uint;
+}
+
+table Model {
+  // Version of the schema.
+  version:uint;
+
+  // A list of all operator codes used in this model. This is
+  // kept in order because operators carry an index into this
+  // vector.
+  operator_codes:[OperatorCode];
+
+  // All the subgraphs of the model. The 0th is assumed to be the main
+  // model.
+  subgraphs:[SubGraph];
+
+  // A description of the model.
+  description:string;
+
+  // Buffers of the model.
+  // Note the 0th entry of this array must be an empty buffer (sentinel).
+  // This is a convention so that tensors without a buffer can provide 0 as
+  // their buffer.
+  buffers:[Buffer];
+
+  // Metadata about the model. Indirects into the existings buffers list.
+  // Deprecated, prefer to use metadata field.
+  metadata_buffer:[int];
+
+  // Metadata about the model.
+  metadata:[Metadata];
+}
+
+root_type Model;
diff --git a/res/TensorFlowLiteSchema/SCHEMA.lst b/res/TensorFlowLiteSchema/SCHEMA.lst
index 2044e2e0c..73dfacd7b 100644
--- a/res/TensorFlowLiteSchema/SCHEMA.lst
+++ b/res/TensorFlowLiteSchema/SCHEMA.lst
@@ -5,3 +5,4 @@ VERSION,URL
 2.1.0,https://raw.githubusercontent.com/tensorflow/tensorflow/v2.1.0/tensorflow/lite/schema/schema.fbs
 2.2.0,https://raw.githubusercontent.com/tensorflow/tensorflow/v2.2.0/tensorflow/lite/schema/schema.fbs
 2.3.0-rc0,https://raw.githubusercontent.com/tensorflow/tensorflow/v2.3.0-rc0/tensorflow/lite/schema/schema.fbs
+2.3.0,https://raw.githubusercontent.com/tensorflow/tensorflow/v2.3.0/tensorflow/lite/schema/schema.fbs
diff --git a/res/TensorFlowPythonExamples/examples/while_2/__init__.py b/res/TensorFlowPythonExamples/examples/while_2/__init__.py
new file mode 100644
index 000000000..af1c74582
--- /dev/null
+++ b/res/TensorFlowPythonExamples/examples/while_2/__init__.py
@@ -0,0 +1,32 @@
+import tensorflow as tf
+
+i = tf.constant(0, shape=[1, 0], dtype=tf.int32, name='i')
+x = tf.compat.v1.placeholder(shape=[1, 1], dtype=tf.int32, name='Hole')
+
+c = lambda i: tf.compat.v1.less(tf.compat.v1.size(i[0]), 10)
+b = lambda i: tf.concat([i, x], axis=1)
+
+# this loop changs i's shape from [1, 0] -> [1, 1] -> [1, 2] -> ... -> [1, 10]
+r = tf.compat.v1.while_loop(
+    c, b, [i], name="While", shape_invariants=[tf.TensorShape([1, None])])
+
+output = tf.compat.v1.identity(r, name="Output")
+
+# by adding the following code, [[1 1 1 1 1 1 1 1 1 1]] and (1, 10) will be printed
+#
+# import numpy as np
+# x_val = np.array([[1]])
+# with tf.Session() as sess:
+#   result = sess.run(r, feed_dict={x:x_val})
+#   print(result)
+#   print(result.shape)
+
+# with TF 2.3, tf2tflite throws the following error
+#
+# Exception: venv/tf-2.3/lib/python3.6/site-packages/tensorflow/python/eager/lift_to_graph.py:339:0:
+# error: body function result type tensor<1x1xi32> is incompatible with result type tensor<1x0xi32>
+# at index 0
+# ...
+# note: see current operation: %1:2 = "tf.While"(%0, %arg0)
+# {body = @_functionalize_body_00, cond = @_functionalize_cond_00, device = "", is_stateless = false, output_shapes = [], parallel_iterations = 10 : i64}
+# : (tensor<1x0xi32>, tensor<1x1xi32>) -> (tensor<1x0xi32>, tensor<1x1xi32>)
diff --git a/res/TensorFlowPythonExamples/examples/while_3/__init__.py b/res/TensorFlowPythonExamples/examples/while_3/__init__.py
new file mode 100644
index 000000000..840846e7e
--- /dev/null
+++ b/res/TensorFlowPythonExamples/examples/while_3/__init__.py
@@ -0,0 +1,33 @@
+import tensorflow as tf
+
+x = tf.compat.v1.placeholder(shape=[1, None], dtype=tf.int32, name='Hole')
+i = tf.compat.v1.placeholder(shape=[1, None], dtype=tf.int32, name='Hole_2')
+
+
+def c(ii):
+    rs = tf.compat.v1.shape(ii)
+    r1 = rs[1]
+    return tf.compat.v1.less(r1, 10)
+
+
+def b(ii):
+    return tf.concat([ii, x], axis=1)
+
+
+# this loop changes i's shape from [1, 0] -> [1, 1] -> [1, 2] -> ... -> [1, 10]
+r = tf.compat.v1.while_loop(
+    c, b, [i], name="While", shape_invariants=[tf.TensorShape([1, None])])
+
+output = tf.compat.v1.identity(r, name="Output")
+
+# by adding the following code, [[123 1 2 3 1 2 3 1 2 3]] and (1, 10) will be printed
+#
+'''
+import numpy as np
+i_val = np.array([[123]], dtype=np.int32)
+x_val = np.array([[1, 2, 3]], dtype=np.int32)
+with tf.compat.v1.Session() as sess:
+  result = sess.run(r, feed_dict={x:x_val, i:i_val})
+  print(result)
+  print(result.shape)
+'''
diff --git a/res/TensorFlowPythonModels/examples/tconv-bn/__init__.py b/res/TensorFlowPythonModels/examples/tconv-bn/__init__.py
new file mode 100644
index 000000000..ae034e8bf
--- /dev/null
+++ b/res/TensorFlowPythonModels/examples/tconv-bn/__init__.py
@@ -0,0 +1,27 @@
+import tensorflow as tf
+import numpy as np
+
+input_ = tf.compat.v1.placeholder(tf.float32, shape=(1, 2, 2, 1), name="Hole")
+W = np.ones(9).reshape((3, 3, 1, 1))
+filter_ = tf.compat.v1.constant(W, dtype=tf.float32)
+tconv_ = tf.compat.v1.nn.conv2d_transpose(
+    input_, filter_, output_shape=(1, 4, 4, 1), strides=[1, 1, 1, 1], padding='VALID')
+
+scale_ = tf.compat.v1.constant([1.0177339315414429], dtype=tf.float32)
+offset_ = tf.compat.v1.constant([0.015628524124622345], dtype=tf.float32)
+mean_ = tf.compat.v1.constant([1.027155211195349693], dtype=tf.float32)
+variance_ = tf.compat.v1.constant([0.25580066442489624], dtype=tf.float32)
+bn_out, _, _ = tf.compat.v1.nn.fused_batch_norm(
+    tconv_,
+    scale_,
+    offset_,
+    mean=mean_,
+    variance=variance_,
+    epsilon=0.0010000000474974513,
+    is_training=False)
+'''
+python ../../compiler/tf2tfliteV2/tf2tfliteV2.py --v1 \
+-i tconv-bn.pbtxt \
+-o tconv-bn.tflite \
+-I Hole -O FusedBatchNorm
+'''
diff --git a/runtime/contrib/android/api/build.gradle b/runtime/contrib/android/api/build.gradle
index 70eb802ab..5c17043eb 100644
--- a/runtime/contrib/android/api/build.gradle
+++ b/runtime/contrib/android/api/build.gradle
@@ -8,7 +8,7 @@ android {
         minSdkVersion 26
         targetSdkVersion 29
         versionCode 1
-        versionName "1.7.0"
+        versionName "1.8.0"
 
         externalNativeBuild {
             ndkBuild {
diff --git a/runtime/contrib/android_benchmark_app/README.md b/runtime/contrib/android_benchmark_app/README.md
index ce165cd5d..19640e32f 100644
--- a/runtime/contrib/android_benchmark_app/README.md
+++ b/runtime/contrib/android_benchmark_app/README.md
@@ -28,7 +28,7 @@ make TARGET_OS=android \
     EXT_ACL_FOLDER=/home/hanjoung/ws/temp/arm_compute-v19.05-bin-android/lib/android-arm64-v8a-neon-cl \
     ANDROID_BUILD_TOOLS_DIR=/home/hanjoung/ws/android-tools/sdk/build-tools/27.0.3/ \
     ANDROID_SDK_DIR=/home/hanjoung/ws/android-tools/sdk \
-    TFLITE_MODEL_PATH=/Users/hanjoung/ws/ghent/STAR/nnfw/tests/scripts/framework/cache/MODELS/mobilenet/mobilenet_v1_0.25_128.tflite \
+    TFLITE_MODEL_PATH=/Users/hanjoung/ws/ghent/STAR/nnfw/tests/scripts/models/cache/MODELS/mobilenet/mobilenet_v1_0.25_128.tflite \
     ANDROID_BOOST_ROOT=/home/hanjoung/ws/gh/moritz-wundke/Boost-for-Android/build/out/arm64-v8a
 ```
 
diff --git a/runtime/libs/benchmark/CMakeLists.txt b/runtime/libs/benchmark/CMakeLists.txt
index 2af0ffaa3..748b2d13f 100644
--- a/runtime/libs/benchmark/CMakeLists.txt
+++ b/runtime/libs/benchmark/CMakeLists.txt
@@ -1,6 +1,5 @@
 file(GLOB_RECURSE SOURCES "src/*.cpp")
 
-add_library(nnfw_lib_benchmark SHARED ${SOURCES})
+add_library(nnfw_lib_benchmark STATIC ${SOURCES})
 target_include_directories(nnfw_lib_benchmark PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include)
 target_link_libraries(nnfw_lib_benchmark PRIVATE ${LIB_PTHREAD})
-install(TARGETS nnfw_lib_benchmark DESTINATION lib)
diff --git a/runtime/libs/benchmark/src/Result.cpp b/runtime/libs/benchmark/src/Result.cpp
index 7a3f9a572..df573da92 100644
--- a/runtime/libs/benchmark/src/Result.cpp
+++ b/runtime/libs/benchmark/src/Result.cpp
@@ -166,7 +166,7 @@ Result::Result(const Phases &phases)
   if (option.memory)
   {
     print_memory = true;
-    for (int i = PhaseEnum::MODEL_LOAD; i <= PhaseEnum::EXECUTE; ++i)
+    for (int i = PhaseEnum::MODEL_LOAD; i < PhaseEnum::EXECUTE; ++i)
     {
       auto phase = phases.at(gPhaseStrings[i]);
       for (int j = MemoryType::RSS; j <= MemoryType::PSS; ++j)
diff --git a/runtime/libs/misc/include/misc/polymorphic_downcast.h b/runtime/libs/misc/include/misc/polymorphic_downcast.h
index ee885eb70..412b864e6 100644
--- a/runtime/libs/misc/include/misc/polymorphic_downcast.h
+++ b/runtime/libs/misc/include/misc/polymorphic_downcast.h
@@ -27,7 +27,9 @@ namespace misc
 
 template <typename DstType, typename SrcType> inline DstType polymorphic_downcast(SrcType *x)
 {
+#ifndef __ANDROID__
   assert(dynamic_cast<DstType>(x) == x);
+#endif
   return static_cast<DstType>(x);
 }
 
diff --git a/runtime/nnapi-header/include/NeuralNetworksEx.h b/runtime/nnapi-header/include/NeuralNetworksEx.h
index 87f0e305f..d15262e17 100644
--- a/runtime/nnapi-header/include/NeuralNetworksEx.h
+++ b/runtime/nnapi-header/include/NeuralNetworksEx.h
@@ -558,7 +558,26 @@ typedef enum {
    * Outputs:
    * * 0: The sum, a tensor of the same type as input0.
    */
-  ANEURALNETWORKS_ADDV2_EX = 50039
+  ANEURALNETWORKS_ADDV2_EX = 50039,
+
+  ANEURALNETWORKS_STATELESS_RANDOM_UNIFORM_EX = 50040,
+
+  /** Splits a tensor value into a list of sub tensors.
+   *
+   * Supported tensor {@link OperandCode}:
+   * * {@link ANEURALNETWORKS_TENSOR_FLOAT32, ANEURALNETWORKS_TENSOR_INT32}
+   *
+   * Supported tensor rank: up to 4
+   *
+   * Inputs:
+   * * 0: A tensor to split.
+   * * 1: A tensor containing the sizes of each output tensor along split_dim
+   * * 2: The dimension along which to split
+   *
+   * Outputs:
+   * * 0: Tensor objects resulting from splitting value.
+   */
+  ANEURALNETWORKS_SPLIT_V_EX = 50041
 
 } OperationCodeEx; // extends OperationCode
 
diff --git a/runtime/onert/api/CMakeLists.txt b/runtime/onert/api/CMakeLists.txt
index 0cce3386a..49a5aa071 100644
--- a/runtime/onert/api/CMakeLists.txt
+++ b/runtime/onert/api/CMakeLists.txt
@@ -4,9 +4,9 @@ set(ONERT_DEV nnfw-dev)
 add_library(${ONERT_DEV} SHARED ${API_SRC})
 
 # Public headers to publish
-# nnfw_debug.h is header for runtime developer, so it will not be installed
-# But runtime developer can use nnfw_debug.h by linking nnfw-dev
-set(NNFW_API_HEADERS include/nnfw.h include/nnfw_dev.h)
+# nnfw_internal.h is header for runtime developer, so it will not be installed
+# But runtime developer can use nnfw_internal.h by linking nnfw-dev
+set(NNFW_API_HEADERS include/nnfw.h include/nnfw_experimental.h)
 
 target_link_libraries(${ONERT_DEV} PUBLIC nnfw-nnapi-header)
 target_link_libraries(${ONERT_DEV} PUBLIC onert_core)
diff --git a/runtime/onert/api/include/nnfw.h b/runtime/onert/api/include/nnfw.h
index 031aabd51..ef3678b0d 100644
--- a/runtime/onert/api/include/nnfw.h
+++ b/runtime/onert/api/include/nnfw.h
@@ -99,6 +99,10 @@ typedef enum {
   NNFW_STATUS_ERROR = 1,
   /** Unexpected null argument is given. */
   NNFW_STATUS_UNEXPECTED_NULL = 2,
+  /** When a function was called but it is not valid for the current session state. */
+  NNFW_STATUS_INVALID_STATE = 3,
+  /** When it is out of memory */
+  NNFW_STATUS_OUT_OF_MEMORY = 4,
 } NNFW_STATUS;
 
 /**
@@ -432,10 +436,10 @@ NNFW_STATUS nnfw_output_tensorinfo(nnfw_session *session, uint32_t index,
  *
  * <p>Supported backends differs on each platforms.
  * For example, `x86_64` supports "cpu" only.
- * Can set multiple backends by semicolon (ex: "acl_cl;cpu").
- * Among the multiple backends, the 1st element is used as default backend.</p>
- *
- * @note      Possible backend strings are: "cpu", "acl_cl", "acl_neon", "srcn"
+ * Multiple backends can be set and they must be separated by a semicolon (ex: "acl_cl;cpu").
+ * For each backend string, `libbackend_{backend}.so` will be dynamically loaded during
+ * {@link nnfw_prepare}.
+ * Among the multiple backends, the 1st element is used as the default backend.</p>
  *
  * @param[in] session session to which avilable backends are set
  * @param[in] backends available backends on which nnfw uses
@@ -449,12 +453,10 @@ NNFW_STATUS nnfw_set_available_backends(nnfw_session *session, const char *backe
  *
  * This function should be called before {@link nnfw_prepare} is invoked.
  *
- * <p>Supported backends differs on each platforms.
- * For example, `x86_64` supports "cpu" only.
- * The backend for op has higher priority than available backends specified by
- * nnfw_set_available_backends.</p>
+ * <p>The backend for op has higher priority than available backends specified by
+ * {@link nnfw_set_available_backends}.</p>
  *
- * @note      Possible backend strings are: "cpu", "acl_cl", "acl_neon"
+ * @deprecated Deprecated since 1.8.0.
  *
  * @param[in] session session to be modified
  * @param[in] op operation to be set
diff --git a/runtime/onert/api/include/nnfw_dev.h b/runtime/onert/api/include/nnfw_experimental.h
index ecf0597cf..4cd5c585a 100644
--- a/runtime/onert/api/include/nnfw_dev.h
+++ b/runtime/onert/api/include/nnfw_experimental.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef __NNFW_DEV_H__
-#define __NNFW_DEV_H__
+#ifndef __NNFW_EXPERIMENTAL_H__
+#define __NNFW_EXPERIMENTAL_H__
 
 #include "nnfw.h"
 
@@ -62,4 +62,4 @@ typedef struct
 NNFW_STATUS nnfw_register_custom_op_info(nnfw_session *session, const char *id,
                                          custom_kernel_registration_info *info);
 
-#endif // __NNFW_DEV_H__
+#endif // __NNFW_EXPERIMENTAL_H__
diff --git a/runtime/onert/api/include/nnfw_debug.h b/runtime/onert/api/include/nnfw_internal.h
index 7af06a202..eb4b6d629 100644
--- a/runtime/onert/api/include/nnfw_debug.h
+++ b/runtime/onert/api/include/nnfw_internal.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef __NNFW_DEBUG_H__
-#define __NNFW_DEBUG_H__
+#ifndef __NNFW_INTERNAL_H__
+#define __NNFW_INTERNAL_H__
 
 #include "nnfw.h"
 
@@ -23,4 +23,16 @@ NNFW_STATUS nnfw_set_config(nnfw_session *session, const char *key, const char *
 
 NNFW_STATUS nnfw_get_config(nnfw_session *session, const char *key, char *value, size_t value_size);
 
-#endif // __NNFW_DEBUG_H__
+/**
+ * @brief Load a circle model from buffer.
+ *
+ * The buffer must outlive the session.
+ *
+ * @param[in] session session
+ * @param[in] buffer  Pointer to the buffer
+ * @param[in] size    Buffer size
+ * @return NNFW_STATUS
+ */
+NNFW_STATUS nnfw_load_circle_from_buffer(nnfw_session *session, uint8_t *buffer, size_t size);
+
+#endif // __NNFW_INTERNAL_H__
diff --git a/runtime/onert/api/include/nnfw_version.h b/runtime/onert/api/include/nnfw_version.h
index d7878708a..320271a26 100644
--- a/runtime/onert/api/include/nnfw_version.h
+++ b/runtime/onert/api/include/nnfw_version.h
@@ -21,6 +21,6 @@
  * NNFW_VERSION is a uint32 value representing nnfw runtime version
  * in 0xMMmmmmPP, where MM = major, mmmm = minor, PP = patch
  */
-#define NNFW_VERSION 0x01000700
+#define NNFW_VERSION 0x01000800
 
 #endif // __NNFW_VERSION_H__
diff --git a/runtime/onert/api/src/CustomKernel.h b/runtime/onert/api/src/CustomKernel.h
index b4fec87fd..a42f7a639 100644
--- a/runtime/onert/api/src/CustomKernel.h
+++ b/runtime/onert/api/src/CustomKernel.h
@@ -17,7 +17,7 @@
 #ifndef __ONERT_BACKEND_CUSTOM_KERNEL_H__
 #define __ONERT_BACKEND_CUSTOM_KERNEL_H__
 
-#include "nnfw_dev.h"
+#include "nnfw_experimental.h"
 
 #include "backend/CustomKernelBuilder.h"
 #include "exec/IFunction.h"
diff --git a/runtime/onert/api/src/nnfw_api.cc b/runtime/onert/api/src/nnfw_api.cc
index 074758374..d65158fd8 100644
--- a/runtime/onert/api/src/nnfw_api.cc
+++ b/runtime/onert/api/src/nnfw_api.cc
@@ -31,6 +31,8 @@ STATIC_ASSERT_ENUM_CHECK(NNFW_TYPE_TENSOR_INT64, 5);
 STATIC_ASSERT_ENUM_CHECK(NNFW_STATUS_NO_ERROR, 0);
 STATIC_ASSERT_ENUM_CHECK(NNFW_STATUS_ERROR, 1);
 STATIC_ASSERT_ENUM_CHECK(NNFW_STATUS_UNEXPECTED_NULL, 2);
+STATIC_ASSERT_ENUM_CHECK(NNFW_STATUS_INVALID_STATE, 3);
+STATIC_ASSERT_ENUM_CHECK(NNFW_STATUS_OUT_OF_MEMORY, 4);
 
 STATIC_ASSERT_ENUM_CHECK(NNFW_LAYOUT_NONE, 0);
 STATIC_ASSERT_ENUM_CHECK(NNFW_LAYOUT_CHANNELS_LAST, 1);
@@ -57,8 +59,9 @@ NNFW_STATUS nnfw_create_session(nnfw_session **session)
 {
   NNFW_RETURN_ERROR_IF_NULL(session);
 
-  *session = new nnfw_session();
-
+  *session = new (std::nothrow) nnfw_session();
+  if (*session == nullptr)
+    return NNFW_STATUS_OUT_OF_MEMORY;
   return NNFW_STATUS_NO_ERROR;
 }
 
@@ -338,3 +341,9 @@ NNFW_STATUS nnfw_query_info_u32(nnfw_session *session, NNFW_INFO_ID id, uint32_t
   // It should not be reached.
   return NNFW_STATUS_ERROR;
 }
+
+NNFW_STATUS nnfw_load_circle_from_buffer(nnfw_session *session, uint8_t *buffer, size_t size)
+{
+  NNFW_RETURN_ERROR_IF_NULL(session);
+  return session->load_circle_from_buffer(buffer, size);
+}
diff --git a/runtime/onert/api/src/nnfw_api_internal.cc b/runtime/onert/api/src/nnfw_api_internal.cc
index d03ddd427..eb0b743d3 100644
--- a/runtime/onert/api/src/nnfw_api_internal.cc
+++ b/runtime/onert/api/src/nnfw_api_internal.cc
@@ -73,15 +73,33 @@ nnfw_session::nnfw_session()
 
 nnfw_session::~nnfw_session() = default;
 
-NNFW_STATUS nnfw_session::load_model_from_file(const char *package_dir)
+NNFW_STATUS nnfw_session::load_circle_from_buffer(uint8_t *buffer, size_t size)
 {
   if (!isStateInitialized())
+    return NNFW_STATUS_INVALID_STATE;
+
+  if (!buffer)
+    return NNFW_STATUS_UNEXPECTED_NULL;
+
+  if (size == 0)
     return NNFW_STATUS_ERROR;
 
+  _subgraphs = onert::circle_loader::loadModel(buffer, size);
+  _compiler = std::make_unique<onert::compiler::Compiler>(_subgraphs);
+
+  _state = State::MODEL_LOADED;
+  return NNFW_STATUS_NO_ERROR;
+}
+
+NNFW_STATUS nnfw_session::load_model_from_file(const char *package_dir)
+{
+  if (!isStateInitialized())
+    return NNFW_STATUS_INVALID_STATE;
+
   if (!package_dir)
   {
     std::cerr << "package_dir is null." << std::endl;
-    return NNFW_STATUS_ERROR;
+    return NNFW_STATUS_UNEXPECTED_NULL;
   }
 
   if (!null_terminating(package_dir, MAX_PATH_LENGTH))
@@ -156,7 +174,7 @@ NNFW_STATUS nnfw_session::prepare()
       std::cerr << "invalid state";
     }
     std::cerr << std::endl;
-    return NNFW_STATUS_ERROR;
+    return NNFW_STATUS_INVALID_STATE;
   }
 
   if (!_subgraphs || !primary_subgraph() || primary_subgraph()->isBuildingPhase())
@@ -188,7 +206,7 @@ NNFW_STATUS nnfw_session::run()
   {
     std::cerr << "Error during nnfw_session::run : "
               << "run should be run after prepare" << std::endl;
-    return NNFW_STATUS_ERROR;
+    return NNFW_STATUS_INVALID_STATE;
   }
 
   try
@@ -211,7 +229,7 @@ NNFW_STATUS nnfw_session::run_async()
   {
     std::cerr << "Error during nnfw_session::run_async : "
               << "run_async should be run after prepare" << std::endl;
-    return NNFW_STATUS_ERROR;
+    return NNFW_STATUS_INVALID_STATE;
   }
 
   _execution->startExecute();
@@ -241,7 +259,7 @@ NNFW_STATUS nnfw_session::set_input(uint32_t index, NNFW_TYPE /*type*/, const vo
   if (!isStatePreparedOrFinishedRun())
   {
     std::cerr << "Error during nnfw_session::set_input : invalid state" << std::endl;
-    return NNFW_STATUS_ERROR;
+    return NNFW_STATUS_INVALID_STATE;
   }
 
   if (!buffer && length != 0)
@@ -270,7 +288,7 @@ NNFW_STATUS nnfw_session::set_output(uint32_t index, NNFW_TYPE /*type*/, void *b
   if (!isStatePreparedOrFinishedRun())
   {
     std::cerr << "Error during nnfw_session::set_output : invalid state" << std::endl;
-    return NNFW_STATUS_ERROR;
+    return NNFW_STATUS_INVALID_STATE;
   }
 
   if (!buffer && length != 0)
@@ -296,14 +314,14 @@ NNFW_STATUS nnfw_session::set_output(uint32_t index, NNFW_TYPE /*type*/, void *b
 NNFW_STATUS nnfw_session::input_size(uint32_t *number)
 {
   if (isStateInitialized()) // Model is not loaded
-    return NNFW_STATUS_ERROR;
+    return NNFW_STATUS_INVALID_STATE;
 
   try
   {
     if (number == nullptr)
     {
       std::cerr << "Error during nnfw_session::input_size, number is null pointer." << std::endl;
-      return NNFW_STATUS_ERROR;
+      return NNFW_STATUS_UNEXPECTED_NULL;
     }
     *number = primary_subgraph()->getInputs().size();
   }
@@ -318,14 +336,14 @@ NNFW_STATUS nnfw_session::input_size(uint32_t *number)
 NNFW_STATUS nnfw_session::output_size(uint32_t *number)
 {
   if (isStateInitialized()) // Model is not loaded
-    return NNFW_STATUS_ERROR;
+    return NNFW_STATUS_INVALID_STATE;
 
   try
   {
     if (number == nullptr)
     {
       std::cerr << "Error during nnfw_session::output_size, number is null pointer." << std::endl;
-      return NNFW_STATUS_ERROR;
+      return NNFW_STATUS_UNEXPECTED_NULL;
     }
     *number = primary_subgraph()->getOutputs().size();
   }
@@ -410,7 +428,7 @@ NNFW_STATUS nnfw_session::apply_tensorinfo(uint32_t index, nnfw_tensorinfo ti)
     {
       std::cerr << "Error during set_input_tensorinfo : should be run after load_model"
                 << std::endl;
-      return NNFW_STATUS_ERROR;
+      return NNFW_STATUS_INVALID_STATE;
     }
 
     if (ti.rank <= 0 || ti.rank > NNFW_MAX_RANK)
@@ -463,13 +481,16 @@ NNFW_STATUS nnfw_session::set_input_tensorinfo(uint32_t index, const nnfw_tensor
 
 NNFW_STATUS nnfw_session::input_tensorinfo(uint32_t index, nnfw_tensorinfo *ti)
 {
+  if (isStateInitialized())
+    return NNFW_STATUS_INVALID_STATE;
+
   try
   {
     if (ti == nullptr)
     {
       std::cerr << "Error during nnfw_session::input_tensorinfo, tensorinfo is null pointer."
                 << std::endl;
-      return NNFW_STATUS_ERROR;
+      return NNFW_STATUS_UNEXPECTED_NULL;
     }
     if (index >= primary_subgraph()->getInputs().size())
     {
@@ -499,13 +520,13 @@ NNFW_STATUS nnfw_session::input_tensorinfo(uint32_t index, nnfw_tensorinfo *ti)
 NNFW_STATUS nnfw_session::output_tensorinfo(uint32_t index, nnfw_tensorinfo *ti)
 {
   if (isStateInitialized())
-    return NNFW_STATUS_ERROR;
+    return NNFW_STATUS_INVALID_STATE;
 
   if (ti == nullptr)
   {
     std::cerr << "Error during nnfw_session::output_tensorinfo, tensorinfo is null pointer."
               << std::endl;
-    return NNFW_STATUS_ERROR;
+    return NNFW_STATUS_UNEXPECTED_NULL;
   }
 
   if (index >= primary_subgraph()->getOutputs().size())
@@ -570,14 +591,14 @@ static std::string get_op_backend_string(std::string op)
 NNFW_STATUS nnfw_session::set_available_backends(const char *backends)
 {
   if (!isStateModelLoaded())
-    return NNFW_STATUS_ERROR;
+    return NNFW_STATUS_INVALID_STATE;
 
   try
   {
-    if (!backends || null_terminating(backends, MAX_BACKEND_NAME_LENGTH) == false)
-    {
+    if (!backends)
+      return NNFW_STATUS_UNEXPECTED_NULL;
+    if (null_terminating(backends, MAX_BACKEND_NAME_LENGTH) == false)
       return NNFW_STATUS_ERROR;
-    }
 
     auto &options = _compiler->options();
 
@@ -596,15 +617,15 @@ NNFW_STATUS nnfw_session::set_available_backends(const char *backends)
 NNFW_STATUS nnfw_session::set_op_backend(const char *op, const char *backend)
 {
   if (!isStateModelLoaded())
-    return NNFW_STATUS_ERROR;
+    return NNFW_STATUS_INVALID_STATE;
 
   try
   {
-    if (!op || !null_terminating(op, MAX_OP_NAME_LENGTH) || !backend ||
+    if (!op || !backend)
+      return NNFW_STATUS_UNEXPECTED_NULL;
+    if (!null_terminating(op, MAX_OP_NAME_LENGTH) ||
         !null_terminating(backend, MAX_BACKEND_NAME_LENGTH))
-    {
       return NNFW_STATUS_ERROR;
-    }
 
     auto key = get_op_backend_string(op);
 
@@ -627,7 +648,10 @@ NNFW_STATUS nnfw_session::set_op_backend(const char *op, const char *backend)
 NNFW_STATUS nnfw_session::set_config(const char *key, const char *value)
 {
   if (!isStateModelLoaded())
-    return NNFW_STATUS_ERROR;
+    return NNFW_STATUS_INVALID_STATE;
+
+  if (!key || !value)
+    return NNFW_STATUS_UNEXPECTED_NULL;
 
   auto &options = _compiler->options();
 
@@ -693,7 +717,10 @@ onert::ir::Graph *nnfw_session::primary_subgraph()
 NNFW_STATUS nnfw_session::get_config(const char *key, char *value, size_t value_size)
 {
   if (!isStateModelLoaded())
-    return NNFW_STATUS_ERROR;
+    return NNFW_STATUS_INVALID_STATE;
+
+  if (!key || !value)
+    return NNFW_STATUS_UNEXPECTED_NULL;
 
   auto &options = _compiler->options();
 
diff --git a/runtime/onert/api/src/nnfw_api_internal.h b/runtime/onert/api/src/nnfw_api_internal.h
index 1154f0471..1c3c3706f 100644
--- a/runtime/onert/api/src/nnfw_api_internal.h
+++ b/runtime/onert/api/src/nnfw_api_internal.h
@@ -18,7 +18,7 @@
 #define __API_NNFW_API_INTERNAL_H__
 
 #include "nnfw.h"
-#include "nnfw_dev.h"
+#include "nnfw_experimental.h"
 
 #include <util/GeneralConfigSource.h>
 
@@ -127,9 +127,15 @@ public:
   NNFW_STATUS set_available_backends(const char *backends);
   NNFW_STATUS set_op_backend(const char *op, const char *backend);
 
+  //
+  // Internal-only API
+  //
+
   NNFW_STATUS set_config(const char *key, const char *value);
   NNFW_STATUS get_config(const char *key, char *value, size_t value_size);
 
+  NNFW_STATUS load_circle_from_buffer(uint8_t *buffer, size_t size);
+
 private:
   onert::ir::Graph *primary_subgraph();
   bool isStateInitialized();
diff --git a/runtime/onert/backend/acl_cl/KernelGenerator.cc b/runtime/onert/backend/acl_cl/KernelGenerator.cc
index 3ca405899..a84f983b4 100644
--- a/runtime/onert/backend/acl_cl/KernelGenerator.cc
+++ b/runtime/onert/backend/acl_cl/KernelGenerator.cc
@@ -31,6 +31,7 @@
 #include "exec/FunctionSequence.h"
 #include "util/logging.h"
 #include "util/Utils.h"
+#include "AclKernelGen.h"
 
 namespace onert
 {
@@ -76,15 +77,15 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
   const auto block_size_index{
       node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  auto block_size_alloc = _tensor_builder->at(block_size_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+  auto block_size_tensor = _tensor_builder->at(block_size_index).get();
 
   assert(_ctx.at(block_size_index).data());
 
   auto fn = std::make_unique<::arm_compute::CLBatchToSpaceLayer>();
 
-  fn->configure(ifm_alloc->handle(), block_size_alloc->handle(), ofm_alloc->handle());
+  fn->configure(ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle());
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -96,15 +97,27 @@ void KernelGenerator::visit(const ir::operation::Cast &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::Cast::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  const auto input_sub_type = _ctx.at(ifm_index).typeInfo().type() == ir::DataType::BOOL8
-                                  ? arm_compute::SubDataType::BOOL
-                                  : arm_compute::SubDataType::NONE;
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
-  auto fn = std::make_unique<::arm_compute::CLCast>();
+  std::unique_ptr<::arm_compute::IFunction> fn;
+  if (ifm_tensor->data_type() == ofm_tensor->data_type())
+  {
+    auto l = std::make_unique<::arm_compute::CLCopy>();
+
+    l->configure(ifm_tensor->handle(), ofm_tensor->handle());
+
+    fn = std::move(l);
+  }
+  else
+  {
+    auto l = std::make_unique<::arm_compute::CLCast>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), input_sub_type);
+    // TODO Support converting float to int32 as round down
+    l->configure(ifm_tensor->handle(), ofm_tensor->handle(), arm_compute::ConvertPolicy::SATURATE);
+
+    fn = std::move(l);
+  }
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -132,10 +145,10 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
                                             ker_width, ker_height);
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  auto ker_alloc = _tensor_builder->at(ker_index).get();
-  auto bias_alloc = _tensor_builder->at(bias_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+  auto ker_tensor = _tensor_builder->at(ker_index).get();
+  auto bias_tensor = _tensor_builder->at(bias_index).get();
 
   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
   const auto act_info = acl_common::asActivationLayerInfo(activation);
@@ -143,8 +156,9 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
   auto fn = std::make_unique<::arm_compute::CLConvolutionLayer>(
       _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
 
-  fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), ofm_alloc->handle(),
-                conv_info, ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
+  fn->configure(ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(),
+                ofm_tensor->handle(), conv_info, ::arm_compute::WeightsInfo(),
+                ::arm_compute::Size2D(1U, 1U), act_info);
 
   _return_fn = asAclClFunction(std::move(fn));
 }
@@ -171,10 +185,10 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
   const auto multiplier = node.param().multiplier;
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  auto ker_alloc = _tensor_builder->at(ker_index).get();
-  auto bias_alloc = _tensor_builder->at(bias_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+  auto ker_tensor = _tensor_builder->at(ker_index).get();
+  auto bias_tensor = _tensor_builder->at(bias_index).get();
 
   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
   const auto act_info = acl_common::asActivationLayerInfo(activation);
@@ -182,8 +196,8 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
   {
     auto fn = std::make_unique<::arm_compute::CLDepthwiseConvolutionLayer>();
 
-    fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(),
-                  ofm_alloc->handle(), conv_info, multiplier, act_info);
+    fn->configure(ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(),
+                  ofm_tensor->handle(), conv_info, multiplier, act_info);
 
     _return_fn = asAclClFunction(std::move(fn));
   }
@@ -191,88 +205,28 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
 
 void KernelGenerator::visit(const ir::operation::MaxPool2D &node)
 {
-  const auto ofm_index{node.getOutputs().at(0)};
-  const auto ifm_index{node.getInputs().at(ir::operation::MaxPool2D::Input::INPUT)};
-
-  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
-  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
+  auto raw_fn = acl_common::kernelGenPool2D<::arm_compute::CLPoolingLayer>(
+      node, _ctx, _tensor_builder, _current_op_seq_layout, ::arm_compute::PoolingType::MAX);
 
-  const auto kh = node.param().kh;
-  const auto kw = node.param().kw;
-  const auto stride = node.param().stride;
-  const auto padding =
-      ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
+  const auto ofm_index{node.getOutputs().at(0)};
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
   const auto activation = node.param().activation;
-
-  VERBOSE(MaxPool2D) << "IFM_H: " << ifm_shape.H << std::endl;
-  VERBOSE(MaxPool2D) << "IFM_W: " << ifm_shape.W << std::endl;
-  VERBOSE(MaxPool2D) << "OFM_H: " << ofm_shape.H << std::endl;
-  VERBOSE(MaxPool2D) << "OFM_W: " << ofm_shape.W << std::endl;
-  VERBOSE(MaxPool2D) << "KER_H: " << kh << std::endl;
-  VERBOSE(MaxPool2D) << "KER_W: " << kw << std::endl;
-  VERBOSE(MaxPool2D) << "STRIDE_H: " << stride.vertical << std::endl;
-  VERBOSE(MaxPool2D) << "STRIDE_W: " << stride.horizontal << std::endl;
-  VERBOSE(MaxPool2D) << "PAD(T): " << padding.top << std::endl;
-  VERBOSE(MaxPool2D) << "PAD(B): " << padding.bottom << std::endl;
-  VERBOSE(MaxPool2D) << "PAD(L): " << padding.left << std::endl;
-  VERBOSE(MaxPool2D) << "PAD(R): " << padding.right << std::endl;
-
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-
-  ::arm_compute::PoolingLayerInfo info{::arm_compute::PoolingType::MAX,
-                                       ::arm_compute::Size2D{kw, kh},
-                                       acl_common::asPadStrideInfo(padding, stride)};
-
-  auto fn = std::make_unique<::arm_compute::CLPoolingLayer>();
-
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
-
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+      asAclClFunction(std::move(raw_fn)),
+      ActivationBuilder::generate(activation, ofm_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::AvgPool2D &node)
 {
-  const auto ofm_index{node.getOutputs().at(0)};
-  const auto ifm_index{node.getInputs().at(ir::operation::AvgPool2D::Input::INPUT)};
-
-  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
-  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
+  auto raw_fn = acl_common::kernelGenPool2D<::arm_compute::CLPoolingLayer>(
+      node, _ctx, _tensor_builder, _current_op_seq_layout, ::arm_compute::PoolingType::AVG);
 
-  const auto kh = node.param().kh;
-  const auto kw = node.param().kw;
-  const auto stride = node.param().stride;
-  const auto padding =
-      ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
+  const auto ofm_index{node.getOutputs().at(0)};
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
   const auto activation = node.param().activation;
-
-  VERBOSE(AvgPool2D) << "IFM_H: " << ifm_shape.H << std::endl;
-  VERBOSE(AvgPool2D) << "IFM_W: " << ifm_shape.W << std::endl;
-  VERBOSE(AvgPool2D) << "OFM_H: " << ofm_shape.H << std::endl;
-  VERBOSE(AvgPool2D) << "OFM_W: " << ofm_shape.W << std::endl;
-  VERBOSE(AvgPool2D) << "KER_H: " << kh << std::endl;
-  VERBOSE(AvgPool2D) << "KER_W: " << kw << std::endl;
-  VERBOSE(AvgPool2D) << "STRIDE_H: " << stride.vertical << std::endl;
-  VERBOSE(AvgPool2D) << "STRIDE_W: " << stride.horizontal << std::endl;
-  VERBOSE(AvgPool2D) << "PAD(T): " << padding.top << std::endl;
-  VERBOSE(AvgPool2D) << "PAD(B): " << padding.bottom << std::endl;
-  VERBOSE(AvgPool2D) << "PAD(L): " << padding.left << std::endl;
-  VERBOSE(AvgPool2D) << "PAD(R): " << padding.right << std::endl;
-
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-
-  ::arm_compute::PoolingLayerInfo info{
-      ::arm_compute::PoolingType::AVG, ::arm_compute::Size2D{kw, kh},
-      acl_common::asPadStrideInfo(padding, stride), true /* exclude_padding */};
-
-  auto fn = std::make_unique<::arm_compute::CLPoolingLayer>();
-
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
-
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+      asAclClFunction(std::move(raw_fn)),
+      ActivationBuilder::generate(activation, ofm_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::Concat &node)
@@ -296,7 +250,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
     return;
   }
 
-  auto output_alloc = _tensor_builder->at(ofm_index).get();
+  auto output_tensor = _tensor_builder->at(ofm_index).get();
   std::vector<::arm_compute::ICLTensor *> input_tensors;
   for (auto &ifm_ind : input_indexes)
     input_tensors.emplace_back(_tensor_builder->at(ifm_ind)->handle());
@@ -305,7 +259,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
   if (input_indexes.size() < 2)
   {
     auto l = std::make_unique<::arm_compute::CLCopy>();
-    l->configure(input_tensors.at(0), output_alloc->handle());
+    l->configure(input_tensors.at(0), output_tensor->handle());
     fn = std::move(l);
   }
   else
@@ -313,10 +267,10 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
     auto l = std::make_unique<::arm_compute::CLConcatenateLayer>();
     const auto rank = _ctx.at(ofm_index).shape().rank();
     const auto frontend_layout = _current_op_seq_layout;
-    const auto backend_layout = output_alloc->layout();
+    const auto backend_layout = output_tensor->layout();
     const auto fixed_axis =
         acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
-    l->configure(input_tensors, output_alloc->handle(), fixed_axis);
+    l->configure(input_tensors, output_tensor->handle(), fixed_axis);
     fn = std::move(l);
   }
 
@@ -327,75 +281,15 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
 
 void KernelGenerator::visit(const ir::operation::FullyConnected &node)
 {
-  using ir::operation::FullyConnected;
-
   const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)};
-  const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)};
-  const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)};
-
-  const auto input_rank = _ctx.at(input_index).shape().rank();
-
-  const auto output_size =
-      _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 1);
-  UNUSED_RELEASE(output_size);
-  assert(_ctx.at(bias_index).shape().dim(0) == output_size);
-  assert(_ctx.at(weight_index).shape().dim(0) == output_size);
-  const auto batch_size =
-      _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 2);
-  const auto input_size =
-      _ctx.at(weight_index).shape().dim(_ctx.at(weight_index).shape().rank() - 1);
-
-  // Check for reshaping input's shape into rank-2
-  bool needs_reshape = false;
-  ir::Shape reshape(2);
-  if (input_rank == 3 || input_rank == 4)
-  {
-    const auto &ifm_shape = _ctx.at(input_index).shape();
-    auto feature_size = 1;
-    for (int i = 0; i < ifm_shape.rank(); ++i)
-    {
-      feature_size *= ifm_shape.dim(i);
-    }
-
-    UNUSED_RELEASE(feature_size);
-    assert(feature_size == batch_size * input_size);
-
-    // for reshaping
-    needs_reshape = true;
-    reshape.dim(0) = batch_size; /* H */
-    reshape.dim(1) = input_size; /* W */
-  }
-
+  auto output_tensor = _tensor_builder->at(output_index).get();
   const auto activation = node.param().activation;
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  const auto input_alloc = _tensor_builder->at(input_index).get();
-  const auto weight_alloc = _tensor_builder->at(weight_index).get();
-  const auto bias_alloc = _tensor_builder->at(bias_index).get();
-  const auto frontend_layout = _current_op_seq_layout;
-  const auto acl_layout = output_alloc->handle()->info()->data_layout();
-
-  auto fn = std::make_unique<arm_compute::CLFullyConnectedReshapingLayer>(
-      _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
-
-  arm_compute::CLFullyConnectedReshapingLayer::KernelType kernel_type =
-      arm_compute::CLFullyConnectedReshapingLayer::KernelType::GENERAL;
-  if (_ctx.at(weight_index).isConstant())
-  {
-    kernel_type = arm_compute::CLFullyConnectedReshapingLayer::KernelType::PREPROCESSED_WEIGHTS;
-    assert(_ctx.at(weight_index).data());
-  }
-  fn->configure(
-      input_alloc->handle(), weight_alloc->handle(), bias_alloc->handle(), output_alloc->handle(),
-      needs_reshape,
-      ::onert::backend::acl_common::asTensorShape(
-          reshape, frontend_layout, ::onert::backend::acl_common::asRuntimeLayout(acl_layout)),
-      kernel_type);
-
+  auto fn = acl_common::kernelGenFullyConnected<acl_common::AclClFunction, ::arm_compute::ICLTensor,
+                                                ::arm_compute::CLFullyConnectedReshapingLayer>(
+      node, _ctx, _tensor_builder, _current_op_seq_layout);
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclClFunction(std::move(fn)),
-      ActivationBuilder::generate(activation, output_alloc->handle()));
+      std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::Mul &node)
@@ -406,17 +300,18 @@ void KernelGenerator::visit(const ir::operation::Mul &node)
 
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLPixelWiseMultiplication>();
 
-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), 1.0, // scale
+  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale
                 arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
 
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+      asAclClFunction(std::move(fn)),
+      ActivationBuilder::generate(activation, ofm_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::Reduce &node)
@@ -427,14 +322,14 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
   const auto keep_dims{node.param().keep_dims};
   const auto reduce_type = node.param().reduce_type;
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   // Convert to ACL axes taking into account negative values and possible duplicates.
   const auto &axes = _ctx.at(axes_index);
   const auto input_rank = _ctx.at(input_index).shape().rank();
   const auto frontend_layout = _current_op_seq_layout;
-  const auto backend_layout = input_alloc->layout();
+  const auto backend_layout = input_tensor->layout();
 
   std::unique_ptr<arm_compute::IFunction> fn;
   if (reduce_type == ir::operation::Reduce::ReduceType::MEAN)
@@ -443,7 +338,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
 
     const auto acl_axes =
         acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout);
-    l->configure(input_alloc->handle(), acl_axes, keep_dims, output_alloc->handle());
+    l->configure(input_tensor->handle(), acl_axes, keep_dims, output_tensor->handle());
 
     fn = std::move(l);
   }
@@ -453,7 +348,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
         _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
 
     const auto acl_axes = acl_common::asSet(axes, input_rank, frontend_layout, backend_layout);
-    l->configure(input_alloc->handle(), output_alloc->handle(), acl_axes, keep_dims,
+    l->configure(input_tensor->handle(), output_tensor->handle(), acl_axes, keep_dims,
                  acl_common::convertReduceType(reduce_type));
 
     fn = std::move(l);
@@ -469,13 +364,13 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   // NOTE This operation must not be changed the layout from frontend to backend
   //      So, PermutationOperationPass makes layouts of frontend and backend the same.
   const auto frontend_layout = _current_op_seq_layout;
-  const auto backend_layout = output_alloc->layout();
+  const auto backend_layout = output_tensor->layout();
   assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) ||
          frontend_layout == backend_layout);
   UNUSED_RELEASE(frontend_layout);
@@ -483,7 +378,7 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
 
   auto fn = std::make_unique<::arm_compute::CLReshapeLayer>();
 
-  fn->configure(input_alloc->handle(), output_alloc->handle());
+  fn->configure(input_tensor->handle(), output_tensor->handle());
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -503,10 +398,10 @@ void KernelGenerator::visit(const ir::operation::Squeeze &node)
   (void)dims;
   (void)ndim;
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
   auto fn = std::make_unique<arm_compute::CLReshapeLayer>();
-  fn->configure(input_alloc->handle(), output_alloc->handle());
+  fn->configure(input_tensor->handle(), output_tensor->handle());
   auto acl_fn = asAclClFunction(std::move(fn));
   _return_fn = std::move(acl_fn);
 }
@@ -516,15 +411,15 @@ void KernelGenerator::visit(const ir::operation::Tanh &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Tanh::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   auto fn = std::make_unique<arm_compute::CLActivationLayer>();
 
   const ::arm_compute::ActivationLayerInfo act_info{
       ::arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f};
 
-  fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
+  fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -538,13 +433,13 @@ void KernelGenerator::visit(const ir::operation::Softmax &node)
 
   const auto beta = node.param().beta;
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLSoftmaxLayer>(
       _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
 
-  fn->configure(input_alloc->handle(), output_alloc->handle(), beta);
+  fn->configure(input_tensor->handle(), output_tensor->handle(), beta);
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -558,10 +453,10 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
   const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
   const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
 
-  auto outputData_alloc = _tensor_builder->at(output_index).get();
-  auto inputData_alloc = _tensor_builder->at(input_index).get();
+  auto outputData_tensor = _tensor_builder->at(output_index).get();
+  auto inputData_tensor = _tensor_builder->at(input_index).get();
   const auto frontend_layout = _current_op_seq_layout;
-  const auto backend_layout = inputData_alloc->layout();
+  const auto backend_layout = inputData_tensor->layout();
 
   // Set initializers for indices data such as order of inputData
   int input_rank = _ctx.at(input_index).shape().rank();
@@ -613,7 +508,7 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
 
   auto fn = std::make_unique<::arm_compute::CLSlice>();
 
-  fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set);
+  fn->configure(inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set);
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -628,10 +523,10 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
   const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
   const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
 
-  auto outputData_alloc = _tensor_builder->at(output_index).get();
-  auto inputData_alloc = _tensor_builder->at(input_index).get();
+  auto outputData_tensor = _tensor_builder->at(output_index).get();
+  auto inputData_tensor = _tensor_builder->at(input_index).get();
   const auto frontend_layout = _current_op_seq_layout;
-  const auto backend_layout = inputData_alloc->layout();
+  const auto backend_layout = inputData_tensor->layout();
 
   // Set initializers for indices data such as order of inputData
   int input_rank = _ctx.at(input_index).shape().rank();
@@ -704,7 +599,7 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
 
   auto fn = std::make_unique<::arm_compute::CLStridedSlice>();
 
-  fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set,
+  fn->configure(inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set,
                 strides_set, begin_mask, end_mask, shrink_axis_mask);
 
   auto acl_fn = asAclClFunction(std::move(fn));
@@ -720,10 +615,10 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
 
   const auto rank = _ctx.at(ifm_idx).shape().rank();
 
-  auto ofm_alloc = _tensor_builder->at(ofm_idx).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_idx).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_idx).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_idx).get();
   const auto frontend_layout = _current_op_seq_layout;
-  const auto backend_layout = ifm_alloc->layout();
+  const auto backend_layout = ifm_tensor->layout();
 
   std::vector<std::int32_t> pv(perm.cbegin(), perm.cend());
   // Reversed
@@ -732,7 +627,7 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
 
   auto fn = std::make_unique<::arm_compute::CLPermute>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), backend_pv);
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), backend_pv);
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -747,17 +642,18 @@ void KernelGenerator::visit(const ir::operation::Add &node)
 
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLArithmeticAddition>();
 
-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
+  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
                 arm_compute::ConvertPolicy::SATURATE);
 
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+      asAclClFunction(std::move(fn)),
+      ActivationBuilder::generate(activation, ofm_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::Sub &node)
@@ -768,17 +664,18 @@ void KernelGenerator::visit(const ir::operation::Sub &node)
 
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLArithmeticSubtraction>();
 
-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
+  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
                 arm_compute::ConvertPolicy::SATURATE);
 
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+      asAclClFunction(std::move(fn)),
+      ActivationBuilder::generate(activation, ofm_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::Div &node)
@@ -789,16 +686,17 @@ void KernelGenerator::visit(const ir::operation::Div &node)
 
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLArithmeticDivision>();
 
-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
+  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
 
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+      asAclClFunction(std::move(fn)),
+      ActivationBuilder::generate(activation, ofm_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::Exp &node)
@@ -806,12 +704,12 @@ void KernelGenerator::visit(const ir::operation::Exp &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Exp::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLExpLayer>();
 
-  fn->configure(input_alloc->handle(), output_alloc->handle());
+  fn->configure(input_tensor->handle(), output_tensor->handle());
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -823,12 +721,12 @@ void KernelGenerator::visit(const ir::operation::ExpandDims &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLReshapeLayer>();
 
-  fn->configure(input_alloc->handle(), output_alloc->handle());
+  fn->configure(input_tensor->handle(), output_tensor->handle());
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -842,20 +740,21 @@ void KernelGenerator::visit(const ir::operation::InstanceNorm &node)
   const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
   const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  auto gamma_alloc = _tensor_builder->at(gamma_index).get();
-  auto beta_alloc = _tensor_builder->at(beta_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+  auto gamma_tensor = _tensor_builder->at(gamma_index).get();
+  auto beta_tensor = _tensor_builder->at(beta_index).get();
   auto epsilon = node.param().epsilon;
   auto activation = node.param().activation;
 
   auto fn = std::make_unique<::arm_compute::CLInstanceNormalizationLayerEx>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), gamma_alloc->handle(),
-                beta_alloc->handle(), epsilon);
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(),
+                beta_tensor->handle(), epsilon);
 
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+      asAclClFunction(std::move(fn)),
+      ActivationBuilder::generate(activation, ofm_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::Logistic &node)
@@ -863,15 +762,15 @@ void KernelGenerator::visit(const ir::operation::Logistic &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::Logistic::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   const ::arm_compute::ActivationLayerInfo act_info{
       ::arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC};
 
   auto fn = std::make_unique<::arm_compute::CLActivationLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info);
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -884,13 +783,13 @@ void KernelGenerator::visit(const ir::operation::LogicalAnd &node)
   const auto input0_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT0)};
   const auto input1_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT1)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input0_alloc = _tensor_builder->at(input0_index).get();
-  auto input1_alloc = _tensor_builder->at(input1_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input0_tensor = _tensor_builder->at(input0_index).get();
+  auto input1_tensor = _tensor_builder->at(input1_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLBinaryLogicalOp>();
 
-  fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(),
+  fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
                 ::arm_compute::BinaryLogicalOperation::AND);
 
   auto acl_fn = asAclClFunction(std::move(fn));
@@ -900,159 +799,8 @@ void KernelGenerator::visit(const ir::operation::LogicalAnd &node)
 
 void KernelGenerator::visit(const ir::operation::LSTM &node)
 {
-  // TODO Support dynamic rnn
-  // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection.
-  const auto scratch_buffer_index{
-      node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)};
-  const auto output_state_out_index{
-      node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)};
-  const auto cell_state_out_index{
-      node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)};
-  const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)};
-
-  const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)};
-  const auto input_to_input_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional
-  const auto input_to_forget_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)};
-  const auto input_to_cell_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)};
-  const auto input_to_output_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
-  const auto recurrent_to_input_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional
-  const auto recurrent_to_forget_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)};
-  const auto recurrent_to_cell_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)};
-  const auto recurrent_to_output_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
-  const auto cell_to_input_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional
-  const auto cell_to_forget_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional
-  const auto cell_to_output_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional
-  const auto input_gate_bias_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)};
-  const auto forget_gate_bias_index{
-      node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)};
-  const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)};
-  const auto output_gate_bias_index{
-      node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)};
-  const auto projection_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional
-  const auto projection_bias_index{
-      node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; // optional
-  const auto output_state_in_index{
-      node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)};
-  const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)};
-  const auto cell_threshold = node.param().cell_threshold;
-  const auto projection_threshold = node.param().projection_threshold;
-
-  bool has_input_to_input_weights = _ctx.at(input_to_input_weights_index).shape().dim(0) != 0 &&
-                                    _ctx.at(input_to_input_weights_index).shape().dim(1) != 0;
-  bool has_recurrent_to_input_weights =
-      _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
-      _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0;
-  bool has_cell_to_forget_weights = _ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0;
-  bool has_cell_to_output_weights = _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0;
-  bool has_projection_weights = _ctx.at(projection_weights_index).shape().dim(0) != 0 &&
-                                _ctx.at(projection_weights_index).shape().dim(1) != 0;
-  bool has_projection_bias = _ctx.at(projection_bias_index).shape().dim(0);
-
-  // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG.
-  // true: no CIFG
-  // false: CIFG
-  // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG).
-  bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights;
-
-  // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole.
-  // But the cell_to_input_weights does not exist in regular CIFG although peephole.
-  // true: peephole
-  // false: no peephole
-  bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights;
-
-  // NOTE Although the projection weights has data the projection bias may not have data.
-  bool has_projection_param = has_projection_weights;
-
-  const auto activation = node.param().activation;
-  const auto cell_clip = cell_threshold;
-  const auto projection_clip = projection_threshold;
-  assert(cell_clip >= 0.f && projection_clip >= 0.f);
-
-  auto scratch_buffer_alloc = _tensor_builder->at(scratch_buffer_index).get();
-  auto output_state_out_alloc = _tensor_builder->at(output_state_out_index).get();
-  auto cell_state_out_alloc = _tensor_builder->at(cell_state_out_index).get();
-  auto output_alloc = _tensor_builder->at(output_index).get();
-
-  auto input_alloc = _tensor_builder->at(input_index).get();
-
-  auto input_to_forget_weights_alloc = _tensor_builder->at(input_to_forget_weights_index).get();
-  auto input_to_cell_weights_alloc = _tensor_builder->at(input_to_cell_weights_index).get();
-  auto input_to_output_weights_alloc = _tensor_builder->at(input_to_output_weights_index).get();
-  auto recurrent_to_forget_weights_alloc =
-      _tensor_builder->at(recurrent_to_forget_weights_index).get();
-  auto recurrent_to_cell_weights_alloc = _tensor_builder->at(recurrent_to_cell_weights_index).get();
-  auto recurrent_to_output_weights_alloc =
-      _tensor_builder->at(recurrent_to_output_weights_index).get();
-
-  auto forget_gate_bias_alloc = _tensor_builder->at(forget_gate_bias_index).get();
-  auto cell_bias_alloc = _tensor_builder->at(cell_bias_index).get();
-  auto output_gate_bias_alloc = _tensor_builder->at(output_gate_bias_index).get();
-  auto output_state_in_alloc = _tensor_builder->at(output_state_in_index).get();
-  auto cell_state_in_alloc = _tensor_builder->at(cell_state_in_index).get();
-
-  auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
-
-  auto fn = std::make_unique<::arm_compute::CLLSTMLayer>();
-
-  ::arm_compute::LSTMParams<::arm_compute::ICLTensor> lstm_params{};
-  if (has_cifg_param)
-  {
-    auto input_to_input_weights_alloc =
-        _tensor_builder->at(input_to_input_weights_index).get(); // optional
-    auto recurrent_to_input_weights_alloc =
-        _tensor_builder->at(recurrent_to_input_weights_index).get(); // optional
-    auto cell_to_input_weights_handle =
-        has_peephole_param ? _tensor_builder->at(cell_to_input_weights_index).get()->handle()
-                           : nullptr; // optional (non-cifg && peephole)
-    auto input_gate_bias_alloc = _tensor_builder->at(input_gate_bias_index).get(); // optional
-    lstm_params.set_cifg_params(input_to_input_weights_alloc->handle(),
-                                recurrent_to_input_weights_alloc->handle(),
-                                cell_to_input_weights_handle, input_gate_bias_alloc->handle());
-  }
-  if (has_peephole_param)
-  {
-    auto cell_to_forget_weights_alloc =
-        _tensor_builder->at(cell_to_forget_weights_index).get(); // optional
-    auto cell_to_output_weights_alloc =
-        _tensor_builder->at(cell_to_output_weights_index).get(); // optional
-    lstm_params.set_peephole_params(cell_to_forget_weights_alloc->handle(),
-                                    cell_to_output_weights_alloc->handle());
-  }
-  if (has_projection_param)
-  {
-    auto projection_weights_alloc = _tensor_builder->at(projection_weights_index).get(); // optional
-    auto projection_bias_handle = has_projection_bias
-                                      ? _tensor_builder->at(projection_bias_index).get()->handle()
-                                      : nullptr; // optional
-    lstm_params.set_projection_params(projection_weights_alloc->handle(), projection_bias_handle);
-  }
-
-  fn->configure(
-      input_alloc->handle(), input_to_forget_weights_alloc->handle(),
-      input_to_cell_weights_alloc->handle(), input_to_output_weights_alloc->handle(),
-      recurrent_to_forget_weights_alloc->handle(), recurrent_to_cell_weights_alloc->handle(),
-      recurrent_to_output_weights_alloc->handle(), forget_gate_bias_alloc->handle(),
-      cell_bias_alloc->handle(), output_gate_bias_alloc->handle(), output_state_in_alloc->handle(),
-      cell_state_in_alloc->handle(), scratch_buffer_alloc->handle(),
-      output_state_out_alloc->handle(), cell_state_out_alloc->handle(), output_alloc->handle(),
-      lstm_params, act_info, cell_clip, projection_clip);
-
-  auto acl_fn = asAclClFunction(std::move(fn));
-
-  _return_fn = std::move(acl_fn);
+  _return_fn = acl_common::kernelGenLSTM<acl_common::AclClFunction, ::arm_compute::ICLTensor,
+                                         ::arm_compute::CLLSTMLayer>(node, _ctx, _tensor_builder);
 }
 
 void KernelGenerator::visit(const ir::operation::Comparison &node)
@@ -1063,13 +811,13 @@ void KernelGenerator::visit(const ir::operation::Comparison &node)
 
   const auto comparison_type = node.param().comparison_type;
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input0_alloc = _tensor_builder->at(input0_index).get();
-  auto input1_alloc = _tensor_builder->at(input1_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input0_tensor = _tensor_builder->at(input0_index).get();
+  auto input1_tensor = _tensor_builder->at(input1_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLComparison>();
 
-  fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(),
+  fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
                 (arm_compute::ComparisonOperation)comparison_type);
 
   auto acl_fn = asAclClFunction(std::move(fn));
@@ -1107,13 +855,13 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
   for (const auto &input_index : input_indexes)
   {
     size_t input_rank = _ctx.at(input_index).shape().rank();
-    const auto &input_alloc = _tensor_builder->at(input_index);
-    orig_inputs_acl_tensor_shapes.emplace_back(input_alloc->info()->tensor_shape());
-    assert(input_rank == input_alloc->num_dimensions());
-    if (input_rank != input_alloc->info()->num_dimensions())
+    const auto &input_tensor = _tensor_builder->at(input_index);
+    orig_inputs_acl_tensor_shapes.emplace_back(input_tensor->info()->tensor_shape());
+    assert(input_rank == input_tensor->num_dimensions());
+    if (input_rank != input_tensor->info()->num_dimensions())
     {
       // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
-      input_alloc->info()->set_tensor_shape(acl_common::asTensorShape(
+      input_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
           _ctx.at(input_index).shape(), _current_op_seq_layout, backend_layout, false));
     }
   }
@@ -1135,8 +883,8 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
   const auto ofm_idx{node.getOutputs().at(0)};
   const auto ifm_idx{node.getInputs().at(0)};
   const auto permute_type = node.getPermuteType();
-  auto ofm_alloc = _tensor_builder->at(ofm_idx).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_idx).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_idx).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_idx).get();
   const auto rank = _ctx.at(ofm_idx).shape().rank();
   assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank());
 
@@ -1149,7 +897,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
 
     auto l = std::make_unique<::arm_compute::CLPermute>();
 
-    l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv);
+    l->configure(ifm_tensor->handle(), ofm_tensor->handle(), pv);
 
     fn = std::move(l);
   }
@@ -1160,7 +908,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
 
     auto l = std::make_unique<::arm_compute::CLPermute>();
 
-    l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv);
+    l->configure(ifm_tensor->handle(), ofm_tensor->handle(), pv);
 
     fn = std::move(l);
   }
@@ -1168,7 +916,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
   {
     auto l = std::make_unique<::arm_compute::CLCopy>();
 
-    l->configure(ifm_alloc->handle(), ofm_alloc->handle());
+    l->configure(ifm_tensor->handle(), ofm_tensor->handle());
 
     fn = std::move(l);
   }
@@ -1183,12 +931,12 @@ void KernelGenerator::visit(const ir::operation::RSQRT &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::RSQRT::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLRsqrtLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle());
 
   _return_fn = asAclClFunction(std::move(fn));
 }
@@ -1198,15 +946,15 @@ void KernelGenerator::visit(const ir::operation::ReLU &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::ReLU::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   auto fn = std::make_unique<arm_compute::CLActivationLayer>();
 
   const ::arm_compute::ActivationLayerInfo act_info{
       ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU};
 
-  fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
+  fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1219,12 +967,12 @@ void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
 
   const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLScale>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(),
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(),
                 ::arm_compute::InterpolationPolicy::BILINEAR, ::arm_compute::BorderMode::REPLICATE,
                 ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT);
 
@@ -1238,15 +986,15 @@ void KernelGenerator::visit(const ir::operation::ReLU1 &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::ReLU1::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   const ::arm_compute::ActivationLayerInfo act_info{
       ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f};
 
   auto fn = std::make_unique<::arm_compute::CLActivationLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info);
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1258,15 +1006,15 @@ void KernelGenerator::visit(const ir::operation::ReLU6 &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::ReLU6::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   const ::arm_compute::ActivationLayerInfo act_info{
       ::arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f};
 
   auto fn = std::make_unique<::arm_compute::CLActivationLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info);
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1288,25 +1036,25 @@ void KernelGenerator::visit(const ir::operation::RNN &node)
 
   const auto activation = node.param().activation;
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto hidden_state_out_alloc = _tensor_builder->at(hidden_state_out_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto hidden_state_out_tensor = _tensor_builder->at(hidden_state_out_index).get();
 
-  auto input_alloc = _tensor_builder->at(input_index).get();
-  auto weights_alloc = _tensor_builder->at(weights_index).get();
-  auto recurrent_weights_alloc = _tensor_builder->at(recurrent_weights_index).get();
-  auto bias_alloc = _tensor_builder->at(bias_index).get();
-  auto hidden_state_in_alloc = _tensor_builder->at(hidden_state_in_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
+  auto weights_tensor = _tensor_builder->at(weights_index).get();
+  auto recurrent_weights_tensor = _tensor_builder->at(recurrent_weights_index).get();
+  auto bias_tensor = _tensor_builder->at(bias_index).get();
+  auto hidden_state_in_tensor = _tensor_builder->at(hidden_state_in_index).get();
   auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
 
   auto copy_layer = std::make_unique<::arm_compute::CLCopy>();
-  copy_layer->configure(hidden_state_in_alloc->handle(), hidden_state_out_alloc->handle());
+  copy_layer->configure(hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle());
   _return_fn = asAclClFunction(std::move(copy_layer));
 
-  auto fn = std::make_unique<::arm_compute::CLRNNLayerEx>(
+  auto fn = std::make_unique<::arm_compute::CLRNNLayer>(
       _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
-  fn->configure(input_alloc->handle(), weights_alloc->handle(), recurrent_weights_alloc->handle(),
-                bias_alloc->handle(), hidden_state_out_alloc->handle(), output_alloc->handle(),
-                act_info);
+  fn->configure(input_tensor->handle(), weights_tensor->handle(),
+                recurrent_weights_tensor->handle(), bias_tensor->handle(),
+                hidden_state_out_tensor->handle(), output_tensor->handle(), act_info);
   _return_fn = asAclClFunction(std::move(fn));
 }
 
@@ -1315,12 +1063,12 @@ void KernelGenerator::visit(const ir::operation::Floor &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::Floor::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLFloor>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle());
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1335,10 +1083,10 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
       node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
   const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  auto block_size_alloc = _tensor_builder->at(block_size_index).get();
-  auto paddings_alloc = _tensor_builder->at(paddings_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+  auto block_size_tensor = _tensor_builder->at(block_size_index).get();
+  auto paddings_tensor = _tensor_builder->at(paddings_index).get();
 
   assert(_ctx.at(block_size_index).data());
   assert(_ctx.at(paddings_index).data());
@@ -1346,8 +1094,8 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
   std::unique_ptr<::arm_compute::IFunction> fn;
 
   auto l = std::make_unique<::arm_compute::CLSpaceToBatchLayer>();
-  l->configure(ifm_alloc->handle(), block_size_alloc->handle(), paddings_alloc->handle(),
-               ofm_alloc->handle());
+  l->configure(ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(),
+               ofm_tensor->handle());
   fn = std::move(l);
 
   auto acl_fn = asAclClFunction(std::move(fn));
@@ -1362,12 +1110,12 @@ void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
 
   auto block_size = node.param().block_size;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
-  auto fn = std::make_unique<::arm_compute::CLSpaceToDepth>();
+  auto fn = std::make_unique<::arm_compute::CLSpaceToDepthLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), block_size);
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), block_size);
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1376,32 +1124,15 @@ void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
 
 void KernelGenerator::visit(const ir::operation::L2Pool2D &node)
 {
-  const auto ofm_index{node.getOutputs().at(0)};
-  const auto ifm_index{node.getInputs().at(ir::operation::L2Pool2D::Input::INPUT)};
+  auto raw_fn = acl_common::kernelGenPool2D<::arm_compute::CLPoolingLayer>(
+      node, _ctx, _tensor_builder, _current_op_seq_layout, ::arm_compute::PoolingType::L2);
 
-  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
-  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
-
-  uint32_t kw = node.param().kw;
-  uint32_t kh = node.param().kh;
-  const auto stride = node.param().stride;
-  const auto padding =
-      ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
+  const auto ofm_index{node.getOutputs().at(0)};
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
   const auto activation = node.param().activation;
-
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-
-  ::arm_compute::PoolingLayerInfo info{
-      ::arm_compute::PoolingType::L2, ::arm_compute::Size2D{kw, kh},
-      ::onert::backend::acl_common::asPadStrideInfo(padding, stride)};
-
-  auto fn = std::make_unique<::arm_compute::CLPoolingLayer>();
-
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
-
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclClFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+      asAclClFunction(std::move(raw_fn)),
+      ActivationBuilder::generate(activation, ofm_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
@@ -1410,13 +1141,13 @@ void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
   const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
   const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto lookups_alloc = _tensor_builder->at(lookups_index).get();
-  auto values_alloc = _tensor_builder->at(values_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto lookups_tensor = _tensor_builder->at(lookups_index).get();
+  auto values_tensor = _tensor_builder->at(values_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLEmbeddingLookup>();
 
-  fn->configure(values_alloc->handle(), output_alloc->handle(), lookups_alloc->handle());
+  fn->configure(values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle());
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1442,15 +1173,15 @@ void KernelGenerator::visit(const ir::operation::L2Normalization &node)
   float beta = 0.5f;                             // pow(reduction, -0.5) = 1 / sqrt(reduction)
   float bias = 0.0f;                             // Don't offset the reduction.
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
                                                                radius, alpha, beta, bias, false);
 
   auto fn = std::make_unique<::arm_compute::CLNormalizationLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1466,17 +1197,17 @@ void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
   const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
   const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto hits_alloc = _tensor_builder->at(hits_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto hits_tensor = _tensor_builder->at(hits_index).get();
 
-  auto lookups_alloc = _tensor_builder->at(lookups_index).get();
-  auto keys_alloc = _tensor_builder->at(keys_index).get();
-  auto values_alloc = _tensor_builder->at(values_index).get();
+  auto lookups_tensor = _tensor_builder->at(lookups_index).get();
+  auto keys_tensor = _tensor_builder->at(keys_index).get();
+  auto values_tensor = _tensor_builder->at(values_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLHashtableLookup>();
 
-  fn->configure(lookups_alloc->handle(), keys_alloc->handle(), values_alloc->handle(),
-                output_alloc->handle(), hits_alloc->handle());
+  fn->configure(lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(),
+                output_tensor->handle(), hits_tensor->handle());
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1489,13 +1220,13 @@ void KernelGenerator::visit(const ir::operation::PReLU &node)
   const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)};
   const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  auto alpha_alloc = _tensor_builder->at(alpha_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+  auto alpha_tensor = _tensor_builder->at(alpha_index).get();
 
-  auto fn = std::make_unique<::arm_compute::CLPReLU>();
+  auto fn = std::make_unique<::arm_compute::CLPReluLayer>();
 
-  fn->configure(ifm_alloc->handle(), alpha_alloc->handle(), ofm_alloc->handle());
+  fn->configure(ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle());
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1518,7 +1249,6 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node)
          (node.param().padding.type == ir::PaddingType::VALID));
   auto padding = ir::calculatePadding(node.param().padding, ofm_shape, ifm_shape, stride,
                                       ker_shape.W, ker_shape.H);
-
   uint32_t invalid_horizontal = 0;
   uint32_t invalid_vertical = 0;
   if (node.param().padding.type == ir::PaddingType::VALID)
@@ -1528,17 +1258,17 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node)
     invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
   }
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  auto ker_alloc = _tensor_builder->at(ker_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+  auto ker_tensor = _tensor_builder->at(ker_index).get();
 
   const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
 
   auto fn = std::make_unique<::arm_compute::CLTransposeConvLayer>(
       _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
 
-  fn->configure(ifm_alloc->handle(), ker_alloc->handle(), nullptr, ofm_alloc->handle(), tconv_info,
-                invalid_horizontal, invalid_vertical);
+  fn->configure(ifm_tensor->handle(), ker_tensor->handle(), nullptr, ofm_tensor->handle(),
+                tconv_info, invalid_horizontal, invalid_vertical);
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1550,15 +1280,15 @@ void KernelGenerator::visit(const ir::operation::SQRT &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::SQRT::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   const ::arm_compute::ActivationLayerInfo act_info{
       ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
 
   auto fn = std::make_unique<::arm_compute::CLActivationLayer>();
 
-  fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
+  fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1571,13 +1301,13 @@ void KernelGenerator::visit(const ir::operation::LogicalOr &node)
   const auto input0_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT0)};
   const auto input1_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT1)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input0_alloc = _tensor_builder->at(input0_index).get();
-  auto input1_alloc = _tensor_builder->at(input1_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input0_tensor = _tensor_builder->at(input0_index).get();
+  auto input1_tensor = _tensor_builder->at(input1_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLBitwiseOr>();
 
-  fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle());
+  fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle());
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1589,12 +1319,12 @@ void KernelGenerator::visit(const ir::operation::LogicalNot &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::LogicalNot::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLBitwiseNot>();
 
-  fn->configure(input_alloc->handle(), output_alloc->handle());
+  fn->configure(input_tensor->handle(), output_tensor->handle());
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1607,13 +1337,13 @@ void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
   const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLElementwiseSquaredDiff>();
 
-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
+  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1634,13 +1364,13 @@ void KernelGenerator::visit(const ir::operation::TopKV2 &node)
 
   const auto k = node.param().k;
 
-  auto values_alloc = _tensor_builder->at(outputValues_index).get();
-  auto indices_alloc = _tensor_builder->at(outputIndices_index).get();
-  auto input_alloc = _tensor_builder->at(inputData_index).get();
+  auto values_tensor = _tensor_builder->at(outputValues_index).get();
+  auto indices_tensor = _tensor_builder->at(outputIndices_index).get();
+  auto input_tensor = _tensor_builder->at(inputData_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLTopKV2>();
 
-  fn->configure(input_alloc->handle(), k, values_alloc->handle(), indices_alloc->handle());
+  fn->configure(input_tensor->handle(), k, values_tensor->handle(), indices_tensor->handle());
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1659,9 +1389,9 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
   const auto axis_value = (axis_raw < 0 ? (ifm_rank + axis_raw) : axis_raw);
   const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  auto indices_alloc = _tensor_builder->at(indices_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+  auto indices_tensor = _tensor_builder->at(indices_index).get();
 
   // NOTE The frontend layout and backend layout must be the same for this operation.
   //      If not the same, we have to add a stage(?) to perform permutation of output tensor. It
@@ -1671,43 +1401,43 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
   //      a model. For example, if a model in NHWC has this operation as output rank == 4, indices
   //      rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
   //      and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
-  const auto backend_layout = ofm_alloc->layout();
+  const auto backend_layout = ofm_tensor->layout();
   UNUSED_RELEASE(backend_layout);
-  assert(backend_layout == ifm_alloc->layout());
-  assert(backend_layout == indices_alloc->layout());
+  assert(backend_layout == ifm_tensor->layout());
+  assert(backend_layout == indices_tensor->layout());
   assert(ifm_rank < 4 || _current_op_seq_layout == backend_layout);
 
   auto fn = std::make_unique<::arm_compute::CLGatherEx>();
 
   // input is n-D, indices k-D, output is (n + k - 1)-D
   size_t n = ifm_rank;
-  assert(n == ifm_alloc->num_dimensions());
+  assert(n == ifm_tensor->num_dimensions());
   size_t k = _ctx.at(indices_index).shape().rank();
-  assert(k == indices_alloc->num_dimensions());
+  assert(k == indices_tensor->num_dimensions());
 
   // Disable applied dim_correction
-  const auto orig_ifm_acl_tensor_shape = ifm_alloc->info()->tensor_shape();
-  if (n != ifm_alloc->info()->num_dimensions())
+  const auto orig_ifm_acl_tensor_shape = ifm_tensor->info()->tensor_shape();
+  if (n != ifm_tensor->info()->num_dimensions())
   {
     // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
     const auto ifm = _ctx.at(ifm_index);
-    ifm_alloc->info()->set_tensor_shape(
+    ifm_tensor->info()->set_tensor_shape(
         acl_common::asTensorShape(ifm.shape(), _current_op_seq_layout, backend_layout, false));
   }
-  const auto orig_indice_acl_tensor_shape = indices_alloc->info()->tensor_shape();
-  if (k != indices_alloc->info()->num_dimensions())
+  const auto orig_indice_acl_tensor_shape = indices_tensor->info()->tensor_shape();
+  if (k != indices_tensor->info()->num_dimensions())
   {
     // This means that high dimension's value is 1 and indices tensor is applied dim_correction
     const auto indices = _ctx.at(indices_index);
-    indices_alloc->info()->set_tensor_shape(
+    indices_tensor->info()->set_tensor_shape(
         acl_common::asTensorShape(indices.shape(), _current_op_seq_layout, backend_layout, false));
   }
 
-  fn->configure(ifm_alloc->handle(), indices_alloc->handle(), ofm_alloc->handle(), axis);
+  fn->configure(ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis);
 
   // Revert disabling applied dim_correction
-  ifm_alloc->info()->set_tensor_shape(orig_ifm_acl_tensor_shape);
-  indices_alloc->info()->set_tensor_shape(orig_indice_acl_tensor_shape);
+  ifm_tensor->info()->set_tensor_shape(orig_ifm_acl_tensor_shape);
+  indices_tensor->info()->set_tensor_shape(orig_indice_acl_tensor_shape);
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1719,12 +1449,12 @@ void KernelGenerator::visit(const ir::operation::Neg &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::Neg::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLNeg>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle());
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1736,15 +1466,15 @@ void KernelGenerator::visit(const ir::operation::Abs &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Abs::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   const ::arm_compute::ActivationLayerInfo act_info{
       ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
 
   auto fn = std::make_unique<::arm_compute::CLActivationLayer>();
 
-  fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
+  fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1761,11 +1491,11 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node)
 
   assert((ifm_shape.rank() - 1) == ofm_shape.rank());
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
   auto frontend_layout = _current_op_seq_layout;
-  auto backend_layout = ifm_alloc->layout();
+  auto backend_layout = ifm_tensor->layout();
 
   int axis_value = node.param().axis;
   if (axis_value < 0)
@@ -1776,10 +1506,10 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node)
   auto acl_axis =
       acl_common::ToARMComputeAxis(ifm_rank, axis_value, frontend_layout, backend_layout).value();
 
-  auto fn = std::make_unique<::arm_compute::CLArgOperation>();
+  auto fn = std::make_unique<::arm_compute::CLArgMinMaxLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), {acl_axis},
-                ::arm_compute::ArgOperation::MAX);
+  fn->configure(ifm_tensor->handle(), acl_axis, ofm_tensor->handle(),
+                ::arm_compute::ReductionOperation::ARG_IDX_MAX);
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1791,12 +1521,12 @@ void KernelGenerator::visit(const ir::operation::Dequantize &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Dequantize::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
-  auto fn = std::make_unique<::arm_compute::CLCast>();
+  auto fn = std::make_unique<::arm_compute::CLDequantizationLayer>();
 
-  fn->configure(input_alloc->handle(), output_alloc->handle(), arm_compute::SubDataType::NONE);
+  fn->configure(input_tensor->handle(), output_tensor->handle());
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1814,15 +1544,15 @@ void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &nod
   auto beta = node.param().beta;
   auto bias = node.param().bias;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   const auto norm_info = ::arm_compute::NormalizationLayerInfo(
       ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
 
   auto fn = std::make_unique<::arm_compute::CLNormalizationLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1837,12 +1567,12 @@ void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
   auto block_size = node.param().block_size;
   assert(block_size > 0);
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
-  auto fn = std::make_unique<::arm_compute::CLDepthToSpace>();
+  auto fn = std::make_unique<::arm_compute::CLDepthToSpaceLayer>();
 
-  fn->configure(input_alloc->handle(), output_alloc->handle(), block_size);
+  fn->configure(input_tensor->handle(), output_tensor->handle(), block_size);
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -1860,13 +1590,13 @@ void KernelGenerator::visit(const ir::operation::Split &node)
   for (const auto &output : node.getOutputs())
     output_indexes.emplace_back(output);
 
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  std::vector<arm_compute::ICLTensor *> output_allocs;
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+  std::vector<arm_compute::ICLTensor *> output_tensors;
   for (const auto &ofm_ind : output_indexes)
-    output_allocs.emplace_back(_tensor_builder->at(ofm_ind).get()->handle());
+    output_tensors.emplace_back(_tensor_builder->at(ofm_ind).get()->handle());
 
   const auto frontend_layout = _current_op_seq_layout;
-  const auto backend_layout = ifm_alloc->layout();
+  const auto backend_layout = ifm_tensor->layout();
   auto axis = node.param().axis;
   if (axis < 0)
     axis += ifm_rank;
@@ -1874,7 +1604,7 @@ void KernelGenerator::visit(const ir::operation::Split &node)
 
   auto fn = std::make_unique<::arm_compute::CLSplit>();
 
-  fn->configure(ifm_alloc->handle(), output_allocs, axis);
+  fn->configure(ifm_tensor->handle(), output_tensors, axis);
 
   _return_fn = asAclClFunction(std::move(fn));
 }
@@ -1906,13 +1636,13 @@ void KernelGenerator::visit(const ir::operation::Unpack &node)
   for (const auto &output_index : output_indexes)
   {
     size_t output_rank = _ctx.at(output_index).shape().rank();
-    const auto &output_alloc = _tensor_builder->at(output_index);
-    orig_outputs_acl_tensor_shapes.emplace_back(output_alloc->info()->tensor_shape());
-    assert(output_rank == output_alloc->num_dimensions());
-    if (output_rank != output_alloc->info()->num_dimensions())
+    const auto &output_tensor = _tensor_builder->at(output_index);
+    orig_outputs_acl_tensor_shapes.emplace_back(output_tensor->info()->tensor_shape());
+    assert(output_rank == output_tensor->num_dimensions());
+    if (output_rank != output_tensor->info()->num_dimensions())
     {
       // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
-      output_alloc->info()->set_tensor_shape(acl_common::asTensorShape(
+      output_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
           _ctx.at(output_index).shape(), _current_op_seq_layout, backend_layout, false));
     }
   }
@@ -1959,12 +1689,12 @@ void KernelGenerator::visit(const ir::operation::Pad &node)
 
   // Disable applied dim_correction
   size_t input_rank = _ctx.at(input_index).shape().rank();
-  const auto &input_alloc = _tensor_builder->at(input_index);
-  assert(input_rank == input_alloc->num_dimensions());
-  if (input_rank != input_alloc->info()->num_dimensions())
+  const auto &input_tensor = _tensor_builder->at(input_index);
+  assert(input_rank == input_tensor->num_dimensions());
+  if (input_rank != input_tensor->info()->num_dimensions())
   {
     // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
-    input_alloc->info()->set_tensor_shape(acl_common::asTensorShape(
+    input_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
         _ctx.at(input_index).shape(), frontend_layout, backend_layout, false));
   }
 
@@ -1982,13 +1712,13 @@ void KernelGenerator::visit(const ir::operation::Min &node)
   const auto lhs_index{node.getInputs().at(ir::operation::Min::Input::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::Min::Input::RHS)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLElementwiseMin>();
 
-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
+  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -2001,13 +1731,13 @@ void KernelGenerator::visit(const ir::operation::Max &node)
   const auto lhs_index{node.getInputs().at(ir::operation::Max::Input::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::Max::Input::RHS)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLElementwiseMax>();
 
-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
+  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
 
   auto acl_fn = asAclClFunction(std::move(fn));
 
@@ -2019,12 +1749,12 @@ void KernelGenerator::visit(const ir::operation::ConvertFp32ToFp16 &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp32ToFp16::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLDepthConvertLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), ::arm_compute::ConvertPolicy::SATURATE,
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE,
                 0);
 
   auto acl_fn = asAclClFunction(std::move(fn));
@@ -2037,12 +1767,12 @@ void KernelGenerator::visit(const ir::operation::ConvertFp16ToFp32 &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::ConvertFp16ToFp32::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   auto fn = std::make_unique<::arm_compute::CLDepthConvertLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), ::arm_compute::ConvertPolicy::SATURATE,
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), ::arm_compute::ConvertPolicy::SATURATE,
                 0);
 
   auto acl_fn = asAclClFunction(std::move(fn));
diff --git a/runtime/onert/backend/acl_common/AclKernelGen.h b/runtime/onert/backend/acl_common/AclKernelGen.h
new file mode 100644
index 000000000..9f7ce3764
--- /dev/null
+++ b/runtime/onert/backend/acl_common/AclKernelGen.h
@@ -0,0 +1,315 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_ACL_COMMON_ACL_KERNEL_GEN_H_
+#define __ONERT_BACKEND_ACL_COMMON_ACL_KERNEL_GEN_H_
+
+#include <exec/IFunction.h>
+#include <ir/Operands.h>
+
+#include <ir/operation/LSTM.h>
+#include <arm_compute/runtime/CL/CLFunctions.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace acl_common
+{
+
+template <typename T_FunctionWrapper, typename T_Tensor, typename T_ACLLayer,
+          typename T_TensorBuilder>
+std::unique_ptr<exec::IFunction>
+kernelGenLSTM(const ir::operation::LSTM &node, const ir::Operands &operands,
+              const std::shared_ptr<T_TensorBuilder> &tensor_builder)
+{
+  // TODO Support dynamic rnn
+  // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection.
+  const auto scratch_buffer_index{
+      node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)};
+  const auto output_state_out_index{
+      node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)};
+  const auto cell_state_out_index{
+      node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)};
+  const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)};
+
+  const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)};
+  const auto input_to_input_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional
+  const auto input_to_forget_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)};
+  const auto input_to_cell_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)};
+  const auto input_to_output_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
+  const auto recurrent_to_input_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional
+  const auto recurrent_to_forget_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)};
+  const auto recurrent_to_cell_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)};
+  const auto recurrent_to_output_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
+  const auto cell_to_input_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional
+  const auto cell_to_forget_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional
+  const auto cell_to_output_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional
+  const auto input_gate_bias_index{
+      node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)};
+  const auto forget_gate_bias_index{
+      node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)};
+  const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)};
+  const auto output_gate_bias_index{
+      node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)};
+  const auto projection_weights_index{
+      node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional
+  const auto projection_bias_index{
+      node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; // optional
+  const auto output_state_in_index{
+      node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)};
+  const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)};
+  const auto cell_threshold = node.param().cell_threshold;
+  const auto projection_threshold = node.param().projection_threshold;
+
+  bool has_input_to_input_weights = operands.at(input_to_input_weights_index).shape().dim(0) != 0 &&
+                                    operands.at(input_to_input_weights_index).shape().dim(1) != 0;
+  bool has_recurrent_to_input_weights =
+      operands.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
+      operands.at(recurrent_to_input_weights_index).shape().dim(1) != 0;
+  bool has_cell_to_forget_weights = operands.at(cell_to_forget_weights_index).shape().dim(0) != 0;
+  bool has_cell_to_output_weights = operands.at(cell_to_output_weights_index).shape().dim(0) != 0;
+  bool has_projection_weights = operands.at(projection_weights_index).shape().dim(0) != 0 &&
+                                operands.at(projection_weights_index).shape().dim(1) != 0;
+  bool has_projection_bias = operands.at(projection_bias_index).shape().dim(0);
+
+  // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG.
+  // true: no CIFG
+  // false: CIFG
+  // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG).
+  bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights;
+
+  // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole.
+  // But the cell_to_input_weights does not exist in regular CIFG although peephole.
+  // true: peephole
+  // false: no peephole
+  bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights;
+
+  // NOTE Although the projection weights has data the projection bias may not have data.
+  bool has_projection_param = has_projection_weights;
+
+  const auto activation = node.param().activation;
+  const auto cell_clip = cell_threshold;
+  const auto projection_clip = projection_threshold;
+  assert(cell_clip >= 0.f && projection_clip >= 0.f);
+
+  auto scratch_buffer_tensor = tensor_builder->at(scratch_buffer_index).get();
+  auto output_state_out_tensor = tensor_builder->at(output_state_out_index).get();
+  auto cell_state_out_tensor = tensor_builder->at(cell_state_out_index).get();
+  auto output_tensor = tensor_builder->at(output_index).get();
+
+  auto input_tensor = tensor_builder->at(input_index).get();
+
+  auto input_to_forget_weights_tensor = tensor_builder->at(input_to_forget_weights_index).get();
+  auto input_to_cell_weights_tensor = tensor_builder->at(input_to_cell_weights_index).get();
+  auto input_to_output_weights_tensor = tensor_builder->at(input_to_output_weights_index).get();
+  auto recurrent_to_forget_weights_tensor =
+      tensor_builder->at(recurrent_to_forget_weights_index).get();
+  auto recurrent_to_cell_weights_tensor = tensor_builder->at(recurrent_to_cell_weights_index).get();
+  auto recurrent_to_output_weights_tensor =
+      tensor_builder->at(recurrent_to_output_weights_index).get();
+
+  auto forget_gate_bias_tensor = tensor_builder->at(forget_gate_bias_index).get();
+  auto cell_bias_tensor = tensor_builder->at(cell_bias_index).get();
+  auto output_gate_bias_tensor = tensor_builder->at(output_gate_bias_index).get();
+  auto output_state_in_tensor = tensor_builder->at(output_state_in_index).get();
+  auto cell_state_in_tensor = tensor_builder->at(cell_state_in_index).get();
+
+  auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
+
+  auto fn = std::make_unique<T_ACLLayer>();
+
+  ::arm_compute::LSTMParams<T_Tensor> lstm_params{};
+  if (has_cifg_param)
+  {
+    auto input_to_input_weights_tensor =
+        tensor_builder->at(input_to_input_weights_index).get(); // optional
+    auto recurrent_to_input_weights_tensor =
+        tensor_builder->at(recurrent_to_input_weights_index).get(); // optional
+    auto cell_to_input_weights_handle =
+        has_peephole_param ? tensor_builder->at(cell_to_input_weights_index).get()->handle()
+                           : nullptr; // optional (non-cifg && peephole)
+    auto input_gate_bias_tensor = tensor_builder->at(input_gate_bias_index).get(); // optional
+    lstm_params.set_cifg_params(input_to_input_weights_tensor->handle(),
+                                recurrent_to_input_weights_tensor->handle(),
+                                cell_to_input_weights_handle, input_gate_bias_tensor->handle());
+  }
+  if (has_peephole_param)
+  {
+    auto cell_to_forget_weights_tensor =
+        tensor_builder->at(cell_to_forget_weights_index).get(); // optional
+    auto cell_to_output_weights_tensor =
+        tensor_builder->at(cell_to_output_weights_index).get(); // optional
+    lstm_params.set_peephole_params(cell_to_forget_weights_tensor->handle(),
+                                    cell_to_output_weights_tensor->handle());
+  }
+  if (has_projection_param)
+  {
+    auto projection_weights_tensor = tensor_builder->at(projection_weights_index).get(); // optional
+    auto projection_bias_handle = has_projection_bias
+                                      ? tensor_builder->at(projection_bias_index).get()->handle()
+                                      : nullptr; // optional
+    lstm_params.set_projection_params(projection_weights_tensor->handle(), projection_bias_handle);
+  }
+
+  fn->configure(input_tensor->handle(), input_to_forget_weights_tensor->handle(),
+                input_to_cell_weights_tensor->handle(), input_to_output_weights_tensor->handle(),
+                recurrent_to_forget_weights_tensor->handle(),
+                recurrent_to_cell_weights_tensor->handle(),
+                recurrent_to_output_weights_tensor->handle(), forget_gate_bias_tensor->handle(),
+                cell_bias_tensor->handle(), output_gate_bias_tensor->handle(),
+                output_state_in_tensor->handle(), cell_state_in_tensor->handle(),
+                scratch_buffer_tensor->handle(), output_state_out_tensor->handle(),
+                cell_state_out_tensor->handle(), output_tensor->handle(), lstm_params, act_info,
+                cell_clip, projection_clip);
+
+  return std::make_unique<T_FunctionWrapper>(std::move(fn));
+}
+
+template <typename T_FunctionWrapper, typename T_Tensor, typename T_ACLLayer,
+          typename T_TensorBuilder>
+std::unique_ptr<exec::IFunction>
+kernelGenFullyConnected(const ir::operation::FullyConnected &node, const ir::Operands &operands,
+                        const std::shared_ptr<T_TensorBuilder> &tensor_builder, ir::Layout layout)
+{
+  using ir::operation::FullyConnected;
+
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)};
+  const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)};
+  const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)};
+
+  const auto input_rank = operands.at(input_index).shape().rank();
+
+  const auto output_size =
+      operands.at(output_index).shape().dim(operands.at(output_index).shape().rank() - 1);
+  UNUSED_RELEASE(output_size);
+  assert(operands.at(bias_index).shape().dim(0) == output_size);
+  assert(operands.at(weight_index).shape().dim(0) == output_size);
+  const auto batch_size =
+      operands.at(output_index).shape().dim(operands.at(output_index).shape().rank() - 2);
+  const auto input_size =
+      operands.at(weight_index).shape().dim(operands.at(weight_index).shape().rank() - 1);
+
+  // Check for reshaping input's shape into rank-2
+  bool needs_reshape = false;
+  ir::Shape reshape(2);
+  if (input_rank == 3 || input_rank == 4)
+  {
+    const auto &ifm_shape = operands.at(input_index).shape();
+    auto feature_size = 1;
+    for (int i = 0; i < ifm_shape.rank(); ++i)
+    {
+      feature_size *= ifm_shape.dim(i);
+    }
+
+    UNUSED_RELEASE(feature_size);
+    assert(feature_size == batch_size * input_size);
+
+    // for reshaping
+    needs_reshape = true;
+    reshape.dim(0) = batch_size; /* H */
+    reshape.dim(1) = input_size; /* W */
+  }
+
+  auto output_tensor = tensor_builder->at(output_index).get();
+  const auto input_tensor = tensor_builder->at(input_index).get();
+  const auto weight_tensor = tensor_builder->at(weight_index).get();
+  const auto bias_tensor = tensor_builder->at(bias_index).get();
+  const auto frontend_layout = layout;
+  const auto acl_layout = output_tensor->handle()->info()->data_layout();
+
+  auto fn =
+      std::make_unique<T_ACLLayer>(tensor_builder->acl_tensor_manager()->internal_buffer_manager());
+
+  typename T_ACLLayer::KernelType kernel_type = T_ACLLayer::KernelType::GENERAL;
+  if (operands.at(weight_index).isConstant())
+  {
+    kernel_type = T_ACLLayer::KernelType::PREPROCESSED_WEIGHTS;
+    assert(operands.at(weight_index).data());
+  }
+
+  fn->configure(
+      input_tensor->handle(), weight_tensor->handle(), bias_tensor->handle(),
+      output_tensor->handle(), needs_reshape,
+      ::onert::backend::acl_common::asTensorShape(
+          reshape, frontend_layout, ::onert::backend::acl_common::asRuntimeLayout(acl_layout)),
+      kernel_type);
+
+  return std::make_unique<T_FunctionWrapper>(std::move(fn));
+}
+
+template <typename T_ACLLayer, typename T_PoolOp, typename T_TensorBuilder>
+std::unique_ptr<::arm_compute::IFunction>
+kernelGenPool2D(const T_PoolOp &node, const ir::Operands &operands,
+                const std::shared_ptr<T_TensorBuilder> &tensor_builder, ir::Layout layout,
+                ::arm_compute::PoolingType pooling_type)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  const auto ifm_index{node.getInputs().at(0)};
+
+  const auto ofm_shape = operands.at(ofm_index).shape().asFeature(layout);
+  const auto ifm_shape = operands.at(ifm_index).shape().asFeature(layout);
+
+  const auto kh = node.param().kh;
+  const auto kw = node.param().kw;
+  const auto stride = node.param().stride;
+  const auto padding =
+      ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
+
+  VERBOSE(Pool2DParam) << "IFM_H: " << ifm_shape.H << std::endl;
+  VERBOSE(Pool2DParam) << "IFM_W: " << ifm_shape.W << std::endl;
+  VERBOSE(Pool2DParam) << "OFM_H: " << ofm_shape.H << std::endl;
+  VERBOSE(Pool2DParam) << "OFM_W: " << ofm_shape.W << std::endl;
+  VERBOSE(Pool2DParam) << "KER_H: " << kh << std::endl;
+  VERBOSE(Pool2DParam) << "KER_W: " << kw << std::endl;
+  VERBOSE(Pool2DParam) << "STRIDE_H: " << stride.vertical << std::endl;
+  VERBOSE(Pool2DParam) << "STRIDE_W: " << stride.horizontal << std::endl;
+  VERBOSE(Pool2DParam) << "PAD(T): " << padding.top << std::endl;
+  VERBOSE(Pool2DParam) << "PAD(B): " << padding.bottom << std::endl;
+  VERBOSE(Pool2DParam) << "PAD(L): " << padding.left << std::endl;
+  VERBOSE(Pool2DParam) << "PAD(R): " << padding.right << std::endl;
+
+  auto ofm_tensor = tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = tensor_builder->at(ifm_index).get();
+
+  ::arm_compute::PoolingLayerInfo info{
+      pooling_type, ::arm_compute::Size2D{kw, kh}, ifm_tensor->info()->data_layout(),
+      acl_common::asPadStrideInfo(padding, stride), true /* exclude_padding */};
+
+  auto fn = std::make_unique<T_ACLLayer>();
+
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), info);
+
+  return fn;
+}
+
+} // namespace acl_common
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_ACL_COMMON_ACL_KERNEL_GEN_H_
diff --git a/runtime/onert/backend/acl_neon/KernelGenerator.cc b/runtime/onert/backend/acl_neon/KernelGenerator.cc
index e47186754..1195b83cc 100644
--- a/runtime/onert/backend/acl_neon/KernelGenerator.cc
+++ b/runtime/onert/backend/acl_neon/KernelGenerator.cc
@@ -31,6 +31,7 @@
 #include "exec/NopFunction.h"
 #include "util/logging.h"
 #include "util/Utils.h"
+#include "AclKernelGen.h"
 
 namespace onert
 {
@@ -74,15 +75,15 @@ void KernelGenerator::visit(const ir::operation::Abs &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Abs::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   const ::arm_compute::ActivationLayerInfo act_info{
       ::arm_compute::ActivationLayerInfo::ActivationFunction::ABS};
 
   auto fn = std::make_unique<::arm_compute::NEActivationLayer>();
 
-  fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
+  fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -96,10 +97,10 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node)
 
   const auto ifm_rank = _ctx.at(ifm_index).shape().rank();
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
   auto frontend_layout = _current_op_seq_layout;
-  auto backend_layout = ifm_alloc->layout();
+  auto backend_layout = ifm_tensor->layout();
 
   int axis_value = node.param().axis;
   if (axis_value < 0)
@@ -112,7 +113,7 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node)
 
   auto fn = std::make_unique<::arm_compute::NEArgMinMaxLayer>();
 
-  fn->configure(ifm_alloc->handle(), fixed_axis, ofm_alloc->handle(),
+  fn->configure(ifm_tensor->handle(), fixed_axis, ofm_tensor->handle(),
                 arm_compute::ReductionOperation::ARG_IDX_MAX);
 
   auto acl_fn = asAclFunction(std::move(fn));
@@ -127,15 +128,15 @@ void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
   const auto block_size_index{
       node.getInputs().at(ir::operation::BatchToSpaceND::Input::BLOCK_SIZE)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  auto block_size_alloc = _tensor_builder->at(block_size_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+  auto block_size_tensor = _tensor_builder->at(block_size_index).get();
 
   assert(_ctx.at(block_size_index).data());
 
   auto fn = std::make_unique<::arm_compute::NEBatchToSpaceLayer>();
 
-  fn->configure(ifm_alloc->handle(), block_size_alloc->handle(), ofm_alloc->handle());
+  fn->configure(ifm_tensor->handle(), block_size_tensor->handle(), ofm_tensor->handle());
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -147,15 +148,26 @@ void KernelGenerator::visit(const ir::operation::Cast &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::Cast::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
-  auto fn = std::make_unique<::arm_compute::NECast>();
+  std::unique_ptr<::arm_compute::IFunction> fn;
+  if (ifm_tensor->data_type() == ofm_tensor->data_type())
+  {
+    auto l = std::make_unique<::arm_compute::NECopy>();
 
-  auto input_sub_type = _ctx.at(ifm_index).typeInfo().type() == ir::DataType::BOOL8
-                            ? arm_compute::SubDataType::BOOL
-                            : arm_compute::SubDataType::NONE;
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), input_sub_type);
+    l->configure(ifm_tensor->handle(), ofm_tensor->handle());
+
+    fn = std::move(l);
+  }
+  else
+  {
+    auto l = std::make_unique<::arm_compute::NECast>();
+
+    l->configure(ifm_tensor->handle(), ofm_tensor->handle(), arm_compute::ConvertPolicy::SATURATE);
+
+    fn = std::move(l);
+  }
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -183,10 +195,10 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
                                             ker_width, ker_height);
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  auto ker_alloc = _tensor_builder->at(ker_index).get();
-  auto bias_alloc = _tensor_builder->at(bias_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+  auto ker_tensor = _tensor_builder->at(ker_index).get();
+  auto bias_tensor = _tensor_builder->at(bias_index).get();
 
   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
   const auto act_info = acl_common::asActivationLayerInfo(activation);
@@ -194,8 +206,9 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
   auto fn = std::make_unique<::arm_compute::NEConvolutionLayer>(
       _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
 
-  fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(), ofm_alloc->handle(),
-                conv_info, ::arm_compute::WeightsInfo(), ::arm_compute::Size2D(1U, 1U), act_info);
+  fn->configure(ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(),
+                ofm_tensor->handle(), conv_info, ::arm_compute::WeightsInfo(),
+                ::arm_compute::Size2D(1U, 1U), act_info);
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -208,12 +221,12 @@ void KernelGenerator::visit(const ir::operation::DepthToSpace &node)
   auto block_size = node.param().block_size;
   assert(block_size > 0);
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
-  auto fn = std::make_unique<::arm_compute::NEDepthToSpaceLayerEx>();
+  auto fn = std::make_unique<::arm_compute::NEDepthToSpaceLayer>();
 
-  fn->configure(input_alloc->handle(), output_alloc->handle(), block_size);
+  fn->configure(input_tensor->handle(), output_tensor->handle(), block_size);
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -242,10 +255,10 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
   const auto multiplier = node.param().multiplier;
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  auto ker_alloc = _tensor_builder->at(ker_index).get();
-  auto bias_alloc = _tensor_builder->at(bias_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+  auto ker_tensor = _tensor_builder->at(ker_index).get();
+  auto bias_tensor = _tensor_builder->at(bias_index).get();
 
   const auto conv_info = acl_common::asPadStrideInfo(padding, stride);
   const auto act_info = acl_common::asActivationLayerInfo(activation);
@@ -253,8 +266,8 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
   {
     auto fn = std::make_unique<::arm_compute::NEDepthwiseConvolutionLayer>();
 
-    fn->configure(ifm_alloc->handle(), ker_alloc->handle(), bias_alloc->handle(),
-                  ofm_alloc->handle(), conv_info, multiplier, act_info);
+    fn->configure(ifm_tensor->handle(), ker_tensor->handle(), bias_tensor->handle(),
+                  ofm_tensor->handle(), conv_info, multiplier, act_info);
 
     _return_fn = asAclFunction(std::move(fn));
   }
@@ -265,12 +278,12 @@ void KernelGenerator::visit(const ir::operation::Dequantize &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Dequantize::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   auto fn = std::make_unique<::arm_compute::NEDequantizationLayer>();
 
-  fn->configure(input_alloc->handle(), output_alloc->handle());
+  fn->configure(input_tensor->handle(), output_tensor->handle());
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -279,88 +292,28 @@ void KernelGenerator::visit(const ir::operation::Dequantize &node)
 
 void KernelGenerator::visit(const ir::operation::MaxPool2D &node)
 {
-  const auto ofm_index{node.getOutputs().at(0)};
-  const auto ifm_index{node.getInputs().at(ir::operation::MaxPool2D::Input::INPUT)};
-
-  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
-  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
+  auto raw_fn = acl_common::kernelGenPool2D<::arm_compute::NEPoolingLayer>(
+      node, _ctx, _tensor_builder, _current_op_seq_layout, ::arm_compute::PoolingType::MAX);
 
-  const auto kh = node.param().kh;
-  const auto kw = node.param().kw;
-  const auto stride = node.param().stride;
-  const auto padding =
-      ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
+  const auto ofm_index{node.getOutputs().at(0)};
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
   const auto activation = node.param().activation;
-
-  VERBOSE(MaxPool2D) << "IFM_H: " << ifm_shape.H << std::endl;
-  VERBOSE(MaxPool2D) << "IFM_W: " << ifm_shape.W << std::endl;
-  VERBOSE(MaxPool2D) << "OFM_H: " << ofm_shape.H << std::endl;
-  VERBOSE(MaxPool2D) << "OFM_W: " << ofm_shape.W << std::endl;
-  VERBOSE(MaxPool2D) << "KER_H: " << kh << std::endl;
-  VERBOSE(MaxPool2D) << "KER_W: " << kw << std::endl;
-  VERBOSE(MaxPool2D) << "STRIDE_H: " << stride.vertical << std::endl;
-  VERBOSE(MaxPool2D) << "STRIDE_W: " << stride.horizontal << std::endl;
-  VERBOSE(MaxPool2D) << "PAD(T): " << padding.top << std::endl;
-  VERBOSE(MaxPool2D) << "PAD(B): " << padding.bottom << std::endl;
-  VERBOSE(MaxPool2D) << "PAD(L): " << padding.left << std::endl;
-  VERBOSE(MaxPool2D) << "PAD(R): " << padding.right << std::endl;
-
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-
-  ::arm_compute::PoolingLayerInfo info{::arm_compute::PoolingType::MAX,
-                                       ::arm_compute::Size2D{kw, kh},
-                                       acl_common::asPadStrideInfo(padding, stride)};
-
-  auto fn = std::make_unique<::arm_compute::NEPoolingLayer>();
-
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
-
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+      asAclFunction(std::move(raw_fn)),
+      ActivationBuilder::generate(activation, ofm_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::AvgPool2D &node)
 {
-  const auto ofm_index{node.getOutputs().at(0)};
-  const auto ifm_index{node.getInputs().at(ir::operation::AvgPool2D::Input::INPUT)};
-
-  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
-  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
+  auto raw_fn = acl_common::kernelGenPool2D<::arm_compute::NEPoolingLayer>(
+      node, _ctx, _tensor_builder, _current_op_seq_layout, ::arm_compute::PoolingType::AVG);
 
-  const auto kh = node.param().kh;
-  const auto kw = node.param().kw;
-  const auto stride = node.param().stride;
-  const auto padding =
-      ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
+  const auto ofm_index{node.getOutputs().at(0)};
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
   const auto activation = node.param().activation;
-
-  VERBOSE(AvgPool2D) << "IFM_H: " << ifm_shape.H << std::endl;
-  VERBOSE(AvgPool2D) << "IFM_W: " << ifm_shape.W << std::endl;
-  VERBOSE(AvgPool2D) << "OFM_H: " << ofm_shape.H << std::endl;
-  VERBOSE(AvgPool2D) << "OFM_W: " << ofm_shape.W << std::endl;
-  VERBOSE(AvgPool2D) << "KER_H: " << kh << std::endl;
-  VERBOSE(AvgPool2D) << "KER_W: " << kw << std::endl;
-  VERBOSE(AvgPool2D) << "STRIDE_H: " << stride.vertical << std::endl;
-  VERBOSE(AvgPool2D) << "STRIDE_W: " << stride.horizontal << std::endl;
-  VERBOSE(AvgPool2D) << "PAD(T): " << padding.top << std::endl;
-  VERBOSE(AvgPool2D) << "PAD(B): " << padding.bottom << std::endl;
-  VERBOSE(AvgPool2D) << "PAD(L): " << padding.left << std::endl;
-  VERBOSE(AvgPool2D) << "PAD(R): " << padding.right << std::endl;
-
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-
-  ::arm_compute::PoolingLayerInfo info{
-      ::arm_compute::PoolingType::AVG, ::arm_compute::Size2D{kw, kh},
-      acl_common::asPadStrideInfo(padding, stride), true /* exclude_padding */};
-
-  auto fn = std::make_unique<::arm_compute::NEPoolingLayer>();
-
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
-
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+      asAclFunction(std::move(raw_fn)),
+      ActivationBuilder::generate(activation, ofm_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::Concat &node)
@@ -383,7 +336,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
     return;
   }
 
-  auto output_alloc = _tensor_builder->at(ofm_index).get();
+  auto output_tensor = _tensor_builder->at(ofm_index).get();
   std::vector<::arm_compute::ITensor *> input_tensors;
   for (const auto &ifm_ind : input_indexes)
     input_tensors.emplace_back(_tensor_builder->at(ifm_ind)->handle());
@@ -392,7 +345,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
   if (input_indexes.size() < 2)
   {
     auto l = std::make_unique<::arm_compute::NECopy>();
-    l->configure(input_tensors.at(0), output_alloc->handle());
+    l->configure(input_tensors.at(0), output_tensor->handle());
     fn = std::move(l);
   }
   else
@@ -400,10 +353,10 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
     auto l = std::make_unique<::arm_compute::NEConcatenateLayer>();
     const auto rank = _ctx.at(ofm_index).shape().rank();
     const auto frontend_layout = _current_op_seq_layout;
-    const auto backend_layout = output_alloc->layout();
+    const auto backend_layout = output_tensor->layout();
     const auto fixed_axis =
         acl_common::ToARMComputeAxis(rank, axis, frontend_layout, backend_layout).value();
-    l->configure(input_tensors, output_alloc->handle(), fixed_axis);
+    l->configure(input_tensors, output_tensor->handle(), fixed_axis);
     fn = std::move(l);
   }
 
@@ -418,13 +371,13 @@ void KernelGenerator::visit(const ir::operation::EmbeddingLookup &node)
   const auto lookups_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::LOOKUPS)};
   const auto values_index{node.getInputs().at(ir::operation::EmbeddingLookup::Input::VALUES)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto lookups_alloc = _tensor_builder->at(lookups_index).get();
-  auto values_alloc = _tensor_builder->at(values_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto lookups_tensor = _tensor_builder->at(lookups_index).get();
+  auto values_tensor = _tensor_builder->at(values_index).get();
 
   auto fn = std::make_unique<::arm_compute::NEEmbeddingLookup>();
 
-  fn->configure(values_alloc->handle(), output_alloc->handle(), lookups_alloc->handle());
+  fn->configure(values_tensor->handle(), output_tensor->handle(), lookups_tensor->handle());
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -436,12 +389,12 @@ void KernelGenerator::visit(const ir::operation::Floor &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::Floor::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   auto fn = std::make_unique<::arm_compute::NEFloor>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle());
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -450,76 +403,15 @@ void KernelGenerator::visit(const ir::operation::Floor &node)
 
 void KernelGenerator::visit(const ir::operation::FullyConnected &node)
 {
-  using ir::operation::FullyConnected;
-
   const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)};
-  const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)};
-  const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)};
-
-  const auto input_rank = _ctx.at(input_index).shape().rank();
-
-  const auto output_size =
-      _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 1);
-  UNUSED_RELEASE(output_size);
-  assert(_ctx.at(bias_index).shape().dim(0) == output_size);
-  assert(_ctx.at(weight_index).shape().dim(0) == output_size);
-  const auto batch_size =
-      _ctx.at(output_index).shape().dim(_ctx.at(output_index).shape().rank() - 2);
-  const auto input_size =
-      _ctx.at(weight_index).shape().dim(_ctx.at(weight_index).shape().rank() - 1);
-
-  // Check for reshaping input's shape into rank-2
-  bool needs_reshape = false;
-  ir::Shape reshape(2);
-  if (input_rank == 3 || input_rank == 4)
-  {
-    const auto &ifm_shape = _ctx.at(input_index).shape();
-    auto feature_size = 1;
-    for (int i = 0; i < ifm_shape.rank(); ++i)
-    {
-      feature_size *= ifm_shape.dim(i);
-    }
-
-    UNUSED_RELEASE(feature_size);
-    assert(feature_size == batch_size * input_size);
-
-    // for reshaping
-    needs_reshape = true;
-    reshape.dim(0) = batch_size; /* H */
-    reshape.dim(1) = input_size; /* W */
-  }
-
+  auto output_tensor = _tensor_builder->at(output_index).get();
   const auto activation = node.param().activation;
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  const auto input_alloc = _tensor_builder->at(input_index).get();
-  const auto weight_alloc = _tensor_builder->at(weight_index).get();
-  const auto bias_alloc = _tensor_builder->at(bias_index).get();
-  const auto frontend_layout = _current_op_seq_layout;
-  const auto acl_layout = output_alloc->handle()->info()->data_layout();
-
-  auto fn = std::make_unique<arm_compute::NEFullyConnectedReshapingLayer>(
-      _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
-
-  arm_compute::NEFullyConnectedReshapingLayer::KernelType kernel_type =
-      arm_compute::NEFullyConnectedReshapingLayer::KernelType::GENERAL;
-  if (_ctx.at(weight_index).isConstant())
-  {
-    kernel_type = arm_compute::NEFullyConnectedReshapingLayer::KernelType::PREPROCESSED_WEIGHTS;
-    assert(_ctx.at(weight_index).data());
-  }
-
-  fn->configure(
-      input_alloc->handle(), weight_alloc->handle(), bias_alloc->handle(), output_alloc->handle(),
-      needs_reshape,
-      ::onert::backend::acl_common::asTensorShape(
-          reshape, frontend_layout, ::onert::backend::acl_common::asRuntimeLayout(acl_layout)),
-      kernel_type);
-
+  auto fn = acl_common::kernelGenFullyConnected<acl_common::AclFunction, ::arm_compute::ITensor,
+                                                ::arm_compute::NEFullyConnectedReshapingLayer>(
+      node, _ctx, _tensor_builder, _current_op_seq_layout);
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclFunction(std::move(fn)),
-      ActivationBuilder::generate(activation, output_alloc->handle()));
+      std::move(fn), ActivationBuilder::generate(activation, output_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
@@ -531,17 +423,17 @@ void KernelGenerator::visit(const ir::operation::HashtableLookup &node)
   const auto keys_index{node.getInputs().at(ir::operation::HashtableLookup::Input::KEYS)};
   const auto values_index{node.getInputs().at(ir::operation::HashtableLookup::Input::VALUES)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto hits_alloc = _tensor_builder->at(hits_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto hits_tensor = _tensor_builder->at(hits_index).get();
 
-  auto lookups_alloc = _tensor_builder->at(lookups_index).get();
-  auto keys_alloc = _tensor_builder->at(keys_index).get();
-  auto values_alloc = _tensor_builder->at(values_index).get();
+  auto lookups_tensor = _tensor_builder->at(lookups_index).get();
+  auto keys_tensor = _tensor_builder->at(keys_index).get();
+  auto values_tensor = _tensor_builder->at(values_index).get();
 
   auto fn = std::make_unique<::arm_compute::NEHashtableLookup>();
 
-  fn->configure(lookups_alloc->handle(), keys_alloc->handle(), values_alloc->handle(),
-                output_alloc->handle(), hits_alloc->handle());
+  fn->configure(lookups_tensor->handle(), keys_tensor->handle(), values_tensor->handle(),
+                output_tensor->handle(), hits_tensor->handle());
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -561,10 +453,10 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
   // Converting in reverse order
   const int axis = ::onert::backend::acl_common::ToARMComputeAxis(ifm_rank, axis_value).value();
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  auto indices_alloc = _tensor_builder->at(indices_index).get();
-  const auto backend_layout = ofm_alloc->layout();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+  auto indices_tensor = _tensor_builder->at(indices_index).get();
+  const auto backend_layout = ofm_tensor->layout();
   UNUSED_RELEASE(backend_layout);
 
   // NOTE The frontend layout and backend layout must be the same for this operation.
@@ -575,35 +467,35 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
   //      a model. For example, if a model in NHWC has this operation as output rank == 4, indices
   //      rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
   //      and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
-  assert(backend_layout == ifm_alloc->layout());
-  assert(backend_layout == indices_alloc->layout());
+  assert(backend_layout == ifm_tensor->layout());
+  assert(backend_layout == indices_tensor->layout());
   assert(ifm_rank < 4 || _current_op_seq_layout == backend_layout);
 
   auto fn = std::make_unique<::arm_compute::NEGatherEx>();
 
   // input is n-D, indices k-D, output is (n + k - 1)-D
   size_t n = ifm_rank;
-  assert(n == ifm_alloc->num_dimensions());
+  assert(n == ifm_tensor->num_dimensions());
   size_t k = _ctx.at(indices_index).shape().rank();
-  assert(k == indices_alloc->num_dimensions());
+  assert(k == indices_tensor->num_dimensions());
 
   // Disable applied dim_correction
-  if (n != ifm_alloc->info()->num_dimensions())
+  if (n != ifm_tensor->info()->num_dimensions())
   {
     // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
     const auto ifm = _ctx.at(ifm_index);
-    ifm_alloc->info()->set_tensor_shape(
+    ifm_tensor->info()->set_tensor_shape(
         acl_common::asTensorShape(ifm.shape(), _current_op_seq_layout, backend_layout, false));
   }
-  if (k != indices_alloc->info()->num_dimensions())
+  if (k != indices_tensor->info()->num_dimensions())
   {
     // This means that high dimension's value is 1 and indices tensor is applied dim_correction
     const auto indices = _ctx.at(indices_index);
-    indices_alloc->info()->set_tensor_shape(
+    indices_tensor->info()->set_tensor_shape(
         acl_common::asTensorShape(indices.shape(), _current_op_seq_layout, backend_layout, false));
   }
 
-  fn->configure(ifm_alloc->handle(), indices_alloc->handle(), ofm_alloc->handle(), axis);
+  fn->configure(ifm_tensor->handle(), indices_tensor->handle(), ofm_tensor->handle(), axis);
 
   // acl_neon doesn't not revert disabling applied dim_correction because acl_neon's kernels would
   // use arm_compute::TensorInfo::offset_element_in_bytes()
@@ -621,20 +513,20 @@ void KernelGenerator::visit(const ir::operation::InstanceNorm &node)
   const auto gamma_index{node.getInputs().at(ir::operation::InstanceNorm::Input::GAMMA)};
   const auto beta_index{node.getInputs().at(ir::operation::InstanceNorm::Input::BETA)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  auto gamma_alloc = _tensor_builder->at(gamma_index).get();
-  auto beta_alloc = _tensor_builder->at(beta_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+  auto gamma_tensor = _tensor_builder->at(gamma_index).get();
+  auto beta_tensor = _tensor_builder->at(beta_index).get();
   auto epsilon = node.param().epsilon;
   auto activation = node.param().activation;
 
   auto fn = std::make_unique<::arm_compute::NEInstanceNormalizationLayerEx>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), gamma_alloc->handle(),
-                beta_alloc->handle(), epsilon);
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), gamma_tensor->handle(),
+                beta_tensor->handle(), epsilon);
 
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::L2Normalization &node)
@@ -656,15 +548,15 @@ void KernelGenerator::visit(const ir::operation::L2Normalization &node)
   float beta = 0.5f;                             // pow(reduction, -0.5) = 1 / sqrt(reduction)
   float bias = 0.0f;                             // Don't offset the reduction.
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   const auto norm_info = ::arm_compute::NormalizationLayerInfo(::arm_compute::NormType::CROSS_MAP,
                                                                radius, alpha, beta, bias, false);
 
   auto fn = std::make_unique<::arm_compute::NENormalizationLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -673,32 +565,15 @@ void KernelGenerator::visit(const ir::operation::L2Normalization &node)
 
 void KernelGenerator::visit(const ir::operation::L2Pool2D &node)
 {
-  const auto ofm_index{node.getOutputs().at(0)};
-  const auto ifm_index{node.getInputs().at(ir::operation::L2Pool2D::Input::INPUT)};
-
-  const auto ifm_shape = _ctx.at(ifm_index).shape().asFeature(_current_op_seq_layout);
-  const auto ofm_shape = _ctx.at(ofm_index).shape().asFeature(_current_op_seq_layout);
+  auto raw_fn = acl_common::kernelGenPool2D<::arm_compute::NEPoolingLayer>(
+      node, _ctx, _tensor_builder, _current_op_seq_layout, ::arm_compute::PoolingType::L2);
 
-  uint32_t kw = node.param().kw;
-  uint32_t kh = node.param().kh;
-  const auto stride = node.param().stride;
-  const auto padding =
-      ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
+  const auto ofm_index{node.getOutputs().at(0)};
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
   const auto activation = node.param().activation;
-
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-
-  ::arm_compute::PoolingLayerInfo info{
-      ::arm_compute::PoolingType::L2, ::arm_compute::Size2D{kw, kh},
-      ::onert::backend::acl_common::asPadStrideInfo(padding, stride)};
-
-  auto fn = std::make_unique<::arm_compute::NEPoolingLayer>();
-
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), info);
-
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+      asAclFunction(std::move(raw_fn)),
+      ActivationBuilder::generate(activation, ofm_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &node)
@@ -712,15 +587,15 @@ void KernelGenerator::visit(const ir::operation::LocalResponseNormalization &nod
   auto beta = node.param().beta;
   auto bias = node.param().bias;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   const auto norm_info = ::arm_compute::NormalizationLayerInfo(
       ::arm_compute::NormType::CROSS_MAP, radius * 2 + 1, alpha, beta, bias, false);
 
   auto fn = std::make_unique<::arm_compute::NENormalizationLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), norm_info);
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), norm_info);
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -733,13 +608,13 @@ void KernelGenerator::visit(const ir::operation::LogicalAnd &node)
   const auto input0_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT0)};
   const auto input1_index{node.getInputs().at(ir::operation::LogicalAnd::Input::INPUT1)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input0_alloc = _tensor_builder->at(input0_index).get();
-  auto input1_alloc = _tensor_builder->at(input1_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input0_tensor = _tensor_builder->at(input0_index).get();
+  auto input1_tensor = _tensor_builder->at(input1_index).get();
 
   auto fn = std::make_unique<::arm_compute::NELogicalAnd>();
 
-  fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle());
+  fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle());
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -751,12 +626,12 @@ void KernelGenerator::visit(const ir::operation::LogicalNot &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::LogicalNot::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   auto fn = std::make_unique<::arm_compute::NEBitwiseNot>();
 
-  fn->configure(input_alloc->handle(), output_alloc->handle());
+  fn->configure(input_tensor->handle(), output_tensor->handle());
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -769,13 +644,13 @@ void KernelGenerator::visit(const ir::operation::LogicalOr &node)
   const auto input0_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT0)};
   const auto input1_index{node.getInputs().at(ir::operation::LogicalOr::Input::INPUT1)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input0_alloc = _tensor_builder->at(input0_index).get();
-  auto input1_alloc = _tensor_builder->at(input1_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input0_tensor = _tensor_builder->at(input0_index).get();
+  auto input1_tensor = _tensor_builder->at(input1_index).get();
 
   auto fn = std::make_unique<::arm_compute::NELogicalOr>();
 
-  fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle());
+  fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle());
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -787,8 +662,8 @@ void KernelGenerator::visit(const ir::operation::Logistic &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::Logistic::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   const ::arm_compute::ActivationLayerInfo act_info{
       ::arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC};
@@ -798,7 +673,7 @@ void KernelGenerator::visit(const ir::operation::Logistic &node)
   // instead of 'INF', and then the result of this op will be errors due to the 'NaN'.
   auto fn = std::make_unique<::arm_compute::NEActivationLayerEx>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info);
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -807,159 +682,8 @@ void KernelGenerator::visit(const ir::operation::Logistic &node)
 
 void KernelGenerator::visit(const ir::operation::LSTM &node)
 {
-  // TODO Support dynamic rnn
-  // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection.
-  const auto scratch_buffer_index{
-      node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)};
-  const auto output_state_out_index{
-      node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)};
-  const auto cell_state_out_index{
-      node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)};
-  const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)};
-
-  const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)};
-  const auto input_to_input_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional
-  const auto input_to_forget_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)};
-  const auto input_to_cell_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)};
-  const auto input_to_output_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
-  const auto recurrent_to_input_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional
-  const auto recurrent_to_forget_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)};
-  const auto recurrent_to_cell_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)};
-  const auto recurrent_to_output_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
-  const auto cell_to_input_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional
-  const auto cell_to_forget_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional
-  const auto cell_to_output_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional
-  const auto input_gate_bias_index{
-      node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)};
-  const auto forget_gate_bias_index{
-      node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)};
-  const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)};
-  const auto output_gate_bias_index{
-      node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)};
-  const auto projection_weights_index{
-      node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional
-  const auto projection_bias_index{
-      node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; // optional
-  const auto output_state_in_index{
-      node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)};
-  const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)};
-  const auto cell_threshold = node.param().cell_threshold;
-  const auto projection_threshold = node.param().projection_threshold;
-
-  bool has_input_to_input_weights = _ctx.at(input_to_input_weights_index).shape().dim(0) != 0 &&
-                                    _ctx.at(input_to_input_weights_index).shape().dim(1) != 0;
-  bool has_recurrent_to_input_weights =
-      _ctx.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
-      _ctx.at(recurrent_to_input_weights_index).shape().dim(1) != 0;
-  bool has_cell_to_forget_weights = _ctx.at(cell_to_forget_weights_index).shape().dim(0) != 0;
-  bool has_cell_to_output_weights = _ctx.at(cell_to_output_weights_index).shape().dim(0) != 0;
-  bool has_projection_weights = _ctx.at(projection_weights_index).shape().dim(0) != 0 &&
-                                _ctx.at(projection_weights_index).shape().dim(1) != 0;
-  bool has_projection_bias = _ctx.at(projection_bias_index).shape().dim(0);
-
-  // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG.
-  // true: no CIFG
-  // false: CIFG
-  // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG).
-  bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights;
-
-  // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole.
-  // But the cell_to_input_weights does not exist in regular CIFG although peephole.
-  // true: peephole
-  // false: no peephole
-  bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights;
-
-  // NOTE Although the projection weights has data the projection bias may not have data.
-  bool has_projection_param = has_projection_weights;
-
-  const auto activation = node.param().activation;
-  const auto cell_clip = cell_threshold;
-  const auto projection_clip = projection_threshold;
-  assert(cell_clip >= 0.f && projection_clip >= 0.f);
-
-  auto scratch_buffer_alloc = _tensor_builder->at(scratch_buffer_index).get();
-  auto output_state_out_alloc = _tensor_builder->at(output_state_out_index).get();
-  auto cell_state_out_alloc = _tensor_builder->at(cell_state_out_index).get();
-  auto output_alloc = _tensor_builder->at(output_index).get();
-
-  auto input_alloc = _tensor_builder->at(input_index).get();
-
-  auto input_to_forget_weights_alloc = _tensor_builder->at(input_to_forget_weights_index).get();
-  auto input_to_cell_weights_alloc = _tensor_builder->at(input_to_cell_weights_index).get();
-  auto input_to_output_weights_alloc = _tensor_builder->at(input_to_output_weights_index).get();
-  auto recurrent_to_forget_weights_alloc =
-      _tensor_builder->at(recurrent_to_forget_weights_index).get();
-  auto recurrent_to_cell_weights_alloc = _tensor_builder->at(recurrent_to_cell_weights_index).get();
-  auto recurrent_to_output_weights_alloc =
-      _tensor_builder->at(recurrent_to_output_weights_index).get();
-
-  auto forget_gate_bias_alloc = _tensor_builder->at(forget_gate_bias_index).get();
-  auto cell_bias_alloc = _tensor_builder->at(cell_bias_index).get();
-  auto output_gate_bias_alloc = _tensor_builder->at(output_gate_bias_index).get();
-  auto output_state_in_alloc = _tensor_builder->at(output_state_in_index).get();
-  auto cell_state_in_alloc = _tensor_builder->at(cell_state_in_index).get();
-
-  auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
-
-  auto fn = std::make_unique<::arm_compute::NELSTMLayer>();
-
-  ::arm_compute::LSTMParams<::arm_compute::ITensor> lstm_params{};
-  if (has_cifg_param)
-  {
-    auto input_to_input_weights_alloc =
-        _tensor_builder->at(input_to_input_weights_index).get(); // optional
-    auto recurrent_to_input_weights_alloc =
-        _tensor_builder->at(recurrent_to_input_weights_index).get(); // optional
-    auto cell_to_input_weights_handle =
-        has_peephole_param ? _tensor_builder->at(cell_to_input_weights_index).get()->handle()
-                           : nullptr; // optional (non-cifg && peephole)
-    auto input_gate_bias_alloc = _tensor_builder->at(input_gate_bias_index).get(); // optional
-    lstm_params.set_cifg_params(input_to_input_weights_alloc->handle(),
-                                recurrent_to_input_weights_alloc->handle(),
-                                cell_to_input_weights_handle, input_gate_bias_alloc->handle());
-  }
-  if (has_peephole_param)
-  {
-    auto cell_to_forget_weights_alloc =
-        _tensor_builder->at(cell_to_forget_weights_index).get(); // optional
-    auto cell_to_output_weights_alloc =
-        _tensor_builder->at(cell_to_output_weights_index).get(); // optional
-    lstm_params.set_peephole_params(cell_to_forget_weights_alloc->handle(),
-                                    cell_to_output_weights_alloc->handle());
-  }
-  if (has_projection_param)
-  {
-    auto projection_weights_alloc = _tensor_builder->at(projection_weights_index).get(); // optional
-    auto projection_bias_handle = has_projection_bias
-                                      ? _tensor_builder->at(projection_bias_index).get()->handle()
-                                      : nullptr; // optional
-    lstm_params.set_projection_params(projection_weights_alloc->handle(), projection_bias_handle);
-  }
-
-  fn->configure(
-      input_alloc->handle(), input_to_forget_weights_alloc->handle(),
-      input_to_cell_weights_alloc->handle(), input_to_output_weights_alloc->handle(),
-      recurrent_to_forget_weights_alloc->handle(), recurrent_to_cell_weights_alloc->handle(),
-      recurrent_to_output_weights_alloc->handle(), forget_gate_bias_alloc->handle(),
-      cell_bias_alloc->handle(), output_gate_bias_alloc->handle(), output_state_in_alloc->handle(),
-      cell_state_in_alloc->handle(), scratch_buffer_alloc->handle(),
-      output_state_out_alloc->handle(), cell_state_out_alloc->handle(), output_alloc->handle(),
-      lstm_params, act_info, cell_clip, projection_clip);
-
-  auto acl_fn = asAclFunction(std::move(fn));
-
-  _return_fn = std::move(acl_fn);
+  _return_fn = acl_common::kernelGenLSTM<acl_common::AclFunction, ::arm_compute::ITensor,
+                                         ::arm_compute::NELSTMLayer>(node, _ctx, _tensor_builder);
 }
 
 void KernelGenerator::visit(const ir::operation::Mul &node)
@@ -970,18 +694,18 @@ void KernelGenerator::visit(const ir::operation::Mul &node)
 
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
 
   auto fn = std::make_unique<::arm_compute::NEPixelWiseMultiplication>();
 
   // RoundingPolicy for scale:1.0 is only allowed RoundingPolicy::TO_ZERO
-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(), 1.0, // scale
+  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(), 1.0, // scale
                 arm_compute::ConvertPolicy::SATURATE, arm_compute::RoundingPolicy::TO_ZERO);
 
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::Neg &node)
@@ -989,12 +713,12 @@ void KernelGenerator::visit(const ir::operation::Neg &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::Neg::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   auto fn = std::make_unique<::arm_compute::NENegLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle());
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -1030,12 +754,12 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
   for (const auto &input_index : input_indexes)
   {
     size_t input_rank = _ctx.at(input_index).shape().rank();
-    const auto &input_alloc = _tensor_builder->at(input_index);
-    assert(input_rank == input_alloc->num_dimensions());
-    if (input_rank != input_alloc->info()->num_dimensions())
+    const auto &input_tensor = _tensor_builder->at(input_index);
+    assert(input_rank == input_tensor->num_dimensions());
+    if (input_rank != input_tensor->info()->num_dimensions())
     {
       // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
-      input_alloc->info()->set_tensor_shape(acl_common::asTensorShape(
+      input_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
           _ctx.at(input_index).shape(), _current_op_seq_layout, backend_layout, false));
     }
   }
@@ -1094,8 +818,8 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
   const auto ofm_idx{node.getOutputs().at(0)};
   const auto ifm_idx{node.getInputs().at(0)};
   const auto permute_type = node.getPermuteType();
-  auto ofm_alloc = _tensor_builder->at(ofm_idx).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_idx).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_idx).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_idx).get();
   const auto rank = _ctx.at(ofm_idx).shape().rank();
   assert(_ctx.at(ifm_idx).shape().rank() == _ctx.at(ofm_idx).shape().rank());
 
@@ -1108,7 +832,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
 
     auto l = std::make_unique<::arm_compute::NEPermute>();
 
-    l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv);
+    l->configure(ifm_tensor->handle(), ofm_tensor->handle(), pv);
 
     fn = std::move(l);
   }
@@ -1119,7 +843,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
 
     auto l = std::make_unique<::arm_compute::NEPermute>();
 
-    l->configure(ifm_alloc->handle(), ofm_alloc->handle(), pv);
+    l->configure(ifm_tensor->handle(), ofm_tensor->handle(), pv);
 
     fn = std::move(l);
   }
@@ -1127,7 +851,7 @@ void KernelGenerator::visit(const ir::operation::Permute &node)
   {
     auto l = std::make_unique<::arm_compute::NECopy>();
 
-    l->configure(ifm_alloc->handle(), ofm_alloc->handle());
+    l->configure(ifm_tensor->handle(), ofm_tensor->handle());
 
     fn = std::move(l);
   }
@@ -1143,15 +867,15 @@ void KernelGenerator::visit(const ir::operation::PReLU &node)
   const auto ifm_index{node.getInputs().at(ir::operation::PReLU::Input::INPUT)};
   const auto alpha_index{node.getInputs().at(ir::operation::PReLU::Input::ALPHA)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  auto alpha_alloc = _tensor_builder->at(alpha_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+  auto alpha_tensor = _tensor_builder->at(alpha_index).get();
 
   std::unique_ptr<::arm_compute::IFunction> fn;
 
-  auto l = std::make_unique<::arm_compute::NEPReLU>();
+  auto l = std::make_unique<::arm_compute::NEPReluLayer>();
 
-  l->configure(ifm_alloc->handle(), alpha_alloc->handle(), ofm_alloc->handle());
+  l->configure(ifm_tensor->handle(), alpha_tensor->handle(), ofm_tensor->handle());
 
   fn = std::move(l);
 
@@ -1166,14 +890,14 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
   const auto input_index{node.getInputs().at(ir::operation::Reduce::Input::INPUT)};
   const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   // Convert to ACL axes taking into account negative values and possible duplicates.
   const auto &axes = _ctx.at(axes_index);
   const auto input_rank = _ctx.at(input_index).shape().rank();
   const auto frontend_layout = _current_op_seq_layout;
-  const auto backend_layout = input_alloc->layout();
+  const auto backend_layout = input_tensor->layout();
   const auto reduce_axes =
       acl_common::asCoordinates(axes, input_rank, frontend_layout, backend_layout);
   const auto reduce_type = node.param().reduce_type;
@@ -1182,11 +906,9 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
   std::unique_ptr<::arm_compute::IFunction> fn;
   if (reduce_type == ir::operation::Reduce::ReduceType::MEAN)
   {
-    // NOTE NEReduceMean has a bug that does not support NHWC layout
-    //      NEReduceMean intermediate tensors are always NCHW layout
-    auto l = std::make_unique<::arm_compute::NEReduceMeanEx>();
+    auto l = std::make_unique<::arm_compute::NEReduceMean>();
 
-    l->configure(input_alloc->handle(), reduce_axes, keep_dims, output_alloc->handle());
+    l->configure(input_tensor->handle(), reduce_axes, keep_dims, output_tensor->handle());
 
     fn = std::move(l);
   }
@@ -1194,7 +916,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
   {
     auto l = std::make_unique<::arm_compute::NEReduceSum>();
 
-    l->configure(input_alloc->handle(), reduce_axes, keep_dims, output_alloc->handle());
+    l->configure(input_tensor->handle(), reduce_axes, keep_dims, output_tensor->handle());
 
     fn = std::move(l);
   }
@@ -1202,7 +924,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
   {
     auto l = std::make_unique<::arm_compute::NEReduceOperation>();
 
-    l->configure(input_alloc->handle(), reduce_axes, keep_dims, output_alloc->handle(),
+    l->configure(input_tensor->handle(), reduce_axes, keep_dims, output_tensor->handle(),
                  acl_common::convertReduceType(reduce_type));
 
     fn = std::move(l);
@@ -1218,15 +940,15 @@ void KernelGenerator::visit(const ir::operation::ReLU &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::ReLU::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   auto fn = std::make_unique<arm_compute::NEActivationLayer>();
 
   const ::arm_compute::ActivationLayerInfo act_info{
       ::arm_compute::ActivationLayerInfo::ActivationFunction::RELU};
 
-  fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
+  fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -1238,15 +960,15 @@ void KernelGenerator::visit(const ir::operation::ReLU1 &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::ReLU1::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   const ::arm_compute::ActivationLayerInfo act_info{
       ::arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 1.0f, -1.0f};
 
   auto fn = std::make_unique<::arm_compute::NEActivationLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info);
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -1258,15 +980,15 @@ void KernelGenerator::visit(const ir::operation::ReLU6 &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::ReLU6::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   const ::arm_compute::ActivationLayerInfo act_info{
       ::arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0f};
 
   auto fn = std::make_unique<::arm_compute::NEActivationLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), act_info);
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), act_info);
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -1278,13 +1000,13 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   // NOTE This operation must not be changed the layout from frontend to backend
   //      So, PermutationOperationPass makes layouts of frontend and backend the same.
   const auto frontend_layout = _current_op_seq_layout;
-  const auto backend_layout = output_alloc->layout();
+  const auto backend_layout = output_tensor->layout();
   assert((_ctx.at(input_index).shape().rank() < 4 && _ctx.at(output_index).shape().rank() < 4) ||
          frontend_layout == backend_layout);
   UNUSED_RELEASE(frontend_layout);
@@ -1292,7 +1014,7 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
 
   auto fn = std::make_unique<arm_compute::NEReshapeLayer>();
 
-  fn->configure(input_alloc->handle(), output_alloc->handle());
+  fn->configure(input_tensor->handle(), output_tensor->handle());
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -1305,12 +1027,12 @@ void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
 
   const auto ifm_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   auto fn = std::make_unique<::arm_compute::NEScale>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(),
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(),
                 ::arm_compute::InterpolationPolicy::BILINEAR, ::arm_compute::BorderMode::REPLICATE,
                 ::arm_compute::PixelValue(0.f), ::arm_compute::SamplingPolicy::TOP_LEFT);
 
@@ -1334,25 +1056,25 @@ void KernelGenerator::visit(const ir::operation::RNN &node)
 
   const auto activation = node.param().activation;
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto hidden_state_out_alloc = _tensor_builder->at(hidden_state_out_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto hidden_state_out_tensor = _tensor_builder->at(hidden_state_out_index).get();
 
-  auto input_alloc = _tensor_builder->at(input_index).get();
-  auto weights_alloc = _tensor_builder->at(weights_index).get();
-  auto recurrent_weights_alloc = _tensor_builder->at(recurrent_weights_index).get();
-  auto bias_alloc = _tensor_builder->at(bias_index).get();
-  auto hidden_state_in_alloc = _tensor_builder->at(hidden_state_in_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
+  auto weights_tensor = _tensor_builder->at(weights_index).get();
+  auto recurrent_weights_tensor = _tensor_builder->at(recurrent_weights_index).get();
+  auto bias_tensor = _tensor_builder->at(bias_index).get();
+  auto hidden_state_in_tensor = _tensor_builder->at(hidden_state_in_index).get();
   auto act_info = ::onert::backend::acl_common::asActivationLayerInfo(activation);
 
   auto copy_layer = std::make_unique<::arm_compute::NECopy>();
-  copy_layer->configure(hidden_state_in_alloc->handle(), hidden_state_out_alloc->handle());
+  copy_layer->configure(hidden_state_in_tensor->handle(), hidden_state_out_tensor->handle());
   _return_fn = asAclFunction(std::move(copy_layer));
 
-  auto fn = std::make_unique<::arm_compute::NERNNLayerEx>(
+  auto fn = std::make_unique<::arm_compute::NERNNLayer>(
       _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
-  fn->configure(input_alloc->handle(), weights_alloc->handle(), recurrent_weights_alloc->handle(),
-                bias_alloc->handle(), hidden_state_out_alloc->handle(), output_alloc->handle(),
-                act_info);
+  fn->configure(input_tensor->handle(), weights_tensor->handle(),
+                recurrent_weights_tensor->handle(), bias_tensor->handle(),
+                hidden_state_out_tensor->handle(), output_tensor->handle(), act_info);
   _return_fn = asAclFunction(std::move(fn));
 }
 
@@ -1361,12 +1083,12 @@ void KernelGenerator::visit(const ir::operation::RSQRT &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::RSQRT::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
   auto fn = std::make_unique<::arm_compute::NERsqrtLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle());
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle());
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1383,10 +1105,10 @@ void KernelGenerator::visit(const ir::operation::Squeeze &node)
   (void)dims;
   (void)ndim;
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
   auto fn = std::make_unique<arm_compute::NEReshapeLayer>();
-  fn->configure(input_alloc->handle(), output_alloc->handle());
+  fn->configure(input_tensor->handle(), output_tensor->handle());
   auto acl_fn = asAclFunction(std::move(fn));
   _return_fn = std::move(acl_fn);
 }
@@ -1396,15 +1118,15 @@ void KernelGenerator::visit(const ir::operation::Tanh &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Tanh::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   auto fn = std::make_unique<arm_compute::NEActivationLayer>();
 
   const ::arm_compute::ActivationLayerInfo act_info{
       ::arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f};
 
-  fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
+  fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -1417,13 +1139,25 @@ void KernelGenerator::visit(const ir::operation::Softmax &node)
   const auto input_index{node.getInputs().at(ir::operation::Softmax::Input::INPUT)};
   const auto beta = node.param().beta;
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
+  const auto frontend_layout = _current_op_seq_layout;
+  const auto backend_layout = input_tensor->layout();
+
+  // Disable applied dim_correction
+  const size_t input_rank = _ctx.at(input_index).shape().rank();
+  if (input_rank != input_tensor->info()->num_dimensions())
+  {
+    // This means that high dimension's value is 1 and input tensor is applied dim_correction
+    const auto input = _ctx.at(input_index);
+    input_tensor->info()->set_tensor_shape(
+        acl_common::asTensorShape(input.shape(), frontend_layout, backend_layout, false));
+  }
 
   auto fn = std::make_unique<::arm_compute::NESoftmaxLayer>(
       _tensor_builder->acl_tensor_manager()->internal_buffer_manager());
 
-  fn->configure(input_alloc->handle(), output_alloc->handle(), beta);
+  fn->configure(input_tensor->handle(), output_tensor->handle(), beta);
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -1438,20 +1172,18 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
       node.getInputs().at(ir::operation::SpaceToBatchND::Input::BLOCK_SIZE)};
   const auto paddings_index{node.getInputs().at(ir::operation::SpaceToBatchND::Input::PADDINGS)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  auto block_size_alloc = _tensor_builder->at(block_size_index).get();
-  auto paddings_alloc = _tensor_builder->at(paddings_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+  auto block_size_tensor = _tensor_builder->at(block_size_index).get();
+  auto paddings_tensor = _tensor_builder->at(paddings_index).get();
 
   assert(_ctx.at(block_size_index).data());
   assert(_ctx.at(paddings_index).data());
 
-  // NESpaceToBatchLayer has a bug that padding's values are 0 even when zero point of QASYMM8 is
-  // not 0.
-  auto fn = std::make_unique<::arm_compute::NESpaceToBatchLayerEx>();
+  auto fn = std::make_unique<::arm_compute::NESpaceToBatchLayer>();
 
-  fn->configure(ifm_alloc->handle(), block_size_alloc->handle(), paddings_alloc->handle(),
-                ofm_alloc->handle());
+  fn->configure(ifm_tensor->handle(), block_size_tensor->handle(), paddings_tensor->handle(),
+                ofm_tensor->handle());
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -1465,12 +1197,12 @@ void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
 
   auto block_size = node.param().block_size;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
 
-  auto fn = std::make_unique<::arm_compute::NESpaceToDepthLayerEx>();
+  auto fn = std::make_unique<::arm_compute::NESpaceToDepthLayer>();
 
-  fn->configure(ifm_alloc->handle(), ofm_alloc->handle(), block_size);
+  fn->configure(ifm_tensor->handle(), ofm_tensor->handle(), block_size);
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -1489,13 +1221,13 @@ void KernelGenerator::visit(const ir::operation::Split &node)
   for (const auto &output : node.getOutputs())
     output_indexes.emplace_back(output);
 
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  std::vector<arm_compute::ITensor *> output_allocs;
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+  std::vector<arm_compute::ITensor *> output_tensors;
   for (const auto &ofm_ind : output_indexes)
-    output_allocs.emplace_back(_tensor_builder->at(ofm_ind).get()->handle());
+    output_tensors.emplace_back(_tensor_builder->at(ofm_ind).get()->handle());
 
   const auto frontend_layout = _current_op_seq_layout;
-  const auto backend_layout = ifm_alloc->layout();
+  const auto backend_layout = ifm_tensor->layout();
   auto axis = node.param().axis;
   if (axis < 0)
     axis += ifm_rank;
@@ -1503,7 +1235,7 @@ void KernelGenerator::visit(const ir::operation::Split &node)
 
   auto fn = std::make_unique<::arm_compute::NESplit>();
 
-  fn->configure(ifm_alloc->handle(), output_allocs, axis);
+  fn->configure(ifm_tensor->handle(), output_tensors, axis);
 
   _return_fn = asAclFunction(std::move(fn));
 }
@@ -1513,15 +1245,15 @@ void KernelGenerator::visit(const ir::operation::SQRT &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::SQRT::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   const ::arm_compute::ActivationLayerInfo act_info{
       ::arm_compute::ActivationLayerInfo::ActivationFunction::SQRT};
 
   auto fn = std::make_unique<::arm_compute::NEActivationLayer>();
 
-  fn->configure(input_alloc->handle(), output_alloc->handle(), act_info);
+  fn->configure(input_tensor->handle(), output_tensor->handle(), act_info);
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -1534,13 +1266,13 @@ void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
   const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
 
   auto fn = std::make_unique<::arm_compute::NEElementwiseSquaredDiff>();
 
-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
+  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -1555,17 +1287,17 @@ void KernelGenerator::visit(const ir::operation::Sub &node)
 
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
 
   auto fn = std::make_unique<::arm_compute::NEArithmeticSubtraction>();
 
-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
+  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
                 arm_compute::ConvertPolicy::SATURATE);
 
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::Slice &node)
@@ -1575,10 +1307,10 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
   const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
   const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
 
-  auto outputData_alloc = _tensor_builder->at(output_index).get();
-  auto inputData_alloc = _tensor_builder->at(input_index).get();
+  auto outputData_tensor = _tensor_builder->at(output_index).get();
+  auto inputData_tensor = _tensor_builder->at(input_index).get();
   const auto frontend_layout = _current_op_seq_layout;
-  const auto backend_layout = inputData_alloc->layout();
+  const auto backend_layout = inputData_tensor->layout();
 
   // Set initializers for indices data such as order of inputData
   int input_rank = _ctx.at(input_index).shape().rank();
@@ -1628,7 +1360,7 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
 
   auto fn = std::make_unique<::arm_compute::NESlice>();
 
-  fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set);
+  fn->configure(inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set);
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -1643,10 +1375,10 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
   const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
   const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
 
-  auto outputData_alloc = _tensor_builder->at(output_index).get();
-  auto inputData_alloc = _tensor_builder->at(input_index).get();
+  auto outputData_tensor = _tensor_builder->at(output_index).get();
+  auto inputData_tensor = _tensor_builder->at(input_index).get();
   const auto frontend_layout = _current_op_seq_layout;
-  const auto backend_layout = inputData_alloc->layout();
+  const auto backend_layout = inputData_tensor->layout();
 
   // Set initializers for indices data such as order of inputData
   int input_rank = _ctx.at(input_index).shape().rank();
@@ -1715,7 +1447,7 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
 
   auto fn = std::make_unique<::arm_compute::NEStridedSlice>();
 
-  fn->configure(inputData_alloc->handle(), outputData_alloc->handle(), starts_set, ends_set,
+  fn->configure(inputData_tensor->handle(), outputData_tensor->handle(), starts_set, ends_set,
                 strides_set, begin_mask, end_mask, shrink_axis_mask);
 
   auto acl_fn = asAclFunction(std::move(fn));
@@ -1749,16 +1481,16 @@ void KernelGenerator::visit(const ir::operation::TransposeConv &node)
     invalid_vertical = ofm_shape.H - (1 + (ifm_shape.H - 1) * stride.vertical) - (ker_shape.H - 1);
   }
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->at(ifm_index).get();
-  auto ker_alloc = _tensor_builder->at(ker_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->at(ifm_index).get();
+  auto ker_tensor = _tensor_builder->at(ker_index).get();
 
   const auto tconv_info = acl_common::asPadStrideInfo(padding, stride);
 
   auto fn = std::make_unique<::arm_compute::NETransposeConvLayer>();
 
-  fn->configure(ifm_alloc->handle(), ker_alloc->handle(), nullptr, ofm_alloc->handle(), tconv_info,
-                invalid_horizontal, invalid_vertical);
+  fn->configure(ifm_tensor->handle(), ker_tensor->handle(), nullptr, ofm_tensor->handle(),
+                tconv_info, invalid_horizontal, invalid_vertical);
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -1771,10 +1503,10 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
   const auto ifm_idx{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
   const auto &perm{node.param().perm};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_idx).get();
-  const auto ifm_alloc = _tensor_builder->at(ifm_idx).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_idx).get();
+  const auto ifm_tensor = _tensor_builder->at(ifm_idx).get();
   const auto frontend_layout = _current_op_seq_layout;
-  const auto backend_layout = ifm_alloc->layout();
+  const auto backend_layout = ifm_tensor->layout();
 
   const auto rank = _ctx.at(ifm_idx).shape().rank();
   std::vector<std::int32_t> pv(perm.cbegin(), perm.cend());
@@ -1783,11 +1515,11 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
 
   std::unique_ptr<::arm_compute::IFunction> fn;
 
-  if (ifm_alloc->num_dimensions() <= 2 && ofm_alloc->num_dimensions() <= 2)
+  if (ifm_tensor->num_dimensions() <= 2 && ofm_tensor->num_dimensions() <= 2)
   {
     auto l = std::make_unique<::arm_compute::NETranspose>();
 
-    l->configure(ifm_alloc->handle(), ofm_alloc->handle());
+    l->configure(ifm_tensor->handle(), ofm_tensor->handle());
 
     fn = std::move(l);
   }
@@ -1795,7 +1527,7 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
   {
     auto l = std::make_unique<::arm_compute::NEPermute>();
 
-    l->configure(ifm_alloc->handle(), ofm_alloc->handle(), backend_pv);
+    l->configure(ifm_tensor->handle(), ofm_tensor->handle(), backend_pv);
 
     fn = std::move(l);
   }
@@ -1834,13 +1566,13 @@ void KernelGenerator::visit(const ir::operation::Unpack &node)
   for (const auto &output_index : output_indexes)
   {
     size_t output_rank = _ctx.at(output_index).shape().rank();
-    const auto &output_alloc = _tensor_builder->at(output_index);
-    orig_outputs_acl_tensor_shapes.emplace_back(output_alloc->info()->tensor_shape());
-    assert(output_rank == output_alloc->num_dimensions());
-    if (output_rank != output_alloc->info()->num_dimensions())
+    const auto &output_tensor = _tensor_builder->at(output_index);
+    orig_outputs_acl_tensor_shapes.emplace_back(output_tensor->info()->tensor_shape());
+    assert(output_rank == output_tensor->num_dimensions());
+    if (output_rank != output_tensor->info()->num_dimensions())
     {
       // This means that high dimension's value is 1 and ifm tensor is applied dim_correction
-      output_alloc->info()->set_tensor_shape(acl_common::asTensorShape(
+      output_tensor->info()->set_tensor_shape(acl_common::asTensorShape(
           _ctx.at(output_index).shape(), _current_op_seq_layout, backend_layout, false));
     }
   }
@@ -1858,17 +1590,17 @@ void KernelGenerator::visit(const ir::operation::Add &node)
 
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
 
   auto fn = std::make_unique<::arm_compute::NEArithmeticAddition>();
 
-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle(),
+  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle(),
                 arm_compute::ConvertPolicy::SATURATE);
 
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::Div &node)
@@ -1879,16 +1611,16 @@ void KernelGenerator::visit(const ir::operation::Div &node)
 
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
 
   auto fn = std::make_unique<::arm_compute::NEElementwiseDivision>();
 
-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
+  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
 
   _return_fn = std::make_unique<exec::FunctionSequence>(
-      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_alloc->handle()));
+      asAclFunction(std::move(fn)), ActivationBuilder::generate(activation, ofm_tensor->handle()));
 }
 
 void KernelGenerator::visit(const ir::operation::Exp &node)
@@ -1896,12 +1628,12 @@ void KernelGenerator::visit(const ir::operation::Exp &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Exp::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   auto fn = std::make_unique<::arm_compute::NEExpLayer>();
 
-  fn->configure(input_alloc->handle(), output_alloc->handle());
+  fn->configure(input_tensor->handle(), output_tensor->handle());
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -1913,12 +1645,12 @@ void KernelGenerator::visit(const ir::operation::ExpandDims &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input_tensor = _tensor_builder->at(input_index).get();
 
   auto fn = std::make_unique<::arm_compute::NEReshapeLayer>();
 
-  fn->configure(input_alloc->handle(), output_alloc->handle());
+  fn->configure(input_tensor->handle(), output_tensor->handle());
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -1933,13 +1665,13 @@ void KernelGenerator::visit(const ir::operation::Comparison &node)
 
   const auto comparison_type = node.param().comparison_type;
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input0_alloc = _tensor_builder->at(input0_index).get();
-  auto input1_alloc = _tensor_builder->at(input1_index).get();
+  auto output_tensor = _tensor_builder->at(output_index).get();
+  auto input0_tensor = _tensor_builder->at(input0_index).get();
+  auto input1_tensor = _tensor_builder->at(input1_index).get();
 
   auto fn = std::make_unique<::arm_compute::NEElementwiseComparison>();
 
-  fn->configure(input0_alloc->handle(), input1_alloc->handle(), output_alloc->handle(),
+  fn->configure(input0_tensor->handle(), input1_tensor->handle(), output_tensor->handle(),
                 (arm_compute::ComparisonOperation)comparison_type);
 
   auto acl_fn = asAclFunction(std::move(fn));
@@ -1953,13 +1685,13 @@ void KernelGenerator::visit(const ir::operation::Min &node)
   const auto lhs_index{node.getInputs().at(ir::operation::Min::Input::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::Min::Input::RHS)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
 
   auto fn = std::make_unique<::arm_compute::NEElementwiseMin>();
 
-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
+  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
 
   auto acl_fn = asAclFunction(std::move(fn));
 
@@ -1972,13 +1704,13 @@ void KernelGenerator::visit(const ir::operation::Max &node)
   const auto lhs_index{node.getInputs().at(ir::operation::Max::Input::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::Max::Input::RHS)};
 
-  auto ofm_alloc = _tensor_builder->at(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->at(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->at(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->at(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->at(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->at(rhs_index).get();
 
   auto fn = std::make_unique<::arm_compute::NEElementwiseMax>();
 
-  fn->configure(lhs_alloc->handle(), rhs_alloc->handle(), ofm_alloc->handle());
+  fn->configure(lhs_tensor->handle(), rhs_tensor->handle(), ofm_tensor->handle());
 
   auto acl_fn = asAclFunction(std::move(fn));
 
diff --git a/runtime/onert/backend/cpu/Backend.h b/runtime/onert/backend/cpu/Backend.h
index 2daf06aca..56bd352e0 100644
--- a/runtime/onert/backend/cpu/Backend.h
+++ b/runtime/onert/backend/cpu/Backend.h
@@ -17,6 +17,7 @@
 #ifndef __ONERT_BACKEND_CPU_BACKEND_H__
 #define __ONERT_BACKEND_CPU_BACKEND_H__
 
+#include "BackendContext.h"
 #include "Config.h"
 #include "ConstantInitializer.h"
 #include "KernelGenerator.h"
@@ -39,9 +40,9 @@ public:
 
   std::shared_ptr<IConfig> config() const override { return _config; }
 
-  std::unique_ptr<BackendContext> newContext(const ir::Graph &graph,
-                                             const std::shared_ptr<custom::IKernelBuilder> &kb,
-                                             bool) const override
+  std::unique_ptr<onert::backend::BackendContext>
+  newContext(const ir::Graph &graph, const std::shared_ptr<custom::IKernelBuilder> &kb,
+             bool) const override
   {
     const auto &operands = graph.operands();
     const auto &operations = graph.operations();
@@ -49,7 +50,8 @@ public:
     auto tb = std::make_shared<TensorBuilder>();
     context->tensor_builder = tb;
     context->constant_initializer = std::make_shared<ConstantInitializer>(operands, tb);
-    context->kernel_gen = std::make_shared<KernelGenerator>(operands, operations, tb, kb);
+    context->kernel_gen = std::make_shared<KernelGenerator>(operands, operations, tb, kb,
+                                                            context->external_context());
     context->tensor_register = nullptr;
     context->optimizer = nullptr;
     return context;
diff --git a/runtime/onert/backend/cpu/BackendContext.h b/runtime/onert/backend/cpu/BackendContext.h
new file mode 100644
index 000000000..f314a8e39
--- /dev/null
+++ b/runtime/onert/backend/cpu/BackendContext.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_CPU_BACKEND_CONTEXT_H__
+#define __ONERT_BACKEND_CPU_BACKEND_CONTEXT_H__
+
+#include <backend/BackendContext.h>
+#include "ExternalContext.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+
+class BackendContext : public onert::backend::BackendContext
+{
+public:
+  BackendContext(const Backend *backend, const ir::Graph *graph,
+                 std::shared_ptr<ITensorBuilder> tensor_builder = nullptr,
+                 std::shared_ptr<IConstantInitializer> constant_initializer = nullptr,
+                 std::shared_ptr<IKernelGenerator> kernel_gen = nullptr,
+                 std::shared_ptr<ITensorRegister> tensor_register = nullptr,
+                 std::shared_ptr<IOptimizer> optimizer = nullptr)
+      : onert::backend::BackendContext(backend, graph, tensor_builder, constant_initializer,
+                                       kernel_gen, tensor_register, optimizer),
+        _external_context(new ExternalContext)
+  {
+  }
+
+  std::shared_ptr<ExternalContext> external_context() { return _external_context; }
+
+private:
+  // NOTE ruy context has a thread pool, and when multiple ruy contexts are created,
+  //      the thread pool is also created in duplicate
+  // TODO Create one ruy context for session
+  std::shared_ptr<ExternalContext> _external_context;
+};
+
+} // namespace cpu
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_CPU_BACKEND_CONTEXT_H__
diff --git a/runtime/onert/backend/cpu/CMakeLists.txt b/runtime/onert/backend/cpu/CMakeLists.txt
index e997a2291..01a3cd178 100644
--- a/runtime/onert/backend/cpu/CMakeLists.txt
+++ b/runtime/onert/backend/cpu/CMakeLists.txt
@@ -1,5 +1,7 @@
 set(LIB_ONERT_BACKEND_CPU onert_backend_cpu)
 
+nnfw_find_package(Ruy REQUIRED)
+
 file(GLOB_RECURSE SOURCES "*.cc")
 
 add_library(${LIB_ONERT_BACKEND_CPU} SHARED ${SOURCES})
@@ -8,6 +10,8 @@ target_link_libraries(${LIB_ONERT_BACKEND_CPU} PRIVATE nnfw_lib_cker)
 target_link_libraries(${LIB_ONERT_BACKEND_CPU} PRIVATE onert_core)
 target_link_libraries(${LIB_ONERT_BACKEND_CPU} PRIVATE nnfw_common)
 target_link_libraries(${LIB_ONERT_BACKEND_CPU} PRIVATE nnfw_coverage)
+target_link_libraries(${LIB_ONERT_BACKEND_CPU} PRIVATE ruy)
+target_link_libraries(${LIB_ONERT_BACKEND_CPU} INTERFACE ruy_instrumentation)
 
 set_target_properties(${LIB_ONERT_BACKEND_CPU} PROPERTIES OUTPUT_NAME backend_cpu)
 
diff --git a/runtime/onert/backend/cpu/ConstantInitializer.cc b/runtime/onert/backend/cpu/ConstantInitializer.cc
index 71e313628..deb27f0fe 100644
--- a/runtime/onert/backend/cpu/ConstantInitializer.cc
+++ b/runtime/onert/backend/cpu/ConstantInitializer.cc
@@ -15,6 +15,7 @@
  */
 
 #include "ConstantInitializer.h"
+#include "Tensor.h"
 
 namespace onert
 {
@@ -30,39 +31,61 @@ ConstantInitializer::ConstantInitializer(const ir::Operands &operands,
   // DO NOTHING
 }
 
+void ConstantInitializer::registerDefaultInitializer(const ir::OperandIndex &index,
+                                                     const ir::Operand &obj)
+{
+  registerExternalInitializer(index, obj);
+}
+
+void ConstantInitializer::registerExternalInitializer(const ir::OperandIndex &index,
+                                                      const ir::Operand &obj)
+{
+  // For only CONSTANTS
+  // TODO Add to check if tensor has been allocated
+  if (!obj.isConstant())
+    return;
+
+  _init_map[index] = [](const onert::ir::Operand &model_obj, onert::backend::ITensor &itensor) {
+    auto data = model_obj.shareData();
+    assert(data && data->base());
+    ExternalTensor &tensor = dynamic_cast<ExternalTensor &>(itensor);
+    tensor.setData(data);
+  };
+}
+
 void ConstantInitializer::visit(const ir::operation::Conv2D &node)
 {
   const auto &kernel_index = node.getInputs().at(ir::operation::Conv2D::KERNEL);
   const auto &kernel_obj = _operands.at(kernel_index);
-  registerCopyInitializer(kernel_index, kernel_obj);
+  registerExternalInitializer(kernel_index, kernel_obj);
 
   const auto &bias_index = node.getInputs().at(ir::operation::Conv2D::BIAS);
   const auto &bias_obj = _operands.at(bias_index);
-  registerCopyInitializer(bias_index, bias_obj);
+  registerExternalInitializer(bias_index, bias_obj);
 }
 
 void ConstantInitializer::visit(const ir::operation::DepthwiseConv2D &node)
 {
   const auto &kernel_index = node.getInputs().at(ir::operation::DepthwiseConv2D::KERNEL);
   const auto &kernel_obj = _operands.at(kernel_index);
-  registerCopyInitializer(kernel_index, kernel_obj);
+  registerExternalInitializer(kernel_index, kernel_obj);
 
   const auto &bias_index = node.getInputs().at(ir::operation::DepthwiseConv2D::BIAS);
   const auto &bias_obj = _operands.at(bias_index);
-  registerCopyInitializer(bias_index, bias_obj);
+  registerExternalInitializer(bias_index, bias_obj);
 }
 
 void ConstantInitializer::visit(const ir::operation::FullyConnected &node)
 {
   const auto &weight_index = node.getInputs().at(ir::operation::FullyConnected::WEIGHT);
   const auto &weight_obj = _operands.at(weight_index);
-  registerCopyInitializer(weight_index, weight_obj);
+  registerExternalInitializer(weight_index, weight_obj);
 
   const auto &bias_index = node.getInputs().at(ir::operation::FullyConnected::BIAS);
   if (!bias_index.undefined())
   {
     const auto &bias_obj = _operands.at(bias_index);
-    registerCopyInitializer(bias_index, bias_obj);
+    registerExternalInitializer(bias_index, bias_obj);
   }
 }
 
diff --git a/runtime/onert/backend/cpu/ConstantInitializer.h b/runtime/onert/backend/cpu/ConstantInitializer.h
index bd06c64d1..de03a693a 100644
--- a/runtime/onert/backend/cpu/ConstantInitializer.h
+++ b/runtime/onert/backend/cpu/ConstantInitializer.h
@@ -36,6 +36,15 @@ public:
                       const std::shared_ptr<TensorBuilder> &tensor_builder);
 
 public:
+  void registerDefaultInitializer(const ir::OperandIndex &index, const ir::Operand &obj) override;
+
+  // TODO: For now the only cpu backend supports constant tensor to use data from external
+  // If the other backend supports (to do this,
+  // ExternalTensor should be abstract such as IExternal, maybe),
+  // this can be an interface of IConstantInitializer
+  void registerExternalInitializer(const ir::OperandIndex &, const ir::Operand &);
+
+public:
   void visit(const ir::operation::Conv2D &) override;
   void visit(const ir::operation::DepthwiseConv2D &) override;
   void visit(const ir::operation::FullyConnected &) override;
diff --git a/runtime/onert/backend/cpu/ExternalContext.h b/runtime/onert/backend/cpu/ExternalContext.h
new file mode 100644
index 000000000..6627412d2
--- /dev/null
+++ b/runtime/onert/backend/cpu/ExternalContext.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_CPU_EXTERNAL_CONTEXT_H__
+#define __ONERT_BACKEND_CPU_EXTERNAL_CONTEXT_H__
+
+#include <backend/IExternalContext.h>
+#include <util/ConfigSource.h>
+#include <ruy/context.h>
+
+namespace
+{
+const int kDefaultNumThreadpoolThreads = 1;
+}
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+
+class ExternalContext : public IExternalContext
+{
+public:
+  ExternalContext() : _ruy_context(new ruy::Context)
+  {
+    setMaxNumThreads(onert::util::getConfigInt(onert::util::config::RUY_THREADS));
+#ifdef USE_RUY_GEMV
+    _ruy_context->cache_policy = ruy::kCacheLHSOnNarrowMul;
+#endif
+  }
+
+  void setMaxNumThreads(int max_num_threads)
+  {
+    const int target_num_threads =
+        max_num_threads > -1 ? max_num_threads : kDefaultNumThreadpoolThreads;
+    _ruy_context->max_num_threads = target_num_threads;
+  }
+
+  ruy::Context *ruy_context() const { return _ruy_context.get(); }
+
+private:
+  const std::unique_ptr<ruy::Context> _ruy_context;
+};
+
+} // namespace cpu
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_CPU_EXTERNAL_CONTEXT_H__
diff --git a/runtime/onert/backend/cpu/KernelGenerator.cc b/runtime/onert/backend/cpu/KernelGenerator.cc
index 72f960675..7939fe894 100644
--- a/runtime/onert/backend/cpu/KernelGenerator.cc
+++ b/runtime/onert/backend/cpu/KernelGenerator.cc
@@ -20,6 +20,7 @@
 #include "ops/AddLayer.h"
 #include "ops/ArgMinMaxLayer.h"
 #include "ops/AvgPoolLayer.h"
+#include "ops/BatchToSpaceNDLayer.h"
 #include "ops/CastLayer.h"
 #include "ops/CompareLayer.h"
 #include "ops/ConcatLayer.h"
@@ -49,7 +50,9 @@
 #include "ops/RangeLayer.h"
 #include "ops/ReduceLayer.h"
 #include "ops/ReLULayer.h"
+#include "ops/ReLU6Layer.h"
 #include "ops/ReshapeLayer.h"
+#include "ops/ResizeBilinearLayer.h"
 #include "ops/ReverseLayer.h"
 #include "ops/RoundLayer.h"
 #include "ops/RsqrtLayer.h"
@@ -60,7 +63,9 @@
 #include "ops/SoftMaxLayer.h"
 #include "ops/StridedSliceLayer.h"
 #include "ops/SpaceToBatchNDLayer.h"
+#include "ops/SpaceToDepthLayer.h"
 #include "ops/SplitLayer.h"
+#include "ops/SplitVLayer.h"
 #include "ops/SubLayer.h"
 #include "ops/TanhLayer.h"
 #include "ops/TileLayer.h"
@@ -70,11 +75,14 @@
 #include "ops/ZerosLikeLayer.h"
 #include "ops/SquaredDiffLayer.h"
 #include "ops/LogicalOrLayer.h"
+#include "ops/L2NormLayer.h"
 #include "ops/MatrixBandPartLayer.h"
 #include "ops/BatchMatMulLayer.h"
 #include "ops/BroadcastToLayer.h"
 #include "ops/FusedBatchNormLayer.h"
 #include "ops/LogSoftMaxLayer.h"
+#include "ops/QuantizeLayer.h"
+#include "ops/StatelessRandomUniformLayer.h"
 
 #include <backend/Backend.h>
 #include <backend/IConfig.h>
@@ -119,9 +127,11 @@ ops::ReduceType convertReduceType(ir::operation::Reduce::ReduceType reduce_type_
 KernelGenerator::KernelGenerator(
     const ir::Operands &operands_ctx, const ir::Operations &operations_ctx,
     const std::shared_ptr<TensorBuilder> &tensor_builder,
-    const std::shared_ptr<backend::custom::IKernelBuilder> &kernel_builder)
+    const std::shared_ptr<backend::custom::IKernelBuilder> &kernel_builder,
+    const std::shared_ptr<ExternalContext> &external_context)
     : _ctx(operands_ctx), _operations_ctx{operations_ctx}, _tensor_builder(tensor_builder),
-      _kernel_builder(kernel_builder), _current_op_seq_layout(ir::Layout::UNKNOWN)
+      _kernel_builder(kernel_builder), _current_op_seq_layout(ir::Layout::UNKNOWN),
+      _external_context(external_context)
 {
   // DO NOTHING
 }
@@ -184,10 +194,10 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
   const auto ker_index{node.getInputs().at(Conv2D::Input::KERNEL)};
   const auto bias_index{node.getInputs().at(Conv2D::Input::BIAS)};
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
-  auto ker_alloc = _tensor_builder->portableAt(ker_index).get();
-  auto bias_alloc = _tensor_builder->portableAt(bias_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
+  auto ker_tensor = _tensor_builder->portableAt(ker_index).get();
+  auto bias_tensor = _tensor_builder->portableAt(bias_index).get();
 
   const auto stride = node.param().stride;
   const auto activation = node.param().activation;
@@ -196,9 +206,9 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
 
   if (_ctx.at(ifm_index).info().isDynamic() || _ctx.at(ker_index).info().isDynamic())
   {
-    fn->configure(ifm_alloc, ker_alloc, bias_alloc, param_padding.type, param_padding.param.left,
+    fn->configure(ifm_tensor, ker_tensor, bias_tensor, param_padding.type, param_padding.param.left,
                   param_padding.param.right, param_padding.param.top, param_padding.param.bottom,
-                  stride.horizontal, stride.vertical, activation, ofm_alloc);
+                  stride.horizontal, stride.vertical, activation, ofm_tensor);
 
     _return_fn = std::move(fn);
     return;
@@ -213,9 +223,9 @@ void KernelGenerator::visit(const ir::operation::Conv2D &node)
   const auto padding =
       ir::calculatePadding(param_padding, ifm_shape, ofm_shape, stride, ker_width, ker_height);
 
-  fn->configure(ifm_alloc, ker_alloc, bias_alloc, param_padding.type, padding.left, padding.right,
-                padding.top, padding.bottom, stride.horizontal, stride.vertical, activation,
-                ofm_alloc);
+  fn->configure(ifm_tensor, ker_tensor, bias_tensor, param_padding.type, padding.left,
+                padding.right, padding.top, padding.bottom, stride.horizontal, stride.vertical,
+                activation, ofm_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -241,16 +251,16 @@ void KernelGenerator::visit(const ir::operation::DepthwiseConv2D &node)
   const auto multiplier = node.param().multiplier;
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
-  auto ker_alloc = _tensor_builder->portableAt(ker_index).get();
-  auto bias_alloc = _tensor_builder->portableAt(bias_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
+  auto ker_tensor = _tensor_builder->portableAt(ker_index).get();
+  auto bias_tensor = _tensor_builder->portableAt(bias_index).get();
 
   auto fn = std::make_unique<ops::DepthwiseConvolutionLayer>();
 
-  fn->configure(ifm_alloc, ker_alloc, bias_alloc, padding.left, padding.right, padding.top,
+  fn->configure(ifm_tensor, ker_tensor, bias_tensor, padding.left, padding.right, padding.top,
                 padding.bottom, stride.horizontal, stride.vertical, multiplier, activation,
-                ofm_alloc);
+                ofm_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -270,13 +280,13 @@ void KernelGenerator::visit(const ir::operation::MaxPool2D &node)
       ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
 
   auto fn = std::make_unique<ops::MaxPoolLayer>();
 
-  fn->configure(ifm_alloc, padding.left, padding.right, padding.top, padding.bottom,
-                stride.horizontal, stride.vertical, kw, kh, activation, ofm_alloc);
+  fn->configure(ifm_tensor, padding.left, padding.right, padding.top, padding.bottom,
+                stride.horizontal, stride.vertical, kw, kh, activation, ofm_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -295,13 +305,13 @@ void KernelGenerator::visit(const ir::operation::AvgPool2D &node)
       ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
 
   auto fn = std::make_unique<ops::AvgPoolLayer>();
 
-  fn->configure(ifm_alloc, padding.left, padding.right, padding.top, padding.bottom,
-                stride.horizontal, stride.vertical, kw, kh, activation, ofm_alloc);
+  fn->configure(ifm_tensor, padding.left, padding.right, padding.top, padding.bottom,
+                stride.horizontal, stride.vertical, kw, kh, activation, ofm_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -313,7 +323,7 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
   const auto rank = _ctx.at(ofm_index).shape().rank();
   const auto axis = ops::getAxis(rank, node.param().axis, _current_op_seq_layout);
 
-  auto output_alloc = _tensor_builder->portableAt(ofm_index).get();
+  auto output_tensor = _tensor_builder->portableAt(ofm_index).get();
 
   std::vector<const IPortableTensor *> input_tensors;
   for (auto &ifm_idx : node.getInputs())
@@ -321,7 +331,33 @@ void KernelGenerator::visit(const ir::operation::Concat &node)
 
   auto fn = std::make_unique<ops::ConcatLayer>();
 
-  fn->configure(input_tensors, axis, output_alloc);
+  fn->configure(input_tensors, axis, output_tensor);
+
+  _return_fn = std::move(fn);
+}
+
+void KernelGenerator::visit(const ir::operation::BatchToSpaceND &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(ir::operation::BatchToSpaceND::INPUT)};
+  const auto block_size_index{node.getInputs().at(ir::operation::BatchToSpaceND::BLOCK_SIZE)};
+
+  auto output_alloc = _tensor_builder->portableAt(output_index).get();
+  auto input_alloc = _tensor_builder->portableAt(input_index).get();
+  auto block_size_alloc = _tensor_builder->portableAt(block_size_index).get();
+
+  auto fn = std::make_unique<ops::BatchToSpaceNDLayer>();
+
+  IPortableTensor *crops_alloc = nullptr;
+  const auto NNApiInputs = 2;
+
+  if (node.getInputs().size() != NNApiInputs)
+  {
+    const auto crops_data_index{node.getInputs().at(ir::operation::BatchToSpaceND::CROPS_DATA)};
+    crops_alloc = _tensor_builder->portableAt(crops_data_index).get();
+  }
+
+  fn->configure(input_alloc, output_alloc, block_size_alloc, crops_alloc);
 
   _return_fn = std::move(fn);
 }
@@ -332,13 +368,13 @@ void KernelGenerator::visit(const ir::operation::Fill &node)
   const auto input_index{node.getInputs().at(ir::operation::Fill::Input::INPUT)};
   const auto value_index{node.getInputs().at(ir::operation::Fill::Input::VALUE)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
-  auto value_alloc = _tensor_builder->portableAt(value_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+  auto value_tensor = _tensor_builder->portableAt(value_index).get();
 
   auto fn = std::make_unique<ops::FillLayer>();
 
-  fn->configure(input_alloc, value_alloc, output_alloc);
+  fn->configure(input_tensor, value_tensor, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -353,15 +389,16 @@ void KernelGenerator::visit(const ir::operation::FullyConnected &node)
   const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)};
   const auto activation = node.param().activation;
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
-  auto weight_alloc = _tensor_builder->portableAt(weight_index).get();
-  auto bias_alloc =
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+  auto weight_tensor = _tensor_builder->portableAt(weight_index).get();
+  auto bias_tensor =
       bias_index.undefined() ? nullptr : _tensor_builder->portableAt(bias_index).get();
 
   auto fn = std::make_unique<ops::FullyConnectedLayer>();
 
-  fn->configure(input_alloc, weight_alloc, bias_alloc, activation, output_alloc);
+  fn->configure(input_tensor, weight_tensor, bias_tensor, activation, output_tensor,
+                _external_context);
 
   _return_fn = std::move(fn);
 }
@@ -371,21 +408,21 @@ void KernelGenerator::visit(const ir::operation::Reshape &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Reshape::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
 
   // optional 2nd input
-  IPortableTensor *shape_alloc = nullptr;
+  IPortableTensor *shape_tensor = nullptr;
 
   if (node.getInputs().size() == 2)
   {
     const auto shape_index{node.getInputs().at(ir::operation::Reshape::Input::SHAPE)};
-    shape_alloc = _tensor_builder->portableAt(shape_index).get();
+    shape_tensor = _tensor_builder->portableAt(shape_index).get();
   }
 
   auto fn = std::make_unique<ops::ReshapeLayer>();
 
-  fn->configure(input_alloc, shape_alloc, output_alloc);
+  fn->configure(input_tensor, shape_tensor, output_tensor);
   _return_fn = std::move(fn);
 }
 
@@ -394,13 +431,13 @@ void KernelGenerator::visit(const ir::operation::Squeeze &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Squeeze::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
 
   // Squeeze can share same kernel with reshape
   auto fn = std::make_unique<ops::ReshapeLayer>();
 
-  fn->configure(input_alloc, nullptr, output_alloc);
+  fn->configure(input_tensor, nullptr, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -412,12 +449,12 @@ void KernelGenerator::visit(const ir::operation::Softmax &node)
 
   const auto beta = node.param().beta;
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
 
   auto fn = std::make_unique<ops::SoftMaxLayer>();
 
-  fn->configure(input_alloc, beta, output_alloc);
+  fn->configure(input_tensor, beta, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -430,13 +467,13 @@ void KernelGenerator::visit(const ir::operation::Add &node)
 
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
 
   auto fn = std::make_unique<ops::AddLayer>();
 
-  fn->configure(lhs_alloc, rhs_alloc, activation, ofm_alloc);
+  fn->configure(lhs_tensor, rhs_tensor, activation, ofm_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -447,15 +484,15 @@ void KernelGenerator::visit(const ir::operation::Comparison &node)
   const auto lhs_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT0)};
   const auto rhs_index{node.getInputs().at(ir::operation::Comparison::Input::INPUT1)};
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
 
   auto comparison_type = node.param().comparison_type;
 
   auto fn = std::make_unique<ops::CompareLayer>();
 
-  fn->configure(lhs_alloc, rhs_alloc, comparison_type, ofm_alloc);
+  fn->configure(lhs_tensor, rhs_tensor, comparison_type, ofm_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -466,11 +503,11 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
   const auto input_index{node.getInputs().at(ir::operation::Gather::Input::INPUT)};
   const auto indices_index{node.getInputs().at(ir::operation::Gather::Input::INDICES)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
-  auto indices_alloc = _tensor_builder->portableAt(indices_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+  auto indices_tensor = _tensor_builder->portableAt(indices_index).get();
 
-  const auto backend_layout = output_alloc->layout();
+  const auto backend_layout = output_tensor->layout();
   UNUSED_RELEASE(backend_layout);
 
   // NOTE The frontend layout and backend layout must be the same for this operation.
@@ -481,8 +518,8 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
   //      a model. For example, if a model in NHWC has this operation as output rank == 4, indices
   //      rank == 2 and axis == 2, this operation should work as the axis W and C, but the axis W
   //      and C are not sequential in NCHW. So the backend in NCHW cannot handle this case.
-  assert(backend_layout == input_alloc->layout());
-  assert(backend_layout == indices_alloc->layout());
+  assert(backend_layout == input_tensor->layout());
+  assert(backend_layout == indices_tensor->layout());
   const auto &input_shape = _ctx.at(input_index).shape();
   UNUSED_RELEASE(input_shape);
   assert(input_shape.rank() < 4 || _current_op_seq_layout == backend_layout);
@@ -492,7 +529,7 @@ void KernelGenerator::visit(const ir::operation::Gather &node)
 
   auto fn = std::make_unique<ops::GatherLayer>();
 
-  fn->configure(input_alloc, indices_alloc, output_alloc, axis_value);
+  fn->configure(input_tensor, indices_tensor, output_tensor, axis_value);
 
   _return_fn = std::move(fn);
 }
@@ -506,13 +543,13 @@ void KernelGenerator::visit(const ir::operation::Sub &node)
 
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
 
   auto fn = std::make_unique<ops::SubLayer>();
 
-  fn->configure(lhs_alloc, rhs_alloc, activation, ofm_alloc);
+  fn->configure(lhs_tensor, rhs_tensor, activation, ofm_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -526,13 +563,13 @@ void KernelGenerator::visit(const ir::operation::Mul &node)
 
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
 
   auto fn = std::make_unique<ops::MulLayer>();
 
-  fn->configure(lhs_alloc, rhs_alloc, activation, ofm_alloc);
+  fn->configure(lhs_tensor, rhs_tensor, activation, ofm_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -547,18 +584,18 @@ void KernelGenerator::visit(const ir::operation::OneHot &node)
 
   const auto axis = node.param().axis;
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto indices_alloc = _tensor_builder->portableAt(indices_index).get();
-  auto depth_alloc = _tensor_builder->portableAt(depth_index).get();
-  auto onvalue_alloc = _tensor_builder->portableAt(onvalue_index).get();
-  auto offvalue_alloc = _tensor_builder->portableAt(offvalue_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto indices_tensor = _tensor_builder->portableAt(indices_index).get();
+  auto depth_tensor = _tensor_builder->portableAt(depth_index).get();
+  auto onvalue_tensor = _tensor_builder->portableAt(onvalue_index).get();
+  auto offvalue_tensor = _tensor_builder->portableAt(offvalue_index).get();
 
-  assert(indices_alloc->data_type() == OperandType::INT32);
-  assert(axis <= static_cast<int>(indices_alloc->num_dimensions()));
+  assert(indices_tensor->data_type() == OperandType::INT32);
+  assert(axis <= static_cast<int>(indices_tensor->num_dimensions()));
 
   auto fn = std::make_unique<ops::OneHotLayer>();
 
-  fn->configure(indices_alloc, depth_alloc, onvalue_alloc, offvalue_alloc, output_alloc, axis);
+  fn->configure(indices_tensor, depth_tensor, onvalue_tensor, offvalue_tensor, output_tensor, axis);
 
   _return_fn = std::move(fn);
 }
@@ -572,13 +609,13 @@ void KernelGenerator::visit(const ir::operation::Div &node)
 
   const auto activation = node.param().activation;
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
 
   auto fn = std::make_unique<ops::DivLayer>();
 
-  fn->configure(lhs_alloc, rhs_alloc, activation, ofm_alloc);
+  fn->configure(lhs_tensor, rhs_tensor, activation, ofm_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -587,16 +624,16 @@ void KernelGenerator::visit(const ir::operation::Einsum &node)
 {
   const auto ofm_index{node.getOutputs().at(0)};
 
-  auto output_alloc = _tensor_builder->portableAt(ofm_index).get();
-  std::vector<const IPortableTensor *> input_allocs;
+  auto output_tensor = _tensor_builder->portableAt(ofm_index).get();
+  std::vector<const IPortableTensor *> input_tensors;
   for (auto &ifm_idx : node.getInputs())
-    input_allocs.emplace_back(_tensor_builder->portableAt(ifm_idx).get());
+    input_tensors.emplace_back(_tensor_builder->portableAt(ifm_idx).get());
 
   const auto equation = node.param().equation;
 
   auto fn = std::make_unique<ops::EinsumLayer>();
 
-  fn->configure(input_allocs, equation, output_alloc);
+  fn->configure(input_tensors, equation, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -605,14 +642,14 @@ void KernelGenerator::visit(const ir::operation::Custom &node)
 {
   auto fill_op_info = [&](const ir::OperandIndexSequence &opSeq,
                           std::vector<custom::TypeInfo> &types,
-                          std::vector<std::shared_ptr<IPortableTensor>> &allocs) {
+                          std::vector<std::shared_ptr<IPortableTensor>> &tensors) {
     for (auto &idx : opSeq)
     {
       const auto &operand = _ctx.at(idx);
       // TODO make sure using `_current_op_seq_layout` is correct for custom operations
       types.emplace_back(custom::TypeInfo{operand.shape(), operand.typeInfo().type()});
-      auto in_alloc = _tensor_builder->portableAt(idx);
-      allocs.emplace_back(in_alloc);
+      auto in_tensor = _tensor_builder->portableAt(idx);
+      tensors.emplace_back(in_tensor);
     }
   };
 
@@ -634,12 +671,12 @@ void KernelGenerator::visit(const ir::operation::Exp &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Exp::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
 
   auto fn = std::make_unique<ops::ExpLayer>();
 
-  fn->configure(input_alloc, output_alloc);
+  fn->configure(input_tensor, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -650,13 +687,13 @@ void KernelGenerator::visit(const ir::operation::ExpandDims &node)
   const auto input_index{node.getInputs().at(ir::operation::ExpandDims::Input::INPUT)};
   const auto axis_index{node.getInputs().at(ir::operation::ExpandDims::Input::AXIS)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
-  auto axis_alloc = _tensor_builder->portableAt(axis_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+  auto axis_tensor = _tensor_builder->portableAt(axis_index).get();
 
   auto fn = std::make_unique<ops::ExpandDimsLayer>();
 
-  fn->configure(input_alloc, axis_alloc, output_alloc);
+  fn->configure(input_tensor, axis_tensor, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -666,12 +703,12 @@ void KernelGenerator::visit(const ir::operation::Logistic &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Logistic::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
 
   auto fn = std::make_unique<ops::LogisticLayer>();
 
-  fn->configure(input_alloc, output_alloc);
+  fn->configure(input_tensor, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -681,12 +718,12 @@ void KernelGenerator::visit(const ir::operation::Tanh &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Tanh::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
 
   auto fn = std::make_unique<ops::TanhLayer>();
 
-  fn->configure(input_alloc, output_alloc);
+  fn->configure(input_tensor, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -700,7 +737,7 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
 
   assert(-rank <= axis && axis < rank);
 
-  auto output_alloc = _tensor_builder->portableAt(ofm_index).get();
+  auto output_tensor = _tensor_builder->portableAt(ofm_index).get();
 
   std::vector<const IPortableTensor *> input_tensors;
   for (auto &ifm_idx : node.getInputs())
@@ -708,7 +745,7 @@ void KernelGenerator::visit(const ir::operation::Pack &node)
 
   auto fn = std::make_unique<ops::PackLayer>();
 
-  fn->configure(input_tensors, axis, output_alloc);
+  fn->configure(input_tensors, axis, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -722,7 +759,7 @@ void KernelGenerator::visit(const ir::operation::Unpack &node)
 
   assert(rank == 0 || (-rank <= axis && axis < rank));
 
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
 
   std::vector<IPortableTensor *> output_tensors;
   for (auto &output_idx : node.getOutputs())
@@ -732,7 +769,7 @@ void KernelGenerator::visit(const ir::operation::Unpack &node)
 
   uint32_t axis_resolved = (axis < 0 ? axis + rank : axis);
 
-  fn->configure(input_alloc, axis_resolved, node.param().num, output_tensors);
+  fn->configure(input_tensor, axis_resolved, node.param().num, output_tensors);
 
   _return_fn = std::move(fn);
 }
@@ -751,8 +788,16 @@ void KernelGenerator::visit(const ir::operation::Pad &node)
 
   auto fn = std::make_unique<ops::PadLayer>();
 
-  fn->configure(input, output, pad_base, pad_rank);
+  bool isPadV2 = node.getInputs().size() == 3 ? true : false;
+  const void *value = nullptr;
+
+  if (isPadV2)
+  {
+    const auto value_index{node.getInputs().at(ir::operation::Pad::Input::VALUE)};
+    value = reinterpret_cast<const void *>(_ctx.at(value_index).data()->base());
+  }
 
+  fn->configure(input, output, pad_base, pad_rank, value);
   _return_fn = std::move(fn);
 }
 
@@ -762,13 +807,13 @@ void KernelGenerator::visit(const ir::operation::Max &node)
   const auto lhs_index{node.getInputs().at(ir::operation::Max::Input::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::Max::Input::RHS)};
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
 
   auto fn = std::make_unique<ops::MaxLayer>();
 
-  fn->configure(lhs_alloc, rhs_alloc, ofm_alloc);
+  fn->configure(lhs_tensor, rhs_tensor, ofm_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -779,13 +824,13 @@ void KernelGenerator::visit(const ir::operation::Min &node)
   const auto lhs_index{node.getInputs().at(ir::operation::Min::Input::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::Min::Input::RHS)};
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
 
   auto fn = std::make_unique<ops::MinLayer>();
 
-  fn->configure(lhs_alloc, rhs_alloc, ofm_alloc);
+  fn->configure(lhs_tensor, rhs_tensor, ofm_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -795,12 +840,12 @@ void KernelGenerator::visit(const ir::operation::Cast &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::Cast::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
 
   auto fn = std::make_unique<ops::CastLayer>();
 
-  fn->configure(ifm_alloc, ofm_alloc);
+  fn->configure(ifm_tensor, ofm_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -810,12 +855,12 @@ void KernelGenerator::visit(const ir::operation::Transpose &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Transpose::Input::INPUT)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
 
   auto fn = std::make_unique<ops::TransposeLayer>();
 
-  fn->configure(input_alloc, output_alloc, node.param().perm);
+  fn->configure(input_tensor, output_tensor, node.param().perm);
 
   _return_fn = std::move(fn);
 }
@@ -827,15 +872,15 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
   const auto axes_index{node.getInputs().at(ir::operation::Reduce::Input::AXES)};
 
   const auto keep_dims = node.param().keep_dims;
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
-  auto axes_alloc = _tensor_builder->portableAt(axes_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+  auto axes_tensor = _tensor_builder->portableAt(axes_index).get();
 
   if (node.param().reduce_type == ir::operation::Reduce::ReduceType::MEAN)
   {
     auto fn = std::make_unique<ops::MeanLayer>();
 
-    fn->configure(input_alloc, axes_alloc, output_alloc, keep_dims);
+    fn->configure(input_tensor, axes_tensor, output_tensor, keep_dims);
 
     _return_fn = std::move(fn);
   }
@@ -844,7 +889,7 @@ void KernelGenerator::visit(const ir::operation::Reduce &node)
     auto fn = std::make_unique<ops::ReduceLayer>();
 
     const auto reduce_type = convertReduceType(node.param().reduce_type);
-    fn->configure(input_alloc, axes_alloc, output_alloc, reduce_type, keep_dims);
+    fn->configure(input_tensor, axes_tensor, output_tensor, reduce_type, keep_dims);
 
     _return_fn = std::move(fn);
   }
@@ -855,12 +900,27 @@ void KernelGenerator::visit(const ir::operation::ReLU &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(0)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
 
   auto fn = std::make_unique<ops::ReLULayer>();
 
-  fn->configure(input_alloc, output_alloc);
+  fn->configure(input_tensor, output_tensor);
+
+  _return_fn = std::move(fn);
+}
+
+void KernelGenerator::visit(const ir::operation::ReLU6 &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(0)};
+
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+
+  auto fn = std::make_unique<ops::ReLU6Layer>();
+
+  fn->configure(input_tensor, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -872,14 +932,14 @@ void KernelGenerator::visit(const ir::operation::Select &node)
   const auto true_index{node.getInputs().at(ir::operation::Select::Input::INPUT_TRUE)};
   const auto false_index{node.getInputs().at(ir::operation::Select::Input::INPUT_FALSE)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto condition_alloc = _tensor_builder->portableAt(condition_index).get();
-  auto true_alloc = _tensor_builder->portableAt(true_index).get();
-  auto false_alloc = _tensor_builder->portableAt(false_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto condition_tensor = _tensor_builder->portableAt(condition_index).get();
+  auto true_tensor = _tensor_builder->portableAt(true_index).get();
+  auto false_tensor = _tensor_builder->portableAt(false_index).get();
 
   auto fn = std::make_unique<ops::SelectLayer>();
 
-  fn->configure(condition_alloc, true_alloc, false_alloc, output_alloc);
+  fn->configure(condition_tensor, true_tensor, false_tensor, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -891,14 +951,14 @@ void KernelGenerator::visit(const ir::operation::Slice &node)
   const auto begins_index{node.getInputs().at(ir::operation::Slice::Input::BEGINS)};
   const auto sizes_index{node.getInputs().at(ir::operation::Slice::Input::SIZES)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
-  auto begins_alloc = _tensor_builder->portableAt(begins_index).get();
-  auto sizes_alloc = _tensor_builder->portableAt(sizes_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+  auto begins_tensor = _tensor_builder->portableAt(begins_index).get();
+  auto sizes_tensor = _tensor_builder->portableAt(sizes_index).get();
 
   auto fn = std::make_unique<ops::SliceLayer>();
 
-  fn->configure(input_alloc, begins_alloc, sizes_alloc, output_alloc);
+  fn->configure(input_tensor, begins_tensor, sizes_tensor, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -911,11 +971,11 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
   const auto ends_index{node.getInputs().at(ir::operation::StridedSlice::Input::ENDS)};
   const auto strides_index{node.getInputs().at(ir::operation::StridedSlice::Input::STRIDES)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
-  auto starts_alloc = _tensor_builder->portableAt(starts_index).get();
-  auto ends_alloc = _tensor_builder->portableAt(ends_index).get();
-  auto strides_alloc = _tensor_builder->portableAt(strides_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+  auto starts_tensor = _tensor_builder->portableAt(starts_index).get();
+  auto ends_tensor = _tensor_builder->portableAt(ends_index).get();
+  auto strides_tensor = _tensor_builder->portableAt(strides_index).get();
 
   auto begin_mask = node.param().begin_mask;
   auto end_mask = node.param().end_mask;
@@ -923,7 +983,7 @@ void KernelGenerator::visit(const ir::operation::StridedSlice &node)
 
   auto fn = std::make_unique<ops::StridedSliceLayer>();
 
-  fn->configure(input_alloc, starts_alloc, ends_alloc, strides_alloc, output_alloc, begin_mask,
+  fn->configure(input_tensor, starts_tensor, ends_tensor, strides_tensor, output_tensor, begin_mask,
                 end_mask, shrink_axis_mask);
 
   _return_fn = std::move(fn);
@@ -957,12 +1017,12 @@ void KernelGenerator::visit(const ir::operation::Abs &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::Abs::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
 
   auto fn = std::make_unique<ops::AbsLayer>();
 
-  fn->configure(ifm_alloc, ofm_alloc);
+  fn->configure(ifm_tensor, ofm_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -972,12 +1032,12 @@ void KernelGenerator::visit(const ir::operation::Sin &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::Sin::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
 
   auto fn = std::make_unique<ops::SinLayer>();
 
-  fn->configure(ifm_alloc, ofm_alloc);
+  fn->configure(ifm_tensor, ofm_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -987,12 +1047,12 @@ void KernelGenerator::visit(const ir::operation::Cos &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::Cos::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
 
   auto fn = std::make_unique<ops::CosLayer>();
 
-  fn->configure(ifm_alloc, ofm_alloc);
+  fn->configure(ifm_tensor, ofm_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -1002,12 +1062,12 @@ void KernelGenerator::visit(const ir::operation::RSQRT &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::RSQRT::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
 
   auto fn = std::make_unique<ops::RsqrtLayer>();
 
-  fn->configure(ifm_alloc, ofm_alloc);
+  fn->configure(ifm_tensor, ofm_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -1017,12 +1077,33 @@ void KernelGenerator::visit(const ir::operation::Shape &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::Shape::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
 
   auto fn = std::make_unique<ops::ShapeLayer>();
 
-  fn->configure(ifm_alloc, ofm_alloc);
+  fn->configure(ifm_tensor, ofm_tensor);
+
+  _return_fn = std::move(fn);
+}
+
+void KernelGenerator::visit(const ir::operation::ResizeBilinear &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(ir::operation::ResizeBilinear::INPUT)};
+
+  auto output_height = node.param().height_out;
+  auto output_width = node.param().width_out;
+  auto align_corners = node.param().align_corners;
+  auto half_pixel_centers = node.param().half_pixel_centers;
+
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+
+  auto fn = std::make_unique<ops::ResizeBilinearLayer>();
+
+  fn->configure(input_tensor, output_tensor, output_height, output_width, align_corners,
+                half_pixel_centers);
 
   _return_fn = std::move(fn);
 }
@@ -1033,13 +1114,13 @@ void KernelGenerator::visit(const ir::operation::Reverse &node)
   const auto input_index{node.getInputs().at(ir::operation::Reverse::INPUT)};
   const auto axis_index{node.getInputs().at(ir::operation::Reverse::AXIS)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
-  auto axis_alloc = _tensor_builder->portableAt(axis_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+  auto axis_tensor = _tensor_builder->portableAt(axis_index).get();
 
   auto fn = std::make_unique<ops::ReverseLayer>();
 
-  fn->configure(input_alloc, axis_alloc, output_alloc);
+  fn->configure(input_tensor, axis_tensor, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -1049,12 +1130,12 @@ void KernelGenerator::visit(const ir::operation::Neg &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::Neg::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
 
   auto fn = std::make_unique<ops::NegLayer>();
 
-  fn->configure(ifm_alloc, ofm_alloc);
+  fn->configure(ifm_tensor, ofm_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -1066,12 +1147,12 @@ void KernelGenerator::visit(const ir::operation::ArgMax &node)
 
   const auto axis = node.param().axis;
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
 
   auto fn = std::make_unique<ops::ArgMinMaxLayer>();
 
-  fn->configure(input_alloc, output_alloc, axis, /* is_arg_max */ true);
+  fn->configure(input_tensor, output_tensor, axis, /* is_arg_max */ true);
 
   _return_fn = std::move(fn);
 }
@@ -1082,13 +1163,13 @@ void KernelGenerator::visit(const ir::operation::Pow &node)
   const auto lhs_index{node.getInputs().at(ir::operation::Pow::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::Pow::RHS)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
 
   auto fn = std::make_unique<ops::PowLayer>();
 
-  fn->configure(lhs_alloc, rhs_alloc, ir::Activation::NONE, output_alloc);
+  fn->configure(lhs_tensor, rhs_tensor, ir::Activation::NONE, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -1098,12 +1179,12 @@ void KernelGenerator::visit(const ir::operation::Log &node)
   const auto ofm_index{node.getOutputs().at(0)};
   const auto ifm_index{node.getInputs().at(ir::operation::Log::Input::INPUT)};
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto ifm_alloc = _tensor_builder->portableAt(ifm_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto ifm_tensor = _tensor_builder->portableAt(ifm_index).get();
 
   auto fn = std::make_unique<ops::LogLayer>();
 
-  fn->configure(ifm_alloc, ofm_alloc);
+  fn->configure(ifm_tensor, ofm_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -1113,12 +1194,12 @@ void KernelGenerator::visit(const ir::operation::Round &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::Round::INPUT)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
 
   auto fn = std::make_unique<ops::RoundLayer>();
 
-  fn->configure(input_alloc, output_alloc);
+  fn->configure(input_tensor, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -1128,12 +1209,12 @@ void KernelGenerator::visit(const ir::operation::LogicalNot &node)
   const auto output_index{node.getOutputs().at(0)};
   const auto input_index{node.getInputs().at(ir::operation::LogicalNot::INPUT)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
 
   auto fn = std::make_unique<ops::LogicalNotLayer>();
 
-  fn->configure(input_alloc, output_alloc);
+  fn->configure(input_tensor, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -1144,28 +1225,43 @@ void KernelGenerator::visit(const ir::operation::LogicalOr &node)
   const auto lhs_index{node.getInputs().at(0)};
   const auto rhs_index{node.getInputs().at(1)};
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
 
   auto fn = std::make_unique<ops::LogicalOrLayer>();
 
-  fn->configure(lhs_alloc, rhs_alloc, ofm_alloc);
+  fn->configure(lhs_tensor, rhs_tensor, ofm_tensor);
 
   _return_fn = std::move(fn);
 }
 
-void KernelGenerator::visit(const ir::operation::ZerosLike &node)
+void KernelGenerator::visit(const ir::operation::L2Normalization &node)
 {
   const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(ir::operation::ZerosLike::INPUT)};
+  const auto input_index{node.getInputs().at(0)};
 
   auto output_alloc = _tensor_builder->portableAt(output_index).get();
   auto input_alloc = _tensor_builder->portableAt(input_index).get();
 
-  auto fn = std::make_unique<ops::ZerosLikeLayer>();
+  auto fn = std::make_unique<ops::L2NormLayer>();
 
   fn->configure(input_alloc, output_alloc);
+
+  _return_fn = std::move(fn);
+}
+
+void KernelGenerator::visit(const ir::operation::ZerosLike &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(ir::operation::ZerosLike::INPUT)};
+
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+
+  auto fn = std::make_unique<ops::ZerosLikeLayer>();
+
+  fn->configure(input_tensor, output_tensor);
   _return_fn = std::move(fn);
 }
 
@@ -1176,14 +1272,14 @@ void KernelGenerator::visit(const ir::operation::Range &node)
   const auto limit_index{node.getInputs().at(ir::operation::Range::LIMIT)};
   const auto delta_index{node.getInputs().at(ir::operation::Range::DELTA)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto start_alloc = _tensor_builder->portableAt(start_index).get();
-  auto limit_alloc = _tensor_builder->portableAt(limit_index).get();
-  auto delta_alloc = _tensor_builder->portableAt(delta_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto start_tensor = _tensor_builder->portableAt(start_index).get();
+  auto limit_tensor = _tensor_builder->portableAt(limit_index).get();
+  auto delta_tensor = _tensor_builder->portableAt(delta_index).get();
 
   auto fn = std::make_unique<ops::RangeLayer>();
 
-  fn->configure(start_alloc, limit_alloc, delta_alloc, output_alloc);
+  fn->configure(start_tensor, limit_tensor, delta_tensor, output_tensor);
   _return_fn = std::move(fn);
 }
 
@@ -1193,13 +1289,13 @@ void KernelGenerator::visit(const ir::operation::SquaredDifference &node)
   const auto lhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::SquaredDifference::Input::RHS)};
 
-  auto ofm_alloc = _tensor_builder->portableAt(ofm_index).get();
-  auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
+  auto ofm_tensor = _tensor_builder->portableAt(ofm_index).get();
+  auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
 
   auto fn = std::make_unique<ops::SqDiffLayer>();
 
-  fn->configure(lhs_alloc, rhs_alloc, ofm_alloc);
+  fn->configure(lhs_tensor, rhs_tensor, ofm_tensor);
   _return_fn = std::move(fn);
 }
 
@@ -1209,13 +1305,13 @@ void KernelGenerator::visit(const ir::operation::Tile &node)
   const auto input_index{node.getInputs().at(ir::operation::Tile::INPUT)};
   const auto multiples_index{node.getInputs().at(ir::operation::Tile::MULTIPLES)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
-  auto multiples_alloc = _tensor_builder->portableAt(multiples_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+  auto multiples_tensor = _tensor_builder->portableAt(multiples_index).get();
 
   auto fn = std::make_unique<ops::TileLayer>();
 
-  fn->configure(input_alloc, multiples_alloc, output_alloc);
+  fn->configure(input_tensor, multiples_tensor, output_tensor);
   _return_fn = std::move(fn);
 }
 
@@ -1226,14 +1322,14 @@ void KernelGenerator::visit(const ir::operation::MatrixBandPart &node)
   const auto num_lower_index{node.getInputs().at(ir::operation::MatrixBandPart::NUM_LOWER_DIAG)};
   const auto num_upper_index{node.getInputs().at(ir::operation::MatrixBandPart::NUM_UPPER_DIAG)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
-  auto num_lower_alloc = _tensor_builder->portableAt(num_lower_index).get();
-  auto num_upper_alloc = _tensor_builder->portableAt(num_upper_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+  auto num_lower_tensor = _tensor_builder->portableAt(num_lower_index).get();
+  auto num_upper_tensor = _tensor_builder->portableAt(num_upper_index).get();
 
   auto fn = std::make_unique<ops::MatrixBandPartLayer>();
 
-  fn->configure(input_alloc, num_lower_alloc, num_upper_alloc, output_alloc);
+  fn->configure(input_tensor, num_lower_tensor, num_upper_tensor, output_tensor);
   _return_fn = std::move(fn);
 }
 
@@ -1243,16 +1339,16 @@ void KernelGenerator::visit(const ir::operation::BatchMatMul &node)
   const auto lhs_index{node.getInputs().at(ir::operation::BatchMatMul::LHS)};
   const auto rhs_index{node.getInputs().at(ir::operation::BatchMatMul::RHS)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto lhs_alloc = _tensor_builder->portableAt(lhs_index).get();
-  auto rhs_alloc = _tensor_builder->portableAt(rhs_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto lhs_tensor = _tensor_builder->portableAt(lhs_index).get();
+  auto rhs_tensor = _tensor_builder->portableAt(rhs_index).get();
 
   const auto adj_x = node.param().adj_x;
   const auto adj_y = node.param().adj_y;
 
   auto fn = std::make_unique<ops::BatchMatMulLayer>();
 
-  fn->configure(lhs_alloc, rhs_alloc, adj_x, adj_y, output_alloc);
+  fn->configure(lhs_tensor, rhs_tensor, adj_x, adj_y, output_tensor);
   _return_fn = std::move(fn);
 }
 
@@ -1262,13 +1358,13 @@ void KernelGenerator::visit(const ir::operation::BroadcastTo &node)
   const auto input_index{node.getInputs().at(ir::operation::BroadcastTo::INPUT)};
   const auto shape_index{node.getInputs().at(ir::operation::BroadcastTo::SHAPE)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
-  auto shape_alloc = _tensor_builder->portableAt(shape_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+  auto shape_tensor = _tensor_builder->portableAt(shape_index).get();
 
   auto fn = std::make_unique<ops::BroadcastToLayer>();
 
-  fn->configure(input_alloc, shape_alloc, output_alloc);
+  fn->configure(input_tensor, shape_tensor, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -1277,10 +1373,10 @@ void KernelGenerator::visit(const ir::operation::FusedBatchNorm &node)
 {
   const auto ofm_index{node.getOutputs().at(0)};
 
-  auto output_alloc = _tensor_builder->portableAt(ofm_index).get();
-  std::vector<const IPortableTensor *> input_allocs;
+  auto output_tensor = _tensor_builder->portableAt(ofm_index).get();
+  std::vector<const IPortableTensor *> input_tensors;
   for (auto &ifm_idx : node.getInputs())
-    input_allocs.emplace_back(_tensor_builder->portableAt(ifm_idx).get());
+    input_tensors.emplace_back(_tensor_builder->portableAt(ifm_idx).get());
 
   const auto epsilon = node.param().epsilon;
   const auto is_training = node.param().is_training;
@@ -1288,7 +1384,7 @@ void KernelGenerator::visit(const ir::operation::FusedBatchNorm &node)
 
   auto fn = std::make_unique<ops::FusedBatchNormLayer>();
 
-  fn->configure(input_allocs, epsilon, is_training, data_format, output_alloc);
+  fn->configure(input_tensors, epsilon, is_training, data_format, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -1301,12 +1397,12 @@ void KernelGenerator::visit(const ir::operation::LogSoftmax &node)
   const auto beta = node.param().beta;
   const auto axis = node.param().axis;
 
-  auto output_alloc = _tensor_builder->at(output_index).get();
-  auto input_alloc = _tensor_builder->at(input_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
 
   auto fn = std::make_unique<ops::LogSoftMaxLayer>();
 
-  fn->configure(input_alloc, beta, axis, output_alloc);
+  fn->configure(input_tensor, beta, axis, output_tensor);
 
   _return_fn = std::move(fn);
 }
@@ -1318,14 +1414,84 @@ void KernelGenerator::visit(const ir::operation::SpaceToBatchND &node)
   const auto block_shape_index{node.getInputs().at(ir::operation::SpaceToBatchND::BLOCK_SIZE)};
   const auto padding_index{node.getInputs().at(ir::operation::SpaceToBatchND::PADDINGS)};
 
-  auto output_alloc = _tensor_builder->portableAt(output_index).get();
-  auto input_alloc = _tensor_builder->portableAt(input_index).get();
-  auto block_shape_alloc = _tensor_builder->portableAt(block_shape_index).get();
-  auto padding_alloc = _tensor_builder->portableAt(padding_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+  auto block_shape_tensor = _tensor_builder->portableAt(block_shape_index).get();
+  auto padding_tensor = _tensor_builder->portableAt(padding_index).get();
 
   auto fn = std::make_unique<ops::SpaceToBatchNDLayer>();
 
-  fn->configure(input_alloc, block_shape_alloc, padding_alloc, output_alloc);
+  fn->configure(input_tensor, block_shape_tensor, padding_tensor, output_tensor);
+
+  _return_fn = std::move(fn);
+}
+
+void KernelGenerator::visit(const ir::operation::Quantize &node)
+{
+  const auto input_index{node.getInputs().at(ir::operation::Quantize::Input::INPUT)};
+  const auto output_index{node.getOutputs().at(0)};
+
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+
+  auto fn = std::make_unique<ops::QuantizeLayer>();
+
+  fn->configure(input_tensor, output_tensor);
+
+  _return_fn = std::move(fn);
+}
+
+void KernelGenerator::visit(const ir::operation::SpaceToDepth &node)
+{
+  const auto input_index{node.getInputs().at(ir::operation::SpaceToDepth::Input::INPUT)};
+  const auto output_index{node.getOutputs().at(0)};
+  auto block_size = node.param().block_size;
+
+  auto input_tensor = _tensor_builder->portableAt(input_index).get();
+  auto output_tensor = _tensor_builder->portableAt(output_index).get();
+
+  auto fn = std::make_unique<ops::SpaceToDepthLayer>();
+
+  fn->configure(input_tensor, block_size, output_tensor);
+  _return_fn = std::move(fn);
+}
+
+void KernelGenerator::visit(const ir::operation::StatelessRandomUniform &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto shape_index{node.getInputs().at(ir::operation::StatelessRandomUniform::SHAPE)};
+  const auto seed_index{node.getInputs().at(ir::operation::StatelessRandomUniform::SEED)};
+
+  auto output_alloc = _tensor_builder->portableAt(output_index).get();
+  auto shape_alloc = _tensor_builder->portableAt(shape_index).get();
+  auto seed_alloc = _tensor_builder->portableAt(seed_index).get();
+
+  auto fn = std::make_unique<ops::StatelessRandomUniformLayer>();
+
+  fn->configure(shape_alloc, seed_alloc, output_alloc);
+  _return_fn = std::move(fn);
+}
+
+void KernelGenerator::visit(const ir::operation::SplitV &node)
+{
+  const auto num_splits = node.param().num_splits;
+  assert(num_splits == static_cast<int>(node.getOutputs().size()));
+
+  const auto input_idx{node.getInputs().at(ir::operation::SplitV::Input::INPUT)};
+  const auto size_splits{node.getInputs().at(ir::operation::SplitV::Input::SIZE_SPLITS)};
+  const auto split_dim{node.getInputs().at(ir::operation::SplitV::Input::SPLIT_DIM)};
+
+  auto in_tensor = _tensor_builder->portableAt(input_idx).get();
+  auto in_size_splits = _tensor_builder->portableAt(size_splits).get();
+  auto in_split_dim = _tensor_builder->portableAt(split_dim).get();
+
+  std::vector<IPortableTensor *> out_tensors;
+  for (auto &output_idx : node.getOutputs())
+    out_tensors.emplace_back(_tensor_builder->portableAt(output_idx).get());
+
+  auto fn = std::make_unique<ops::SplitVLayer>();
+
+  fn->configure(in_tensor, in_size_splits, in_split_dim, num_splits, out_tensors);
 
   _return_fn = std::move(fn);
 }
diff --git a/runtime/onert/backend/cpu/KernelGenerator.h b/runtime/onert/backend/cpu/KernelGenerator.h
index d6f4c2825..40c056a96 100644
--- a/runtime/onert/backend/cpu/KernelGenerator.h
+++ b/runtime/onert/backend/cpu/KernelGenerator.h
@@ -17,6 +17,7 @@
 #ifndef __ONERT_BACKEND_CPU_KERNEL_GENERATOR_H__
 #define __ONERT_BACKEND_CPU_KERNEL_GENERATOR_H__
 
+#include "ExternalContext.h"
 #include "TensorBuilder.h"
 #include "Tensor.h"
 
@@ -37,7 +38,8 @@ class KernelGenerator : public IKernelGenerator
 public:
   KernelGenerator(const ir::Operands &operands_ctx, const ir::Operations &operations_ctx,
                   const std::shared_ptr<TensorBuilder> &tensor_builder,
-                  const std::shared_ptr<custom::IKernelBuilder> &kernel_builder);
+                  const std::shared_ptr<custom::IKernelBuilder> &kernel_builder,
+                  const std::shared_ptr<ExternalContext> &external_context);
 
   using IKernelGenerator::visit;
 
@@ -74,6 +76,7 @@ public:
   void visit(const ir::operation::Transpose &) override;
   void visit(const ir::operation::Reduce &) override;
   void visit(const ir::operation::ReLU &) override;
+  void visit(const ir::operation::ReLU6 &) override;
   void visit(const ir::operation::Select &) override;
   void visit(const ir::operation::Slice &) override;
   void visit(const ir::operation::StridedSlice &) override;
@@ -83,6 +86,7 @@ public:
   void visit(const ir::operation::Sin &) override;
   void visit(const ir::operation::RSQRT &) override;
   void visit(const ir::operation::Shape &) override;
+  void visit(const ir::operation::ResizeBilinear &node) override;
   void visit(const ir::operation::Reverse &) override;
   void visit(const ir::operation::Neg &) override;
   void visit(const ir::operation::ArgMax &) override;
@@ -94,13 +98,19 @@ public:
   void visit(const ir::operation::SquaredDifference &) override;
   void visit(const ir::operation::Tile &) override;
   void visit(const ir::operation::LogicalOr &) override;
+  void visit(const ir::operation::L2Normalization &) override;
   void visit(const ir::operation::Range &) override;
   void visit(const ir::operation::MatrixBandPart &) override;
   void visit(const ir::operation::BatchMatMul &) override;
+  void visit(const ir::operation::BatchToSpaceND &) override;
   void visit(const ir::operation::BroadcastTo &) override;
   void visit(const ir::operation::FusedBatchNorm &) override;
   void visit(const ir::operation::LogSoftmax &) override;
   void visit(const ir::operation::SpaceToBatchND &) override;
+  void visit(const ir::operation::Quantize &) override;
+  void visit(const ir::operation::SpaceToDepth &) override;
+  void visit(const ir::operation::StatelessRandomUniform &) override;
+  void visit(const ir::operation::SplitV &) override;
 
 private:
   const ir::Operands &_ctx;
@@ -108,6 +118,7 @@ private:
   std::shared_ptr<TensorBuilder> _tensor_builder;
   std::shared_ptr<backend::custom::IKernelBuilder> _kernel_builder;
   ir::Layout _current_op_seq_layout;
+  const std::shared_ptr<ExternalContext> _external_context;
 };
 
 } // namespace cpu
diff --git a/runtime/onert/backend/cpu/StaticTensorManager.cc b/runtime/onert/backend/cpu/StaticTensorManager.cc
new file mode 100644
index 000000000..78c98dabf
--- /dev/null
+++ b/runtime/onert/backend/cpu/StaticTensorManager.cc
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "StaticTensorManager.h"
+#include "Tensor.h"
+
+#include <util/logging.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+
+StaticTensorManager::StaticTensorManager(const std::shared_ptr<cpu_common::TensorRegistry> &reg,
+                                         cpu_common::DynamicTensorManager *dynamic_tensor_manager)
+    : _nonconst_mgr{new cpu_common::MemoryManager()}, _tensors{reg},
+      _dynamic_tensor_manager{dynamic_tensor_manager}
+{
+  // DO NOTHING
+}
+
+void StaticTensorManager::allocateNonconsts(void)
+{
+  _nonconst_mgr->allocate();
+
+  for (auto &pair : _tensors->native_tensors())
+  {
+    const auto &ind = pair.first;
+    auto tensor = pair.second;
+    if (!_as_constants[ind] && !tensor->is_dynamic())
+    {
+      auto *buffer = _nonconst_mgr->getBuffer(ind);
+      tensor->setBuffer(buffer);
+
+      VERBOSE(CPU_StaticTensorManager) << "TENSOR(#" << ind.value()
+                                       << "): " << static_cast<void *>(buffer) << std::endl;
+    }
+  }
+}
+
+void StaticTensorManager::deallocateNonconsts(void) { _nonconst_mgr->deallocate(); }
+
+void StaticTensorManager::buildTensor(const ir::OperandIndex &ind,
+                                      const ir::OperandInfo &tensor_info, ir::Layout backend_layout,
+                                      bool as_const)
+{
+  assert(!_tensors->getITensor(ind));
+  if (as_const)
+  {
+    auto tensor = std::make_shared<ExternalTensor>(tensor_info, backend_layout);
+    _tensors->setNativeTensor(ind, tensor);
+  }
+  else
+  {
+    auto tensor = std::make_shared<Tensor>(tensor_info, backend_layout, _dynamic_tensor_manager);
+    _tensors->setNativeTensor(ind, tensor);
+  }
+  _as_constants[ind] = as_const;
+}
+
+void StaticTensorManager::claimPlan(const ir::OperandIndex &ind, uint32_t size)
+{
+  assert(_tensors->getITensor(ind));
+
+  // This method is called only when a tensor has proper shape
+  assert(!_tensors->getITensor(ind)->is_dynamic());
+
+  if (!_as_constants[ind])
+    _nonconst_mgr->claimPlan(ind, size);
+}
+
+void StaticTensorManager::releasePlan(const ir::OperandIndex &ind)
+{
+  assert(_tensors->getITensor(ind));
+
+  // This method is called only when a tensor has proper shape
+  assert(!_tensors->getITensor(ind)->is_dynamic());
+
+  if (!_as_constants[ind])
+    _nonconst_mgr->releasePlan(ind);
+}
+
+void StaticTensorManager::iterate(const std::function<void(const ir::OperandIndex &)> &fn)
+{
+  for (const auto &it : _tensors->native_tensors())
+    fn(it.first);
+}
+
+} // namespace cpu
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/cpu/StaticTensorManager.h b/runtime/onert/backend/cpu/StaticTensorManager.h
new file mode 100644
index 000000000..2af61e4e7
--- /dev/null
+++ b/runtime/onert/backend/cpu/StaticTensorManager.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_CPU_STATICTENSOR_MANAGER_H__
+#define __ONERT_BACKEND_CPU_STATICTENSOR_MANAGER_H__
+
+#include "backend/IStaticTensorManager.h"
+#include "backend/cpu_common/DynamicTensorManager.h"
+#include "backend/cpu_common/MemoryManager.h"
+#include "backend/cpu_common/TensorRegistry.h"
+#include "backend/ITensorManager.h"
+#include "ir/OperandIndexMap.h"
+#include "ir/OperandInfo.h"
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+
+class StaticTensorManager : public backend::IStaticTensorManager
+{
+public:
+  StaticTensorManager(const std::shared_ptr<cpu_common::TensorRegistry> &reg,
+                      cpu_common::DynamicTensorManager *dynamic_tensor_manager);
+  virtual ~StaticTensorManager() = default;
+
+  void allocateNonconsts(void);
+  void deallocateNonconsts(void);
+
+  void buildTensor(const ir::OperandIndex &ind, const ir::OperandInfo &tensor_info,
+                   ir::Layout backend_layout, bool as_const);
+
+  void claimPlan(const ir::OperandIndex &ind, uint32_t size);
+  void releasePlan(const ir::OperandIndex &ind);
+
+  void iterate(const std::function<void(const ir::OperandIndex &)> &fn);
+
+private:
+  std::unique_ptr<cpu_common::MemoryManager> _nonconst_mgr;
+  const std::shared_ptr<cpu_common::TensorRegistry> _tensors;
+  ir::OperandIndexMap<bool> _as_constants;
+  cpu_common::DynamicTensorManager *_dynamic_tensor_manager;
+};
+
+} // namespace cpu
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_CPU_STATICTENSOR_MANAGER_H__
diff --git a/runtime/onert/backend/cpu/Tensor.h b/runtime/onert/backend/cpu/Tensor.h
index 4dd251bd3..20e60260c 100644
--- a/runtime/onert/backend/cpu/Tensor.h
+++ b/runtime/onert/backend/cpu/Tensor.h
@@ -29,15 +29,22 @@ namespace cpu
 
 using Tensor = cpu_common::Tensor;
 
-// Tensor which has data from external. To support this, assume below things
-// no padding, always NHWC layout, constant tensor and not dynamic
+/**
+ * @brief Class that uses data from external memory that is not managed by a backend
+ *        instead of allocating and copying the data. ExternalTensor's data pointer points to
+ *        an address of memory such as where memory is already allocated, or mmapped area.
+ *        This is meaning that ExternalTensor can take all of types' ir::Data.
+ *        To support this, assume below things no padding, always NHWC layout,
+ *        constant tensor and not dynamic.
+ */
 class ExternalTensor : public Tensor
 {
 public:
   ExternalTensor() = delete;
 
 public:
-  ExternalTensor(const ir::OperandInfo &info, const ir::Layout layout) : Tensor(info, layout)
+  ExternalTensor(const ir::OperandInfo &info, const ir::Layout layout)
+      : Tensor(info, layout, nullptr)
   {
     assert(_layout == ir::Layout::NHWC);
     assert(_info.isConstant());
@@ -45,6 +52,11 @@ public:
   }
 
 public:
+  /**
+   * @brief     set Data to be shared from external so that this ExternalTensor will not be
+   *            allocated on CPU backend
+   * @param[in] data    data of Operand to be set
+   */
   void setData(const std::shared_ptr<ir::Data> data)
   {
     assert(data != nullptr);
diff --git a/runtime/onert/backend/cpu/TensorBuilder.cc b/runtime/onert/backend/cpu/TensorBuilder.cc
index 886e8d820..ab8ba5756 100644
--- a/runtime/onert/backend/cpu/TensorBuilder.cc
+++ b/runtime/onert/backend/cpu/TensorBuilder.cc
@@ -29,8 +29,8 @@ namespace cpu
 
 TensorBuilder::TensorBuilder()
     : _tensor_reg{new cpu_common::TensorRegistry()},
-      _static_tensor_mgr{new cpu_common::StaticTensorManager(_tensor_reg)},
-      _dynamic_tensor_mgr{new cpu_common::DynamicTensorManager(_tensor_reg)}
+      _dynamic_tensor_mgr{new cpu_common::DynamicTensorManager(_tensor_reg)},
+      _static_tensor_mgr{new StaticTensorManager(_tensor_reg, _dynamic_tensor_mgr.get())}
 {
   /* empty */
 }
@@ -77,11 +77,7 @@ bool TensorBuilder::isRegistered(const ir::OperandIndex &ind) const
   return _tensor_info_map.find(ind) != _tensor_info_map.end();
 }
 
-void TensorBuilder::prepare(void)
-{
-  _static_tensor_mgr->allocateConsts();
-  _static_tensor_mgr->allocateNonconsts();
-}
+void TensorBuilder::prepare(void) { _static_tensor_mgr->allocateNonconsts(); }
 
 void TensorBuilder::allocate()
 {
@@ -99,17 +95,17 @@ std::shared_ptr<IPortableTensor> TensorBuilder::portableAt(const ir::OperandInde
   return _tensor_reg->getPortableTensor(ind);
 }
 
-bool TensorBuilder::setExternalTensor(const ir::OperandIndex &ind,
-                                      const std::shared_ptr<IPortableTensor> &tensor)
+bool TensorBuilder::setMigrantTensor(const ir::OperandIndex &ind,
+                                     const std::shared_ptr<IPortableTensor> &tensor)
 {
-  return _tensor_reg->setExternalTensor(ind, tensor);
+  return _tensor_reg->setMigrantTensor(ind, tensor);
 }
 
 void TensorBuilder::iterate(const IterateFunction &fn) { _static_tensor_mgr->iterate(fn); }
 
-std::shared_ptr<cpu_common::Tensor> TensorBuilder::at(const ir::OperandIndex &ind)
+std::shared_ptr<Tensor> TensorBuilder::at(const ir::OperandIndex &ind)
 {
-  return _tensor_reg->getManagedTensor(ind);
+  return _tensor_reg->getNativeTensor(ind);
 }
 
 std::unique_ptr<ITensorManager> TensorBuilder::releaseStaticTensorManager(void)
diff --git a/runtime/onert/backend/cpu/TensorBuilder.h b/runtime/onert/backend/cpu/TensorBuilder.h
index ba25451ec..617136514 100644
--- a/runtime/onert/backend/cpu/TensorBuilder.h
+++ b/runtime/onert/backend/cpu/TensorBuilder.h
@@ -18,13 +18,14 @@
 #define __ONERT_BACKEND_CPU_TENSOR_BUILDER_H__
 
 #include <backend/cpu_common/DynamicTensorManager.h>
-#include <backend/cpu_common/StaticTensorManager.h>
 #include <backend/cpu_common/TensorRegistry.h>
-#include <backend/cpu_common/Tensor.h>
 
 #include <backend/ITensorBuilder.h>
 #include <ir/OperandIndexMap.h>
 
+#include "StaticTensorManager.h"
+#include "Tensor.h"
+
 #include <unordered_map>
 
 namespace onert
@@ -80,17 +81,17 @@ public:
    *        If not, program will crash with assert or exception.
    * @return shared_ptr<Tensor>
    */
-  std::shared_ptr<cpu_common::Tensor> at(const ir::OperandIndex &ind);
+  std::shared_ptr<Tensor> at(const ir::OperandIndex &ind);
   std::shared_ptr<IPortableTensor> portableAt(const ir::OperandIndex &ind);
-  bool setExternalTensor(const ir::OperandIndex &ind,
-                         const std::shared_ptr<IPortableTensor> &tensor) override;
+  bool setMigrantTensor(const ir::OperandIndex &ind,
+                        const std::shared_ptr<IPortableTensor> &tensor) override;
 
   std::shared_ptr<ITensorRegistry> tensorRegistry() override { return _tensor_reg; }
 
 private:
   const std::shared_ptr<cpu_common::TensorRegistry> _tensor_reg;
-  std::unique_ptr<cpu_common::StaticTensorManager> _static_tensor_mgr;
   std::unique_ptr<cpu_common::DynamicTensorManager> _dynamic_tensor_mgr;
+  std::unique_ptr<StaticTensorManager> _static_tensor_mgr;
   ir::OperandIndexMap<ir::OperandInfo> _tensor_info_map;
 };
 
diff --git a/runtime/onert/backend/cpu/ops/BatchToSpaceNDLayer.cc b/runtime/onert/backend/cpu/ops/BatchToSpaceNDLayer.cc
new file mode 100644
index 000000000..f2f10eb9d
--- /dev/null
+++ b/runtime/onert/backend/cpu/ops/BatchToSpaceNDLayer.cc
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "BatchToSpaceNDLayer.h"
+
+#include <cker/operation/BatchToSpaceND.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+namespace ops
+{
+
+BatchToSpaceNDLayer::BatchToSpaceNDLayer()
+    : _input(nullptr), _output(nullptr), _block_shape(nullptr), _crops(nullptr)
+{
+  // DO NOTHING
+}
+
+template <typename T> void BatchToSpaceNDLayer::batchToSpaceNDGeneric()
+{
+  const int32_t NNapiCrops[]{0, 0, 0, 0};
+  const int32_t *_crops_buffer;
+
+  if (_crops == nullptr)
+  {
+    _crops_buffer = NNapiCrops;
+  }
+  else
+  {
+    _crops_buffer = reinterpret_cast<const int32_t *>(_crops->buffer());
+  }
+  nnfw::cker::BatchToSpaceND<T>(
+      getTensorShape(_input), reinterpret_cast<const T *>(_input->buffer()),
+      reinterpret_cast<const int32_t *>(_block_shape->buffer()), _crops_buffer,
+      getTensorShape(_output), reinterpret_cast<T *>(_output->buffer()));
+}
+
+void BatchToSpaceNDLayer::configure(const IPortableTensor *input, IPortableTensor *output,
+                                    IPortableTensor *block_shape, IPortableTensor *crops)
+{
+  _output = output;
+  _input = input;
+  _block_shape = block_shape;
+  _crops = crops;
+}
+
+void BatchToSpaceNDLayer::run()
+{
+  if (_output->data_type() == OperandType::FLOAT32)
+  {
+    batchToSpaceNDGeneric<float>();
+  }
+  else if (_output->data_type() == OperandType::QUANT_UINT8_ASYMM)
+  {
+    batchToSpaceNDGeneric<uint8_t>();
+  }
+  else
+  {
+    throw std::runtime_error{"NYI"};
+  }
+}
+
+} // namespace ops
+} // namespace cpu
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/cpu/ops/BatchToSpaceNDLayer.h b/runtime/onert/backend/cpu/ops/BatchToSpaceNDLayer.h
new file mode 100644
index 000000000..6e25b241b
--- /dev/null
+++ b/runtime/onert/backend/cpu/ops/BatchToSpaceNDLayer.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_CPU_OPS_BATCHTOSPACEND_LAYER_H__
+#define __ONERT_BACKEND_CPU_OPS_BATCHTOSPACEND_LAYER_H__
+
+#include <backend/IPortableTensor.h>
+#include "OperationUtils.h"
+
+#include <exec/IFunction.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+namespace ops
+{
+
+class BatchToSpaceNDLayer : public ::onert::exec::IFunction
+{
+public:
+  BatchToSpaceNDLayer();
+
+public:
+  template <typename T> void batchToSpaceNDGeneric();
+
+  void configure(const IPortableTensor *input, IPortableTensor *output,
+                 IPortableTensor *block_shape, IPortableTensor *crops);
+
+  void run() override;
+
+private:
+  const IPortableTensor *_input;
+  IPortableTensor *_output;
+  IPortableTensor *_block_shape;
+  IPortableTensor *_crops;
+};
+
+} // namespace ops
+} // namespace cpu
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_CPU_OPS_BATCHTOSPACEND_LAYER_H__
diff --git a/runtime/onert/backend/cpu/ops/CompareLayer.cc b/runtime/onert/backend/cpu/ops/CompareLayer.cc
index f557f3ade..adf902aaf 100644
--- a/runtime/onert/backend/cpu/ops/CompareLayer.cc
+++ b/runtime/onert/backend/cpu/ops/CompareLayer.cc
@@ -17,6 +17,7 @@
 
 #include "OperationUtils.h"
 
+#include <assert.h>
 #include <cker/operation/Comparison.h>
 using namespace nnfw::cker;
 namespace onert
@@ -34,6 +35,14 @@ namespace
 using OpType = onert::ir::operation::Comparison::ComparisonType;
 using namespace onert::backend::cpu;
 
+// Assumes these enum values to be in the order like this
+static_assert(static_cast<int>(OpType::Equal) == 0, "An OpType value has changed!");
+static_assert(static_cast<int>(OpType::NotEqual) == 1, "An OpType value has changed!");
+static_assert(static_cast<int>(OpType::Greater) == 2, "An OpType value has changed!");
+static_assert(static_cast<int>(OpType::GreaterEqual) == 3, "An OpType value has changed!");
+static_assert(static_cast<int>(OpType::Less) == 4, "An OpType value has changed!");
+static_assert(static_cast<int>(OpType::LessEqual) == 5, "An OpType value has changed!");
+
 template <typename T>
 void compareQuant8(const IPortableTensor *lhs, const IPortableTensor *rhs, IPortableTensor *output,
                    OpType op_type)
@@ -52,95 +61,33 @@ void compareQuant8(const IPortableTensor *lhs, const IPortableTensor *rhs, IPort
                                       &params.input2_shift);
   params.is_broadcast = !HaveSameShapes(lhs, rhs);
 
-  if (params.is_broadcast)
-  {
-    switch (op_type)
-    {
-      case OpType::Equal:
-        Broadcast4DSlowEqualWithScaling(
-            params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::NotEqual:
-        Broadcast4DSlowNotEqualWithScaling(
-            params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::Greater:
-        Broadcast4DSlowGreaterWithScaling(
-            params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::GreaterEqual:
-        Broadcast4DSlowGreaterEqualWithScaling(
-            params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::Less:
-        Broadcast4DSlowLessWithScaling(
-            params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::LessEqual:
-        Broadcast4DSlowLessEqualWithScaling(
-            params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
-        break;
-      default:
-        throw std::runtime_error{"Invalid OpType for CompareLayer"};
-    }
-  }
-  else // if (requires_broadcast == false)
-  {
-    switch (op_type)
-    {
-      case OpType::Equal:
-        EqualWithScaling(params, getExtendedTensorShape(lhs),
-                         reinterpret_cast<const T *>(lhs->buffer()), getExtendedTensorShape(rhs),
-                         reinterpret_cast<const T *>(rhs->buffer()), getExtendedTensorShape(output),
-                         reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::NotEqual:
-        NotEqualWithScaling(
-            params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::Greater:
-        GreaterWithScaling(
-            params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::GreaterEqual:
-        GreaterEqualWithScaling(
-            params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::Less:
-        LessWithScaling(params, getExtendedTensorShape(lhs),
-                        reinterpret_cast<const T *>(lhs->buffer()), getExtendedTensorShape(rhs),
-                        reinterpret_cast<const T *>(rhs->buffer()), getExtendedTensorShape(output),
-                        reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::LessEqual:
-        LessEqualWithScaling(
-            params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
-        break;
-      default:
-        throw std::runtime_error{"Invalid OpType for CompareLayer"};
-    }
-  }
-  return;
+  using CompareFunction =
+      void (*)(ComparisonParams & params, const Shape &input1_shape, const T *input1_data,
+               const Shape &input2_shape, const T *input2_data, const Shape &output_shape,
+               bool *output_data);
+
+  static const CompareFunction broadcast_fns[] = {
+      Broadcast4DSlowEqualWithScaling,   Broadcast4DSlowNotEqualWithScaling,
+      Broadcast4DSlowGreaterWithScaling, Broadcast4DSlowGreaterEqualWithScaling,
+      Broadcast4DSlowLessWithScaling,    Broadcast4DSlowLessEqualWithScaling,
+  };
+  static const CompareFunction non_broadcast_fns[] = {
+      EqualWithScaling,        NotEqualWithScaling, GreaterWithScaling,
+      GreaterEqualWithScaling, LessWithScaling,     LessEqualWithScaling,
+  };
+
+  static_assert(sizeof(broadcast_fns) == sizeof(non_broadcast_fns),
+                "Sizes of broadcast_fns and non_broadcast_fns must match!");
+
+  auto index = static_cast<int>(op_type);
+  if (index < 0 || index >= static_cast<int>(sizeof(broadcast_fns) / sizeof(broadcast_fns[0])))
+    throw std::runtime_error{"Invalid OpType for CompareLayer"};
+
+  CompareFunction fn = (params.is_broadcast ? broadcast_fns[index] : non_broadcast_fns[index]);
+
+  fn(params, getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+     getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+     getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
 }
 
 template <typename T>
@@ -149,94 +96,33 @@ void compareScalar(const IPortableTensor *lhs, const IPortableTensor *rhs, IPort
 {
   bool requires_broadcast = !HaveSameShapes(lhs, rhs);
 
-  if (requires_broadcast)
-  {
-    switch (op_type)
-    {
-      case OpType::Equal:
-        Broadcast4DSlowEqual(
-            getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::NotEqual:
-        Broadcast4DSlowNotEqual(
-            getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::Greater:
-        Broadcast4DSlowGreater(
-            getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::GreaterEqual:
-        Broadcast4DSlowGreaterEqual(
-            getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::Less:
-        Broadcast4DSlowLess(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-                            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-                            getExtendedTensorShape(output),
-                            reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::LessEqual:
-        Broadcast4DSlowLessEqual(
-            getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
-        break;
-      default:
-        throw std::runtime_error{"Invalid OpType for CompareLayer"};
-    }
-  }
-  else // if (requires_broadcast == false)
-  {
-    switch (op_type)
-    {
-      case OpType::Equal:
-        EqualNoScaling(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-                       getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-                       getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::NotEqual:
-        NotEqualNoScaling(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-                          getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-                          getExtendedTensorShape(output),
-                          reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::Greater:
-        GreaterNoScaling(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-                         getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-                         getExtendedTensorShape(output),
-                         reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::GreaterEqual:
-        GreaterEqualNoScaling(
-            getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-            getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-            getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::Less:
-        LessNoScaling(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-                      getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-                      getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
-        break;
-      case OpType::LessEqual:
-        LessEqualNoScaling(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
-                           getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
-                           getExtendedTensorShape(output),
-                           reinterpret_cast<bool *>(output->buffer()));
-        break;
-      default:
-        throw std::runtime_error{"Invalid OpType for CompareLayer"};
-    }
-  }
-  return;
+  using CompareFunction =
+      void (*)(const Shape &input1_shape, const T *input1_data, const Shape &input2_shape,
+               const T *input2_data, const Shape &output_shape, bool *output_data);
+
+  static const CompareFunction broadcast_fns[] = {
+      Broadcast4DSlowEqual,        Broadcast4DSlowNotEqual, Broadcast4DSlowGreater,
+      Broadcast4DSlowGreaterEqual, Broadcast4DSlowLess,     Broadcast4DSlowLessEqual,
+  };
+  static const CompareFunction non_broadcast_fns[] = {
+      EqualNoScaling,        NotEqualNoScaling, GreaterNoScaling,
+      GreaterEqualNoScaling, LessNoScaling,     LessEqualNoScaling,
+  };
+
+  static_assert(sizeof(broadcast_fns) == sizeof(non_broadcast_fns),
+                "Sizes of broadcast_fns and non_broadcast_fns must match!");
+
+  auto index = static_cast<int>(op_type);
+  if (index < 0 || index >= static_cast<int>(sizeof(broadcast_fns) / sizeof(broadcast_fns[0])))
+    throw std::runtime_error{"Invalid OpType for CompareLayer"};
+
+  CompareFunction fn = (requires_broadcast ? broadcast_fns[index] : non_broadcast_fns[index]);
+
+  fn(getExtendedTensorShape(lhs), reinterpret_cast<const T *>(lhs->buffer()),
+     getExtendedTensorShape(rhs), reinterpret_cast<const T *>(rhs->buffer()),
+     getExtendedTensorShape(output), reinterpret_cast<bool *>(output->buffer()));
 }
+
 } // namespace
 
 CompareLayer::CompareLayer()
diff --git a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc
index c00be64e5..05da33abf 100644
--- a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc
+++ b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc
@@ -18,6 +18,8 @@
 
 #include "../Tensor.h"
 #include <cker/operation/FullyConnected.h>
+#include <cker/TensorUtils.h>
+#include <misc/polymorphic_downcast.h>
 
 namespace onert
 {
@@ -31,7 +33,7 @@ namespace ops
 FullyConnectedLayer::FullyConnectedLayer()
     : _input(nullptr), _weights(nullptr), _bias(nullptr), _output(nullptr),
       _activation(ir::Activation::NONE), _temp_arena(new nnfw::cker::FCTempArena()),
-      _is_hybrid(false)
+      _external_context(nullptr), _is_hybrid(false)
 {
   // DO NOTHING
 }
@@ -102,7 +104,8 @@ void FullyConnectedLayer::fullyConnectedHybrid()
       op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
       getTensorShape(_weights), reinterpret_cast<const int8_t *>(_weights->buffer()),
       getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr),
-      getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), temp_arena);
+      getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), temp_arena,
+      _external_context->ruy_context());
 #else
   nnfw::cker::FullyConnectedHybrid(
       op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
@@ -110,31 +113,67 @@ void FullyConnectedLayer::fullyConnectedHybrid()
       (_cached_weights) ? reinterpret_cast<const int8_t *>(_cached_weights)
                         : reinterpret_cast<const int8_t *>(_weights->buffer()),
       getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr),
-      getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), temp_arena);
+      getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), temp_arena,
+      _external_context->ruy_context());
 
-// TODO Enable calling decrease_ref
-#if 0
   if (_cached_weights == nullptr || _is_weights_freed)
     return;
 
-  auto weight_tensor = dynamic_cast<const Tensor *>(_weights);
-  if (weight_tensor)
+  // '_cached_weights is not nullptr and _is_weights_freed is false' means
+  // this weight shape is satisfied with the ruy kernel's prepack cache's condition.
+  // After entering here, it will not enter again except below the case - input is zero-vector
+
+  // if input's elements are filled with zero, it by-passes(does not enter ruy-kernel path)
+  // so that handle this case
+  const int input_size = getTensorShape(_input).FlatSize();
+  if (nnfw::cker::IsZeroVector(reinterpret_cast<float *>(_input->buffer()), input_size))
+    return;
+
+  auto weight_tensor = nnfw::misc::polymorphic_downcast<const Tensor *>(_weights);
+
+  // This weight tensor could be other ops' const tensor.
+  // Therefore, below reference should be checked like following
+  auto tensor = const_cast<Tensor *>(weight_tensor);
+  if (tensor->buffer() == nullptr) // ref is already 0?
   {
-    auto tensor = const_cast<Tensor *>(weight_tensor);
+    _is_weights_freed = true;
+    return;
+  }
 
-    tensor->decrease_ref();
-    if (tensor->buffer() == nullptr) // ref == 0?
-    {
-      _is_weights_freed = true;
-    }
+  tensor->decrease_ref();
+  if (tensor->buffer() == nullptr) // ref == 0?
+  {
+    _is_weights_freed = true;
   }
-#endif // if 0
 #endif
 }
 
+void FullyConnectedLayer::fullyConnectedSparseWeight()
+{
+  float output_activation_min = 0, output_activation_max = 0;
+  CalculateActivationRange(_activation, &output_activation_min, &output_activation_max);
+
+  nnfw::cker::FullyConnectedParams op_params;
+  op_params.float_activation_min = output_activation_min;
+  op_params.float_activation_max = output_activation_max;
+  op_params.activation = convertActivationType(_activation);
+
+  int w0_size = getTensorShape(_weights).Dims(0);
+  const uint16_t *w1_segments = _weights->w1_segments();
+  const uint16_t *w1_indices = _weights->w1_indices();
+
+  nnfw::cker::FullyConnectedSparseWeight(
+      op_params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
+      getTensorShape(_weights), reinterpret_cast<const float *>(_weights->buffer()),
+      getTensorShape(_bias), reinterpret_cast<const float *>(_bias ? _bias->buffer() : nullptr),
+      getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()), w0_size, w1_segments,
+      w1_indices);
+}
+
 void FullyConnectedLayer::configure(const IPortableTensor *input, const IPortableTensor *weights,
                                     const IPortableTensor *bias, ir::Activation activation,
-                                    IPortableTensor *output)
+                                    IPortableTensor *output,
+                                    const std::shared_ptr<ExternalContext> &external_context)
 {
   _input = input;
   _weights = weights;
@@ -143,6 +182,7 @@ void FullyConnectedLayer::configure(const IPortableTensor *input, const IPortabl
   _output = output;
   _is_hybrid = input->data_type() == OperandType::FLOAT32 &&
                weights->data_type() == OperandType::QUANT_INT8_SYMM;
+  _external_context = external_context;
 }
 
 void FullyConnectedLayer::run()
@@ -151,6 +191,10 @@ void FullyConnectedLayer::run()
   {
     fullyConnectedHybrid();
   }
+  else if (_weights->is_sparse())
+  {
+    fullyConnectedSparseWeight();
+  }
   else if (_input->data_type() == OperandType::FLOAT32)
   {
     fullyConnectedFloat32();
@@ -167,7 +211,16 @@ void FullyConnectedLayer::run()
 
 void FullyConnectedLayer::prepare()
 {
-#ifdef USE_RUY_GEMV
+  if (_bias && _bias->is_constant())
+  {
+    const int bias_size = getTensorShape(_bias).FlatSize();
+    if (nnfw::cker::IsZeroVector(reinterpret_cast<float *>(_bias->buffer()), bias_size))
+    {
+      _bias = nullptr;
+    }
+  }
+
+#if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && defined(USE_RUY_GEMV)
   // TODO This is workaround
   // The only fc hybrid will use ruy kernel
   if (_input->data_type() != OperandType::FLOAT32 ||
diff --git a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.h b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.h
index dd5ef2436..f1242677c 100644
--- a/runtime/onert/backend/cpu/ops/FullyConnectedLayer.h
+++ b/runtime/onert/backend/cpu/ops/FullyConnectedLayer.h
@@ -18,6 +18,7 @@
 #define __ONERT_BACKEND_CPU_OPS_FULLYCONNECTEDLAYER_H__
 
 #include <backend/IPortableTensor.h>
+#include "../ExternalContext.h"
 #include "OperationUtils.h"
 
 #include <exec/IFunction.h>
@@ -52,8 +53,11 @@ public:
 
   void fullyConnectedHybrid();
 
+  void fullyConnectedSparseWeight();
+
   void configure(const IPortableTensor *input, const IPortableTensor *weights,
-                 const IPortableTensor *bias, ir::Activation activation, IPortableTensor *output);
+                 const IPortableTensor *bias, ir::Activation activation, IPortableTensor *output,
+                 const std::shared_ptr<ExternalContext> &external_context);
 
   void run() override;
 
@@ -68,10 +72,13 @@ private:
   ir::Activation _activation;
   std::unique_ptr<nnfw::cker::FCTempArena> _temp_arena;
 
+  std::shared_ptr<ExternalContext> _external_context;
+
   bool _is_hybrid;
 
 #ifdef USE_RUY_GEMV
   uint8_t *_cached_weights = nullptr; // weights to be cached and a key
+  bool _is_weights_freed = false;     // is weights freed?
 #endif
 };
 
diff --git a/runtime/onert/backend/cpu/ops/L2NormLayer.cc b/runtime/onert/backend/cpu/ops/L2NormLayer.cc
new file mode 100644
index 000000000..0d99b0586
--- /dev/null
+++ b/runtime/onert/backend/cpu/ops/L2NormLayer.cc
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "L2NormLayer.h"
+
+#include "OperationUtils.h"
+
+#include <cker/operation/L2Normalize.h>
+#include <cker/Types.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+namespace ops
+{
+
+void L2NormLayer::configure(const IPortableTensor *input, IPortableTensor *output)
+{
+  assert(input != nullptr);
+  assert(output != nullptr);
+
+  _input = input;
+  _output = output;
+}
+
+void L2NormLayer::run()
+{
+  switch (_input->data_type())
+  {
+    case OperandType::FLOAT32:
+      nnfw::cker::L2NormalizeFloat32(
+          getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
+          getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()));
+      break;
+
+    case OperandType::QUANT_UINT8_ASYMM:
+    {
+      nnfw::cker::L2NormParams params;
+      assert(_input->data_offset() == 128);
+      params.input_zero_point = _input->data_offset();
+      nnfw::cker::L2NormalizeQuant8(
+          params, getTensorShape(_input), reinterpret_cast<const uint8_t *>(_input->buffer()),
+          getTensorShape(_output), reinterpret_cast<uint8_t *>(_output->buffer()));
+    }
+    break;
+
+    default:
+      throw std::runtime_error{"L2Norm: Unsupported data type"};
+  }
+}
+
+} // namespace ops
+} // namespace cpu
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/cpu/ops/L2NormLayer.h b/runtime/onert/backend/cpu/ops/L2NormLayer.h
new file mode 100644
index 000000000..63f2d1133
--- /dev/null
+++ b/runtime/onert/backend/cpu/ops/L2NormLayer.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in riting, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_CPU_OPS_L2NORM_LAYER_H__
+#define __ONERT_BACKEND_CPU_OPS_L2NORM_LAYER_H__
+
+#include <backend/IPortableTensor.h>
+
+#include <exec/IFunction.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+namespace ops
+{
+class L2NormLayer : public ::onert::exec::IFunction
+{
+public:
+  L2NormLayer() : _input(nullptr), _output(nullptr)
+  {
+    // Nothing
+  }
+
+public:
+  void configure(const IPortableTensor *_input, IPortableTensor *output);
+
+  void run() override;
+
+private:
+  const IPortableTensor *_input;
+  IPortableTensor *_output;
+};
+
+} // namespace ops
+} // namespace cpu
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_CPU_OPS_L2NORM_LAYER_H__
diff --git a/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc b/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc
index d71e325ac..06dde4fc4 100644
--- a/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc
+++ b/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.cc
@@ -49,8 +49,8 @@ void LogSoftMaxLayer::logsoftmaxQuant8()
   // NYI
 }
 
-void LogSoftMaxLayer::configure(const Tensor *input, const float beta, const int axis,
-                                Tensor *output)
+void LogSoftMaxLayer::configure(const IPortableTensor *input, const float beta, const int axis,
+                                IPortableTensor *output)
 {
   _input = input;
   _output = output;
diff --git a/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.h b/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.h
index bc145cea7..ba9deca17 100644
--- a/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.h
+++ b/runtime/onert/backend/cpu/ops/LogSoftMaxLayer.h
@@ -40,13 +40,14 @@ public:
 
   void logsoftmaxQuant8();
 
-  void configure(const Tensor *input, const float beta, const int axis, Tensor *output);
+  void configure(const IPortableTensor *input, const float beta, const int axis,
+                 IPortableTensor *output);
 
   void run();
 
 private:
-  const Tensor *_input;
-  Tensor *_output;
+  const IPortableTensor *_input;
+  IPortableTensor *_output;
 
   float _beta;
   int _axis;
diff --git a/runtime/onert/backend/cpu/ops/OperationUtils.h b/runtime/onert/backend/cpu/ops/OperationUtils.h
index 8d29374ff..98385521a 100644
--- a/runtime/onert/backend/cpu/ops/OperationUtils.h
+++ b/runtime/onert/backend/cpu/ops/OperationUtils.h
@@ -52,6 +52,17 @@ union DataPtr {
   void *v;
 };
 
+union ConstDataPtr {
+  const uint8_t *u8;
+  const int8_t *i8;
+  const uint32_t *u32;
+  const int32_t *i32;
+  const bool *b;
+  const float *f;
+  const int64_t *i64;
+  const void *v;
+};
+
 uint32_t getNumberOfDimensions(const IPortableTensor *tensor);
 
 uint32_t getNumberOfElements(const IPortableTensor *tensor);
diff --git a/runtime/onert/backend/cpu/ops/PadLayer.cc b/runtime/onert/backend/cpu/ops/PadLayer.cc
index fcfcf7b5e..6a2bf9da0 100644
--- a/runtime/onert/backend/cpu/ops/PadLayer.cc
+++ b/runtime/onert/backend/cpu/ops/PadLayer.cc
@@ -33,33 +33,40 @@ PadLayer::PadLayer()
   // DO NOTHING
 }
 
-void PadLayer::padFloat32()
+template <typename T> void PadLayer::padImpl(const T *constant_value_data)
 {
-  nnfw::cker::Pad(_padData, _padRank, getTensorShape(_input),
-                  reinterpret_cast<const float *>(_input->buffer()), getTensorShape(_output),
-                  reinterpret_cast<float *>(_output->buffer()), _constantValueData.f);
+  nnfw::cker::Pad<T>(_padData, _padRank, getTensorShape(_input),
+                     reinterpret_cast<const T *>(_input->buffer()), getTensorShape(_output),
+                     reinterpret_cast<T *>(_output->buffer()), constant_value_data);
 }
-void PadLayer::padQuant8() { throw std::runtime_error("Quantized Pad isn't supported NYI"); }
 
 void PadLayer::configure(const IPortableTensor *input, IPortableTensor *output,
-                         const int32_t *padData, int32_t padRank, uint8_t *constantValueData)
+                         const int32_t *padData, int32_t padRank, const void *constantValueData)
 {
   _input = input;
   _output = output;
   memcpy(_padData, padData, sizeof(_padData));
   _padRank = padRank;
-  _constantValueData.u8 = constantValueData;
+  _constantValueData.v = constantValueData;
 }
 
 void PadLayer::run()
 {
   if (_input->data_type() == OperandType::FLOAT32)
   {
-    padFloat32();
+    padImpl<float>(_constantValueData.f);
   }
   else if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM)
   {
-    padQuant8();
+    if (_constantValueData.u8 == nullptr)
+    {
+      uint8_t pad_value = static_cast<uint8_t>(_output->data_offset());
+      padImpl<uint8_t>(&pad_value);
+    }
+    else
+    {
+      padImpl<uint8_t>(_constantValueData.u8);
+    }
   }
   else
   {
diff --git a/runtime/onert/backend/cpu/ops/PadLayer.h b/runtime/onert/backend/cpu/ops/PadLayer.h
index 85bd2e6f0..efd73d5e5 100644
--- a/runtime/onert/backend/cpu/ops/PadLayer.h
+++ b/runtime/onert/backend/cpu/ops/PadLayer.h
@@ -39,12 +39,10 @@ public:
   PadLayer();
 
 public:
-  void padFloat32();
-
-  void padQuant8();
+  template <typename T> void padImpl(const T *constant_value_data);
 
   void configure(const IPortableTensor *input, IPortableTensor *output, const int32_t *padData,
-                 int32_t padRank, uint8_t *constantValueData = nullptr);
+                 int32_t padRank, const void *constantValueData = nullptr);
 
   void run() override;
 
@@ -54,7 +52,7 @@ private:
 
   int32_t _padData[8];
   int32_t _padRank;
-  DataPtr _constantValueData;
+  ConstDataPtr _constantValueData;
 };
 
 } // namespace ops
diff --git a/runtime/onert/backend/cpu/ops/QuantizeLayer.cc b/runtime/onert/backend/cpu/ops/QuantizeLayer.cc
new file mode 100644
index 000000000..45fc148bf
--- /dev/null
+++ b/runtime/onert/backend/cpu/ops/QuantizeLayer.cc
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "QuantizeLayer.h"
+
+#include <cker/operation/Quantize.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+namespace ops
+{
+
+QuantizeLayer::QuantizeLayer() : _input(nullptr), _output(nullptr)
+{
+  // DO NOTHING
+}
+
+template <typename InputT, typename OutputT> void QuantizeLayer::affineQuantize()
+{
+  nnfw::cker::Quantize(getTensorShape(_input), reinterpret_cast<const InputT *>(_input->buffer()),
+                       getTensorShape(_output), reinterpret_cast<OutputT *>(_output->buffer()),
+                       _output->data_scale(), _output->data_offset());
+}
+
+void QuantizeLayer::configure(const IPortableTensor *input, IPortableTensor *output)
+{
+  _input = input;
+  _output = output;
+}
+
+void QuantizeLayer::run()
+{
+  if (_input->data_type() == OperandType::FLOAT32)
+  {
+    affineQuantize<float, uint8_t>();
+  }
+  else
+  {
+    throw std::runtime_error{"Quantize: unsupported data type"};
+  }
+}
+
+} // namespace ops
+} // namespace cpu
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/cpu/ops/QuantizeLayer.h b/runtime/onert/backend/cpu/ops/QuantizeLayer.h
new file mode 100644
index 000000000..b4e7aca40
--- /dev/null
+++ b/runtime/onert/backend/cpu/ops/QuantizeLayer.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_CPU_OPS_QUANTIZELAYER_H__
+#define __ONERT_BACKEND_CPU_OPS_QUANTIZELAYER_H__
+
+#include <backend/IPortableTensor.h>
+#include "OperationUtils.h"
+
+#include <exec/IFunction.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+namespace ops
+{
+
+class QuantizeLayer : public ::onert::exec::IFunction
+{
+public:
+  QuantizeLayer();
+
+public:
+  template <typename InputT, typename OutputT> void affineQuantize();
+
+  void configure(const IPortableTensor *input, IPortableTensor *output);
+
+  void run() override;
+
+private:
+  const IPortableTensor *_input;
+  IPortableTensor *_output;
+};
+
+} // namespace ops
+} // namespace cpu
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_CPU_OPS_QUANTIZELAYER_H__
diff --git a/runtime/onert/backend/cpu/ops/ReLU6Layer.cc b/runtime/onert/backend/cpu/ops/ReLU6Layer.cc
new file mode 100644
index 000000000..26eb35e0d
--- /dev/null
+++ b/runtime/onert/backend/cpu/ops/ReLU6Layer.cc
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ReLU6Layer.h"
+
+#include "OperationUtils.h"
+
+#include <cker/operation/ReLU6.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+namespace ops
+{
+
+ReLU6Layer::ReLU6Layer() : _input(nullptr), _output(nullptr)
+{
+  // DO NOTHING
+}
+
+void ReLU6Layer::relu6Float32()
+{
+  nnfw::cker::ReLU6(getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
+                    reinterpret_cast<float *>(_output->buffer()));
+}
+
+void ReLU6Layer::relu6Quant8()
+{
+  // cker quant8 relu is not implemented yet
+  throw std::runtime_error{"NYI"};
+}
+
+void ReLU6Layer::configure(const IPortableTensor *input, IPortableTensor *output)
+{
+  _input = input;
+  _output = output;
+}
+
+void ReLU6Layer::run()
+{
+  if (_input->data_type() == OperandType::FLOAT32)
+  {
+    relu6Float32();
+  }
+  else if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM)
+  {
+    relu6Quant8();
+  }
+  else
+  {
+    throw std::runtime_error{"ReLU6: unsupported data type"};
+  }
+}
+
+} // namespace ops
+} // namespace cpu
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/cpu/ops/ReLU6Layer.h b/runtime/onert/backend/cpu/ops/ReLU6Layer.h
new file mode 100644
index 000000000..994d17a30
--- /dev/null
+++ b/runtime/onert/backend/cpu/ops/ReLU6Layer.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_CPU_OPS_RELU6LAYER_H__
+#define __ONERT_BACKEND_CPU_OPS_RELU6LAYER_H__
+
+#include <backend/IPortableTensor.h>
+
+#include <exec/IFunction.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+namespace ops
+{
+
+class ReLU6Layer : public ::onert::exec::IFunction
+{
+public:
+  ReLU6Layer();
+
+public:
+  void relu6Float32();
+
+  void relu6Quant8();
+
+  void configure(const IPortableTensor *input, IPortableTensor *output);
+
+  void run() override;
+
+private:
+  const IPortableTensor *_input;
+  IPortableTensor *_output;
+};
+
+} // namespace ops
+} // namespace cpu
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_CPU_OPS_RELU6LAYER_H__
diff --git a/runtime/onert/backend/cpu/ops/ReduceLayer.cc b/runtime/onert/backend/cpu/ops/ReduceLayer.cc
index 1dad031aa..fe22dbed7 100644
--- a/runtime/onert/backend/cpu/ops/ReduceLayer.cc
+++ b/runtime/onert/backend/cpu/ops/ReduceLayer.cc
@@ -116,6 +116,39 @@ void evalGeneric(const IPortableTensor *input, IPortableTensor *output,
       throw std::runtime_error{"Reduce(generic): unsupported data type"};
   }
 }
+
+void evalSumQuantized(const IPortableTensor *input, IPortableTensor *output,
+                      const std::vector<int> &axes, bool keep_dims,
+                      nnfw::cker::Reduce &reduce_kernel)
+{
+  const bool same_scale = (input->data_scale() == output->data_scale() &&
+                           input->data_offset() == output->data_offset());
+
+  reduce_kernel.prepare(input->num_dimensions(), axes.size());
+
+  if (!same_scale)
+  {
+    std::vector<int32_t> temp_sum(output->getShape().num_elements());
+    bool result = reduce_kernel.QuantizedMeanOrSum<uint8_t, int32_t>(
+        reinterpret_cast<const uint8_t *>(input->buffer()), input->data_offset(),
+        input->data_scale(), getTensorShape(input), reinterpret_cast<uint8_t *>(output->buffer()),
+        output->data_offset(), output->data_scale(), getTensorShape(output), axes, keep_dims,
+        temp_sum.data(), true, [](const int32_t current, const uint8_t in) -> int32_t {
+          const int32_t actual_in = static_cast<int32_t>(in);
+          return current + actual_in;
+        });
+
+    if (!result)
+    {
+      throw std::runtime_error{"Reduce: Fail to run"};
+    }
+
+    return;
+  }
+
+  evalGeneric<ReduceType::kSum>(input, output, axes, keep_dims, reduce_kernel);
+}
+
 } // namespace
 
 ReduceLayer::ReduceLayer()
@@ -143,6 +176,11 @@ void ReduceLayer::run()
   switch (_reduceType)
   {
     case ReduceType::kSum:
+      if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM)
+      {
+        evalSumQuantized(_input, _output, axes, _keep_dims, *_reduce_kernel);
+        return;
+      }
       evalGeneric<ReduceType::kSum>(_input, _output, axes, _keep_dims, *_reduce_kernel);
       break;
     case ReduceType::kProd:
diff --git a/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.cc b/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.cc
new file mode 100644
index 000000000..180094bb8
--- /dev/null
+++ b/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.cc
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "OperationUtils.h"
+#include "ResizeBilinearLayer.h"
+#include "cker/operation/ResizeBilinear.h"
+#include <cker/Types.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+namespace ops
+{
+
+ResizeBilinearLayer::ResizeBilinearLayer()
+    : _input(nullptr), _output(nullptr), _output_height(0), _output_width(0), _align_corners(false),
+      _half_pixel_centers(false)
+{
+  // DO NOTHING
+}
+
+void ResizeBilinearLayer::configure(const IPortableTensor *input, IPortableTensor *output,
+                                    int32_t output_height, int32_t output_width, bool align_corners,
+                                    bool half_pixel_centers)
+{
+  _input = input;
+  _output = output;
+  _output_height = output_height;
+  _output_width = output_width;
+  _align_corners = align_corners;
+  _half_pixel_centers = half_pixel_centers;
+}
+
+void ResizeBilinearLayer::run()
+{
+  nnfw::cker::ResizeBilinearParams params;
+  params.align_corners = _align_corners;
+  params.half_pixel_centers = _half_pixel_centers;
+  params.output_height = _output_height;
+  params.output_width = _output_width;
+
+  switch (_input->data_type())
+  {
+    case OperandType::FLOAT32:
+      nnfw::cker::ResizeBilinear(
+          params, getTensorShape(_input), reinterpret_cast<const float *>(_input->buffer()),
+          getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()));
+      break;
+
+    case OperandType::QUANT_UINT8_ASYMM:
+      nnfw::cker::ResizeBilinear(
+          params, getTensorShape(_input), reinterpret_cast<const uint8_t *>(_input->buffer()),
+          getTensorShape(_output), reinterpret_cast<uint8_t *>(_output->buffer()));
+      break;
+
+    case OperandType::UINT8:
+    case OperandType::BOOL8:
+    case OperandType::FLOAT16:
+    case OperandType::INT32:
+    case OperandType::INT64:
+    case OperandType::QUANT_INT8_SYMM:
+      std::runtime_error("ResizeBilinear NYI");
+      break;
+    default:
+      std::runtime_error("ResizeBilinear unsupported data type");
+  }
+}
+
+} // namespace ops
+} // namespace cpu
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.h b/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.h
new file mode 100644
index 000000000..fc49b348e
--- /dev/null
+++ b/runtime/onert/backend/cpu/ops/ResizeBilinearLayer.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in riting, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_CPU_OPS_RESIZEBILINEAR_H__
+#define __ONERT_BACKEND_CPU_OPS_RESIZEBILINEAR_H__
+
+#include <backend/IPortableTensor.h>
+
+#include <exec/IFunction.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+namespace ops
+{
+
+class ResizeBilinearLayer : public ::onert::exec::IFunction
+{
+public:
+  ResizeBilinearLayer();
+
+public:
+  void configure(const IPortableTensor *input1, IPortableTensor *output, int32_t output_height,
+                 int32_t output_width, bool align_corners, bool half_pixel_centers);
+
+  void run() override;
+
+private:
+  const IPortableTensor *_input;
+  IPortableTensor *_output;
+  int32_t _output_height;
+  int32_t _output_width;
+  bool _align_corners;
+  bool _half_pixel_centers;
+};
+
+} // namespace ops
+} // namespace cpu
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_CPU_OPS_RESIZEBILINEAR_H__
diff --git a/runtime/onert/backend/cpu/ops/SliceLayer.cc b/runtime/onert/backend/cpu/ops/SliceLayer.cc
index a9106c1a2..449c073e6 100644
--- a/runtime/onert/backend/cpu/ops/SliceLayer.cc
+++ b/runtime/onert/backend/cpu/ops/SliceLayer.cc
@@ -46,7 +46,7 @@ void SliceLayer::GetBeginAndSizeVectors(int dimensions, const IPortableTensor *b
   }
 }
 
-void SliceLayer::sliceFloat32()
+template <typename T> void SliceLayer::sliceImpl()
 {
   const int kMaxDim = nnfw::cker::Shape::kMaxSmallSize;
 
@@ -74,14 +74,8 @@ void SliceLayer::sliceFloat32()
   }
 
   nnfw::cker::Slice(op_params, getExtendedTensorShape(_input),
-                    reinterpret_cast<const float *>(_input->buffer()),
-                    reinterpret_cast<float *>(_output->buffer()));
-}
-
-void SliceLayer::sliceQuant8()
-{
-  // cker quant8 slice is not implemented yet
-  throw std::runtime_error{"NYI"};
+                    reinterpret_cast<const T *>(_input->buffer()),
+                    reinterpret_cast<T *>(_output->buffer()));
 }
 
 void SliceLayer::configure(const IPortableTensor *input, const IPortableTensor *begin,
@@ -97,11 +91,11 @@ void SliceLayer::run()
 {
   if (_input->data_type() == OperandType::FLOAT32)
   {
-    sliceFloat32();
+    sliceImpl<float>();
   }
   else if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM)
   {
-    sliceQuant8();
+    sliceImpl<uint8_t>();
   }
   else
   {
diff --git a/runtime/onert/backend/cpu/ops/SliceLayer.h b/runtime/onert/backend/cpu/ops/SliceLayer.h
index 9945d7ee6..650e2c97a 100644
--- a/runtime/onert/backend/cpu/ops/SliceLayer.h
+++ b/runtime/onert/backend/cpu/ops/SliceLayer.h
@@ -42,8 +42,7 @@ public:
   void run() override;
 
 private:
-  void sliceFloat32();
-  void sliceQuant8();
+  template <typename T> void sliceImpl();
 
   template <typename T>
   void GetBeginAndSizeVectors(int dimensions, const IPortableTensor *begin,
diff --git a/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.cc b/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.cc
new file mode 100644
index 000000000..a0869aed8
--- /dev/null
+++ b/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.cc
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "SpaceToDepthLayer.h"
+
+#include "OperationUtils.h"
+
+#include <cker/operation/SpaceToDepth.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+namespace ops
+{
+SpaceToDepthLayer::SpaceToDepthLayer() : _input(nullptr), _block_size(0), _output(nullptr)
+{
+  // DO NOTHING
+}
+
+template <typename T> void SpaceToDepthLayer::spaceToDepth()
+{
+
+  nnfw::cker::SpaceToDepthParams params;
+  params.block_size = _block_size;
+
+  nnfw::cker::SpaceToDepth(params, getTensorShape(_input),
+                           reinterpret_cast<const T *>(_input->buffer()), getTensorShape(_output),
+                           reinterpret_cast<T *>(_output->buffer()));
+}
+
+void SpaceToDepthLayer::configure(const IPortableTensor *input, const int32_t block_size,
+                                  IPortableTensor *output)
+{
+  _input = input;
+  _block_size = block_size;
+  _output = output;
+}
+
+void SpaceToDepthLayer::run()
+{
+  if (_input->data_type() == OperandType::FLOAT32)
+  {
+    spaceToDepth<float>();
+  }
+  else if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM)
+  {
+    spaceToDepth<uint8_t>();
+  }
+  else
+  {
+    throw std::runtime_error{"SpaceToDepth: unsupported data type"};
+  }
+}
+
+} // namespace ops
+} // namespace cpu
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.h b/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.h
new file mode 100644
index 000000000..c11ef2b0a
--- /dev/null
+++ b/runtime/onert/backend/cpu/ops/SpaceToDepthLayer.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in riting, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_CPU_OPS_SPACE_TO_DEPTH_LAYER_H__
+#define __ONERT_BACKEND_CPU_OPS_SPACE_TO_DEPTH_LAYER_H__
+
+#include <backend/IPortableTensor.h>
+
+#include <exec/IFunction.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+namespace ops
+{
+class SpaceToDepthLayer : public ::onert::exec::IFunction
+{
+public:
+  SpaceToDepthLayer();
+
+  void configure(const IPortableTensor *input, const int32_t block_size, IPortableTensor *output);
+
+  void run() override;
+
+private:
+  template <typename T> void spaceToDepth();
+
+  const IPortableTensor *_input;
+  int32_t _block_size;
+  IPortableTensor *_output;
+};
+
+} // namespace ops
+} // namespace cpu
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_CPU_OPS_SPACE_TO_BATCH_ND_LAYER_H__
diff --git a/runtime/onert/backend/cpu/ops/SplitVLayer.cc b/runtime/onert/backend/cpu/ops/SplitVLayer.cc
new file mode 100644
index 000000000..d6ca12442
--- /dev/null
+++ b/runtime/onert/backend/cpu/ops/SplitVLayer.cc
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "SplitVLayer.h"
+
+#include "OperationUtils.h"
+
+#include <cker/operation/SplitV.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+namespace ops
+{
+
+SplitVLayer::SplitVLayer()
+    : _input(nullptr), _size_splits(nullptr), _split_dim(nullptr), _num_splits(0), _outputs()
+{
+  // DO NOTHING
+}
+
+template <typename T> void SplitVLayer::splitV(void)
+{
+  nnfw::cker::SplitVParams op_params;
+  op_params.axis = *(reinterpret_cast<const int32_t *>(_split_dim->buffer()));
+  op_params.num_split = _num_splits;
+
+  std::vector<T *> outputPtrs;
+  std::vector<nnfw::cker::Shape> outshape;
+
+  for (const auto output : _outputs)
+  {
+    assert(output->total_size() == sizeOfData(output->data_type(), output->getShape().dims()));
+    outputPtrs.emplace_back(reinterpret_cast<T *>(output->buffer()));
+    outshape.emplace_back(getTensorShape(output));
+  }
+
+  assert(_input->total_size() == sizeOfData(_input->data_type(), _input->getShape().dims()));
+  nnfw::cker::SplitV<T>(op_params, getTensorShape(_input), reinterpret_cast<T *>(_input->buffer()),
+                        outshape, outputPtrs.data());
+}
+
+void SplitVLayer::configure(const IPortableTensor *input, const IPortableTensor *size_splits,
+                            const IPortableTensor *split_dim, uint16_t num_splits,
+                            std::vector<IPortableTensor *> &outputs)
+{
+  assert(input != nullptr);
+
+  _num_splits = num_splits;
+  _size_splits = size_splits;
+  _input = input;
+  _split_dim = split_dim;
+  _outputs = outputs;
+}
+
+void SplitVLayer::run()
+{
+  if (_input->data_type() == OperandType::FLOAT32)
+  {
+    splitV<float>();
+  }
+  else if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM)
+  {
+    splitV<uint8_t>();
+  }
+  else if (_input->data_type() == OperandType::INT32)
+  {
+    splitV<int32_t>();
+  }
+  else if (_input->data_type() == OperandType::INT64)
+  {
+    splitV<int64_t>();
+  }
+  else
+  {
+    throw std::runtime_error{"SplitV: unsupported input type"};
+  }
+}
+
+} // namespace ops
+} // namespace cpu
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/cpu/ops/SplitVLayer.h b/runtime/onert/backend/cpu/ops/SplitVLayer.h
new file mode 100644
index 000000000..98f2f4406
--- /dev/null
+++ b/runtime/onert/backend/cpu/ops/SplitVLayer.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_CPU_OPS_SPLIT_V_LAYER_H__
+#define __ONERT_BACKEND_CPU_OPS_SPLIT_V_LAYER_H__
+
+#include <backend/IPortableTensor.h>
+
+#include <exec/IFunction.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+namespace ops
+{
+
+class SplitVLayer : public ::onert::exec::IFunction
+{
+public:
+  SplitVLayer();
+
+public:
+  template <typename T> void splitV(void);
+
+  void configure(const IPortableTensor *input, const IPortableTensor *size_splits,
+                 const IPortableTensor *size_dim, uint16_t num_splits,
+                 std::vector<IPortableTensor *> &outputs);
+
+  void run() override;
+
+private:
+  const IPortableTensor *_input;
+  const IPortableTensor *_size_splits;
+  const IPortableTensor *_split_dim;
+  uint16_t _num_splits;
+  std::vector<IPortableTensor *> _outputs;
+};
+
+} // namespace ops
+} // namespace cpu
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_CPU_OPS_SPLIT_V_LAYER_H__
diff --git a/runtime/onert/backend/cpu/ops/StatelessRandomUniformLayer.cc b/runtime/onert/backend/cpu/ops/StatelessRandomUniformLayer.cc
new file mode 100644
index 000000000..b8dfcb4b5
--- /dev/null
+++ b/runtime/onert/backend/cpu/ops/StatelessRandomUniformLayer.cc
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "StatelessRandomUniformLayer.h"
+
+#include <cker/operation/StatelessRandomUniform.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+namespace ops
+{
+
+StatelessRandomUniformLayer::StatelessRandomUniformLayer()
+    : _shape(nullptr), _seed(nullptr), _output(nullptr)
+{
+  // DO NOTHING
+}
+
+void StatelessRandomUniformLayer::configure(const IPortableTensor *shape,
+                                            const IPortableTensor *seed, IPortableTensor *output)
+{
+  _shape = shape;
+  _seed = seed;
+  _output = output;
+}
+
+void StatelessRandomUniformLayer::StatelessRandomUniformFloat32()
+{
+  nnfw::cker::StatelessRandomUniform(
+      getTensorShape(_shape), reinterpret_cast<const int *>(_shape->buffer()),
+      getTensorShape(_seed), reinterpret_cast<const int *>(_seed->buffer()),
+      getTensorShape(_output), reinterpret_cast<float *>(_output->buffer()));
+}
+
+void StatelessRandomUniformLayer::run()
+{
+  switch (_output->data_type())
+  {
+    // ToDo : It need to support INT8 and UINT8 also when will be applied quantization.
+    case OperandType::FLOAT32:
+      StatelessRandomUniformFloat32();
+      break;
+    default:
+      throw std::runtime_error{"StatelessRandomUniformLayer: unsupported data type"};
+  }
+}
+
+} // namespace ops
+} // namespace cpu
+} // namespace backend
+} // namespace onert
diff --git a/runtime/onert/backend/cpu/ops/StatelessRandomUniformLayer.h b/runtime/onert/backend/cpu/ops/StatelessRandomUniformLayer.h
new file mode 100644
index 000000000..ef11d623d
--- /dev/null
+++ b/runtime/onert/backend/cpu/ops/StatelessRandomUniformLayer.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_CPU_OPS_STATELESS_RANDOM_UNIFORM_H__
+#define __ONERT_BACKEND_CPU_OPS_STATELESS_RANDOM_UNIFORM_H__
+
+#include <backend/IPortableTensor.h>
+#include "OperationUtils.h"
+
+#include <exec/IFunction.h>
+
+namespace onert
+{
+namespace backend
+{
+namespace cpu
+{
+namespace ops
+{
+
+class StatelessRandomUniformLayer : public ::onert::exec::IFunction
+{
+public:
+  StatelessRandomUniformLayer();
+
+public:
+  void configure(const IPortableTensor *shape, const IPortableTensor *seed,
+                 IPortableTensor *output);
+
+  void StatelessRandomUniformFloat32();
+
+  void run() override;
+
+private:
+  const IPortableTensor *_shape;
+  const IPortableTensor *_seed;
+
+  IPortableTensor *_output;
+};
+
+} // namespace ops
+} // namespace cpu
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_CPU_OPS_STATELESS_RANDOM_UNIFORM_H__
diff --git a/runtime/onert/core/include/backend/BackendContext.h b/runtime/onert/core/include/backend/BackendContext.h
index c82e5b7a9..c263aef2b 100644
--- a/runtime/onert/core/include/backend/BackendContext.h
+++ b/runtime/onert/core/include/backend/BackendContext.h
@@ -56,6 +56,8 @@ public:
   {
   }
 
+  virtual ~BackendContext() = default;
+
   void initialize(const std::vector<OperationInfo> &operation_list,
                   const std::vector<ir::OperandIndex> &operand_list);
   void initConsts();
diff --git a/runtime/onert/core/include/backend/IExternalContext.h b/runtime/onert/core/include/backend/IExternalContext.h
new file mode 100644
index 000000000..88ffb502c
--- /dev/null
+++ b/runtime/onert/core/include/backend/IExternalContext.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_BACKEND_IEXTERNAL_CONTEXT_H__
+#define __ONERT_BACKEND_IEXTERNAL_CONTEXT_H__
+
+namespace onert
+{
+namespace backend
+{
+
+struct IExternalContext
+{
+  virtual ~IExternalContext() = default;
+  virtual void setMaxNumThreads(int) = 0;
+};
+
+} // namespace backend
+} // namespace onert
+
+#endif // __ONERT_BACKEND_IEXTERNAL_CONTEXT__
diff --git a/runtime/onert/core/include/backend/IPortableTensor.h b/runtime/onert/core/include/backend/IPortableTensor.h
index 2b2d00899..a05b39a33 100644
--- a/runtime/onert/core/include/backend/IPortableTensor.h
+++ b/runtime/onert/core/include/backend/IPortableTensor.h
@@ -37,6 +37,9 @@ class IPortableTensor : public ITensor
 {
 public:
   virtual ~IPortableTensor() = default;
+  virtual bool is_sparse() const { return false; }
+  virtual const uint16_t *w1_segments() const { return nullptr; }
+  virtual const uint16_t *w1_indices() const { return nullptr; }
 
 public:
   bool has_padding() const final { return false; }
diff --git a/runtime/onert/core/include/backend/ITensor.h b/runtime/onert/core/include/backend/ITensor.h
index 217d9debc..12b1c5433 100644
--- a/runtime/onert/core/include/backend/ITensor.h
+++ b/runtime/onert/core/include/backend/ITensor.h
@@ -32,6 +32,8 @@ namespace onert
 namespace backend
 {
 
+struct IDynamicTensorManager;
+
 class ITensor
 {
 public:
@@ -51,6 +53,15 @@ public:
   virtual void access(const std::function<void(ITensor &tensor)> &fn) = 0;
 
   /**
+   * @brief Return the dynamic tensor manager
+   *
+   * If dynamic tensors are not supported, it returns @c nullptr .
+   *
+   * @return IDynamicTensorManager* DynamicTensorManager
+   */
+  virtual IDynamicTensorManager *dynamic_tensor_manager() { return nullptr; }
+
+  /**
    * @brief Return true if the tensor is constant
    */
   virtual bool is_constant() const
diff --git a/runtime/onert/core/include/backend/ITensorBuilder.h b/runtime/onert/core/include/backend/ITensorBuilder.h
index a49525ba7..b760cda0e 100644
--- a/runtime/onert/core/include/backend/ITensorBuilder.h
+++ b/runtime/onert/core/include/backend/ITensorBuilder.h
@@ -112,12 +112,12 @@ public: // methods for static tensor allocation
   virtual std::shared_ptr<ITensor> tensorAt(const ir::OperandIndex &ind) = 0;
 
   /**
-   * @brief Set the External Tensor object
+   * @brief Set the migrant tensor object
    *
    * @return true if succeeded
    * @return false if failed or unsupported
    */
-  virtual bool setExternalTensor(const ir::OperandIndex &, const std::shared_ptr<IPortableTensor> &)
+  virtual bool setMigrantTensor(const ir::OperandIndex &, const std::shared_ptr<IPortableTensor> &)
   {
     return false;
   }
diff --git a/runtime/onert/core/include/backend/ITensorRegistry.h b/runtime/onert/core/include/backend/ITensorRegistry.h
index f5a95f49c..855513124 100644
--- a/runtime/onert/core/include/backend/ITensorRegistry.h
+++ b/runtime/onert/core/include/backend/ITensorRegistry.h
@@ -35,17 +35,22 @@ struct ITensorRegistry
   virtual ~ITensorRegistry() = default;
 
   /**
-   * @brief Returns pointer of ITensor among managed and external tensors
+   * @brief Returns pointer of ITensor among native and migrant tensors
+   *
+   * Native Tensor is a tensor that is managed by this backend
+   * Migrant Tensor is a tensor that is imported from another backend
+   *
    * @note  Return tensor cannot be used longer than dynamic tensor manager
    */
   virtual std::shared_ptr<ITensor> getITensor(const ir::OperandIndex &) = 0;
   /**
-   * @brief Returns pointer of ITensor among managed tensors
+   * @brief Returns pointer of ITensor among native tensors
    *
-   * Unlike @c getITensor , this function only searches from managed tensors
-   * @note  Return tensor cannot be used longer than dynamic tensor manager
+   * Unlike @c getITensor , this function only searches from native tensors
+   *
+   * @note  Returned tensor cannot be used longer than dynamic tensor manager
    */
-  virtual std::shared_ptr<ITensor> getManagedITensor(const ir::OperandIndex &) = 0;
+  virtual std::shared_ptr<ITensor> getNativeITensor(const ir::OperandIndex &) = 0;
 };
 
 } // namespace backend
@@ -73,68 +78,67 @@ public:
   std::shared_ptr<ITensor> getITensor(const ir::OperandIndex &ind) override
   {
     static_assert(std::is_base_of<ITensor, T_Tensor>::value, "T_Tensor must derive from ITensor.");
-    auto external_tensor = _external.find(ind);
-    if (external_tensor != _external.end())
+    auto external_tensor = _migrant.find(ind);
+    if (external_tensor != _migrant.end())
       return external_tensor->second;
-    return getManagedTensor(ind);
+    return getNativeTensor(ind);
   }
 
-  std::shared_ptr<ITensor> getManagedITensor(const ir::OperandIndex &ind) override
+  std::shared_ptr<ITensor> getNativeITensor(const ir::OperandIndex &ind) override
   {
-    return getManagedTensor(ind);
+    return getNativeTensor(ind);
   }
 
   std::shared_ptr<IPortableTensor> getPortableTensor(const ir::OperandIndex &ind)
   {
-    auto external_tensor = _external.find(ind);
-    if (external_tensor != _external.end())
+    auto external_tensor = _migrant.find(ind);
+    if (external_tensor != _migrant.end())
     {
       if (external_tensor->second)
         return external_tensor->second;
     }
-    return getManagedTensor(ind);
+    return getNativeTensor(ind);
   }
 
-  std::shared_ptr<T_Tensor> getManagedTensor(const ir::OperandIndex &ind)
+  std::shared_ptr<T_Tensor> getNativeTensor(const ir::OperandIndex &ind)
   {
-    auto tensor = _managed.find(ind);
-    if (tensor != _managed.end())
+    auto tensor = _native.find(ind);
+    if (tensor != _native.end())
       return tensor->second;
     return nullptr;
   }
 
-  bool setExternalTensor(const ir::OperandIndex &ind,
-                         const std::shared_ptr<IPortableTensor> &tensor)
+  bool setMigrantTensor(const ir::OperandIndex &ind, const std::shared_ptr<IPortableTensor> &tensor)
   {
     // TODO Uncomment this as two tensors for an index is not allowed.
     //      But now it is temporarily allowed as a workaround. External one hides Managed one.
-    // auto itr = _managed.find(ind);
-    // if (itr != _managed.end() && itr->second != nullptr && tensor != nullptr)
+    // auto itr = _native.find(ind);
+    // if (itr != _native.end() && itr->second != nullptr && tensor != nullptr)
     //  throw std::runtime_error{
-    //      "Tried to set an external tensor but an managed tensor already exists."};
-    _external[ind] = tensor;
+    //      "Tried to set an migrant tensor but an native tensor already exists."};
+    _migrant[ind] = tensor;
     return true;
   }
 
-  void setManagedTensor(const ir::OperandIndex &ind, const std::shared_ptr<T_Tensor> &tensor)
+  void setNativeTensor(const ir::OperandIndex &ind, const std::shared_ptr<T_Tensor> &tensor)
   {
-    auto itr = _external.find(ind);
-    if (itr != _external.end() && itr->second != nullptr && tensor != nullptr)
+    auto itr = _migrant.find(ind);
+    if (itr != _migrant.end() && itr->second != nullptr && tensor != nullptr)
       throw std::runtime_error{
-          "Tried to set a managed tensor but an external tensor already exists."};
-    _managed[ind] = tensor;
+          "Tried to set a native tensor but an migrant tensor already exists."};
+    _native[ind] = tensor;
   }
 
-  const ir::OperandIndexMap<std::shared_ptr<T_Tensor>> &managed_tensors() { return _managed; }
+  const ir::OperandIndexMap<std::shared_ptr<T_Tensor>> &native_tensors() { return _native; }
 
-  const ir::OperandIndexMap<std::shared_ptr<IPortableTensor>> &external_tensors()
+  const ir::OperandIndexMap<std::shared_ptr<IPortableTensor>> &migrant_tensors()
   {
-    return _external;
+    return _migrant;
   }
 
 private:
-  ir::OperandIndexMap<std::shared_ptr<IPortableTensor>> _external;
-  ir::OperandIndexMap<std::shared_ptr<T_Tensor>> _managed;
+  ir::OperandIndexMap<std::shared_ptr<IPortableTensor>> _migrant;
+  ir::OperandIndexMap<std::shared_ptr<T_Tensor>> _native;
 };
 
 } // namespace backend
diff --git a/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h b/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h
index 6ddacc7bc..a7e034a91 100644
--- a/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h
+++ b/runtime/onert/core/include/backend/cpu_common/StaticTensorManager.h
@@ -19,7 +19,7 @@
 
 #include "MemoryManager.h"
 
-#include "backend/ITensorManager.h"
+#include "backend/IStaticTensorManager.h"
 #include "ir/OperandIndexMap.h"
 #include "ir/OperandInfo.h"
 #include "TensorRegistry.h"
@@ -31,7 +31,7 @@ namespace backend
 namespace cpu_common
 {
 
-class StaticTensorManager : public backend::ITensorManager
+class StaticTensorManager : public backend::IStaticTensorManager
 {
 public:
   StaticTensorManager(const std::shared_ptr<TensorRegistry> &reg);
diff --git a/runtime/onert/core/include/backend/cpu_common/Tensor.h b/runtime/onert/core/include/backend/cpu_common/Tensor.h
index a0db96dc3..974501ecb 100644
--- a/runtime/onert/core/include/backend/cpu_common/Tensor.h
+++ b/runtime/onert/core/include/backend/cpu_common/Tensor.h
@@ -35,27 +35,42 @@ public:
   Tensor() = delete;
 
 public:
-  Tensor(const ir::OperandInfo &info, const ir::Layout layout)
-      : _info(info), _layout(layout), _buffer(nullptr), _num_references(0), _allocator(nullptr)
+  Tensor(const ir::OperandInfo &info, const ir::Layout layout,
+         IDynamicTensorManager *dynamic_tensor_manager)
+      : _info(info), _layout(layout), _buffer(nullptr), _num_references(0),
+        _dynamic_tensor_manager(dynamic_tensor_manager), _allocator(nullptr)
   {
     // DO NOTHING
   }
 
 public:
   // Only one of two method 'setBuffer' must be called once
+
+  /**
+   * @brief Set the Buffer object. This method is called for static and non-const tensor
+   */
   void setBuffer(uint8_t *buffer)
   {
-    assert(_buffer == nullptr && _allocator == nullptr);
+    assert(_buffer == nullptr);
     _buffer = buffer;
   }
+
+  /**
+   * @brief Set the Buffer object. This method is called for dynamic or const tensor
+   */
   void setBuffer(const std::shared_ptr<Allocator> &alloc)
   {
-    assert(_buffer == nullptr && _allocator == nullptr);
+    assert(_buffer == nullptr);
     _allocator = alloc;
+    _buffer = alloc->base();
   }
 
   // This works just as setBuffer but it simply overwrite existing Allocator without nullptr check
-  void overwriteBuffer(const std::shared_ptr<Allocator> &alloc) { _allocator = alloc; }
+  void overwriteBuffer(const std::shared_ptr<Allocator> &alloc)
+  {
+    _allocator = alloc;
+    _buffer = alloc->base();
+  }
 
   /**
    * @brief Mark this tensor does not have memory.
@@ -68,13 +83,7 @@ public:
   }
 
 public:
-  uint8_t *buffer() const override
-  {
-    if (_allocator != nullptr)
-      return _allocator->base();
-    else
-      return _buffer;
-  }
+  uint8_t *buffer() const override { return _buffer; }
   /**
    * @brief Get dimension by index
    *
@@ -96,12 +105,16 @@ public:
   bool is_constant() const override { return _info.isConstant(); }
   bool is_dynamic() const override { return _info.isDynamic(); }
   void set_dynamic() override { _info.setDynamic(); }
+  IDynamicTensorManager *dynamic_tensor_manager() override { return _dynamic_tensor_manager; }
+  bool is_sparse() const override { return _info.typeInfo().sparse(); }
+  virtual const uint16_t *w1_segments() const override { return _info.typeInfo().w1_segments(); }
+  virtual const uint16_t *w1_indices() const override { return _info.typeInfo().w1_indices(); }
 
   virtual void increase_ref()
   {
     assert(is_dynamic() ||
            // when not dynamic
-           (_buffer != nullptr || _allocator != nullptr));
+           (_buffer != nullptr));
 
     ++_num_references;
   }
@@ -110,12 +123,12 @@ public:
     assert(_buffer != nullptr || _allocator != nullptr);
     assert(_num_references > 0);
     --_num_references;
-    // Only constant tensor has allocator pointer
+    // constant tensor and dynamic tensor has _allocator
     if (_num_references == 0)
     {
       if (_buffer != nullptr)
         _buffer = nullptr;
-      else
+      if (_allocator != nullptr)
       {
         _allocator->release();
         _allocator = nullptr;
@@ -130,8 +143,15 @@ protected:
   ir::Layout _layout;
   uint8_t *_buffer;
   int32_t _num_references;
+  IDynamicTensorManager *_dynamic_tensor_manager;
 
 private:
+  /**
+   * @brief Memory allocator for dynamic tensor and const tensor
+   *        Since maintaing _allocator and also _buffer makes confusion,
+   *        we will mainly use _buffer (not _allocator.base()) for memory pointer in this code.
+   *        _allocator(shared_ptr) is used to guarantee that we have valid _buffer.
+   */
   std::shared_ptr<Allocator> _allocator;
 };
 
diff --git a/runtime/onert/core/include/compiler/StaticShapeInference.h b/runtime/onert/core/include/compiler/StaticShapeInference.h
index 379143baf..bff68c9fa 100644
--- a/runtime/onert/core/include/compiler/StaticShapeInference.h
+++ b/runtime/onert/core/include/compiler/StaticShapeInference.h
@@ -99,6 +99,7 @@ private:
   void visit(const ir::operation::LogicalNot &op) override;
   void visit(const ir::operation::LogicalOr &op) override;
   void visit(const ir::operation::Logistic &op) override;
+  void visit(const ir::operation::L2Normalization &op) override;
   void visit(const ir::operation::MatrixBandPart &op) override;
   void visit(const ir::operation::Max &op) override;
   void visit(const ir::operation::Min &op) override;
@@ -114,6 +115,7 @@ private:
   void visit(const ir::operation::Reshape &op) override;
   void visit(const ir::operation::Round &op) override;
   void visit(const ir::operation::RSQRT &op) override;
+  void visit(const ir::operation::ResizeBilinear &op) override;
   void visit(const ir::operation::Reverse &op) override;
   void visit(const ir::operation::Select &op) override;
   void visit(const ir::operation::Shape &op) override;
diff --git a/runtime/onert/core/include/exec/DynamicShapeInference.h b/runtime/onert/core/include/exec/DynamicShapeInference.h
index 113c34809..bca80db09 100644
--- a/runtime/onert/core/include/exec/DynamicShapeInference.h
+++ b/runtime/onert/core/include/exec/DynamicShapeInference.h
@@ -72,6 +72,7 @@ public:
   void visit(const ir::operation::LogicalNot &op) override;
   void visit(const ir::operation::LogicalOr &op) override;
   void visit(const ir::operation::Logistic &op) override;
+  void visit(const ir::operation::L2Normalization &op) override;
   void visit(const ir::operation::MatrixBandPart &op) override;
   void visit(const ir::operation::Max &op) override;
   void visit(const ir::operation::Min &op) override;
@@ -88,6 +89,7 @@ public:
   void visit(const ir::operation::Reshape &op) override;
   void visit(const ir::operation::Round &op) override;
   void visit(const ir::operation::RSQRT &op) override;
+  void visit(const ir::operation::ResizeBilinear &op) override;
   void visit(const ir::operation::Reverse &op) override;
   void visit(const ir::operation::Select &op) override;
   void visit(const ir::operation::Shape &op) override;
@@ -127,6 +129,7 @@ private:
   /**
    * @brief To allocate memory for output tensor if needed
    */
+  // TODO Remove this, as it is no longer used
   backend::IDynamicTensorManager *_dynamic_tensor_manager;
   /**
    * @brief To get tensor object and access tensor-level info, e.g., ITensor::buffer()
diff --git a/runtime/onert/core/include/ir/Operand.h b/runtime/onert/core/include/ir/Operand.h
index 53371d606..1b3a43b02 100644
--- a/runtime/onert/core/include/ir/Operand.h
+++ b/runtime/onert/core/include/ir/Operand.h
@@ -49,11 +49,11 @@ public:
   size_t operandSize(void) const;
 
   const OperationIndexSet &getUses() const { return _uses; }
-  const OperationIndexSet &getDef() const { return _def; }
+  OperationIndex getDef() const { return _def; }
   void insertUse(const OperationIndex &idx);
   void removeUse(const OperationIndex &idx);
-  void insertDef(const OperationIndex &idx);
-  void removeDef(const OperationIndex &idx);
+  void setDef(const OperationIndex &idx);
+  void unsetDef();
 
 public:
   void type(const DataType type) { _info.type(type); };
@@ -107,7 +107,7 @@ private:
   std::shared_ptr<Data> _data;
 
   OperationIndexSet _uses;
-  OperationIndexSet _def; // size is 0 (constant) or 1 (from def operation)
+  OperationIndex _def;
 };
 
 } // namespace ir
diff --git a/runtime/onert/core/include/ir/Operations.Include.h b/runtime/onert/core/include/ir/Operations.Include.h
index 5fac54e26..30c4ff25a 100644
--- a/runtime/onert/core/include/ir/Operations.Include.h
+++ b/runtime/onert/core/include/ir/Operations.Include.h
@@ -79,6 +79,7 @@
 #include "ir/operation/Pack.h"
 #include "ir/operation/Select.h"
 #include "ir/operation/Split.h"
+#include "ir/operation/SplitV.h"
 #include "ir/operation/Unpack.h"
 #include "ir/operation/Pad.h"
 #include "ir/operation/Min.h"
@@ -103,3 +104,5 @@
 #include "ir/operation/BatchMatMul.h"
 #include "ir/operation/FusedBatchNorm.h"
 #include "ir/operation/LogSoftmax.h"
+#include "ir/operation/Quantize.h"
+#include "ir/operation/StatelessRandomUniform.h"
diff --git a/runtime/onert/core/include/ir/Operations.lst b/runtime/onert/core/include/ir/Operations.lst
index 9d0642fba..75c6d8221 100644
--- a/runtime/onert/core/include/ir/Operations.lst
+++ b/runtime/onert/core/include/ir/Operations.lst
@@ -81,6 +81,7 @@ OP(DepthToSpace)
 OP(Pack)
 OP(Select)
 OP(Split)
+OP(SplitV)
 OP(Unpack)
 OP(Pad)
 OP(Custom)
@@ -106,3 +107,5 @@ OP(MatrixBandPart)
 OP(BatchMatMul)
 OP(FusedBatchNorm)
 OP(LogSoftmax)
+OP(Quantize)
+OP(StatelessRandomUniform)
diff --git a/runtime/onert/core/include/ir/TypeInfo.h b/runtime/onert/core/include/ir/TypeInfo.h
index 07d82b6a7..3f7eab4c0 100644
--- a/runtime/onert/core/include/ir/TypeInfo.h
+++ b/runtime/onert/core/include/ir/TypeInfo.h
@@ -18,6 +18,7 @@
 #define __ONERT_IR_TYPEINFO_H__
 
 #include <cstdint>
+#include <vector>
 
 #include "ir/DataType.h"
 
@@ -32,7 +33,7 @@ public:
   TypeInfo() = delete;
 
   explicit TypeInfo(DataType type, float scale = 0, int32_t offset = 0)
-      : _type(type), _scale(scale), _offset(offset)
+      : _type(type), _scale(scale), _offset(offset), _sparse(false)
   {
   }
 
@@ -40,14 +41,28 @@ public:
   DataType type() const { return _type; }
   float scale() const { return _scale; }
   int32_t offset() const { return _offset; }
+  bool sparse() const { return _sparse; }
+  const uint16_t *w1_segments() const { return _w1_segments.data(); }
+  const uint16_t *w1_indices() const { return _w1_indices.data(); }
 
 public:
   void type(const DataType type) { _type = type; }
+  void sparse2DMetadata(std::vector<uint16_t> &&w1_segments, std::vector<uint16_t> &&w1_indices)
+  {
+    _sparse = true;
+    _w1_segments = w1_segments;
+    _w1_indices = w1_indices;
+  }
 
 private:
   DataType _type;
+  // for quantization
   float _scale;
   int32_t _offset;
+  // for sparsity
+  bool _sparse;
+  std::vector<uint16_t> _w1_segments;
+  std::vector<uint16_t> _w1_indices;
 };
 
 bool operator==(const TypeInfo &lhs, const TypeInfo &rhs);
diff --git a/runtime/onert/core/include/ir/operation/BatchToSpaceND.h b/runtime/onert/core/include/ir/operation/BatchToSpaceND.h
index bb6be57d7..3e69b42c7 100644
--- a/runtime/onert/core/include/ir/operation/BatchToSpaceND.h
+++ b/runtime/onert/core/include/ir/operation/BatchToSpaceND.h
@@ -32,7 +32,8 @@ public:
   enum Input
   {
     INPUT = 0,
-    BLOCK_SIZE = 1
+    BLOCK_SIZE = 1,
+    CROPS_DATA = 2
   };
 
 public:
diff --git a/runtime/onert/core/include/ir/operation/LogSoftmax.h b/runtime/onert/core/include/ir/operation/LogSoftmax.h
index 26a92d7f8..391b4ba4a 100644
--- a/runtime/onert/core/include/ir/operation/LogSoftmax.h
+++ b/runtime/onert/core/include/ir/operation/LogSoftmax.h
@@ -48,7 +48,7 @@ public:
 
 public:
   void accept(OperationVisitor &v) const override;
-  OpCode opcode() const final { return OpCode::Softmax; }
+  OpCode opcode() const final { return OpCode::LogSoftmax; }
 
 public:
   const Param &param() const { return _param; }
diff --git a/runtime/onert/core/include/ir/operation/Pad.h b/runtime/onert/core/include/ir/operation/Pad.h
index a48606196..00481cd50 100644
--- a/runtime/onert/core/include/ir/operation/Pad.h
+++ b/runtime/onert/core/include/ir/operation/Pad.h
@@ -33,7 +33,7 @@ public:
   {
     INPUT = 0,
     PAD = 1,
-    // VALUE = 2 Not allow padding value operand yet
+    VALUE = 2
   };
 
 public:
diff --git a/runtime/onert/core/include/ir/operation/Quantize.h b/runtime/onert/core/include/ir/operation/Quantize.h
new file mode 100644
index 000000000..2533ce432
--- /dev/null
+++ b/runtime/onert/core/include/ir/operation/Quantize.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_IR_OPERATION_QUANTIZE_H__
+#define __ONERT_IR_OPERATION_QUANTIZE_H__
+
+#include "ir/Operation.h"
+
+namespace onert
+{
+namespace ir
+{
+namespace operation
+{
+
+class Quantize : public Operation
+{
+public:
+  enum Input
+  {
+    INPUT = 0,
+  };
+
+public:
+  Quantize(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs);
+
+public:
+  void accept(OperationVisitor &v) const override;
+  OpCode opcode() const final { return OpCode::Quantize; }
+};
+
+} // namespace operation
+} // namespace ir
+} // namespace onert
+
+#endif // __ONERT_IR_OPERATION_QUANTIZE_H__
diff --git a/runtime/onert/core/include/ir/operation/ResizeBilinear.h b/runtime/onert/core/include/ir/operation/ResizeBilinear.h
index 2887ed845..29aa496d7 100644
--- a/runtime/onert/core/include/ir/operation/ResizeBilinear.h
+++ b/runtime/onert/core/include/ir/operation/ResizeBilinear.h
@@ -33,13 +33,15 @@ class ResizeBilinear : public Operation
 public:
   enum Input
   {
-    INPUT = 0
+    INPUT = 0,
   };
 
   struct Param
   {
     int32_t height_out;
     int32_t width_out;
+    bool align_corners;
+    bool half_pixel_centers;
   };
 
 public:
diff --git a/runtime/onert/core/include/ir/operation/SplitV.h b/runtime/onert/core/include/ir/operation/SplitV.h
new file mode 100644
index 000000000..99a06ee7f
--- /dev/null
+++ b/runtime/onert/core/include/ir/operation/SplitV.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef __ONERT_IR_OPERATION_SPLIT_V_H__
+#define __ONERT_IR_OPERATION_SPLIT_V_H__
+
+#include "ir/Operation.h"
+
+namespace onert
+{
+namespace ir
+{
+namespace operation
+{
+class SplitV : public Operation
+{
+public:
+  enum Input
+  {
+    INPUT = 0,
+    SIZE_SPLITS = 1,
+    SPLIT_DIM = 2
+  };
+
+  struct Param
+  {
+    int num_splits;
+  };
+
+public:
+  SplitV(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
+         const Param &param);
+
+public:
+  void accept(OperationVisitor &v) const override;
+  OpCode opcode() const final { return OpCode::SplitV; }
+
+public:
+  const Param &param() const { return _param; }
+
+private:
+  Param _param;
+};
+} // namespace operation
+} // namespace ir
+} // namespace onert
+#endif // __ONERT_IR_OPERATION_SPLIT_V_H__
diff --git a/runtime/onert/core/include/ir/operation/StatelessRandomUniform.h b/runtime/onert/core/include/ir/operation/StatelessRandomUniform.h
new file mode 100644
index 000000000..112a748fd
--- /dev/null
+++ b/runtime/onert/core/include/ir/operation/StatelessRandomUniform.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __ONERT_IR_OPERATION_STATELESS_RANDOM_UNIFORM_H__
+#define __ONERT_IR_OPERATION_STATELESS_RANDOM_UNIFORM_H__
+
+#include <memory>
+
+#include "ir/Operation.h"
+
+namespace onert
+{
+namespace ir
+{
+namespace operation
+{
+
+class StatelessRandomUniform : public Operation
+{
+public:
+  enum Input
+  {
+    SHAPE = 0,
+    SEED = 1
+  };
+
+public:
+  StatelessRandomUniform(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs);
+
+public:
+  void accept(OperationVisitor &v) const override;
+  OpCode opcode() const final { return OpCode::StatelessRandomUniform; }
+};
+
+} // namespace operation
+} // namespace ir
+} // namespace onert
+
+#endif // __ONERT_IR_OPERATION_STATELESS_RANDOM_UNIFORM_H__
diff --git a/runtime/onert/core/include/util/ShapeInference.h b/runtime/onert/core/include/util/ShapeInference.h
index 0d4525144..a68c22b16 100644
--- a/runtime/onert/core/include/util/ShapeInference.h
+++ b/runtime/onert/core/include/util/ShapeInference.h
@@ -95,6 +95,9 @@ template <float *> ir::Shape inferRangeShape(float *start_val, float *limit_val,
 
 template <typename T> ir::Shape inferRangeShape(T start_val, T limit_val, T delta_val);
 
+ir::Shape inferResizeBilinearShape(const ir::Shape &in_shape, const int32_t output_height,
+                                   const int32_t output_width);
+
 ir::Shape inferSelectShape(const ir::Shape &input_cond_shape, const ir::Shape &input_true_shape,
                            const ir::Shape &input_false_shape);
 
diff --git a/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc b/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc
index 32a80412b..e538f3fd3 100644
--- a/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc
+++ b/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.cc
@@ -36,7 +36,7 @@ DynamicTensorManager::DynamicTensorManager(const std::shared_ptr<cpu_common::Ten
 void DynamicTensorManager::applyShape(const ir::OperandIndex &ind, const ir::Shape &new_shape)
 {
   // NOTE Handle user tensors first
-  auto user_tensor = _user_tensors->getManagedTensor(ind);
+  auto user_tensor = _user_tensors->getNativeTensor(ind);
   if (user_tensor)
   {
     // User tensors cannot be reallocated.
@@ -45,10 +45,11 @@ void DynamicTensorManager::applyShape(const ir::OperandIndex &ind, const ir::Sha
     if (buffer_size < new_size)
       throw std::runtime_error{"ExecutorBase: output buffer size is less than output tensor size"};
     user_tensor->setShape(new_shape);
+    return;
   }
 
-  // NOTE Then handle managed tensors
-  auto tensor = _tensors->getManagedTensor(ind);
+  // NOTE Then handle native tensors
+  auto tensor = _tensors->getNativeTensor(ind);
   assert(tensor);
 
   bool previously_dynamic = tensor->is_dynamic();
@@ -101,9 +102,9 @@ void DynamicTensorManager::buildTensor(const ir::OperandIndex &ind,
                                        const ir::OperandInfo &tensor_info,
                                        ir::Layout backend_layout)
 {
-  assert(_tensors->getManagedTensor(ind) == nullptr);
-  auto tensor = std::make_shared<cpu_common::Tensor>(tensor_info, backend_layout);
-  _tensors->setManagedTensor(ind, tensor);
+  assert(_tensors->getNativeTensor(ind) == nullptr);
+  auto tensor = std::make_shared<cpu_common::Tensor>(tensor_info, backend_layout, this);
+  _tensors->setNativeTensor(ind, tensor);
 }
 
 void DynamicTensorManager::planDealloc(ir::OperationIndex op_ind, ir::OperandIndex operand_ind)
@@ -130,7 +131,7 @@ void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind)
   auto &input_set = find->second;
   for (auto input_ind : input_set)
   {
-    if (!_tensors->getManagedTensor(input_ind)->is_dynamic())
+    if (!_tensors->getNativeTensor(input_ind)->is_dynamic())
       continue;
 
     _dynamic_mem_mgr->deallocate(input_ind);
@@ -141,7 +142,7 @@ void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind)
 
 void DynamicTensorManager::deallocSubgraphOutput(ir::OperandIndex output_ind)
 {
-  if (!_tensors->getManagedTensor(output_ind)->is_dynamic())
+  if (!_tensors->getNativeTensor(output_ind)->is_dynamic())
     return;
 
   _dynamic_mem_mgr->deallocate(output_ind);
diff --git a/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.h b/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.h
index 300c342c7..446427d64 100644
--- a/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.h
+++ b/runtime/onert/core/src/backend/controlflow/DynamicTensorManager.h
@@ -61,6 +61,7 @@ private:
    * @todo  DynamicMemoryManager is not optimized. Optimized one is needed
    */
   std::shared_ptr<cpu_common::DynamicMemoryManager> _dynamic_mem_mgr;
+  // TODO Refactoring : Merge two TensorRegistries into one
   const std::shared_ptr<cpu_common::TensorRegistry> _tensors;
   const std::shared_ptr<UserTensorRegistry> _user_tensors;
 
diff --git a/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc b/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc
index 4b683fb58..eb83b7de4 100644
--- a/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc
+++ b/runtime/onert/core/src/backend/controlflow/KernelGenerator.cc
@@ -81,23 +81,23 @@ void KernelGenerator::visit(const ir::operation::If &node)
   std::vector<std::shared_ptr<backend::ITensor>> input_tensors;
   for (const auto input_index : node.getInputs())
   {
-    auto input_alloc = getTensor(input_index);
+    auto input_tensor = getTensor(input_index);
 
-    input_tensors.emplace_back(input_alloc);
+    input_tensors.emplace_back(input_tensor);
   }
 
   std::vector<std::shared_ptr<backend::ITensor>> output_tensors;
   exec::DynAllocInfoMap outputs_dyn_alloc_info;
   for (const auto output_index : node.getOutputs())
   {
-    auto output_alloc = getTensor(output_index);
+    auto output_tensor = getTensor(output_index);
 
-    output_tensors.emplace_back(output_alloc);
+    output_tensors.emplace_back(output_tensor);
     const auto output_tensor_builder = getTensorBuilder(output_index);
     if (output_tensor_builder->supportDynamicTensor())
     {
       auto output_dyn_manager = output_tensor_builder->dynamicTensorManager();
-      outputs_dyn_alloc_info[output_alloc] = exec::DynAllocInfo{output_index, output_dyn_manager};
+      outputs_dyn_alloc_info[output_tensor] = exec::DynAllocInfo{output_index, output_dyn_manager};
     }
   }
 
@@ -146,24 +146,24 @@ void KernelGenerator::visit(const ir::operation::While &node)
   std::vector<std::shared_ptr<backend::ITensor>> input_tensors;
   for (const auto input_index : node.getInputs())
   {
-    auto input_alloc = getTensor(input_index);
+    auto input_tensor = getTensor(input_index);
 
-    input_tensors.emplace_back(input_alloc);
+    input_tensors.emplace_back(input_tensor);
   }
 
   std::vector<std::shared_ptr<backend::ITensor>> output_tensors;
   std::unordered_map<std::shared_ptr<ITensor>, exec::DynAllocInfo> outputs_dyn_alloc_info;
   for (const auto output_index : node.getOutputs())
   {
-    auto output_alloc = getTensor(output_index);
+    auto output_tensor = getTensor(output_index);
 
-    output_tensors.emplace_back(output_alloc);
+    output_tensors.emplace_back(output_tensor);
 
     const auto output_tensor_builder = getTensorBuilder(output_index);
     if (output_tensor_builder->supportDynamicTensor())
     {
       auto output_dyn_manager = output_tensor_builder->dynamicTensorManager();
-      outputs_dyn_alloc_info[output_alloc] = exec::DynAllocInfo{output_index, output_dyn_manager};
+      outputs_dyn_alloc_info[output_tensor] = exec::DynAllocInfo{output_index, output_dyn_manager};
     }
   }
 
@@ -199,7 +199,7 @@ KernelGenerator::getTensorBuilder(const ir::OperandIndex &index)
   for (auto tensor_builder : _tensor_builder_set)
   {
     auto reg = tensor_builder->tensorRegistry();
-    auto tensor = reg ? reg->getManagedITensor(index) : tensor_builder->tensorAt(index);
+    auto tensor = reg ? reg->getNativeITensor(index) : tensor_builder->tensorAt(index);
     if (tensor)
     {
       ret = tensor_builder;
diff --git a/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc b/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc
index 16cd3ec63..5bddb9185 100644
--- a/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc
+++ b/runtime/onert/core/src/backend/controlflow/TensorBuilder.cc
@@ -92,7 +92,7 @@ void TensorBuilder::allocate()
 std::shared_ptr<ITensor> TensorBuilder::tensorAt(const ir::OperandIndex &ind)
 {
   // NOTE Find from User Tensor Registry first
-  // FIXME There may be both user tensor and managed tensor for a `ind` which is a waste
+  // FIXME There may be both user tensor and native tensor for a `ind` which is a waste
   auto user_tensor = _user_tensor_reg->getITensor(ind);
   auto tensor = _tensor_reg->getITensor(ind);
   if (user_tensor)
@@ -107,7 +107,7 @@ void TensorBuilder::iterate(const IterateFunction &fn) { _static_tensor_mgr->ite
 
 std::shared_ptr<cpu_common::Tensor> TensorBuilder::at(const ir::OperandIndex &ind)
 {
-  return _tensor_reg->getManagedTensor(ind);
+  return _tensor_reg->getNativeTensor(ind);
 }
 
 std::unique_ptr<ITensorManager> TensorBuilder::releaseStaticTensorManager(void)
@@ -123,7 +123,7 @@ std::unique_ptr<ITensorManager> TensorBuilder::releaseDynamicTensorManager(void)
 void TensorBuilder::setUserTensor(const ir::OperandIndex &ind,
                                   const std::shared_ptr<UserTensor> &tensor)
 {
-  _user_tensor_reg->setManagedTensor(ind, tensor);
+  _user_tensor_reg->setNativeTensor(ind, tensor);
 }
 
 } // namespace controlflow
diff --git a/runtime/onert/core/src/backend/controlflow/UserTensor.h b/runtime/onert/core/src/backend/controlflow/UserTensor.h
index ce94ea028..9be33595d 100644
--- a/runtime/onert/core/src/backend/controlflow/UserTensor.h
+++ b/runtime/onert/core/src/backend/controlflow/UserTensor.h
@@ -38,12 +38,16 @@ namespace controlflow
 class UserTensor : public IPortableTensor
 {
 public:
-  UserTensor(const ir::OperandInfo &info, ir::Layout layout, uint8_t *buffer, size_t size)
-      : _info{info}, _layout{layout}, _buffer{buffer}, _size{size}, _dynamic{false}
+  UserTensor(const ir::OperandInfo &info, ir::Layout layout, uint8_t *buffer, size_t size,
+             IDynamicTensorManager *dynamic_tensor_manager)
+      : _info{info}, _layout{layout}, _buffer{buffer}, _size{size}, _dynamic{false},
+        _dynamic_tensor_manager{dynamic_tensor_manager}
   {
   }
 
-  UserTensor(const ir::OperandInfo &info, ir::Layout layout) : UserTensor{info, layout, nullptr, 0}
+  UserTensor(const ir::OperandInfo &info, ir::Layout layout,
+             IDynamicTensorManager *dynamic_tensor_manager)
+      : UserTensor{info, layout, nullptr, 0, dynamic_tensor_manager}
   {
   }
 
@@ -68,6 +72,8 @@ public:
   void set_dynamic() override { _dynamic = true; }
   ir::Shape getShape() const override { return _info.shape(); }
   void setShape(const ir::Shape &new_shape) override { _info.shape(new_shape); }
+  bool is_constant() const override { return false; }
+  IDynamicTensorManager *dynamic_tensor_manager() override { return _dynamic_tensor_manager; }
 
 private:
   ir::OperandInfo _info;
@@ -75,6 +81,7 @@ private:
   uint8_t *_buffer;
   size_t _size;
   bool _dynamic;
+  IDynamicTensorManager *_dynamic_tensor_manager;
 };
 
 } // namespace controlflow
diff --git a/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc b/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc
index 0ccf7000b..cb27d757f 100644
--- a/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc
+++ b/runtime/onert/core/src/backend/cpu_common/DynamicTensorManager.cc
@@ -35,7 +35,7 @@ void DynamicTensorManager::applyShape(const ir::OperandIndex &ind, const ir::Sha
 {
   VERBOSE_F() << ind << std::endl;
 
-  auto tensor = _tensors->getManagedTensor(ind);
+  auto tensor = _tensors->getNativeTensor(ind);
   assert(tensor);
 
   bool previously_dynamic = tensor->is_dynamic();
@@ -88,9 +88,9 @@ void DynamicTensorManager::buildTensor(const ir::OperandIndex &ind,
                                        const ir::OperandInfo &tensor_info,
                                        ir::Layout backend_layout)
 {
-  assert(_tensors->getManagedTensor(ind) == nullptr);
-  auto tensor = std::make_shared<Tensor>(tensor_info, backend_layout);
-  _tensors->setManagedTensor(ind, tensor);
+  assert(_tensors->getNativeTensor(ind) == nullptr);
+  auto tensor = std::make_shared<Tensor>(tensor_info, backend_layout, this);
+  _tensors->setNativeTensor(ind, tensor);
 }
 
 void DynamicTensorManager::planDealloc(ir::OperationIndex op_ind, ir::OperandIndex operand_ind)
@@ -117,7 +117,7 @@ void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind)
   auto &input_set = find->second;
   for (auto input_ind : input_set)
   {
-    auto *tensor = _tensors->getManagedTensor(input_ind).get();
+    auto *tensor = _tensors->getNativeTensor(input_ind).get();
     if (!tensor->is_dynamic())
       continue;
 
@@ -131,7 +131,7 @@ void DynamicTensorManager::deallocInput(ir::OperationIndex op_ind)
 
 void DynamicTensorManager::deallocSubgraphOutput(ir::OperandIndex output_ind)
 {
-  auto *tensor = _tensors->getManagedTensor(output_ind).get();
+  auto *tensor = _tensors->getNativeTensor(output_ind).get();
   if (!tensor->is_dynamic())
     return;
 
diff --git a/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc b/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc
index 47bea35df..820cad38a 100644
--- a/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc
+++ b/runtime/onert/core/src/backend/cpu_common/StaticTensorManager.cc
@@ -16,6 +16,7 @@
 
 #include "backend/cpu_common/StaticTensorManager.h"
 
+#include "backend/cpu_common/DynamicTensorManager.h"
 #include <util/logging.h>
 
 namespace onert
@@ -33,7 +34,7 @@ StaticTensorManager::StaticTensorManager(const std::shared_ptr<TensorRegistry> &
 
 void StaticTensorManager::allocateConsts(void)
 {
-  for (auto &pair : _tensors->managed_tensors())
+  for (auto &pair : _tensors->native_tensors())
   {
     const auto &ind = pair.first;
     auto tensor = pair.second;
@@ -42,9 +43,9 @@ void StaticTensorManager::allocateConsts(void)
       auto mem_alloc = _const_mgr->allocate(ind, tensor->total_size());
       tensor->setBuffer(mem_alloc);
       auto buffer = mem_alloc->base();
-      VERBOSE(CPU_StaticTensorManager) << "CONSTANT TENSOR(#" << ind.value()
-                                       << "): " << static_cast<void *>(buffer)
-                                       << "size : " << tensor->total_size() << std::endl;
+      VERBOSE(CPU_COMMON_StaticTensorManager) << "CONSTANT TENSOR(#" << ind.value()
+                                              << "): " << static_cast<void *>(buffer)
+                                              << "size : " << tensor->total_size() << std::endl;
     }
   }
 }
@@ -53,7 +54,7 @@ void StaticTensorManager::allocateNonconsts(void)
 {
   _nonconst_mgr->allocate();
 
-  for (auto &pair : _tensors->managed_tensors())
+  for (auto &pair : _tensors->native_tensors())
   {
     const auto &ind = pair.first;
     auto tensor = pair.second;
@@ -62,8 +63,8 @@ void StaticTensorManager::allocateNonconsts(void)
       auto *buffer = _nonconst_mgr->getBuffer(ind);
       tensor->setBuffer(buffer);
 
-      VERBOSE(CPU_StaticTensorManager) << "TENSOR(#" << ind.value()
-                                       << "): " << static_cast<void *>(buffer) << std::endl;
+      VERBOSE(CPU_COMMON_StaticTensorManager) << "TENSOR(#" << ind.value()
+                                              << "): " << static_cast<void *>(buffer) << std::endl;
     }
   }
 }
@@ -76,18 +77,18 @@ void StaticTensorManager::buildTensor(const ir::OperandIndex &ind,
                                       const ir::OperandInfo &tensor_info, ir::Layout backend_layout,
                                       bool as_const)
 {
-  assert(!_tensors->getManagedTensor(ind));
-  auto tensor = std::make_shared<Tensor>(tensor_info, backend_layout);
-  _tensors->setManagedTensor(ind, tensor);
+  assert(!_tensors->getNativeTensor(ind));
+  auto tensor = std::make_shared<Tensor>(tensor_info, backend_layout, nullptr);
+  _tensors->setNativeTensor(ind, tensor);
   _as_constants[ind] = as_const;
 }
 
 void StaticTensorManager::claimPlan(const ir::OperandIndex &ind, uint32_t size)
 {
-  assert(_tensors->getManagedTensor(ind));
+  assert(_tensors->getNativeTensor(ind));
 
   // This method is called only when a tensor has proper shape
-  assert(!_tensors->getManagedTensor(ind)->is_dynamic());
+  assert(!_tensors->getNativeTensor(ind)->is_dynamic());
 
   if (!_as_constants[ind])
     _nonconst_mgr->claimPlan(ind, size);
@@ -95,10 +96,10 @@ void StaticTensorManager::claimPlan(const ir::OperandIndex &ind, uint32_t size)
 
 void StaticTensorManager::releasePlan(const ir::OperandIndex &ind)
 {
-  assert(_tensors->getManagedTensor(ind));
+  assert(_tensors->getNativeTensor(ind));
 
   // This method is called only when a tensor has proper shape
-  assert(!_tensors->getManagedTensor(ind)->is_dynamic());
+  assert(!_tensors->getNativeTensor(ind)->is_dynamic());
 
   if (!_as_constants[ind])
     _nonconst_mgr->releasePlan(ind);
@@ -106,7 +107,7 @@ void StaticTensorManager::releasePlan(const ir::OperandIndex &ind)
 
 void StaticTensorManager::iterate(const std::function<void(const ir::OperandIndex &)> &fn)
 {
-  for (const auto &it : _tensors->managed_tensors())
+  for (const auto &it : _tensors->native_tensors())
     fn(it.first);
 }
 
diff --git a/runtime/onert/core/src/compiler/ExecutorFactory.cc b/runtime/onert/core/src/compiler/ExecutorFactory.cc
index f3f69ad1a..82afd9e56 100644
--- a/runtime/onert/core/src/compiler/ExecutorFactory.cc
+++ b/runtime/onert/core/src/compiler/ExecutorFactory.cc
@@ -196,23 +196,41 @@ ExecutorFactory::initializeModelIOTensors(ir::LoweredGraph &lowered_graph,
     const auto &operand = lowered_graph.graph().operands().at(ind);
     auto tensor = std::make_shared<backend::controlflow::UserTensor>(
         operand.info(),
-        ir::Layout::NHWC /* FIXME find op_seq for this operand and use frontend_layout */);
+        ir::Layout::NHWC, /* FIXME find op_seq for this operand and use frontend_layout */
+        cf_tensor_builder->dynamicTensorManager());
 
     // Add tensor to controlflow TensorRegistry.
     cf_tensor_builder->setUserTensor(ind, tensor);
     ret.push_back(tensor);
-
-    // Set other tensors as external tensors
-    for (auto &tensor_builder : tensor_builders)
-    {
-      // FIXME This is a workaround registering all user tensors to all backends
-      // FIXME Handle when it is failed
-      tensor_builder->setExternalTensor(ind, tensor);
-    }
   }
   return ret;
 }
 
+void ExecutorFactory::prepareExternalTensors(ir::LoweredGraph &lowered_graph,
+                                             TensorBuilders &tensor_builders)
+{
+  lowered_graph.op_seqs().iterate(
+      [&](const ir::OpSequenceIndex &op_seq_index, const ir::OpSequence &op_seq) {
+        auto lower_info = lowered_graph.getLowerInfo(op_seq_index);
+        auto &backend_ctx = lowered_graph.backend_contexts().at(lower_info->backend());
+        for (auto ind : (op_seq.getInputs() + op_seq.getOutputs()) | ir::Remove::DUPLICATED |
+                            ir::Remove::UNDEFINED)
+        {
+          // If an OpSequence input/output tensor does not have a own tensor object,
+          // it must be using external tensors, so find the tensor from other tensor builders and
+          // set the tensor to this tensor builder if portable
+          if (!backend_ctx->tensor_builder->tensorAt(ind))
+          {
+            auto tensor = tensor_builders.getITensor(ind);
+            assert(tensor); // The tensor must have been created in one of TensorBuilders
+            auto ptensor = std::dynamic_pointer_cast<backend::IPortableTensor>(tensor);
+            if (ptensor)
+              backend_ctx->tensor_builder->setMigrantTensor(ind, ptensor);
+          }
+        }
+      });
+}
+
 exec::IExecutor *
 ExecutorFactory::createLinearExecutor(std::unique_ptr<ir::LoweredGraph> lowered_graph,
                                       const compiler::CompilerOptions &options,
@@ -265,6 +283,8 @@ ExecutorFactory::createLinearExecutor(std::unique_ptr<ir::LoweredGraph> lowered_
     tensor_builder->prepare();
   }
 
+  prepareExternalTensors(*lowered_graph, tensor_builders);
+
   ExecutionBuilder builder;
 
   // Generate kernels
@@ -367,6 +387,8 @@ exec::IExecutor *ExecutorFactory::createDataflowExecutor(
     tensor_builder->prepare();
   }
 
+  prepareExternalTensors(*lowered_graph, tensor_builders);
+
   ExecutionBuilder builder;
 
   // Generate kernels
diff --git a/runtime/onert/core/src/compiler/ExecutorFactory.h b/runtime/onert/core/src/compiler/ExecutorFactory.h
index 1e82b9838..418e5a764 100644
--- a/runtime/onert/core/src/compiler/ExecutorFactory.h
+++ b/runtime/onert/core/src/compiler/ExecutorFactory.h
@@ -22,6 +22,7 @@
 #include "backend/ITensor.h"
 #include "exec/IExecutor.h"
 #include "ir/LoweredGraph.h"
+#include "TensorBuilders.h"
 
 namespace onert
 {
@@ -48,6 +49,8 @@ private:
   static std::vector<std::shared_ptr<backend::ITensor>>
   initializeModelIOTensors(ir::LoweredGraph &lowered_graph,
                            const ir::OperandIndexSequence &indices);
+  static void prepareExternalTensors(ir::LoweredGraph &lowered_graph,
+                                     TensorBuilders &tensor_builders);
   static exec::IExecutor *
   createLinearExecutor(std::unique_ptr<ir::LoweredGraph> lowered_graph,
                        const compiler::CompilerOptions &options,
diff --git a/runtime/onert/core/src/compiler/Fp32ToFp16Converter.cc b/runtime/onert/core/src/compiler/Fp32ToFp16Converter.cc
index c68e6c388..5c4b84ec0 100644
--- a/runtime/onert/core/src/compiler/Fp32ToFp16Converter.cc
+++ b/runtime/onert/core/src/compiler/Fp32ToFp16Converter.cc
@@ -316,7 +316,7 @@ void Fp32ToFp16Converter::appendNewOpSeqForConvertFp16ToFp32(const ir::OpSequenc
 
     // manipulate output of operation and op_seq
     // - replace output of the last operation's output to new operand
-    //    with old operand's removeDef and new operand's appendDef()
+    //    with old operand's unsetDef and new operand's appendDef()
     manipulateOutput(op_seq_ind, op_seq_output_ind, new_op_ind);
 
     // new op
@@ -584,8 +584,9 @@ void Fp32ToFp16Converter::manipulateOutput(const ir::OpSequenceIndex &op_seq_ind
   last_node.replaceOutputs(op_seq_output_ind, new_op_ind);
 
   // op_seq_obj doesn't have uses/def
-  output_obj.removeDef(last_node_ind);
-  new_op_obj.insertDef(last_node_ind);
+  assert(output_obj.getDef() == last_node_ind);
+  output_obj.unsetDef();
+  new_op_obj.setDef(last_node_ind);
 }
 
 ir::OperationIndex
@@ -603,7 +604,7 @@ Fp32ToFp16Converter::newOperationConvertFp32ToFp16(const ir::OperandIndex &op_se
   const auto new_node_ind = operations.push(std::move(new_node));
 
   input_obj.insertUse(new_node_ind);
-  new_op_obj.insertDef(new_node_ind);
+  new_op_obj.setDef(new_node_ind);
 
   return new_node_ind;
 }
@@ -623,7 +624,7 @@ Fp32ToFp16Converter::newOperationConvertFp16ToFp32(const ir::OperandIndex &op_se
   const auto new_node_ind = operations.push(std::move(new_node));
 
   new_op_obj.insertUse(new_node_ind);
-  output_obj.insertDef(new_node_ind);
+  output_obj.setDef(new_node_ind);
 
   return new_node_ind;
 }
@@ -925,7 +926,8 @@ void Fp32ToFp16Converter::deleteContiguousOpSequences(
     for (auto &ind : first_node.getOutputs() | ir::Remove::DUPLICATED | ir::Remove::UNDEFINED)
     {
       auto &obj = operands.at(ind);
-      obj.removeDef(first_node_ind);
+      assert(obj.getDef() == first_node_ind);
+      obj.unsetDef();
       VERBOSE(Fp32ToFp16Converter) << "Operand #" << ind.value() << "'s Def(Node#"
                                    << first_node_ind.value() << ") is removed" << std::endl;
     }
diff --git a/runtime/onert/core/src/compiler/HEScheduler.cc b/runtime/onert/core/src/compiler/HEScheduler.cc
index b0e61f6d5..de9b4fbd0 100644
--- a/runtime/onert/core/src/compiler/HEScheduler.cc
+++ b/runtime/onert/core/src/compiler/HEScheduler.cc
@@ -105,7 +105,7 @@ static bool isMergeable(const ir::Graph &graph, const ir::Operation &node)
       continue;
 
     // This operand is output of operation, not weight or bias
-    if (operand.getDef().size() > 0)
+    if (operand.getDef().valid())
       ++prev_op_cnt;
 
     // Current node has multiple inputs as concat or at the beginning of the separated branch
@@ -599,7 +599,8 @@ int64_t HEScheduler::predMaxEFT(const backend::Backend *backend, const ir::Opera
     const auto &input_operand = _graph->operands().at(input_operand_idx);
     const bool quant = input_operand.typeInfo().type() == ir::DataType::QUANT_UINT8_ASYMM;
 
-    for (const auto &input_node_idx : input_operand.getDef())
+    auto input_node_idx = input_operand.getDef();
+    if (input_node_idx.valid())
     {
       // Data transfer cost from parent's node backend to current node's backend:
       auto parent_backend = _backend_resolver->getBackend(input_node_idx);
diff --git a/runtime/onert/core/src/compiler/HEScheduler.h b/runtime/onert/core/src/compiler/HEScheduler.h
index f5075390d..d8ceca9c8 100644
--- a/runtime/onert/core/src/compiler/HEScheduler.h
+++ b/runtime/onert/core/src/compiler/HEScheduler.h
@@ -51,16 +51,12 @@ public:
    * @param[in] backend_resolver backend resolver
    */
   HEScheduler(const backend::BackendContexts &backend_contexts, const CompilerOptions &options)
-      : _backend_contexts{backend_contexts}, _is_supported{}, _backends_avail_time{}, _ops_eft{},
+      : _is_supported{}, _backends_avail_time{}, _ops_eft{},
         _op_to_rank{std::make_shared<ir::OperationIndexMap<int64_t>>()},
         _is_profiling_mode{options.he_profiling_mode},
         _is_linear_exec{options.executor == "Linear"},
         _is_parallel_exec{options.executor == "Parallel"}
   {
-    // Workaround to avoid unused-private-field warning
-    // TODO use _backend_contexts and remove workaround
-    (void)_backend_contexts;
-
     for (auto &entry : backend_contexts)
     {
       _all_backends.push_back(entry.first);
@@ -165,7 +161,6 @@ private:
   // whether it should assign these backends to these nodes:
   // * It stores false for unsupported nodes
   // * During rank calculation with enabled profiling mode it stores true for supported nodes
-  const backend::BackendContexts &_backend_contexts;
   std::unordered_map<const backend::Backend *, std::unordered_map<std::string, bool>> _is_supported;
   // Finishing and starting time of each backend
   std::unordered_map<const backend::Backend *, std::map<int64_t, int64_t>> _backends_avail_time;
@@ -175,8 +170,7 @@ private:
   std::unique_ptr<compiler::BackendResolver> _backend_resolver;
   std::unique_ptr<exec::ExecTime> _exec_time;
   const ir::Graph *_graph{nullptr};
-  std::vector<const backend::Backend *>
-      _all_backends; // TODO Remove this and use _backend_contexts instead
+  std::vector<const backend::Backend *> _all_backends;
   const backend::Backend *_cpu_backend{nullptr}; // TODO Change this to controlflow_backend
   bool _is_profiling_mode;
   bool _is_linear_exec;
diff --git a/runtime/onert/core/src/compiler/Linear.cc b/runtime/onert/core/src/compiler/Linear.cc
index b9db2f358..493ca1e43 100644
--- a/runtime/onert/core/src/compiler/Linear.cc
+++ b/runtime/onert/core/src/compiler/Linear.cc
@@ -96,7 +96,7 @@ void Linear::planTensors(const ir::LoweredGraph &lowered_graph,
     }
 
     uses_map[ind] = obj.getUses().size();
-    def_map[ind] = obj.getDef().size(); // should be 1 or 0
+    def_map[ind] = obj.getDef().valid() ? 1 : 0;
 
     bool is_const = obj.isConstant();
     if (is_const)
diff --git a/runtime/onert/core/src/compiler/OperandContext.h b/runtime/onert/core/src/compiler/OperandContext.h
deleted file mode 100644
index 390b376fe..000000000
--- a/runtime/onert/core/src/compiler/OperandContext.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifndef __ONERT_COMPILER_OPERAND_CONTEXT_H__
-#define __ONERT_COMPILER_OPERAND_CONTEXT_H__
-
-#include "backend/ITensor.h"
-#include "ir/OperandIndexMap.h"
-#include <unordered_map>
-#include <memory>
-
-namespace onert
-{
-namespace compiler
-{
-
-class OperandContext
-{
-public:
-  OperandContext &set(const ir::OperandIndex &ind, const std::shared_ptr<backend::ITensor> &tensor);
-
-public:
-  bool exist(const ir::OperandIndex &ind) const { return _tensors.find(ind) != _tensors.end(); }
-
-public:
-  std::shared_ptr<backend::ITensor> at(const ir::OperandIndex &ind) const
-  {
-    return _tensors.at(ind);
-  }
-
-  std::shared_ptr<backend::ITensor> &at(const ir::OperandIndex &ind) { return _tensors.at(ind); }
-
-  void iterate(const std::function<void(const ir::OperandIndex &, backend::ITensor &)> &fn);
-
-private:
-  ir::OperandIndexMap<std::shared_ptr<backend::ITensor>> _tensors;
-};
-
-} // namespace compiler
-} // namespace onert
-
-#endif // __ONERT_COMPILER_OPERAND_CONTEXT_H__
diff --git a/runtime/onert/core/src/compiler/OperationValidator.cc b/runtime/onert/core/src/compiler/OperationValidator.cc
index 5c545aedd..44496318f 100644
--- a/runtime/onert/core/src/compiler/OperationValidator.cc
+++ b/runtime/onert/core/src/compiler/OperationValidator.cc
@@ -41,6 +41,21 @@ OperationValidator::OperationValidator(const ir::Graph &graph)
 {
 }
 
+void OperationValidator::checkUnaryOp(const ir::Operation &node)
+{
+  const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(0)};
+
+  // Check if I/O types match
+  OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type());
+
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  // Check if I/O shapes match
+  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
+}
+
 void OperationValidator::operator()()
 {
   // There is no reason for each subgraph to have subgraphs since compiler has subgraphs when
@@ -53,16 +68,7 @@ void OperationValidator::operator()()
       [&](const ir::OperationIndex &, const ir::Operation &node) { node.accept(*this); });
 }
 
-void OperationValidator::visit(const ir::operation::Abs &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  const auto input_index{node.getInputs().at(0)};
-
-  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
-}
+void OperationValidator::visit(const ir::operation::Abs &node) { checkUnaryOp(node); }
 
 void OperationValidator::visit(const ir::operation::AvgPool2D &node)
 {
@@ -292,17 +298,7 @@ void OperationValidator::visit(const ir::operation::RNN &node)
               num_units == _ctx.at(hidden_state_out_index).shape().dim(1));
 }
 
-void OperationValidator::visit(const ir::operation::Round &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(ir::operation::Round::Input::INPUT)};
-
-  OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type());
-
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
-}
+void OperationValidator::visit(const ir::operation::Round &node) { checkUnaryOp(node); }
 
 void OperationValidator::visit(const ir::operation::SpaceToBatchND &node)
 {
@@ -393,17 +389,7 @@ void OperationValidator::visit(const ir::operation::EmbeddingLookup &node)
   }
 }
 
-void OperationValidator::visit(const ir::operation::Exp &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(ir::operation::Exp::Input::INPUT)};
-
-  OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type());
-
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
-}
+void OperationValidator::visit(const ir::operation::Exp &node) { checkUnaryOp(node); }
 
 void OperationValidator::visit(const ir::operation::ExpandDims &node)
 {
@@ -419,17 +405,7 @@ void OperationValidator::visit(const ir::operation::ExpandDims &node)
   OP_REQUIRES(_ctx.at(axis_index).shape().rank() <= 1);
 }
 
-void OperationValidator::visit(const ir::operation::Floor &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  const auto input_index{node.getInputs().at(ir::operation::Floor::Input::INPUT)};
-
-  OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == _ctx.at(input_index).typeInfo().type());
-
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
-}
+void OperationValidator::visit(const ir::operation::Floor &node) { checkUnaryOp(node); }
 
 void OperationValidator::visit(const ir::operation::HashtableLookup &node)
 {
@@ -789,6 +765,25 @@ void OperationValidator::visit(const ir::operation::LSTM &node)
   }
 }
 
+void OperationValidator::visit(const ir::operation::L2Normalization &node)
+{
+  const auto ofm_index{node.getOutputs().at(0)};
+  if (_ctx.at(ofm_index).info().isDynamic())
+    return;
+
+  const auto ifm_index{node.getInputs().at(ir::operation::L2Normalization::Input::INPUT)};
+
+  auto ifm_shape = _ctx.at(ifm_index).shape();
+  auto ofm_shape = _ctx.at(ofm_index).shape();
+
+  OP_REQUIRES(ifm_shape.rank() == ofm_shape.rank());
+
+  for (auto i = 0; i < ifm_shape.rank(); i++)
+  {
+    OP_REQUIRES(ifm_shape.dim(i) == ofm_shape.dim(i));
+  }
+}
+
 void OperationValidator::visit(const ir::operation::Unpack &node)
 {
   const auto num{node.param().num};
@@ -904,45 +899,39 @@ void OperationValidator::visit(const ir::operation::Split &node)
   OP_REQUIRES(_ctx.at(input_index).shape().dim(axis) % num_splits == 0);
 }
 
-void OperationValidator::visit(const ir::operation::Cos &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
+void OperationValidator::visit(const ir::operation::Cos &node) { checkUnaryOp(node); }
 
-  const auto input_index{node.getInputs().at(0)};
-  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
-}
-
-void OperationValidator::visit(const ir::operation::Sin &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
+void OperationValidator::visit(const ir::operation::Sin &node) { checkUnaryOp(node); }
 
-  const auto input_index{node.getInputs().at(0)};
-  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
-}
+void OperationValidator::visit(const ir::operation::RSQRT &node) { checkUnaryOp(node); }
 
-void OperationValidator::visit(const ir::operation::RSQRT &node)
+void OperationValidator::visit(const ir::operation::Shape &node)
 {
   const auto output_index{node.getOutputs().at(0)};
   if (_ctx.at(output_index).info().isDynamic())
     return;
 
   const auto input_index{node.getInputs().at(0)};
-  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
+  UNUSED_RELEASE(input_index);
+  OP_REQUIRES(_ctx.at(output_index).shape().rank() == 1);
 }
 
-void OperationValidator::visit(const ir::operation::Shape &node)
+void OperationValidator::visit(const ir::operation::ResizeBilinear &node)
 {
   const auto output_index{node.getOutputs().at(0)};
+  const auto input_index{node.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
+
   if (_ctx.at(output_index).info().isDynamic())
+  {
     return;
+  }
+  OP_REQUIRES(_ctx.at(input_index).shape().rank() == 4);
+  OP_REQUIRES(_ctx.at(output_index).shape().rank() == 4);
 
-  const auto input_index{node.getInputs().at(0)};
-  UNUSED_RELEASE(input_index);
-  OP_REQUIRES(_ctx.at(output_index).shape().rank() == 1);
+  auto align_corners = node.param().align_corners;
+  auto half_pixel_centers = node.param().half_pixel_centers;
+
+  OP_REQUIRES(!align_corners || !half_pixel_centers);
 }
 
 void OperationValidator::visit(const ir::operation::Reverse &node)
@@ -972,35 +961,11 @@ void OperationValidator::visit(const ir::operation::While &node)
   // TODO Add to validate with subgraphs
 }
 
-void OperationValidator::visit(const ir::operation::Neg &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
+void OperationValidator::visit(const ir::operation::Neg &node) { checkUnaryOp(node); }
 
-  const auto input_index{node.getInputs().at(0)};
-  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
-}
+void OperationValidator::visit(const ir::operation::Log &node) { checkUnaryOp(node); }
 
-void OperationValidator::visit(const ir::operation::Log &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  const auto input_index{node.getInputs().at(0)};
-  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
-}
-
-void OperationValidator::visit(const ir::operation::LogicalNot &node)
-{
-  const auto output_index{node.getOutputs().at(0)};
-  if (_ctx.at(output_index).info().isDynamic())
-    return;
-
-  const auto input_index{node.getInputs().at(0)};
-  OP_REQUIRES(_ctx.at(output_index).shape() == _ctx.at(input_index).shape());
-}
+void OperationValidator::visit(const ir::operation::LogicalNot &node) { checkUnaryOp(node); }
 
 void OperationValidator::visit(const ir::operation::SquaredDifference &node)
 {
@@ -1118,5 +1083,25 @@ void OperationValidator::visit(const ir::operation::LogSoftmax &node)
 
   OP_REQUIRES(_ctx.at(output_index).shape().rank() == _ctx.at(input_index).shape().rank());
 }
+
+void OperationValidator::visit(const ir::operation::Quantize &node)
+{
+  VERBOSE(Quantize) << "Configure Quantize operation" << std::endl;
+
+  OP_REQUIRES(node.getInputs().size() == 1);
+  OP_REQUIRES(node.getOutputs().size() == 1);
+
+  const auto input_index{node.getInputs().at(0)};
+  const auto output_index{node.getOutputs().at(0)};
+
+  OP_REQUIRES(_ctx.at(input_index).typeInfo().type() == ir::DataType::FLOAT32);
+
+  if (_ctx.at(output_index).info().isDynamic())
+    return;
+
+  OP_REQUIRES(_ctx.at(output_index).typeInfo().type() == ir::DataType::QUANT_UINT8_ASYMM);
+
+  OP_REQUIRES(_ctx.at(output_index).shape().rank() == _ctx.at(input_index).shape().rank());
+}
 } // namespace compiler
 } // namespace onert
diff --git a/runtime/onert/core/src/compiler/OperationValidator.h b/runtime/onert/core/src/compiler/OperationValidator.h
index 6ceafe8b1..b27e6863c 100644
--- a/runtime/onert/core/src/compiler/OperationValidator.h
+++ b/runtime/onert/core/src/compiler/OperationValidator.h
@@ -70,6 +70,7 @@ public:
   void visit(const ir::operation::DepthToSpace &node) override;
   void visit(const ir::operation::Pack &node) override;
   void visit(const ir::operation::LSTM &node) override;
+  void visit(const ir::operation::L2Normalization &node) override;
   void visit(const ir::operation::Unpack &node) override;
   void visit(const ir::operation::Pad &node) override;
   void visit(const ir::operation::Min &node) override;
@@ -81,6 +82,7 @@ public:
   void visit(const ir::operation::Sin &node) override;
   void visit(const ir::operation::RSQRT &node) override;
   void visit(const ir::operation::Shape &node) override;
+  void visit(const ir::operation::ResizeBilinear &node) override;
   void visit(const ir::operation::Reverse &node) override;
   void visit(const ir::operation::If &node) override;
   void visit(const ir::operation::While &node) override;
@@ -93,9 +95,10 @@ public:
   void visit(const ir::operation::Range &node) override;
   void visit(const ir::operation::MatrixBandPart &node) override;
   void visit(const ir::operation::LogSoftmax &node) override;
+  void visit(const ir::operation::Quantize &node) override;
 
 private:
-  void checkReduceOp(const ir::OperandIndex input_index, const ir::OperandIndex output_index);
+  void checkUnaryOp(const ir::Operation &node);
 
 private:
   // TODO Remove _ctx field
diff --git a/runtime/onert/core/src/compiler/StaticShapeInference.cc b/runtime/onert/core/src/compiler/StaticShapeInference.cc
index 5a58f2e9d..76c1edcbc 100644
--- a/runtime/onert/core/src/compiler/StaticShapeInference.cc
+++ b/runtime/onert/core/src/compiler/StaticShapeInference.cc
@@ -497,6 +497,11 @@ void StaticShapeInferer::visit(const ir::operation::Logistic &op)
   handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::Logistic::Input::INPUT));
 }
 
+void StaticShapeInferer::visit(const ir::operation::L2Normalization &op)
+{
+  handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::L2Normalization::Input::INPUT));
+}
+
 void StaticShapeInferer::visit(const ir::operation::MatrixBandPart &op)
 {
   handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::MatrixBandPart::Input::INPUT));
@@ -813,6 +818,35 @@ void StaticShapeInferer::visit(const ir::operation::Reshape &op)
   }
 }
 
+void StaticShapeInferer::visit(const ir::operation::ResizeBilinear &op)
+{
+  const auto input_idx{op.getInputs().at(ir::operation::ResizeBilinear::Input::INPUT)};
+  const auto &input = _operands.at(input_idx);
+
+  // get mutable output operand
+  const auto output_idx = op.getOutputs().at(0);
+  ir::Operand &output = _operands.at(output_idx);
+
+  // if input is dynamic, output also becomes dynamic
+  if (input.info().isDynamic())
+  {
+    output.info().setDynamic();
+    _return_has_dynamic_tensor = true;
+    return;
+  }
+
+  // Shape inferencing logic based on Params
+  ir::Shape new_shape = shape_inference::inferResizeBilinearShape(
+      input.shape(), op.param().height_out, op.param().width_out);
+
+  // if size_op is from Const, TFLC put the shape of output into tensor
+  if (new_shape != output.shape())
+  {
+    // change on output shape
+    output.info().shape(new_shape);
+  }
+}
+
 void StaticShapeInferer::visit(const ir::operation::Reverse &op)
 {
   handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::Reverse::Input::INPUT));
diff --git a/runtime/onert/core/src/compiler/TensorBuilders.h b/runtime/onert/core/src/compiler/TensorBuilders.h
index 4bb7413b1..c0a1ebc04 100644
--- a/runtime/onert/core/src/compiler/TensorBuilders.h
+++ b/runtime/onert/core/src/compiler/TensorBuilders.h
@@ -23,6 +23,7 @@
 #include "backend/Backend.h"
 #include "backend/controlflow/Config.h"
 #include "backend/controlflow/TensorBuilder.h"
+#include "util/logging.h"
 
 namespace onert
 {
@@ -66,6 +67,17 @@ public:
     return _cf_tensor_builder;
   }
 
+  std::shared_ptr<backend::ITensor> getITensor(ir::OperandIndex ind)
+  {
+    for (auto &tensor_builder : _tensor_builders)
+    {
+      auto tensor = tensor_builder->tensorAt(ind);
+      if (tensor)
+        return tensor;
+    }
+    return nullptr;
+  }
+
 private:
   std::unordered_set<std::shared_ptr<backend::ITensorBuilder>> _tensor_builders;
   std::shared_ptr<backend::controlflow::TensorBuilder> _cf_tensor_builder;
diff --git a/runtime/onert/core/src/exec/DynamicShapeInference.cc b/runtime/onert/core/src/exec/DynamicShapeInference.cc
index 1b8202978..5ec7012ee 100644
--- a/runtime/onert/core/src/exec/DynamicShapeInference.cc
+++ b/runtime/onert/core/src/exec/DynamicShapeInference.cc
@@ -16,12 +16,21 @@
 
 #include "exec/DynamicShapeInference.h"
 #include "util/ShapeInference.h"
+#include <assert.h>
 
 namespace onert
 {
 namespace exec
 {
 
+inline backend::IDynamicTensorManager *
+dynamicTensorManagerOf(const std::shared_ptr<backend::ITensor> &tensor)
+{
+  if (!tensor->dynamic_tensor_manager())
+    throw std::runtime_error{"Dynamic Tensor Manager is not available for this tensor."};
+  return tensor->dynamic_tensor_manager();
+}
+
 void DynamicShapeInferer::handleBinaryArithmeticOp(const ir::Operation &op,
                                                    const ir::OperandIndex lhs_idx,
                                                    const ir::OperandIndex rhs_idx)
@@ -55,7 +64,7 @@ void DynamicShapeInferer::handleBinaryArithmeticOp(const ir::Operation &op,
 
   ir::Shape new_shape = shape_inference::inferEltwiseShape(lhs_shape, rhs_shape);
 
-  _dynamic_tensor_manager->applyShape(output_idx, new_shape);
+  dynamicTensorManagerOf(output)->applyShape(output_idx, new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -87,7 +96,7 @@ void DynamicShapeInferer::handleSimpleUnaryOp(const ir::Operation &op,
   auto output_ind = op.getOutputs().at(0);
   auto output = _tensor_registry->getITensor(output_ind);
 
-  _dynamic_tensor_manager->applyShape(output_ind, output_shape);
+  dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -121,7 +130,7 @@ void DynamicShapeInferer::visit(const ir::operation::ArgMax &op)
 
   ir::Shape new_shape = shape_inference::inferArgMaxShape(input_shape, axis, rank);
 
-  _dynamic_tensor_manager->applyShape(output_ind, new_shape);
+  dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -143,7 +152,7 @@ void DynamicShapeInferer::visit(const ir::operation::BatchMatMul &op)
   // TODO
 
   auto new_shape = shape_inference::inferBatchMatMulShape(lhs_shape, rhs_shape, op.param());
-  _dynamic_tensor_manager->applyShape(output_index, new_shape);
+  dynamicTensorManagerOf(output)->applyShape(output_index, new_shape);
 }
 
 void DynamicShapeInferer::visit(const ir::operation::BroadcastTo &op)
@@ -166,7 +175,7 @@ void DynamicShapeInferer::visit(const ir::operation::BroadcastTo &op)
       shape->getShape(), reinterpret_cast<const int32_t *>(shape->buffer()));
 
   // set output shape and output buffer
-  _dynamic_tensor_manager->applyShape(output_ind, output_shape);
+  dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -256,7 +265,7 @@ void DynamicShapeInferer::visit(const ir::operation::Concat &op)
   auto output = _tensor_registry->getITensor(output_ind);
   auto output_shape = shape_inference::inferConcatShape(in_shapes, op.param());
 
-  _dynamic_tensor_manager->applyShape(output_ind, output_shape);
+  dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape);
 }
 
 void DynamicShapeInferer::visit(const ir::operation::Conv2D &op)
@@ -279,7 +288,7 @@ void DynamicShapeInferer::visit(const ir::operation::Conv2D &op)
 
   ir::Shape output_shape = shape_inference::inferConv2DShape(input_shape, ker_shape, op.param());
 
-  _dynamic_tensor_manager->applyShape(output_ind, output_shape);
+  dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -339,7 +348,7 @@ void DynamicShapeInferer::visit(const ir::operation::ExpandDims &op)
 
   auto output_shape = shape_inference::inferExpandDimsShape(input_shape, axis_buf[0]);
 
-  _dynamic_tensor_manager->applyShape(output_ind, output_shape);
+  dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -362,7 +371,7 @@ void DynamicShapeInferer::visit(const ir::operation::Fill &op)
 
   auto output_shape = shape_inference::inferFillShape(input_shape, input_buf);
 
-  _dynamic_tensor_manager->applyShape(output_ind, output_shape);
+  dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -385,7 +394,7 @@ void DynamicShapeInferer::visit(const ir::operation::FullyConnected &op)
   auto output_ind = op.getOutputs().at(0);
   auto output = _tensor_registry->getITensor(output_ind);
 
-  _dynamic_tensor_manager->applyShape(output_ind, new_shape);
+  dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -417,7 +426,7 @@ void DynamicShapeInferer::visit(const ir::operation::Gather &op)
   auto output_ind = op.getOutputs().at(0);
   auto output = _tensor_registry->getITensor(output_ind);
 
-  _dynamic_tensor_manager->applyShape(output_ind, new_shape);
+  dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -442,6 +451,11 @@ void DynamicShapeInferer::visit(const ir::operation::Logistic &op)
   handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::Logistic::INPUT));
 }
 
+void DynamicShapeInferer::visit(const ir::operation::L2Normalization &op)
+{
+  handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::L2Normalization::INPUT));
+}
+
 void DynamicShapeInferer::visit(const ir::operation::MatrixBandPart &op)
 {
   handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::MatrixBandPart::INPUT));
@@ -492,7 +506,7 @@ void DynamicShapeInferer::visit(const ir::operation::OneHot &op)
   const auto axis_val = op.param().axis;
 
   ir::Shape new_shape = shape_inference::inferOnehotShape(indices_shape, *depth_buf, axis_val);
-  _dynamic_tensor_manager->applyShape(output_ind, new_shape);
+  dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -528,7 +542,7 @@ void DynamicShapeInferer::visit(const ir::operation::Pack &op)
 
   ir::Shape new_shape = shape_inference::inferPackShape(input_shape, axis, rank, num);
 
-  _dynamic_tensor_manager->applyShape(output_ind, new_shape);
+  dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -555,7 +569,7 @@ void DynamicShapeInferer::visit(const ir::operation::Pad &op)
       shape_inference::inferPadShape(input->getShape(), pad_buf, pad->getShape().num_elements());
 
   // change output shape and reallocate output tensor memory
-  _dynamic_tensor_manager->applyShape(output_ind, output_shape);
+  dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -607,7 +621,7 @@ void DynamicShapeInferer::visit(const ir::operation::Range &op)
         *reinterpret_cast<int32_t *>(limit_tensor->buffer()),
         *reinterpret_cast<int32_t *>(delta_tensor->buffer()));
   }
-  _dynamic_tensor_manager->applyShape(output_ind, new_shape);
+  dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -651,7 +665,7 @@ void DynamicShapeInferer::visit(const ir::operation::Reduce &op)
 
   ir::Shape new_shape = shape_inference::inferReduceShape(input_shape, axes_vec, keep_dims);
 
-  _dynamic_tensor_manager->applyShape(output_ind, new_shape);
+  dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -705,7 +719,7 @@ void DynamicShapeInferer::visit(const ir::operation::Reshape &op)
     if (output_shape != output->getShape() || output->buffer() == nullptr)
     {
       // change on output shape
-      _dynamic_tensor_manager->applyShape(output_ind, output_shape);
+      dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape);
     }
     assert(output->buffer() != nullptr);
   }
@@ -721,7 +735,7 @@ void DynamicShapeInferer::visit(const ir::operation::Reshape &op)
     if (output_shape != output->getShape() || output->buffer() == nullptr)
     {
       // change on output shape
-      _dynamic_tensor_manager->applyShape(output_ind, output_shape);
+      dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape);
     }
     assert(output->buffer() != nullptr);
   }
@@ -732,6 +746,31 @@ void DynamicShapeInferer::visit(const ir::operation::Reshape &op)
   }
 }
 
+void DynamicShapeInferer::visit(const ir::operation::ResizeBilinear &op)
+{
+  // check if output is not dynamic
+  auto output_ind = op.getOutputs().at(0);
+  auto output = _tensor_registry->getITensor(output_ind);
+
+  auto input_ind = op.getInputs().at(ir::operation::Reshape::Input::INPUT);
+  auto input = _tensor_registry->getITensor(input_ind);
+
+  if ((!input->is_dynamic()) && (!output->is_dynamic()))
+    return;
+
+  // getting output shape from input shape and Params
+  auto output_shape = shape_inference::inferResizeBilinearShape(
+      input->getShape(), op.param().height_out, op.param().width_out);
+
+  // if shape is changed, change output shape and reallocate output tensor memory
+  if (output_shape != output->getShape() || output->buffer() == nullptr)
+  {
+    // change on output shape
+    _dynamic_tensor_manager->applyShape(output_ind, output_shape);
+  }
+  assert(output->buffer() != nullptr);
+}
+
 void DynamicShapeInferer::visit(const ir::operation::Reverse &op)
 {
   handleSimpleUnaryOp(op, op.getInputs().at(ir::operation::Reverse::INPUT));
@@ -774,7 +813,7 @@ void DynamicShapeInferer::visit(const ir::operation::Select &op)
   auto output_ind = op.getOutputs().at(0);
   auto output = _tensor_registry->getITensor(output_ind);
 
-  _dynamic_tensor_manager->applyShape(output_ind, new_shape);
+  dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -793,7 +832,7 @@ void DynamicShapeInferer::visit(const ir::operation::Shape &op)
   ir::Shape output_shape;
   output_shape.append(input_shape.rank());
 
-  _dynamic_tensor_manager->applyShape(output_ind, output_shape);
+  dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -824,7 +863,7 @@ void DynamicShapeInferer::visit(const ir::operation::Slice &op)
 
   ir::Shape new_shape = shape_inference::inferSliceShape(input_shape, begins_buf, sizes_buf);
 
-  _dynamic_tensor_manager->applyShape(output_index, new_shape);
+  dynamicTensorManagerOf(output)->applyShape(output_index, new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -861,7 +900,7 @@ void DynamicShapeInferer::visit(const ir::operation::SpaceToBatchND &op)
   ir::Shape new_shape = shape_inference::inferSpaceToBatchNDShape(
       input_shape, block_shape_shape, padding_shape, block_shape_data, padding_data);
 
-  _dynamic_tensor_manager->applyShape(output_idx, new_shape);
+  dynamicTensorManagerOf(output)->applyShape(output_idx, new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -890,7 +929,7 @@ void DynamicShapeInferer::visit(const ir::operation::Split &op)
     auto output_ind = op.getOutputs().at(out_tensor_idx);
     auto output = _tensor_registry->getITensor(output_ind);
 
-    _dynamic_tensor_manager->applyShape(output_ind, new_shape);
+    dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape);
     assert(output->buffer() != nullptr);
   }
 }
@@ -919,7 +958,7 @@ void DynamicShapeInferer::visit(const ir::operation::Squeeze &op)
   auto output_ind = op.getOutputs().at(0);
   auto output = _tensor_registry->getITensor(output_ind);
 
-  _dynamic_tensor_manager->applyShape(output_ind, new_shape);
+  dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -960,7 +999,7 @@ void DynamicShapeInferer::visit(const ir::operation::StridedSlice &op)
   ir::Shape output_shape =
       onert::shape_inference::inferStridedSliceShape(input_shape, op_params, rank);
 
-  _dynamic_tensor_manager->applyShape(output_index, output_shape);
+  dynamicTensorManagerOf(output)->applyShape(output_index, output_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -996,7 +1035,7 @@ void DynamicShapeInferer::visit(const ir::operation::Tile &op)
   auto output_shape = shape_inference::inferTileShape(input_shape, multiplier_buffer);
 
   // set output shape and output buffer
-  _dynamic_tensor_manager->applyShape(output_ind, output_shape);
+  dynamicTensorManagerOf(output)->applyShape(output_ind, output_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -1018,7 +1057,7 @@ void DynamicShapeInferer::visit(const ir::operation::Transpose &op)
   // set output shape, based on input and params
   ir::Shape new_shape = shape_inference::inferTransposeShape(input_shape, perm);
 
-  _dynamic_tensor_manager->applyShape(output_ind, new_shape);
+  dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape);
   assert(output->buffer() != nullptr);
 }
 
@@ -1046,7 +1085,7 @@ void DynamicShapeInferer::visit(const ir::operation::Unpack &op)
     auto output_ind = op.getOutputs().at(out_tensor_idx);
     auto output = _tensor_registry->getITensor(output_ind);
 
-    _dynamic_tensor_manager->applyShape(output_ind, new_shape);
+    dynamicTensorManagerOf(output)->applyShape(output_ind, new_shape);
 
     assert(output->buffer() != nullptr);
   }
diff --git a/runtime/onert/core/src/exec/ExecutorBase.cc b/runtime/onert/core/src/exec/ExecutorBase.cc
index a7409b90c..864ccb31a 100644
--- a/runtime/onert/core/src/exec/ExecutorBase.cc
+++ b/runtime/onert/core/src/exec/ExecutorBase.cc
@@ -46,7 +46,7 @@ ExecutorBase::ExecutorBase(std::unique_ptr<ir::LoweredGraph> &&lowered_graph,
         {
           auto tensor_registry = tensor_builder->tensorRegistry();
           assert(tensor_registry);
-          tensor = tensor_registry->getManagedITensor(ind);
+          tensor = tensor_registry->getNativeITensor(ind);
           if (tensor != nullptr)
           {
             if (tensor_builder->supportDynamicTensor())
@@ -71,7 +71,7 @@ ExecutorBase::ExecutorBase(std::unique_ptr<ir::LoweredGraph> &&lowered_graph,
         {
           auto tensor_registry = tensor_builder->tensorRegistry();
           assert(tensor_registry);
-          tensor = tensor_registry->getManagedITensor(ind);
+          tensor = tensor_registry->getNativeITensor(ind);
           if (tensor != nullptr)
           {
             if (tensor_builder->supportDynamicTensor())
diff --git a/runtime/onert/core/src/exec/ExecutorBase.h b/runtime/onert/core/src/exec/ExecutorBase.h
index b82d0e818..080c9bbdd 100644
--- a/runtime/onert/core/src/exec/ExecutorBase.h
+++ b/runtime/onert/core/src/exec/ExecutorBase.h
@@ -29,7 +29,6 @@
 #include "ir/LowerInfoMap.h"
 #include "backend/IConfig.h"
 #include "backend/Backend.h"
-#include "compiler/OperandContext.h"
 #include "exec/ExecTime.h"
 #include "exec/IFunction.h"
 #include "backend/IDynamicTensorManager.h"
diff --git a/runtime/onert/core/src/interp/Tensor.h b/runtime/onert/core/src/interp/Tensor.h
index 8b72d537d..008a4b9d4 100644
--- a/runtime/onert/core/src/interp/Tensor.h
+++ b/runtime/onert/core/src/interp/Tensor.h
@@ -171,6 +171,7 @@ public:
   int32_t data_offset() const override { return _info.typeInfo().offset(); }
   const ir::OperandInfo &tensorInfo() const override { return _info; }
   uint64_t num_elements() const override { return _info.shape().num_elements(); };
+  backend::IDynamicTensorManager *dynamic_tensor_manager() override { return nullptr; }
 
 private:
   const ir::OperandInfo _info;
diff --git a/runtime/onert/core/src/interp/operations/Pad.cc b/runtime/onert/core/src/interp/operations/Pad.cc
index d2e3627b4..c8dce698d 100644
--- a/runtime/onert/core/src/interp/operations/Pad.cc
+++ b/runtime/onert/core/src/interp/operations/Pad.cc
@@ -69,8 +69,8 @@ void invoke(const ITensor *input_tensor, const ITensor *pad_tensor, const ITenso
   const int32_t *pad_ptr = reinterpret_cast<const int32_t *>(pad_buffer);
   float *output_ptr = reinterpret_cast<float *>(output_buffer);
 
-  nnfw::cker::Pad(pad_ptr, pad_rank, cker_input_shape, input_ptr, cker_output_shape, output_ptr,
-                  nullptr);
+  nnfw::cker::Pad<float>(pad_ptr, pad_rank, cker_input_shape, input_ptr, cker_output_shape,
+                         output_ptr, nullptr);
 }
 
 void invokePad(const ExecEnv *env, const ir::Operation &node)
diff --git a/runtime/onert/core/src/ir/Graph.cc b/runtime/onert/core/src/ir/Graph.cc
index 262863061..0db9b6133 100644
--- a/runtime/onert/core/src/ir/Graph.cc
+++ b/runtime/onert/core/src/ir/Graph.cc
@@ -89,7 +89,7 @@ void Graph::initializeUseDef()
     auto outputs = node.getOutputs();
     for (auto output : outputs)
     {
-      operands().at(output).insertDef(index);
+      operands().at(output).setDef(index);
     }
 
     for (auto input : node.getInputs() | ir::Remove::UNDEFINED)
diff --git a/runtime/onert/core/src/ir/LoweredGraph.cc b/runtime/onert/core/src/ir/LoweredGraph.cc
index 6e93a23e9..8aedfbdf0 100644
--- a/runtime/onert/core/src/ir/LoweredGraph.cc
+++ b/runtime/onert/core/src/ir/LoweredGraph.cc
@@ -23,6 +23,7 @@
 #include "pass/ConstantLoweringPass.h"
 #include "pass/PermutationOperationPass.h"
 #include "pass/PermutationInsertionPass.h"
+#include "pass/PermutationEliminationPass.h"
 #include "ir/GraphIterator.h"
 #include "verifier/Verifier.h"
 #include "backend/Backend.h"
@@ -122,9 +123,9 @@ LoweredGraph::LoweredGraph(const Graph &graph, const compiler::CompilerOptions &
 
     pass::PermutationInsertionPass pi_pass(*this);
     pi_pass.run();
-    // Implemented code no longer works.
-    // pass::PermutationEliminationPass pe_pass(*this);
-    // pe_pass.run();
+
+    pass::PermutationEliminationPass pe_pass(*this);
+    pe_pass.run();
 
     _op_seqs.dump("merged and sorted operations with permutation", _graph.operations());
   }
@@ -414,7 +415,8 @@ void LoweredGraph::dumpLowerInfo()
 
       const auto lower_info = getLowerInfo(index);
       const auto &shape = object.shape();
-      std::string def_ops = operation_index_to_string(object.getDef());
+      std::string def_ops =
+          object.getDef().valid() ? std::to_string(object.getDef().value()) : "N/A";
       std::string use_ops = operation_index_to_string(object.getUses());
       std::string def_layouts = factors_to_string(lower_info->def_factors());
       std::string use_layouts = factors_to_string(lower_info->use_factors());
@@ -474,7 +476,8 @@ bool LoweredGraph::mergeable(const OpSequenceIndex &op_seq_index, const Operatio
     for (const auto &input : op_seq.getInputs() | Remove::DUPLICATED | ir::Remove::UNDEFINED)
     {
       const auto &input_obj = _graph.operands().at(input);
-      for (const auto &def : input_obj.getDef())
+      auto def = input_obj.getDef();
+      if (def.valid())
       {
         branched_set.insert(def);
         if (branched_set.size() > 1)
@@ -488,6 +491,12 @@ bool LoweredGraph::mergeable(const OpSequenceIndex &op_seq_index, const Operatio
     // Check for branching down
     for (const auto &output : node.getOutputs() | Remove::DUPLICATED)
     {
+      // TODO Fix this workaround for the case of model outputs that are used by another operation
+      //      This is needed since the branching is decided by operation, but for model outputs,
+      //      there is controlflow backen(use backend) but no actual use operation exists
+      if (_graph.getOutputs().contains(output))
+        return false;
+
       const auto &output_obj = _graph.operands().at(output);
       for (const auto &use : output_obj.getUses())
       {
diff --git a/runtime/onert/core/src/ir/Operand.cc b/runtime/onert/core/src/ir/Operand.cc
index cde7fb7bc..e29c7a6ec 100644
--- a/runtime/onert/core/src/ir/Operand.cc
+++ b/runtime/onert/core/src/ir/Operand.cc
@@ -42,20 +42,9 @@ void Operand::insertUse(const OperationIndex &idx) { _uses.insert(idx); }
 
 void Operand::removeUse(const OperationIndex &idx) { _uses.remove(idx); }
 
-void Operand::insertDef(const OperationIndex &idx)
-{
-  assert(!isConstant());
-  assert(_def.size() == 0);
-
-  _def.insert(idx);
-}
+void Operand::setDef(const OperationIndex &idx) { _def = idx; }
 
-void Operand::removeDef(const OperationIndex &idx)
-{
-  assert(_def.contains(idx));
-
-  _def.remove(idx);
-}
+void Operand::unsetDef() { _def = OperationIndex{}; }
 
 } // namespace ir
 } // namespace onert
diff --git a/runtime/onert/core/src/ir/OperationDumper.cc b/runtime/onert/core/src/ir/OperationDumper.cc
index c4b61f3d4..e3cbce57a 100644
--- a/runtime/onert/core/src/ir/OperationDumper.cc
+++ b/runtime/onert/core/src/ir/OperationDumper.cc
@@ -613,6 +613,15 @@ void OperationDumper::visit(const SquaredDifference &node)
   VERBOSE(LIR) << "  - Output : Output(" << node.getOutputs().at(0) << ")" << std::endl;
 }
 
+void OperationDumper::visit(const StatelessRandomUniform &node)
+{
+  VERBOSE(LIR) << "* StatelessRandomUniform" << std::endl;
+  VERBOSE(LIR) << "  - Inputs : Input(" << node.getInputs().at(StatelessRandomUniform::Input::SHAPE)
+               << ", " << node.getInputs().at(StatelessRandomUniform::Input::SEED) << ")"
+               << std::endl;
+  VERBOSE(LIR) << "  - Output : Output(" << node.getOutputs().at(0) << ")" << std::endl;
+}
+
 void OperationDumper::visit(const Squeeze &node)
 {
   VERBOSE(LIR) << "* Squeeze" << std::endl;
diff --git a/runtime/onert/core/src/ir/OperationDumper.h b/runtime/onert/core/src/ir/OperationDumper.h
index 27cfffc46..d83f1493f 100644
--- a/runtime/onert/core/src/ir/OperationDumper.h
+++ b/runtime/onert/core/src/ir/OperationDumper.h
@@ -96,6 +96,7 @@ public:
   void visit(const operation::Squeeze &) override;
   void visit(const operation::Slice &) override;
   void visit(const operation::StridedSlice &) override;
+  void visit(const operation::StatelessRandomUniform &) override;
   void visit(const operation::Sub &) override;
   void visit(const operation::Tanh &) override;
   void visit(const operation::Tile &) override;
diff --git a/runtime/onert/core/src/ir/operation/BatchToSpaceND.cc b/runtime/onert/core/src/ir/operation/BatchToSpaceND.cc
index 0b3955c5c..9ef2b125f 100644
--- a/runtime/onert/core/src/ir/operation/BatchToSpaceND.cc
+++ b/runtime/onert/core/src/ir/operation/BatchToSpaceND.cc
@@ -31,7 +31,7 @@ void BatchToSpaceND::accept(OperationVisitor &v) const { v.visit(*this); }
 
 BatchToSpaceND::BatchToSpaceND(const OperandIndexSequence &inputs,
                                const OperandIndexSequence &outputs)
-    : Operation{OperandConstraint::createExact(2u), inputs, outputs}
+    : Operation{OperandConstraint::createExact(3u), inputs, outputs}
 {
 }
 
diff --git a/runtime/onert/core/src/ir/operation/Quantize.cc b/runtime/onert/core/src/ir/operation/Quantize.cc
new file mode 100644
index 000000000..0e3d5b69b
--- /dev/null
+++ b/runtime/onert/core/src/ir/operation/Quantize.cc
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "ir/operation/Quantize.h"
+
+#include "ir/OperationVisitor.h"
+
+namespace onert
+{
+namespace ir
+{
+namespace operation
+{
+
+void Quantize::accept(OperationVisitor &v) const { v.visit(*this); }
+
+Quantize::Quantize(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs)
+    : Operation{OperandConstraint::createExact(2u), inputs, outputs}
+{
+}
+
+} // namespace operation
+} // namespace ir
+} // namespace onert
diff --git a/runtime/onert/core/src/ir/operation/SplitV.cc b/runtime/onert/core/src/ir/operation/SplitV.cc
new file mode 100644
index 000000000..e638c9ac9
--- /dev/null
+++ b/runtime/onert/core/src/ir/operation/SplitV.cc
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "ir/operation/SplitV.h"
+#include <cassert>
+#include "ir/OperationVisitor.h"
+namespace onert
+{
+namespace ir
+{
+namespace operation
+{
+void SplitV::accept(OperationVisitor &v) const { v.visit(*this); }
+SplitV::SplitV(const OperandIndexSequence &inputs, const OperandIndexSequence &outputs,
+               const Param &param)
+    : Operation{OperandConstraint::createExact(3u), inputs, outputs}, _param{param}
+{
+}
+} // namespace operation
+} // namespace ir
+} // namespace onert
diff --git a/runtime/onert/core/src/compiler/OperandContext.cc b/runtime/onert/core/src/ir/operation/StatelessRandomUniform.cc
index cce555e33..cbb0ff251 100644
--- a/runtime/onert/core/src/compiler/OperandContext.cc
+++ b/runtime/onert/core/src/ir/operation/StatelessRandomUniform.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,32 +14,26 @@
  * limitations under the License.
  */
 
-#include "OperandContext.h"
+#include "ir/operation/StatelessRandomUniform.h"
 
 #include <cassert>
 
+#include "ir/OperationVisitor.h"
+
 namespace onert
 {
-namespace compiler
+namespace ir
 {
-
-OperandContext &OperandContext::set(const ir::OperandIndex &id,
-                                    const std::shared_ptr<backend::ITensor> &tensor)
+namespace operation
 {
-  // Only one tensor for an id
-  assert(_tensors.find(id) == _tensors.end());
-  _tensors[id] = tensor;
-  return (*this);
-}
+void StatelessRandomUniform::accept(OperationVisitor &v) const { v.visit(*this); }
 
-void OperandContext::iterate(
-    const std::function<void(const ir::OperandIndex &, backend::ITensor &)> &fn)
+StatelessRandomUniform::StatelessRandomUniform(const OperandIndexSequence &inputs,
+                                               const OperandIndexSequence &outputs)
+    : Operation{OperandConstraint::createExact(2u), inputs, outputs}
 {
-  for (auto &e : _tensors)
-  {
-    fn(e.first, *e.second);
-  }
 }
 
-} // namespace compiler
+} // namespace operation
+} // namespace ir
 } // namespace onert
diff --git a/runtime/onert/core/src/ir/pass/ConstantInsertionPass.cc b/runtime/onert/core/src/ir/pass/ConstantInsertionPass.cc
index 29275f1c9..1742a0dd5 100644
--- a/runtime/onert/core/src/ir/pass/ConstantInsertionPass.cc
+++ b/runtime/onert/core/src/ir/pass/ConstantInsertionPass.cc
@@ -45,8 +45,8 @@ void ConstantInsertionPass::callback(const OperationIndex &node_index, Operation
       if (_replace_operands_map.count(key) == 0)
       {
         auto new_object = object;
+        new_object.unsetDef();
         // TODO Remove const_case
-        const_cast<OperationIndexSet &>(new_object.getDef()).clear();
         const_cast<OperationIndexSet &>(new_object.getUses()).clear();
         const auto new_index = _graph.operands().emplace(new_object);
         _replace_operands_map[key] = new_index;
@@ -71,7 +71,7 @@ void ConstantInsertionPass::callback(const OperationIndex &node_index, Operation
 
       // Remove this node from uses of origin operand
       // Constant operand has no def.
-      assert(object.getDef().size() == 0);
+      assert(!object.getDef().valid());
       object.removeUse(node_index);
 
       // Remove origin operand
diff --git a/runtime/onert/core/src/ir/pass/PermutationEliminationPass.cc b/runtime/onert/core/src/ir/pass/PermutationEliminationPass.cc
index 9e0291ef9..2deccd40b 100644
--- a/runtime/onert/core/src/ir/pass/PermutationEliminationPass.cc
+++ b/runtime/onert/core/src/ir/pass/PermutationEliminationPass.cc
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,11 +15,8 @@
  */
 
 #include "PermutationEliminationPass.h"
+#include "backend/controlflow/Config.h"
 
-#include "ir/Operand.h"
-#include "ir/operand/LowerInfo.h"
-#include "ir/Graph.h"
-#include "backend/IConfig.h"
 #include "util/logging.h"
 
 namespace onert
@@ -28,166 +25,129 @@ namespace ir
 {
 namespace pass
 {
-void PermutationEliminationPass::callback(const OperandIndex &inp_index, Operand &object)
-{
-  if (_graph.getInputs().contains(inp_index))
-  {
-    eliminateInput(inp_index, object);
-  }
-  else if (_graph.getOutputs().contains(inp_index))
-  {
-    eliminateOutput(inp_index, object);
-  }
-}
 
-void PermutationEliminationPass::eliminateInput(const OperandIndex &inp_index, Operand &object)
+void PermutationEliminationPass::callback(const OperationIndex &ind, Operation &node)
 {
-  auto &model_inputs = _graph.getInputs();
-
-  // get uses of the model's given input
-  auto uses = object.getUses();
+  _op_ind = ind;
+  node.accept(*this);
+};
 
-  // input must be used just by permutation
-  if (uses.size() != 1)
-  {
-    return;
-  }
+void PermutationEliminationPass::visit(const operation::Permute &node)
+{
+  auto in_operand = node.getInputs().at(0);
+  auto out_operand = node.getOutputs().at(0);
 
-  for (auto input_use : uses)
+  // Check if two tensors are both portable
+  // TODO Make this general, this is just a workaround to check two tensors are portable
   {
-    auto &perm_operation = _graph.operations().at(input_use);
-    auto perm_inputs = perm_operation.getInputs();
+    auto in_def_factor = _lowered_graph.getLowerInfo(in_operand)->def_factors().getOnlyElement();
+    auto out_def_factor = _lowered_graph.getLowerInfo(out_operand)->def_factors().getOnlyElement();
 
-    auto perm_outputs = perm_operation.getOutputs();
+    auto in_backend_id = in_def_factor.backend()->config()->id();
+    auto out_backend_id = out_def_factor.backend()->config()->id();
 
-    if (!isPermuteLayerToEliminate(perm_inputs, perm_outputs, true))
-    {
+    // TODO Fix this workaround that removes only Permute between cpu and controlflow backend.
+    //      This should be general.
+    if (!((in_backend_id == backend::controlflow::Config::ID && out_backend_id == "cpu") ||
+          (in_backend_id == "cpu" && out_backend_id == backend::controlflow::Config::ID)))
       return;
-    }
-
-    assert(perm_inputs.at(0) == inp_index);
-
-    VERBOSE(PermutationEliminationPass::EliminateInput) << "remove NHWC_TO_NCHW permutation\n";
-
-    // set model's new input, which was output of permutation
-    model_inputs.replace(inp_index, perm_outputs.at(0));
-
-    // remove model's input, which is also input of permutation
-    _graph.removeOperand(inp_index);
-
-    // remove permutation operation
-    assert(_lowered_graph.op_seqs().containsOperation(input_use));
-    auto op_seq_idx = _lowered_graph.op_seqs().getOperation(input_use);
-    _lowered_graph.op_seqs().remove(op_seq_idx);
-    _graph.operations().remove(input_use);
-
-    VERBOSE(PermutationEliminationPass::EliminateInput)
-        << inp_index.value() << " is model's input and is removed. New input is "
-        << perm_outputs.at(0).value() << "\n"
-        << input_use.value() << " is removed permutation operation\n";
-  }
-}
-
-void PermutationEliminationPass::eliminateOutput(const OperandIndex &out_index, Operand &object)
-{
-  auto &model_outputs = _graph.getOutputs();
-
-  // get defs of the model's given output
-  auto defs = object.getDef();
-
-  // output must use just permutation
-  if (defs.size() != 1)
-  {
-    return;
   }
 
-  for (auto output_def : defs)
+  if (_graph.getOutputs().contains(out_operand))
   {
-    auto &perm_operation = _graph.operations().at(output_def);
-    auto perm_outputs = perm_operation.getOutputs();
-
-    auto perm_inputs = perm_operation.getInputs();
-    if (!isPermuteLayerToEliminate(perm_inputs, perm_outputs, false))
+    // Exceptional case : When the output operand is a model output
+    // In this case we keep the output and remove the input
+
+    auto &out_operand_obj = _graph.operands().at(out_operand);
+    assert(out_operand_obj.getDef() == _op_ind);
+    out_operand_obj.unsetDef();
+    _lowered_graph.op_seqs().iterate([&](const ir::OpSequenceIndex &, ir::OpSequence &op_seq) {
+      if (!op_seq.getOutputs().contains(in_operand))
+        return;
+
+      // Update OpSequence/Operation edges and Operand edges
+      op_seq.replaceOutputs(in_operand, out_operand);
+      for (auto op : op_seq.operations())
+      {
+        auto &operation_obj = _graph.operations().at(op);
+        if (operation_obj.getOutputs().contains(in_operand))
+        {
+          operation_obj.replaceOutputs(in_operand, out_operand);
+          out_operand_obj.setDef(op);
+        }
+      }
+    });
+
+    // Remove Permute operation, enclosing OpSequence and the operand
     {
-      return;
-    }
-
-    assert(perm_outputs.at(0) == out_index);
+      _graph.removeOperand(in_operand);
 
-    VERBOSE(PermutationEliminationPass::EliminateOutput) << "remove NCHW_TO_NHWC permutation\n";
-
-    // Update operations' output that is used by permute operand
-    for (auto perm_input_index : perm_inputs)
-    {
-      auto &perm_input_operand = _graph.operands().at(perm_input_index);
-      perm_input_operand.removeUse(output_def);
+      auto op_seq_ind = _lowered_graph.op_seqs().getOperation(_op_ind);
+      // Assumes enclosing OpSequence contatins just this Permute operation
+      assert(_lowered_graph.op_seqs().at(op_seq_ind).size() == 1);
+      _lowered_graph.op_seqs().remove(op_seq_ind);
+      _graph.operations().remove(_op_ind);
     }
 
-    // set model's new output, which was input of permutation
-    model_outputs.replace(out_index, perm_inputs.at(0));
-
-    // remove model's output, which is also output of permutation
-    _graph.removeOperand(out_index);
-
-    // remove permutation operation
-    assert(_lowered_graph.op_seqs().containsOperation(output_def));
-    auto op_seq_idx = _lowered_graph.op_seqs().getOperation(output_def);
-    _lowered_graph.op_seqs().remove(op_seq_idx);
-    _graph.operations().remove(output_def);
-
-    VERBOSE(PermutationEliminationPass::EliminateOutput)
-        << out_index.value() << " is model's output and is removed. New output is "
-        << perm_inputs.at(0).value() << "\n"
-        << output_def.value() << " is removed permutation operation\n";
+    _lowered_graph.op_seqs().iterate([&](const ir::OpSequenceIndex &, ir::OpSequence &op_seq) {
+      if (!op_seq.getInputs().contains(in_operand))
+        return;
+
+      op_seq.replaceInputs(in_operand, out_operand);
+      for (auto op : op_seq.operations())
+      {
+        auto &operation_obj = _graph.operations().at(op);
+        if (operation_obj.getInputs().contains(in_operand))
+        {
+          operation_obj.replaceInputs(in_operand, out_operand);
+          out_operand_obj.insertUse(op);
+        }
+      }
+    });
+
+    VERBOSE(removePermute) << "Permute Op removed, node index : " << _op_ind << std::endl;
+    VERBOSE(removePermute) << "  - Input (removed) Operand : " << in_operand << std::endl;
+    VERBOSE(removePermute) << "  - Output(kept)    Operand : " << out_operand << std::endl;
   }
-}
-
-bool PermutationEliminationPass::isPermuteLayerToEliminate(const OperandIndexSequence &inp_indexes,
-                                                           const OperandIndexSequence &out_indexes,
-                                                           bool is_for_model_input)
-{
-  auto input_def_factors = _lowered_graph.getLowerInfo(inp_indexes.at(0))->def_factors();
-  auto output_def_factors = _lowered_graph.getLowerInfo(out_indexes.at(0))->def_factors();
-
-  auto input_layout = input_def_factors.getOnlyElement().layout();
-  auto output_layout = output_def_factors.getOnlyElement().layout();
-
-  if (input_def_factors.size() != 1 || output_def_factors.size() != 1)
-  {
-    return false;
-  }
-
-  // all operands' factor must be the same
-  for (auto index : inp_indexes)
-  {
-    auto op_factor_set = _lowered_graph.getLowerInfo(index)->def_factors();
-    if (op_factor_set.size() != 1 ||
-        input_layout != _lowered_graph.getLowerInfo(index)->def_factors().getOnlyElement().layout())
-    {
-      return false;
-    }
-  }
-  // all operands' factor must be the same
-  for (auto index : out_indexes)
+  else
   {
-    auto op_factor_set = _lowered_graph.getLowerInfo(index)->def_factors();
-    if (op_factor_set.size() != 1 ||
-        output_layout !=
-            _lowered_graph.getLowerInfo(index)->def_factors().getOnlyElement().layout())
+    // Otherwise keep the input and remove the output
+
+    auto &in_operand_obj = _graph.operands().at(in_operand);
+    in_operand_obj.removeUse(_op_ind);
+
+    // Make OpSequences(that use the output) use the input
+    _lowered_graph.op_seqs().iterate([&](const ir::OpSequenceIndex &, ir::OpSequence &op_seq) {
+      if (!op_seq.getInputs().contains(out_operand))
+        return;
+
+      op_seq.replaceInputs(out_operand, in_operand);
+      for (auto op : op_seq.operations())
+      {
+        auto &operation_obj = _graph.operations().at(op);
+        if (operation_obj.getInputs().contains(out_operand))
+        {
+          operation_obj.replaceInputs(out_operand, in_operand);
+          in_operand_obj.insertUse(op);
+        }
+      }
+    });
+
+    // Remove Permute operation, enclosing OpSequence and the operand
     {
-      return false;
+      _graph.removeOperand(out_operand);
+
+      auto op_seq_ind = _lowered_graph.op_seqs().getOperation(_op_ind);
+      // Assumes enclosing OpSequence contatins just this Permute operation
+      assert(_lowered_graph.op_seqs().at(op_seq_ind).size() == 1);
+      _lowered_graph.op_seqs().remove(op_seq_ind);
+      _graph.operations().remove(_op_ind);
     }
-  }
 
-  if (is_for_model_input)
-  {
-    // check if this is NHWC_TO_NCHW permutation: must have single input, which is model's input
-    return (inp_indexes.size() == 1 && input_layout == Layout::NHWC &&
-            output_layout == Layout::NCHW);
+    VERBOSE(removePermute) << "Permute Op removed, node index : " << _op_ind << std::endl;
+    VERBOSE(removePermute) << "  - Input (kept)    Operand : " << in_operand << std::endl;
+    VERBOSE(removePermute) << "  - Output(removed) Operand : " << out_operand << std::endl;
   }
-
-  // check if this is NCHW_TO_NHWC permutation: must have single output, which is model's output
-  return (out_indexes.size() == 1 && input_layout == Layout::NCHW && output_layout == Layout::NHWC);
 }
 
 } // namespace pass
diff --git a/runtime/onert/core/src/ir/pass/PermutationEliminationPass.h b/runtime/onert/core/src/ir/pass/PermutationEliminationPass.h
index 1c8430062..614e44cd2 100644
--- a/runtime/onert/core/src/ir/pass/PermutationEliminationPass.h
+++ b/runtime/onert/core/src/ir/pass/PermutationEliminationPass.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,9 +17,8 @@
 #ifndef __ONERT_GRAPH_PASS_PERMUTATION_ELIMINATION_PASS_H__
 #define __ONERT_GRAPH_PASS_PERMUTATION_ELIMINATION_PASS_H__
 
-#include "LoweredOperandPass.h"
-#include "ir/Operand.h"
-#include "ir/OperandIndexSequence.h"
+#include "ir/OperationVisitor.h"
+#include "LoweredOperationPass.h"
 
 namespace onert
 {
@@ -28,55 +27,35 @@ namespace ir
 namespace pass
 {
 
-class PermutationEliminationPass : public LoweredOperandPass
+/**
+ * @brief An optimization pass that removes Permute operations if possible
+ *
+ * There may be some Permute operations that are inserted by PermutationInsertionPass or other
+ * passes. This pass checks all Permute operations and eliminates them if Permute in/out tensors
+ * are compatible and layouts match.
+ *
+ * Permute input tensor is kept and the output is removed for all the cases, except model outputs.
+ * As all output tensors have to be controlflow backend, so the output is kept.
+ *
+ * @note This is an optimization pass which means that everything should work fine even if this pass
+ *       was skipped.
+ */
+class PermutationEliminationPass : public LoweredOperationPass, public OperationVisitor
 {
 public:
-  using LoweredOperandPass::LoweredOperandPass;
+  using LoweredOperationPass::LoweredOperationPass;
 
 public:
-  std::string id() override { return "PermutationEliminationPass"; }
+  std::string id() final { return "PermutationEliminationPass"; }
 
-  void callback(const OperandIndex &index, Operand &object) override;
+public:
+  void callback(const OperationIndex &i, Operation &n) final;
 
 private:
-  /**
-   * @brief Remove Permute operation that permutates input
-   *
-   * Note: This function aslo removes model's input and
-   * sets output of permutation as model's new input
-   *
-   * @param inp_index is the target operand index for the elimination
-   * @param object is the target operand object for the elimination
-   *
-   * @return
-   */
-  void eliminateInput(const OperandIndex &inp_index, Operand &object);
-
-  /**
-   * @brief Remove Permute operation that permutates output of a model
-   *
-   * Note: This function aslo removes model's output and
-   * sets input of permutation as model's new output
-   *
-   * @param out_index is the target operand index for the elimination
-   * @param object is the target operand object for the elimination
-   *
-   * @return
-   */
-  void eliminateOutput(const OperandIndex &out_index, Operand &object);
+  void visit(const operation::Permute &) final;
 
-  /**
-   * @brief Determine if passed operands are permute layer's input and output, that must be
-   * eliminated
-   *
-   * @param inp_index indexes of the input operand to operation
-   * @param out_index indexes of the output operand to operation
-   * @param is_for_model_input checking for model's input or output
-   *
-   * @return if it is permutation layer
-   */
-  bool isPermuteLayerToEliminate(const OperandIndexSequence &inp_indexes,
-                                 const OperandIndexSequence &out_indexes, bool is_for_model_input);
+private:
+  ir::OperationIndex _op_ind;
 };
 
 } // namespace pass
diff --git a/runtime/onert/core/src/ir/pass/PermutationInsertionPass.cc b/runtime/onert/core/src/ir/pass/PermutationInsertionPass.cc
index 7c3da52a2..3578af813 100644
--- a/runtime/onert/core/src/ir/pass/PermutationInsertionPass.cc
+++ b/runtime/onert/core/src/ir/pass/PermutationInsertionPass.cc
@@ -60,36 +60,8 @@ void PermutationInsertionPass::callback(const OperandIndex &index, Operand &obje
     }
 
     auto insert_set = operand_li->use_factors() - operand_li->def_factors();
-    auto def_factor = operand_li->def_factors().getOnlyElement();
-
-    auto compatible_backends = [](auto /* backend1 */, auto /* backend2 */) {
-      // TODO If other issues for Permute elimination are resolved, enable this
-      return false;
-      /*
-      // TODO This is a workaround for not inserting Permute between cpu and controlflow.
-      //      To be general, we need another way of checking they are compatible.
-      const auto cf = backend::controlflow::Config::ID;
-      const auto cpu = "cpu";
-      const auto id1 = backend1->config()->id();
-      const auto id2 = backend2->config()->id();
-      return (id1 == cpu && id2 == cf) // Allows no-Permute for Model inputs
-          || (id1 == cf && id2 == cpu); // Allows no-Permute for Model outputs
-          */
-    };
-
     for (auto factor : insert_set)
     {
-      if (factor.layout() == def_factor.layout() &&
-          compatible_backends(factor.backend(), def_factor.backend()))
-      {
-        // For this factor we can just reuse existing operand - Permute is not added.
-        VERBOSE(PermutationInsertionPass) << "Permutation Insertion is skipped for operand "
-                                          << index << " / as the tensor is compatible with backend "
-                                          << factor.backend()->config()->id() << std::endl;
-        factor_to_index.emplace(factor, index);
-        continue;
-      }
-
       const auto permute_operation_index = insertPermute(index, factor);
       permute_indexes.push_back(permute_operation_index);
       const auto &permute_operation = _graph.operations().at(permute_operation_index);
@@ -235,7 +207,7 @@ OperationIndex PermutationInsertionPass::insertPermute(const OperandIndex &opera
   // Update Use/Def info
   {
     _graph.operands().at(operand_index).insertUse(node_index);
-    _graph.operands().at(out_operand_index).insertDef(node_index);
+    _graph.operands().at(out_operand_index).setDef(node_index);
   }
   return node_index;
 }
diff --git a/runtime/onert/core/src/ir/pass/PermutationInsertionPass.h b/runtime/onert/core/src/ir/pass/PermutationInsertionPass.h
index 314a54c95..6c30c6f12 100644
--- a/runtime/onert/core/src/ir/pass/PermutationInsertionPass.h
+++ b/runtime/onert/core/src/ir/pass/PermutationInsertionPass.h
@@ -38,6 +38,7 @@ public:
   std::string id() override { return "PermutationInsertionPass"; }
   void callback(const OperandIndex &index, Operand &object) override;
 
+private:
   /**
    * @brief Insert Permute operation that has given operand as input
    *
@@ -48,8 +49,6 @@ public:
    */
   OperationIndex insertPermute(const OperandIndex &operand_index,
                                const operand::PermuteFactor &factor);
-
-private:
 };
 
 } // namespace pass
diff --git a/runtime/onert/core/src/ir/pass/PermutationOperationPass.cc b/runtime/onert/core/src/ir/pass/PermutationOperationPass.cc
index 1d77b48b4..6eb412cf1 100644
--- a/runtime/onert/core/src/ir/pass/PermutationOperationPass.cc
+++ b/runtime/onert/core/src/ir/pass/PermutationOperationPass.cc
@@ -39,8 +39,8 @@ void PermutationOperationPass::applyExpandRanks(const Operation &node)
   const auto &output_ind = node.getOutputs().at(0);
   const auto &output = _graph.operands().at(output_ind);
 
-  assert(output.getDef().size() == 1);
-  const auto &node_index = *output.getDef().begin();
+  assert(output.getDef().valid());
+  const auto node_index = output.getDef();
   const auto &op_seq_index = _lowered_graph.op_seqs().getOperation(node_index);
   const auto frontend_layout = _lowered_graph.op_seqs().at(op_seq_index).getLayout();
   const auto backend_layout = _lowered_graph.getLowerInfo(op_seq_index)->layout();
@@ -80,8 +80,8 @@ void PermutationOperationPass::changeToKeepLayout(const Operation &node)
   const auto &output_ind = node.getOutputs().at(0);
   const auto &output_obj = _graph.operands().at(output_ind);
 
-  assert(output_obj.getDef().size() == 1);
-  const auto &node_index = *output_obj.getDef().begin();
+  assert(output_obj.getDef().valid());
+  const auto node_index = output_obj.getDef();
   const auto &op_seq_index = _lowered_graph.op_seqs().getOperation(node_index);
 
   const auto frontend_layout = _lowered_graph.op_seqs().at(op_seq_index).getLayout();
@@ -200,7 +200,7 @@ void PermutationOperationPass::changeToKeepLayout(const Operation &node)
       lower_info->addUsePermuteFactor(new_factor);
 
       // Whether if node's input is an input of model or a constant
-      if (_graph.operands().at(input).getDef().size() == 0 &&
+      if (!_graph.operands().at(input).getDef().valid() &&
           (lower_info->def_factors().size() == 1 &&
            lower_info->def_factors().getOnlyElement() == removed_factor))
       {
diff --git a/runtime/onert/core/src/ir/verifier/Verifier.cc b/runtime/onert/core/src/ir/verifier/Verifier.cc
index 9b8388765..09cbdcf2f 100644
--- a/runtime/onert/core/src/ir/verifier/Verifier.cc
+++ b/runtime/onert/core/src/ir/verifier/Verifier.cc
@@ -32,7 +32,7 @@ namespace verifier
 // DAGChecker
 //
 
-bool DAGChecker::verify(const Graph &graph) const
+bool DAGChecker::verify(const Graph &graph) const noexcept
 {
   auto &operations = graph.operations();
   bool cyclic = false;
@@ -72,23 +72,59 @@ bool DAGChecker::verify(const Graph &graph) const
 // EdgeConsistencyVerifier
 //
 
-bool EdgeConsistencyChecker::verify(const Graph &graph) const
+bool EdgeConsistencyChecker::verify(const Graph &graph) const noexcept
 {
   auto &operations = graph.operations();
-  uint32_t mismatches = 0;
+  uint32_t errors = 0;
   operations.iterate([&](const OperationIndex &index, const Operation &node) {
     for (auto operand_index : node.getInputs() | ir::Remove::UNDEFINED)
     {
-      auto &operand = graph.operands().at(operand_index);
-      mismatches += (operand.getUses().contains(index) ? 0 : 1);
+      try
+      {
+        auto &operand = graph.operands().at(operand_index);
+        bool operand_has_use = operand.getUses().contains(index);
+        if (!operand_has_use)
+        {
+          VERBOSE(EdgeConsistencyChecker) << "[ERROR] EDGE MISMATCH : Missing USE edge - Operand "
+                                          << operand_index << " to Operation " << index
+                                          << std::endl;
+          errors += 1;
+        }
+      }
+      catch (const std::out_of_range &e)
+      {
+        VERBOSE(EdgeConsistencyChecker)
+            << "[ERROR] OPEARAND NOT FOUND : Operation " << index << " has Operand "
+            << operand_index << ", but the operand object is not present in the graph" << std::endl;
+        errors += 1;
+      }
     }
     for (auto operand_index : node.getOutputs())
     {
-      auto &operand = graph.operands().at(operand_index);
-      mismatches += (operand.getDef().contains(index) ? 0 : 1);
+      try
+      {
+        auto &operand = graph.operands().at(operand_index);
+        if (operand.getDef() != index)
+        {
+          VERBOSE(EdgeConsistencyChecker) << "[ERROR] EDGE MISMATCH : Missing DEF edge - Operand"
+                                          << operand_index << " to Operation " << index
+                                          << std::endl;
+          errors += 1;
+        }
+      }
+      catch (const std::out_of_range &e)
+      {
+        VERBOSE(EdgeConsistencyChecker)
+            << "[ERROR] OPEARAND NOT FOUND : Operation " << index << " has Operand "
+            << operand_index << ", but the operand object is not present in the graph" << std::endl;
+        errors += 1;
+      }
     }
   });
-  return mismatches == 0;
+
+  VERBOSE(EdgeConsistencyChecker) << "Total Number of errors : " << errors << std::endl;
+
+  return errors == 0;
 }
 
 } // namespace verifier
diff --git a/runtime/onert/core/src/ir/verifier/Verifier.h b/runtime/onert/core/src/ir/verifier/Verifier.h
index 0bc22bc47..0c7b57b04 100644
--- a/runtime/onert/core/src/ir/verifier/Verifier.h
+++ b/runtime/onert/core/src/ir/verifier/Verifier.h
@@ -35,7 +35,7 @@ namespace verifier
 struct IVerifier
 {
   virtual ~IVerifier() = default;
-  virtual bool verify(const Graph &graph) const = 0;
+  virtual bool verify(const Graph &graph) const noexcept = 0;
 };
 
 } // namespace verifier
@@ -52,13 +52,13 @@ namespace verifier
 class DAGChecker : public IVerifier
 {
 public:
-  bool verify(const Graph &graph) const override;
+  bool verify(const Graph &graph) const noexcept override;
 };
 
 class EdgeConsistencyChecker : public IVerifier
 {
 public:
-  bool verify(const Graph &graph) const override;
+  bool verify(const Graph &graph) const noexcept override;
 };
 
 } // namespace verifier
diff --git a/runtime/onert/core/src/util/EventCollector.cc b/runtime/onert/core/src/util/EventCollector.cc
index 9ecc7e3b8..de37276bf 100644
--- a/runtime/onert/core/src/util/EventCollector.cc
+++ b/runtime/onert/core/src/util/EventCollector.cc
@@ -54,7 +54,8 @@ private:
   std::string _ts;
 };
 
-void emit_rusage(EventRecorder *rec, const std::string &ts)
+#ifdef DEBUG
+inline void emit_rusage(EventRecorder *rec, const std::string &ts)
 {
   struct rusage ru;
 
@@ -81,6 +82,7 @@ void emit_rusage(EventRecorder *rec, const std::string &ts)
     rec->emit(evt);
   }
 }
+#endif
 
 } // namespace
 
@@ -99,6 +101,9 @@ void EventCollector::onEvent(const Event &event)
       break;
   }
 
-  // Trace resource usage per each event notification
+// TODO: Add resurece measurement(e.g. RSS)
+// when ready with low overhead in release build
+#ifdef DEBUG
   emit_rusage(_rec, ts);
+#endif
 }
diff --git a/runtime/onert/core/src/util/EventRecorder.h b/runtime/onert/core/src/util/EventRecorder.h
index 6ba0bc061..6eea06986 100644
--- a/runtime/onert/core/src/util/EventRecorder.h
+++ b/runtime/onert/core/src/util/EventRecorder.h
@@ -74,7 +74,8 @@ private:
 
 private:
   std::mutex _mu;
-  WriteFormat _write_format{WriteFormat::CHROME_TRACING};
+  // TODO: Allow user to control write_format
+  WriteFormat _write_format{WriteFormat::SNPE_BENCHMARK};
   std::vector<DurationEvent> _duration_events;
   std::vector<CounterEvent> _counter_events;
 };
diff --git a/runtime/onert/core/src/util/ShapeInference.cc b/runtime/onert/core/src/util/ShapeInference.cc
index ac795bb99..9a24f8c1a 100644
--- a/runtime/onert/core/src/util/ShapeInference.cc
+++ b/runtime/onert/core/src/util/ShapeInference.cc
@@ -486,6 +486,20 @@ ir::Shape inferPadShape(const ir::Shape &in_shape, const int32_t *pad_buf, const
   return ret;
 }
 
+ir::Shape inferResizeBilinearShape(const ir::Shape &in_shape, const int32_t output_height,
+                                   const int32_t output_width)
+{
+  assert(in_shape.rank() == 4);
+  ir::Shape ret(in_shape.rank());
+
+  ret.dim(0) = in_shape.dim(0);
+  ret.dim(1) = output_height;
+  ret.dim(2) = output_width;
+  ret.dim(3) = in_shape.dim(3);
+
+  return ret;
+}
+
 template <typename T> ir::Shape inferRangeShape(T start_val, T limit_val, T delta_val)
 {
   ir::Shape out_shape(static_cast<int>(1));
diff --git a/runtime/onert/frontend/base_loader/include/base_loader.h b/runtime/onert/frontend/base_loader/include/base_loader.h
index f5687ad1e..0f6a2a5d0 100644
--- a/runtime/onert/frontend/base_loader/include/base_loader.h
+++ b/runtime/onert/frontend/base_loader/include/base_loader.h
@@ -53,6 +53,8 @@ protected:
   using SubGraph = typename LoaderDomain::SubGraph;
   using Tensor = typename LoaderDomain::Tensor;
   using TensorType = typename LoaderDomain::TensorType;
+  using DimensionType = typename LoaderDomain::DimensionType;
+  using SparseIndexVector = typename LoaderDomain::SparseIndexVector;
 
 protected:
   bool isOptionalInputTensor(std::int32_t idx) { return idx == -1; }
@@ -75,6 +77,13 @@ public:
    * @param file_path
    */
   void loadFromFile(const char *file_path);
+  /**
+   * @brief Load a model from a buffer
+   *
+   * @param buffer buffer pointer
+   * @param size buffer size
+   */
+  void loadFromBuffer(uint8_t *buffer, size_t size);
 
 protected:
   ~BaseLoader() = default;
@@ -107,7 +116,6 @@ protected:
   void loadSoftmax(const Operator *op, ir::Graph &subg);
   void loadMaxPool2D(const Operator *op, ir::Graph &subg);
   void loadConcatenation(const Operator *op, ir::Graph &subg);
-  void loadInstanceNorm(const Operator *op, ir::Graph &subg);
   void loadFill(const Operator *op, ir::Graph &subg);
   void loadFC(const Operator *op, ir::Graph &subg);
   void loadAdd(const Operator *op, ir::Graph &subg);
@@ -140,6 +148,7 @@ protected:
   void loadSqueeze(const Operator *op, ir::Graph &subg);
   void loadPrelu(const Operator *op, ir::Graph &subg);
   void loadSplit(const Operator *op, ir::Graph &subg);
+  void loadSplitV(const Operator *op, ir::Graph &subg);
   void loadSlice(const Operator *op, ir::Graph &subg);
   void loadStridedSlice(const Operator *op, ir::Graph &subg);
   void loadUnpack(const Operator *op, ir::Graph &subg);
@@ -165,12 +174,13 @@ protected:
   void loadTile(const Operator *op, ir::Graph &subg);
   void loadLogicalOr(const Operator *op, ir::Graph &subg);
   void loadRange(const Operator *op, ir::Graph &subg);
-  void loadBCQFullyConnected(const Operator *op, ir::Graph &subg);
-  void loadBCQGather(const Operator *op, ir::Graph &subg);
   void loadMatrixBandPart(const Operator *op, ir::Graph &subg);
   void loadBroadcastTo(const Operator *op, ir::Graph &subg);
   void loadFusedBatchNorm(const Operator *op, ir::Graph &subg);
   void loadLogSoftmax(const Operator *op, ir::Graph &subg);
+  void loadQuantize(const Operator *op, ir::Graph &subg);
+  void loadSpaceToDepth(const Operator *op, ir::Graph &subg);
+  void loadStatelessRandomUniform(const Operator *op, ir::Graph &subg);
 
 protected:
   // Base address for mapped region for loading (if needed)
@@ -216,12 +226,20 @@ void BaseLoader<LoaderDomain, SpecificLoader>::BaseLoader::loadFromFile(const ch
   _verifier = std::make_unique<Verifier>(reinterpret_cast<const std::uint8_t *>(_base), size);
 
   loadModel();
-  munmap(_base, size);
 
   close(_fd);
 }
 
 template <typename LoaderDomain, typename SpecificLoader>
+void BaseLoader<LoaderDomain, SpecificLoader>::BaseLoader::loadFromBuffer(uint8_t *buffer,
+                                                                          size_t size)
+{
+  _base = buffer;
+  _verifier = std::make_unique<Verifier>(reinterpret_cast<const std::uint8_t *>(_base), size);
+  loadModel();
+}
+
+template <typename LoaderDomain, typename SpecificLoader>
 ir::Activation BaseLoader<LoaderDomain, SpecificLoader>::BaseLoader::convertActivation(
     const ActivationFunctionType type)
 {
@@ -299,6 +317,23 @@ void BaseLoader<LoaderDomain, SpecificLoader>::BaseLoader::deallocateMmappedArea
   }
 }
 
+/* Copied from tensorflow lite. Need to append copyright */
+template <typename T> bool Copy(const T *data_ptr, std::vector<uint16_t> &arr)
+{
+  if (data_ptr->values() == nullptr)
+  {
+    return false;
+  }
+
+  int size = data_ptr->values()->size();
+  arr.reserve(size);
+  for (int i = 0; i < size; i++)
+  {
+    arr.emplace_back(static_cast<uint16_t>(data_ptr->values()->Get(i)));
+  }
+  return true;
+}
+
 template <typename LoaderDomain, typename SpecificLoader>
 ir::OperandIndex BaseLoader<LoaderDomain, SpecificLoader>::loadOperand(const Tensor *tensor,
                                                                        ir::Graph &subg)
@@ -355,6 +390,61 @@ ir::OperandIndex BaseLoader<LoaderDomain, SpecificLoader>::loadOperand(const Ten
   }
   // Create TypeInfo
   ir::TypeInfo type_info(data_type, scale, zero_point);
+  // Sparsity
+  auto src_sparsity = tensor->sparsity();
+  if (src_sparsity != nullptr)
+  {
+    std::vector<uint16_t> w1_segments;
+    std::vector<uint16_t> w1_indices;
+    // ignore traversal_order, block_map
+    // load metadata
+    const size_t dim_metadata_size = src_sparsity->dim_metadata()->size();
+    if (dim_metadata_size != 2)
+      throw std::runtime_error("sparse tensor is supported only for 2D");
+    const auto *src_metadata = src_sparsity->dim_metadata()->Get(0);
+    if (src_metadata->format() != DimensionType::DimensionType_DENSE)
+      throw std::runtime_error("sparse tensor dim[0] is not DENSE");
+    src_metadata = src_sparsity->dim_metadata()->Get(1);
+    if (src_metadata->format() != DimensionType::DimensionType_SPARSE_CSR)
+      throw std::runtime_error("sparse tensor dim[0] is not SPARSE_CSR");
+
+    auto ParseSparseIndexVector = [src_metadata, &w1_segments, &w1_indices]() {
+      if (src_metadata->array_segments() == nullptr || src_metadata->array_indices() == nullptr)
+        return false;
+      bool status = true;
+      switch (src_metadata->array_segments_type())
+      {
+        case SparseIndexVector::SparseIndexVector_Int32Vector:
+          status = Copy(src_metadata->array_segments_as_Int32Vector(), w1_segments);
+          break;
+        case SparseIndexVector::SparseIndexVector_Uint16Vector:
+          status = Copy(src_metadata->array_segments_as_Uint16Vector(), w1_segments);
+          break;
+        case SparseIndexVector::SparseIndexVector_Uint8Vector:
+          status = Copy(src_metadata->array_segments_as_Uint8Vector(), w1_segments);
+          break;
+        default:
+          return false;
+      }
+      if (status != true)
+        return false;
+      switch (src_metadata->array_indices_type())
+      {
+        case SparseIndexVector::SparseIndexVector_Int32Vector:
+          return Copy(src_metadata->array_indices_as_Int32Vector(), w1_indices);
+        case SparseIndexVector::SparseIndexVector_Uint16Vector:
+          return Copy(src_metadata->array_indices_as_Uint16Vector(), w1_indices);
+        case SparseIndexVector::SparseIndexVector_Uint8Vector:
+          return Copy(src_metadata->array_indices_as_Uint8Vector(), w1_indices);
+        default:
+          break;
+      }
+      return false;
+    };
+    if (ParseSparseIndexVector() == false)
+      throw std::runtime_error("Error during parsing sparsity index information");
+    type_info.sparse2DMetadata(std::move(w1_segments), std::move(w1_indices));
+  }
   // Create operand
   const auto operand_index = subg.addOperand(shape, type_info);
 
@@ -363,18 +453,17 @@ ir::OperandIndex BaseLoader<LoaderDomain, SpecificLoader>::loadOperand(const Ten
   if (data != nullptr)
   {
     using std::ptrdiff_t;
-    size_t data_size = data->size();
-    ptrdiff_t unaligned_offset_start = data->data() - _base;
-    ptrdiff_t offset_end = unaligned_offset_start + data_size;
-
-    // Calculated aligned offset from base address of mapped region
-    // munmap accepts memory address which is a multiple of the pagesize
-    ptrdiff_t aligned_offset_start = (unaligned_offset_start / _pagesize) * _pagesize;
-    size_t mmap_size = offset_end - aligned_offset_start;
-
-    auto ptr = std::make_unique<ir::MMapedData>(_fd, aligned_offset_start, mmap_size,
-                                                unaligned_offset_start, data_size);
-    subg.setOperandValue(operand_index, std::move(ptr));
+    std::unique_ptr<ir::Data> data_obj;
+    if (_fd == -1) // Model is from memory
+    {
+      data_obj = std::make_unique<ir::ExternalData>(data->data(), data->size());
+    }
+    else // Model is loaded(mmap'd) from a file
+    {
+      data_obj = std::make_unique<ir::CachedData>(data->data(), data->size());
+      deallocateMmappedArea(const_cast<uint8_t *>(data->data()), data->size());
+    }
+    subg.setOperandValue(operand_index, std::move(data_obj));
   }
 
   // Name unused
@@ -592,25 +681,6 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadConcatenation(const Operator
 }
 
 template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadInstanceNorm(const Operator *op, ir::Graph &subg)
-{
-  ir::OperandIndexSequence inputs;
-  ir::OperandIndexSequence outputs;
-
-  loadOperationIO(op, inputs, outputs);
-
-  ir::operation::InstanceNorm::Param param;
-  const auto *options = op->builtin_options_as_InstanceNormOptions();
-
-  param.activation = convertActivation(options->fused_activation_function());
-  // Use default value 1e-5 if value of epsilon is zero
-  param.epsilon = options->epsilon() == 0.f ? 1e-5 : options->epsilon();
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::InstanceNorm(inputs, outputs, param));
-  subg.addOperation(std::move(new_op));
-}
-
-template <typename LoaderDomain, typename SpecificLoader>
 void BaseLoader<LoaderDomain, SpecificLoader>::loadFill(const Operator *op, ir::Graph &subg)
 {
   ir::OperandIndexSequence inputs;
@@ -778,6 +848,8 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadResizeBilinear(const Operator
   ir::operation::ResizeBilinear::Param param;
   param.height_out = size_v[0];
   param.width_out = size_v[1];
+  param.align_corners = op->builtin_options_as_ResizeBilinearOptions()->align_corners();
+  param.half_pixel_centers = op->builtin_options_as_ResizeBilinearOptions()->half_pixel_centers();
 
   std::unique_ptr<ir::Operation> new_op(new ir::operation::ResizeBilinear({input}, outputs, param));
   subg.addOperation(std::move(new_op));
@@ -1046,81 +1118,61 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadBatchToSpaceND(const Operator
   ir::OperandIndexSequence outputs;
 
   loadOperationIO(op, inputs, outputs);
-  auto input = inputs.at(0);
-  auto block_shape = inputs.at(1);
-  auto crops = inputs.at(2);
-
-  if (!subg.operands().at(crops).isConstant())
-    throw std::runtime_error("BatchToSpaceND: non-constant 'crops' is not supported.");
 
-  std::vector<std::int32_t> crops_v = subg.operands().at(crops).template asVector<std::int32_t>();
-  assert(crops_v.size() == 4);
-  if (crops_v != std::vector<std::int32_t>{0, 0, 0, 0})
-    throw std::runtime_error("BatchToSpaceND: 'crops' other than {0, 0, 0, 0} is not supported.");
-
-  std::unique_ptr<ir::Operation> new_op{
-      new ir::operation::BatchToSpaceND{{input, block_shape}, outputs}};
+  std::unique_ptr<ir::Operation> new_op{new ir::operation::BatchToSpaceND{inputs, outputs}};
   subg.addOperation(std::move(new_op));
 }
 
 template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadBCQGather(const Operator *op, ir::Graph &subg)
+void BaseLoader<LoaderDomain, SpecificLoader>::loadMatrixBandPart(const Operator *op,
+                                                                  ir::Graph &subg)
 {
   ir::OperandIndexSequence inputs;
   ir::OperandIndexSequence outputs;
 
   loadOperationIO(op, inputs, outputs);
 
-  ir::operation::BCQGather::Param param;
-  const auto *options = op->builtin_options_as_BCQGatherOptions();
-  param.input_hidden_size = options->input_hidden_size();
-  param.axis = options->axis();
-
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::BCQGather(inputs, outputs, param));
+  std::unique_ptr<ir::Operation> new_op(new ir::operation::MatrixBandPart(inputs, outputs));
   subg.addOperation(std::move(new_op));
 }
 
 template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadBCQFullyConnected(const Operator *op,
-                                                                     ir::Graph &subg)
+void BaseLoader<LoaderDomain, SpecificLoader>::loadBroadcastTo(const Operator *op, ir::Graph &subg)
 {
   ir::OperandIndexSequence inputs;
   ir::OperandIndexSequence outputs;
 
   loadOperationIO(op, inputs, outputs);
 
-  ir::operation::BCQFullyConnected::Param param;
-  const auto *options = op->builtin_options_as_BCQFullyConnectedOptions();
-  param.weights_hidden_size = options->weights_hidden_size();
-  param.activation = convertActivation(options->fused_activation_function());
-
-  std::unique_ptr<ir::Operation> new_op(
-      new ir::operation::BCQFullyConnected(inputs, outputs, param));
+  std::unique_ptr<ir::Operation> new_op(new ir::operation::BroadcastTo(inputs, outputs));
   subg.addOperation(std::move(new_op));
 }
-
 template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadMatrixBandPart(const Operator *op,
-                                                                  ir::Graph &subg)
+void BaseLoader<LoaderDomain, SpecificLoader>::loadSpaceToDepth(const Operator *op, ir::Graph &subg)
 {
   ir::OperandIndexSequence inputs;
   ir::OperandIndexSequence outputs;
+  ir::operation::SpaceToDepth::Param param;
+
+  const auto *options = op->builtin_options_as_SpaceToDepthOptions();
+
+  param.block_size = options->block_size();
 
   loadOperationIO(op, inputs, outputs);
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::MatrixBandPart(inputs, outputs));
+  std::unique_ptr<ir::Operation> new_op(new ir::operation::SpaceToDepth(inputs, outputs, param));
   subg.addOperation(std::move(new_op));
 }
 
 template <typename LoaderDomain, typename SpecificLoader>
-void BaseLoader<LoaderDomain, SpecificLoader>::loadBroadcastTo(const Operator *op, ir::Graph &subg)
+void BaseLoader<LoaderDomain, SpecificLoader>::loadStatelessRandomUniform(const Operator *op,
+                                                                          ir::Graph &subg)
 {
   ir::OperandIndexSequence inputs;
   ir::OperandIndexSequence outputs;
-
   loadOperationIO(op, inputs, outputs);
 
-  std::unique_ptr<ir::Operation> new_op(new ir::operation::BroadcastTo(inputs, outputs));
+  std::unique_ptr<ir::Operation> new_op(new ir::operation::StatelessRandomUniform(inputs, outputs));
   subg.addOperation(std::move(new_op));
 }
 
@@ -1144,7 +1196,8 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadCustom(const Operator *op, ir
     BatchMatMul,
     Einsum,
     BroadcastTo,
-    FusedBatchNorm
+    FusedBatchNorm,
+    StatelessRandomUniform
   };
 
   // Mapping from custom op name string to BuiltinOP enum
@@ -1156,6 +1209,7 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadCustom(const Operator *op, ir
       {"Einsum", BuiltinOP::Einsum},
       {"FusedBatchNormV3", BuiltinOP::FusedBatchNorm},
       {"BroadcastTo", BuiltinOP::BroadcastTo},
+      {"StatelessRandomUniform", BuiltinOP::StatelessRandomUniform},
   };
 
   try
@@ -1185,6 +1239,9 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadCustom(const Operator *op, ir
       case BuiltinOP::FusedBatchNorm:
         loadFusedBatchNorm(op, subg);
         break;
+      case BuiltinOP::StatelessRandomUniform:
+        loadStatelessRandomUniform(op, subg);
+        break;
       default:
         throw std::runtime_error{
             "Loader: Custom OP map is defined but operation loader function is not defined"};
@@ -1274,6 +1331,23 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadSplit(const Operator *op, ir:
 }
 
 template <typename LoaderDomain, typename SpecificLoader>
+void BaseLoader<LoaderDomain, SpecificLoader>::loadSplitV(const Operator *op, ir::Graph &subg)
+{
+  ir::OperandIndexSequence inputs;
+  ir::OperandIndexSequence outputs;
+
+  loadOperationIO(op, inputs, outputs);
+
+  ir::operation::SplitV::Param param{};
+
+  const auto *options = op->builtin_options_as_SplitVOptions();
+  param.num_splits = options->num_splits();
+
+  std::unique_ptr<ir::Operation> new_op(new ir::operation::SplitV(inputs, outputs, param));
+  subg.addOperation(std::move(new_op));
+}
+
+template <typename LoaderDomain, typename SpecificLoader>
 void BaseLoader<LoaderDomain, SpecificLoader>::loadSlice(const Operator *op, ir::Graph &subg)
 {
   ir::OperandIndexSequence inputs;
@@ -1743,6 +1817,18 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadLogSoftmax(const Operator *op
 }
 
 template <typename LoaderDomain, typename SpecificLoader>
+void BaseLoader<LoaderDomain, SpecificLoader>::loadQuantize(const Operator *op, ir::Graph &subg)
+{
+  ir::OperandIndexSequence inputs;
+  ir::OperandIndexSequence outputs;
+
+  loadOperationIO(op, inputs, outputs);
+
+  std::unique_ptr<ir::Operation> new_op(new ir::operation::Quantize(inputs, outputs));
+  subg.addOperation(std::move(new_op));
+}
+
+template <typename LoaderDomain, typename SpecificLoader>
 void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op, ir::Graph &subg)
 {
   const auto builtin_op = _model->operator_codes()->Get(op->opcode_index())->builtin_code();
@@ -1870,6 +1956,9 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op,
     case BuiltinOperator::BuiltinOperator_SPLIT:
       loadSplit(op, subg);
       return;
+    case BuiltinOperator::BuiltinOperator_SPLIT_V:
+      loadSplitV(op, subg);
+      return;
     case BuiltinOperator::BuiltinOperator_SLICE:
       loadSlice(op, subg);
       return;
@@ -1959,6 +2048,12 @@ void BaseLoader<LoaderDomain, SpecificLoader>::loadOperation(const Operator *op,
     case BuiltinOperator::BuiltinOperator_LOG_SOFTMAX:
       loadLogSoftmax(op, subg);
       return;
+    case BuiltinOperator::BuiltinOperator_QUANTIZE:
+      loadQuantize(op, subg);
+      return;
+    case BuiltinOperator::BuiltinOperator_SPACE_TO_DEPTH:
+      loadSpaceToDepth(op, subg);
+      return;
     default:
       throw std::runtime_error(
           std::string("Unsupported operation: ").append(EnumNameBuiltinOperator(builtin_op)));
diff --git a/runtime/onert/frontend/circle/CMakeLists.txt b/runtime/onert/frontend/circle/CMakeLists.txt
index a112def32..8bcf85dd3 100644
--- a/runtime/onert/frontend/circle/CMakeLists.txt
+++ b/runtime/onert/frontend/circle/CMakeLists.txt
@@ -10,5 +10,6 @@ target_include_directories(circle_loader PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/incl
 
 target_link_libraries(circle_loader PUBLIC onert_core)
 target_link_libraries(circle_loader PRIVATE base_loader nnfw_common nnfw_coverage)
+target_link_libraries(circle_loader PRIVATE circle_schema)
 
 install(TARGETS circle_loader DESTINATION lib)
diff --git a/runtime/onert/frontend/circle/include/circle_loader.h b/runtime/onert/frontend/circle/include/circle_loader.h
index 8ed4b0c10..675a5b3e7 100644
--- a/runtime/onert/frontend/circle/include/circle_loader.h
+++ b/runtime/onert/frontend/circle/include/circle_loader.h
@@ -26,6 +26,7 @@ namespace onert
 namespace circle_loader
 {
 std::unique_ptr<ir::Subgraphs> loadModel(const char *filename);
+std::unique_ptr<ir::Subgraphs> loadModel(uint8_t *buffer, size_t size);
 } // namespace circle_loader
 } // namespace onert
 
diff --git a/runtime/onert/frontend/circle/src/circle_loader.cc b/runtime/onert/frontend/circle/src/circle_loader.cc
index 94222e9d7..96dd4698a 100644
--- a/runtime/onert/frontend/circle/src/circle_loader.cc
+++ b/runtime/onert/frontend/circle/src/circle_loader.cc
@@ -53,6 +53,8 @@ struct LoaderDomain
   using Tensor = circle::Tensor;
   using TensorType = circle::TensorType;
   using SubGraph = circle::SubGraph;
+  using DimensionType = circle::DimensionType;
+  using SparseIndexVector = circle::SparseIndexVector;
 
   static const char *EnumNameBuiltinOperator(BuiltinOperator e)
   {
@@ -69,6 +71,11 @@ struct LoaderDomain
 
 class CircleLoader final : public base_loader::BaseLoader<LoaderDomain, CircleLoader>
 {
+protected:
+  void loadInstanceNorm(const Operator *op, ir::Graph &subg);
+  void loadBCQFullyConnected(const Operator *op, ir::Graph &subg);
+  void loadBCQGather(const Operator *op, ir::Graph &subg);
+
 public:
   using BaseLoader::BaseLoader;
 
@@ -138,6 +145,57 @@ public:
   }
 };
 
+void CircleLoader::loadInstanceNorm(const Operator *op, ir::Graph &subg)
+{
+  ir::OperandIndexSequence inputs;
+  ir::OperandIndexSequence outputs;
+
+  loadOperationIO(op, inputs, outputs);
+
+  ir::operation::InstanceNorm::Param param;
+  const auto *options = op->builtin_options_as_InstanceNormOptions();
+
+  param.activation = convertActivation(options->fused_activation_function());
+  // Use default value 1e-5 if value of epsilon is zero
+  param.epsilon = options->epsilon() == 0.f ? 1e-5 : options->epsilon();
+
+  std::unique_ptr<ir::Operation> new_op(new ir::operation::InstanceNorm(inputs, outputs, param));
+  subg.addOperation(std::move(new_op));
+}
+
+void CircleLoader::loadBCQGather(const Operator *op, ir::Graph &subg)
+{
+  ir::OperandIndexSequence inputs;
+  ir::OperandIndexSequence outputs;
+
+  loadOperationIO(op, inputs, outputs);
+
+  ir::operation::BCQGather::Param param;
+  const auto *options = op->builtin_options_as_BCQGatherOptions();
+  param.input_hidden_size = options->input_hidden_size();
+  param.axis = options->axis();
+
+  std::unique_ptr<ir::Operation> new_op(new ir::operation::BCQGather(inputs, outputs, param));
+  subg.addOperation(std::move(new_op));
+}
+
+void CircleLoader::loadBCQFullyConnected(const Operator *op, ir::Graph &subg)
+{
+  ir::OperandIndexSequence inputs;
+  ir::OperandIndexSequence outputs;
+
+  loadOperationIO(op, inputs, outputs);
+
+  ir::operation::BCQFullyConnected::Param param;
+  const auto *options = op->builtin_options_as_BCQFullyConnectedOptions();
+  param.weights_hidden_size = options->weights_hidden_size();
+  param.activation = convertActivation(options->fused_activation_function());
+
+  std::unique_ptr<ir::Operation> new_op(
+      new ir::operation::BCQFullyConnected(inputs, outputs, param));
+  subg.addOperation(std::move(new_op));
+}
+
 } // namespace
 
 std::unique_ptr<ir::Subgraphs> loadModel(const char *filename)
@@ -148,5 +206,13 @@ std::unique_ptr<ir::Subgraphs> loadModel(const char *filename)
   return subgraphs;
 }
 
+std::unique_ptr<ir::Subgraphs> loadModel(uint8_t *buffer, size_t size)
+{
+  auto subgraphs = std::make_unique<ir::Subgraphs>();
+  CircleLoader loader(subgraphs);
+  loader.loadFromBuffer(buffer, size);
+  return subgraphs;
+}
+
 } // namespace circle_loader
 } // namespace onert
diff --git a/runtime/onert/frontend/circle_schema/CMakeLists.txt b/runtime/onert/frontend/circle_schema/CMakeLists.txt
new file mode 100644
index 000000000..208103f1c
--- /dev/null
+++ b/runtime/onert/frontend/circle_schema/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_library(circle_schema INTERFACE)
+
+nnfw_find_package(FlatBuffers REQUIRED)
+
+target_link_libraries(circle_schema INTERFACE flatbuffers::flatbuffers)
+
+target_include_directories(circle_schema INTERFACE include)
diff --git a/runtime/onert/frontend/circle/src/circle_schema_generated.h b/runtime/onert/frontend/circle_schema/include/circle_schema_generated.h
index 190c84d98..190c84d98 100644
--- a/runtime/onert/frontend/circle/src/circle_schema_generated.h
+++ b/runtime/onert/frontend/circle_schema/include/circle_schema_generated.h
diff --git a/runtime/onert/frontend/nnapi/model.cc b/runtime/onert/frontend/nnapi/model.cc
index 337bc3aa4..8c7bd1789 100644
--- a/runtime/onert/frontend/nnapi/model.cc
+++ b/runtime/onert/frontend/nnapi/model.cc
@@ -294,7 +294,7 @@ int ANeuralNetworksModel_addOperationEx(ANeuralNetworksModel *model,
   }
 
   const ANeuralNetworksOperationTypeEx FIRST_OPERATION = ANEURALNETWORKS_CAST_EX;
-  const ANeuralNetworksOperationTypeEx LAST_OPERATION = ANEURALNETWORKS_ADDV2_EX;
+  const ANeuralNetworksOperationTypeEx LAST_OPERATION = ANEURALNETWORKS_SPLIT_V_EX;
   if ((type < FIRST_OPERATION) || (type > LAST_OPERATION))
   {
     VERBOSE(NNAPI::Model) << "addOperation: Invalid operation type" << std::endl;
diff --git a/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc b/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc
index 94791f8e6..8ff6cbbfd 100644
--- a/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc
+++ b/runtime/onert/frontend/nnapi/wrapper/OperationFactory.cc
@@ -106,30 +106,122 @@ getReduceGenerator(const onert::ir::operation::Reduce::ReduceType reduce_type)
   };
 }
 
-} // namespace
+template <typename T>
+Operation *CreateSimpleUnaryOp(const OperationFactory::Param &init_param, Operands &)
+{
+  assert(init_param.input_count == 1 && init_param.output_count == 1);
 
-OperationFactory &OperationFactory::get()
+  OperandIndexSequence outputs{init_param.outputs[0]};
+
+  // Each input should be interpreted as follows:
+  //
+  //  0 -> Input Tensor Index
+  OperandIndexSequence inputs{init_param.inputs[0]};
+
+  return new T{inputs, outputs};
+}
+
+// A generator function for binary ops with no params
+template <typename T>
+Operation *createSimpleBinaryOp(const OperationFactory::Param &init_param, Operands &)
 {
-  static OperationFactory factory;
-  return factory;
+  assert(init_param.input_count == 2 && init_param.output_count == 1);
+
+  OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
+  OperandIndexSequence outputs{init_param.outputs[0]};
+
+  return new T{inputs, outputs};
 }
 
-OperationFactory::OperationFactory()
+// A generator function for binary ops with no params
+template <typename T>
+Operation *createPool2DOp(const OperationFactory::Param &init_param, Operands &operands)
 {
-  _map[ANEURALNETWORKS_BATCH_TO_SPACE_ND] = [](const OperationFactory::Param &init_param,
-                                               Operands &) {
-    assert(init_param.input_count == 2 && init_param.output_count == 1);
+  assert(init_param.input_count == 7 || init_param.input_count == 10);
+  assert(init_param.output_count == 1);
 
-    OperandIndexSequence outputs{init_param.outputs[0]};
+  // In common
+  //  0 -> IFM Tensor Index
+  OperandIndexSequence inputs{init_param.inputs[0]};
+  OperandIndexSequence outputs{init_param.outputs[0]};
 
+  typename T::Param param;
+  if (init_param.input_count == 7) // support implicit padding
+  {
     // Each input should be interpreted as follows:
     //
-    //  0 -> Input Tensor Index
-    //  1 -> Block size Index
-    OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
+    //  1 -> Padding Code (ANEURALNETWORKS_PADDING_SAME or ANEURALNETWORKS_PADDING_VALID) Index
+    //  2 -> Horizontal (over width) Stride Index
+    //  3 -> Vertial (over height) Stride Index
+    //  4 -> Filter Width Index
+    //  5 -> Filter Height Index
+    //  6 -> FuseCode (activation) Index
 
-    return new operation::BatchToSpaceND{inputs, outputs};
-  };
+    const auto padding_index = OperandIndex{init_param.inputs[1]};
+    const auto hstride_index = OperandIndex{init_param.inputs[2]};
+    const auto vstride_index = OperandIndex{init_param.inputs[3]};
+    const auto kw_index = OperandIndex{init_param.inputs[4]};
+    const auto kh_index = OperandIndex{init_param.inputs[5]};
+    const auto activation_index = OperandIndex{init_param.inputs[6]};
+
+    param.padding.type =
+        NNAPIConvert::getPaddingType(operands.at(padding_index).asScalar<PaddingCode>());
+    param.stride = makeStride(operands, hstride_index, vstride_index);
+    param.kw = getUint32Scalar(operands, kw_index);
+    param.kh = operands.at(kh_index).asScalar<uint32_t>();
+    param.activation =
+        NNAPIConvert::getFusedActivation(operands.at(activation_index).asScalar<FuseCode>());
+  }
+  else // support explicit padding
+  {
+    // Each input should be interpreted as follows:
+    //
+    //  1 -> Padding_left index
+    //  2 -> Padding_right index
+    //  3 -> Padding_top index
+    //  4 -> Padding_bottom index
+    //  5 -> Horizontal (over width) Stride Index
+    //  6 -> Vertial (over height) Stride Index
+    //  7 -> Filter Width Index
+    //  8 -> Filter Height Index
+    //  9 -> FuseCode (activation) Index
+
+    const auto padding_left_index = OperandIndex{init_param.inputs[1]};
+    const auto padding_right_index = OperandIndex{init_param.inputs[2]};
+    const auto padding_top_index = OperandIndex{init_param.inputs[3]};
+    const auto padding_bottom_index = OperandIndex{init_param.inputs[4]};
+    const auto hstride_index = OperandIndex{init_param.inputs[5]};
+    const auto vstride_index = OperandIndex{init_param.inputs[6]};
+    const auto kw_index = OperandIndex{init_param.inputs[7]};
+    const auto kh_index = OperandIndex{init_param.inputs[8]};
+    const auto activation_index = OperandIndex{init_param.inputs[9]};
+
+    param.padding.type = PaddingType::EXPLICIT;
+    param.padding.param = makeExplicitPadding(operands, padding_left_index, padding_right_index,
+                                              padding_top_index, padding_bottom_index);
+    param.stride = makeStride(operands, hstride_index, vstride_index);
+    param.kw = getUint32Scalar(operands, kw_index);
+    param.kh = getUint32Scalar(operands, kh_index);
+    param.activation =
+        NNAPIConvert::getFusedActivation(operands.at(activation_index).asScalar<FuseCode>());
+  }
+  return new T{inputs, outputs, param};
+}
+
+} // namespace
+
+OperationFactory &OperationFactory::get()
+{
+  static OperationFactory factory;
+  return factory;
+}
+
+OperationFactory::OperationFactory()
+{
+  // Each input should be interpreted as follows:
+  //  0 -> Input Tensor Index
+  //  1 -> Block size Index
+  _map[ANEURALNETWORKS_BATCH_TO_SPACE_ND] = createSimpleBinaryOp<operation::BatchToSpaceND>;
 
   _map[ANEURALNETWORKS_DEPTHWISE_CONV_2D] = [](const OperationFactory::Param &init_param,
                                                Operands &operands) {
@@ -203,153 +295,9 @@ OperationFactory::OperationFactory()
     return new operation::DepthwiseConv2D{inputs, outputs, param};
   };
 
-  _map[ANEURALNETWORKS_MAX_POOL_2D] = [](const OperationFactory::Param &init_param,
-                                         Operands &operands) {
-    assert(init_param.input_count == 7 || init_param.input_count == 10);
-    assert(init_param.output_count == 1);
+  _map[ANEURALNETWORKS_MAX_POOL_2D] = createPool2DOp<operation::MaxPool2D>;
 
-    // In common
-    //  0 -> IFM Tensor Index
-    OperandIndexSequence inputs{init_param.inputs[0]};
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    operation::MaxPool2D::Param param;
-    if (init_param.input_count == 7) // support implicit padding
-    {
-      // Each input should be interpreted as follows:
-      //
-      //  1 -> Padding Code (ANEURALNETWORKS_PADDING_SAME or ANEURALNETWORKS_PADDING_VALID) Index
-      //  2 -> Horizontal (over width) Stride Index
-      //  3 -> Vertial (over height) Stride Index
-      //  4 -> Filter Width Index
-      //  5 -> Filter Height Index
-      //  6 -> FuseCode (activation) Index
-
-      const auto padding_index = OperandIndex{init_param.inputs[1]};
-      const auto hstride_index = OperandIndex{init_param.inputs[2]};
-      const auto vstride_index = OperandIndex{init_param.inputs[3]};
-      const auto kw_index = OperandIndex{init_param.inputs[4]};
-      const auto kh_index = OperandIndex{init_param.inputs[5]};
-      const auto activation_index = OperandIndex{init_param.inputs[6]};
-
-      param.padding.type =
-          NNAPIConvert::getPaddingType(operands.at(padding_index).asScalar<PaddingCode>());
-      param.stride = makeStride(operands, hstride_index, vstride_index);
-      param.kw = getUint32Scalar(operands, kw_index);
-      param.kh = operands.at(kh_index).asScalar<uint32_t>();
-      param.activation =
-          NNAPIConvert::getFusedActivation(operands.at(activation_index).asScalar<FuseCode>());
-    }
-    else if (init_param.input_count == 10) // support explicit padding
-    {
-      // Each input should be interpreted as follows:
-      //
-      //  1 -> Padding_left index
-      //  2 -> Padding_right index
-      //  3 -> Padding_top index
-      //  4 -> Padding_bottom index
-      //  5 -> Horizontal (over width) Stride Index
-      //  6 -> Vertial (over height) Stride Index
-      //  7 -> Filter Width Index
-      //  8 -> Filter Height Index
-      //  9 -> FuseCode (activation) Index
-
-      const auto padding_left_index = OperandIndex{init_param.inputs[1]};
-      const auto padding_right_index = OperandIndex{init_param.inputs[2]};
-      const auto padding_top_index = OperandIndex{init_param.inputs[3]};
-      const auto padding_bottom_index = OperandIndex{init_param.inputs[4]};
-      const auto hstride_index = OperandIndex{init_param.inputs[5]};
-      const auto vstride_index = OperandIndex{init_param.inputs[6]};
-      const auto kw_index = OperandIndex{init_param.inputs[7]};
-      const auto kh_index = OperandIndex{init_param.inputs[8]};
-      const auto activation_index = OperandIndex{init_param.inputs[9]};
-
-      param.padding.type = PaddingType::EXPLICIT;
-      param.padding.param = makeExplicitPadding(operands, padding_left_index, padding_right_index,
-                                                padding_top_index, padding_bottom_index);
-      param.stride = makeStride(operands, hstride_index, vstride_index);
-      param.kw = getUint32Scalar(operands, kw_index);
-      param.kh = getUint32Scalar(operands, kh_index);
-      param.activation =
-          NNAPIConvert::getFusedActivation(operands.at(activation_index).asScalar<FuseCode>());
-    }
-    return new operation::MaxPool2D{inputs, outputs, param};
-  };
-
-  _map[ANEURALNETWORKS_AVERAGE_POOL_2D] = [](const OperationFactory::Param &init_param,
-                                             Operands &operands) {
-    // TODO We may reuse code here for MAX_POOL_2D. Seems like these two are identical
-    assert(init_param.input_count == 7 || init_param.input_count == 10);
-    assert(init_param.output_count == 1);
-
-    // In common
-    //  0 -> IFM Tensor Index
-    OperandIndexSequence inputs{init_param.inputs[0]};
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    operation::AvgPool2D::Param param;
-    if (init_param.input_count == 7) // support implicit padding
-    {
-      // Each input should be interpreted as follows:
-      //
-      //  1 -> Padding Code (ANEURALNETWORKS_PADDING_SAME or ANEURALNETWORKS_PADDING_VALID) Index
-      //  2 -> Horizontal (over width) Stride Index
-      //  3 -> Vertial (over height) Stride Index
-      //  4 -> Filter Width Index
-      //  5 -> Filter Height Index
-      //  6 -> FuseCode (activation) Index
-
-      const auto padding_index = OperandIndex{init_param.inputs[1]};
-      const auto hstride_index = OperandIndex{init_param.inputs[2]};
-      const auto vstride_index = OperandIndex{init_param.inputs[3]};
-      const auto kw_index = OperandIndex{init_param.inputs[4]};
-      const auto kh_index = OperandIndex{init_param.inputs[5]};
-      const auto activation_index = OperandIndex{init_param.inputs[6]};
-
-      param.padding.type =
-          NNAPIConvert::getPaddingType(operands.at(padding_index).asScalar<PaddingCode>());
-      param.stride = makeStride(operands, hstride_index, vstride_index);
-      param.kw = getUint32Scalar(operands, kw_index);
-      param.kh = getUint32Scalar(operands, kh_index);
-      param.activation =
-          NNAPIConvert::getFusedActivation(operands.at(activation_index).asScalar<FuseCode>());
-    }
-    else if (init_param.input_count == 10) // support explicit padding
-    {
-      // Each input should be interpreted as follows:
-      //
-      //  1 -> Padding_left index
-      //  2 -> Padding_right index
-      //  3 -> Padding_top index
-      //  4 -> Padding_bottom index
-      //  5 -> Horizontal (over width) Stride Index
-      //  6 -> Vertial (over height) Stride Index
-      //  7 -> Filter Width Index
-      //  8 -> Filter Height Index
-      //  9 -> FuseCode (activation) Index
-
-      const auto padding_left_index = OperandIndex{init_param.inputs[1]};
-      const auto padding_right_index = OperandIndex{init_param.inputs[2]};
-      const auto padding_top_index = OperandIndex{init_param.inputs[3]};
-      const auto padding_bottom_index = OperandIndex{init_param.inputs[4]};
-      const auto hstride_index = OperandIndex{init_param.inputs[5]};
-      const auto vstride_index = OperandIndex{init_param.inputs[6]};
-      const auto kw_index = OperandIndex{init_param.inputs[7]};
-      const auto kh_index = OperandIndex{init_param.inputs[8]};
-      const auto activation_index = OperandIndex{init_param.inputs[9]};
-
-      param.padding.type = PaddingType::EXPLICIT;
-      param.padding.param = makeExplicitPadding(operands, padding_left_index, padding_right_index,
-                                                padding_top_index, padding_bottom_index);
-      param.stride = makeStride(operands, hstride_index, vstride_index);
-      param.kw = getUint32Scalar(operands, kw_index);
-      param.kh = getUint32Scalar(operands, kh_index);
-      param.activation =
-          NNAPIConvert::getFusedActivation(operands.at(activation_index).asScalar<FuseCode>());
-    }
-
-    return new operation::AvgPool2D{inputs, outputs, param};
-  };
+  _map[ANEURALNETWORKS_AVERAGE_POOL_2D] = createPool2DOp<operation::AvgPool2D>;
 
   _map[ANEURALNETWORKS_CONCATENATION] = [](const OperationFactory::Param &init_param,
                                            Operands &operands) {
@@ -724,44 +672,11 @@ OperationFactory::OperationFactory()
     return new operation::Squeeze{inputs, outputs, param};
   };
 
-  _map[ANEURALNETWORKS_TANH] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 1 && init_param.output_count == 1);
-
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    // Each input should be interpreted as follows:
-    //
-    //  0 -> Input Tensor Index
-    OperandIndexSequence inputs{init_param.inputs[0]};
-
-    return new operation::Tanh{inputs, outputs};
-  };
-
-  _map[ANEURALNETWORKS_LOG] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 1 && init_param.output_count == 1);
-
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    // Each input should be interpreted as follows:
-    //
-    //  0 -> Input Tensor Index
-    OperandIndexSequence inputs{init_param.inputs[0]};
-
-    return new operation::Log{inputs, outputs};
-  };
-
-  _map[ANEURALNETWORKS_LOGISTIC] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 1 && init_param.output_count == 1);
-
-    OperandIndexSequence outputs{init_param.outputs[0]};
+  _map[ANEURALNETWORKS_TANH] = CreateSimpleUnaryOp<operation::Tanh>;
 
-    // Each input should be interpreted as follows:
-    //
-    //  0 -> Input Tensor Index
-    OperandIndexSequence inputs{init_param.inputs[0]};
+  _map[ANEURALNETWORKS_LOG] = CreateSimpleUnaryOp<operation::Log>;
 
-    return new operation::Logistic{inputs, outputs};
-  };
+  _map[ANEURALNETWORKS_LOGISTIC] = CreateSimpleUnaryOp<operation::Logistic>;
 
   _map[ANEURALNETWORKS_DIV] = [](const OperationFactory::Param &init_param, Operands &operands) {
     assert(init_param.input_count == 3 && init_param.output_count == 1);
@@ -784,36 +699,16 @@ OperationFactory::OperationFactory()
     return new operation::Div{inputs, outputs, param};
   };
 
-  _map[ANEURALNETWORKS_EXP] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 1 && init_param.output_count == 1);
-
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    // Each input should be interpreted as follows:
-    //
-    //  0 -> Input Tensor Index
-    OperandIndexSequence inputs{init_param.inputs[0]};
-
-    return new operation::Exp{inputs, outputs};
-  };
+  _map[ANEURALNETWORKS_EXP] = CreateSimpleUnaryOp<operation::Exp>;
 
   // ANEURALNETWORKS_EXP_EX is deprecated
   // TODO Remove ANEURALNETWORKS_EXP_EX
   _map[ANEURALNETWORKS_EXP_EX] = _map[ANEURALNETWORKS_EXP];
 
-  _map[ANEURALNETWORKS_EXPAND_DIMS] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 2 && init_param.output_count == 1);
-
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    // Each input should be interpreted as follows:
-    //
-    //  0 -> Input Tensor Index
-    //  1 -> Axis Tensor Index
-    OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
-
-    return new operation::ExpandDims{inputs, outputs};
-  };
+  // Each input should be interpreted as follows:
+  //  0 -> Input Tensor Index
+  //  1 -> Axis Tensor Index
+  _map[ANEURALNETWORKS_EXPAND_DIMS] = createSimpleBinaryOp<operation::ExpandDims>;
 
   _map[ANEURALNETWORKS_GREATER] = [](const OperationFactory::Param &init_param, Operands &) {
     assert(init_param.input_count == 2 && init_param.output_count == 1);
@@ -982,19 +877,7 @@ OperationFactory::OperationFactory()
     return new operation::Comparison{inputs, outputs, param};
   };
 
-  _map[ANEURALNETWORKS_LOGICAL_AND] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 2 && init_param.output_count == 1);
-
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    // Each input should be interpreted as follows:
-    //
-    //  0 -> input0 Tensor Index
-    //  1 -> input1 Tensor Index
-    OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
-
-    return new operation::LogicalAnd{inputs, outputs};
-  };
+  _map[ANEURALNETWORKS_LOGICAL_AND] = createSimpleBinaryOp<operation::LogicalAnd>;
 
   // ANEURALNETWORKS_LOGICAL_AND_EX is deprecated
   // TODO Remove ANEURALNETWORKS_LOGICAL_AND_EX
@@ -1018,18 +901,7 @@ OperationFactory::OperationFactory()
     return new operation::LogicalAnd{inputs, outputs};
   };
 
-  _map[ANEURALNETWORKS_RSQRT] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 1 && init_param.output_count == 1);
-
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    // Each input should be interpreted as follows:
-    //
-    //  0 -> Input Tensor Index
-    OperandIndexSequence inputs{init_param.inputs[0]};
-
-    return new operation::RSQRT{inputs, outputs};
-  };
+  _map[ANEURALNETWORKS_RSQRT] = CreateSimpleUnaryOp<operation::RSQRT>;
 
   _map[ANEURALNETWORKS_SELECT] = [](const OperationFactory::Param &init_param, Operands &) {
     assert(init_param.input_count == 3 && init_param.output_count == 1);
@@ -1065,18 +937,7 @@ OperationFactory::OperationFactory()
   // TODO Remove ANEURALNETWORKS_RSQRT_EX
   _map[ANEURALNETWORKS_RSQRT_EX] = _map[ANEURALNETWORKS_RSQRT];
 
-  _map[ANEURALNETWORKS_RELU] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 1 && init_param.output_count == 1);
-
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    // Each input should be interpreted as follows:
-    //
-    //  0 -> Input Tensor Index
-    OperandIndexSequence inputs{init_param.inputs[0]};
-
-    return new operation::ReLU{inputs, outputs};
-  };
+  _map[ANEURALNETWORKS_RELU] = CreateSimpleUnaryOp<operation::ReLU>;
 
   _map[ANEURALNETWORKS_RESIZE_BILINEAR] = [](const OperationFactory::Param &init_param,
                                              Operands &operands) {
@@ -1094,35 +955,14 @@ OperationFactory::OperationFactory()
     operation::ResizeBilinear::Param param;
     param.height_out = operands.at(OperandIndex{init_param.inputs[1]}).asScalar<int32_t>();
     param.width_out = operands.at(OperandIndex{init_param.inputs[2]}).asScalar<int32_t>();
-
+    param.align_corners = false;
+    param.half_pixel_centers = false;
     return new operation::ResizeBilinear{inputs, outputs, param};
   };
 
-  _map[ANEURALNETWORKS_RELU1] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 1 && init_param.output_count == 1);
-
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    // Each input should be interpreted as follows:
-    //
-    //  0 -> input Tensor Index
-    OperandIndexSequence inputs{init_param.inputs[0]};
-
-    return new operation::ReLU1{inputs, outputs};
-  };
-
-  _map[ANEURALNETWORKS_RELU6] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 1 && init_param.output_count == 1);
+  _map[ANEURALNETWORKS_RELU1] = CreateSimpleUnaryOp<operation::ReLU1>;
 
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    // Each input should be interpreted as follows:
-    //
-    //  0 -> input Tensor Index
-    OperandIndexSequence inputs{init_param.inputs[0]};
-
-    return new operation::ReLU6{inputs, outputs};
-  };
+  _map[ANEURALNETWORKS_RELU6] = CreateSimpleUnaryOp<operation::ReLU6>;
 
   _map[ANEURALNETWORKS_REVERSE_EX] = [](const OperationFactory::Param &init_param, Operands &) {
     assert(init_param.input_count == 2 && init_param.output_count == 1);
@@ -1219,76 +1059,7 @@ OperationFactory::OperationFactory()
     return new operation::SpaceToDepth{inputs, outputs, param};
   };
 
-  _map[ANEURALNETWORKS_L2_POOL_2D] = [](const OperationFactory::Param &init_param,
-                                        Operands &operands) {
-    assert(init_param.input_count == 10 || init_param.input_count == 7);
-    assert(init_param.output_count == 1);
-
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    // Each input should be interpreted as follows:
-    //
-    //  0 -> IFM Tensor Index
-    OperandIndexSequence inputs{init_param.inputs[0]};
-
-    operation::L2Pool2D::Param param;
-
-    if (init_param.input_count == 7) // Imlicit Padding case
-    {
-      //  1 -> Padding Code (ANEURALNETWORKS_PADDING_SAME or ANEURALNETWORKS_PADDING_VALID) Index
-      //  2 -> Horizontal (over width) Stride Index
-      //  3 -> Vertial (over height) Stride Index
-      //  4 -> Filter Width Index
-      //  5 -> Filter Height Index
-      //  6 -> FuseCode (activation) Index
-      const auto padding_index = OperandIndex{init_param.inputs[1]};
-      const auto hstride_index = OperandIndex{init_param.inputs[2]};
-      const auto vstride_index = OperandIndex{init_param.inputs[3]};
-      const auto kw_index = OperandIndex{init_param.inputs[4]};
-      const auto kh_index = OperandIndex{init_param.inputs[5]};
-      const auto activation_index = OperandIndex{init_param.inputs[6]};
-
-      param.padding.type =
-          NNAPIConvert::getPaddingType(operands.at(padding_index).asScalar<PaddingCode>());
-      param.stride = makeStride(operands, hstride_index, vstride_index);
-      param.kw = getUint32Scalar(operands, kw_index);
-      param.kh = getUint32Scalar(operands, kh_index);
-      param.activation =
-          NNAPIConvert::getFusedActivation(operands.at(activation_index).asScalar<FuseCode>());
-    }
-    else // Explicit Padding case
-    {
-      //  1 -> Padding_left index
-      //  2 -> Padding_right index
-      //  3 -> Padding_top index
-      //  4 -> Padding_bottom index
-      //  5 -> Horizontal (over width) Stride Index
-      //  6 -> Vertial (over height) Stride Index
-      //  7 -> Filter Width Index
-      //  8 -> Filter Height Index
-      //  9 -> FuseCode (activation) Index
-      const auto padding_left_index = OperandIndex{init_param.inputs[1]};
-      const auto padding_right_index = OperandIndex{init_param.inputs[2]};
-      const auto padding_top_index = OperandIndex{init_param.inputs[3]};
-      const auto padding_bottom_index = OperandIndex{init_param.inputs[4]};
-      const auto hstride_index = OperandIndex{init_param.inputs[5]};
-      const auto vstride_index = OperandIndex{init_param.inputs[6]};
-      const auto kw_index = OperandIndex{init_param.inputs[7]};
-      const auto kh_index = OperandIndex{init_param.inputs[8]};
-      const auto activation_index = OperandIndex{init_param.inputs[9]};
-
-      param.padding.type = PaddingType::EXPLICIT;
-      param.padding.param = makeExplicitPadding(operands, padding_left_index, padding_right_index,
-                                                padding_top_index, padding_bottom_index);
-      param.stride = makeStride(operands, hstride_index, vstride_index);
-      param.kw = getUint32Scalar(operands, kw_index);
-      param.kh = getUint32Scalar(operands, kh_index);
-      param.activation =
-          NNAPIConvert::getFusedActivation(operands.at(activation_index).asScalar<FuseCode>());
-    }
-
-    return new operation::L2Pool2D{inputs, outputs, param};
-  };
+  _map[ANEURALNETWORKS_L2_POOL_2D] = createPool2DOp<operation::L2Pool2D>;
 
   _map[ANEURALNETWORKS_EMBEDDING_LOOKUP] = [](const OperationFactory::Param &init_param,
                                               Operands &) {
@@ -1438,18 +1209,7 @@ OperationFactory::OperationFactory()
     return new operation::LogicalOr{inputs, outputs};
   };
 
-  _map[ANEURALNETWORKS_LOGICAL_NOT] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 1 && init_param.output_count == 1);
-
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    // Each input should be interpreted as follows:
-    //
-    //  0 -> input Tensor Index
-    OperandIndexSequence inputs{init_param.inputs[0]};
-
-    return new operation::LogicalNot{inputs, outputs};
-  };
+  _map[ANEURALNETWORKS_LOGICAL_NOT] = CreateSimpleUnaryOp<operation::LogicalNot>;
 
   // ANEURALNETWORKS_LOGICAL_NOT_EX is deprecated
   // TODO Remove ANEURALNETWORKS_LOGICAL_NOT_EX
@@ -1649,35 +1409,13 @@ OperationFactory::OperationFactory()
   // TODO Remove ANEURALNETWORKS_GATHER_EX
   _map[ANEURALNETWORKS_GATHER_EX] = _map[ANEURALNETWORKS_GATHER];
 
-  _map[ANEURALNETWORKS_NEG] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 1 && init_param.output_count == 1);
-
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    // Each input should be interpreted as follows:
-    //
-    //  0 -> Input Tensor Index
-    OperandIndexSequence inputs{init_param.inputs[0]};
-
-    return new operation::Neg{inputs, outputs};
-  };
+  _map[ANEURALNETWORKS_NEG] = CreateSimpleUnaryOp<operation::Neg>;
 
   // ANEURALNETWORKS_NEG_EX is deprecated
   // TODO Remove ANEURALNETWORKS_NEG_EX
   _map[ANEURALNETWORKS_NEG_EX] = _map[ANEURALNETWORKS_NEG];
 
-  _map[ANEURALNETWORKS_ABS] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 1 && init_param.output_count == 1);
-
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    // Each input should be interpreted as follows:
-    //
-    //  0 -> Input Tensor Index
-    OperandIndexSequence inputs{init_param.inputs[0]};
-
-    return new operation::Abs{inputs, outputs};
-  };
+  _map[ANEURALNETWORKS_ABS] = CreateSimpleUnaryOp<operation::Abs>;
 
   // ANEURALNETWORKS_ABS_EX is deprecated
   // TODO Remove ANEURALNETWORKS_ABS_EX
@@ -1704,18 +1442,7 @@ OperationFactory::OperationFactory()
   // TODO Remove ANEURALNETWORKS_ARGMAX_EX
   _map[ANEURALNETWORKS_ARGMAX_EX] = _map[ANEURALNETWORKS_ARGMAX];
 
-  _map[ANEURALNETWORKS_DEQUANTIZE] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 1 && init_param.output_count == 1);
-
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    // Each input should be interpreted as follows:
-    //
-    //  0 -> Input Tensor Index
-    OperandIndexSequence inputs{init_param.inputs[0]};
-
-    return new operation::Dequantize{inputs, outputs};
-  };
+  _map[ANEURALNETWORKS_DEQUANTIZE] = CreateSimpleUnaryOp<operation::Dequantize>;
 
   _map[ANEURALNETWORKS_MEAN] = [](const OperationFactory::Param &init_param, Operands &operands) {
     assert(init_param.input_count == 3 && init_param.output_count == 1);
@@ -1816,6 +1543,23 @@ OperationFactory::OperationFactory()
     return new operation::Split{inputs, outputs, param};
   };
 
+  _map[ANEURALNETWORKS_SPLIT_V_EX] = [](const OperationFactory::Param &init_param,
+                                        Operands &operands) {
+    assert(init_param.input_count == 4);
+    assert(init_param.output_count >= 1); // At least one output tensor and axis
+
+    OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1], init_param.inputs[2]};
+    OperandIndexSequence outputs;
+    for (uint32_t n = 0; n < init_param.output_count; ++n)
+    {
+      outputs.append(OperandIndex{init_param.outputs[n]});
+    }
+
+    operation::SplitV::Param param;
+    param.num_splits = operands.at(OperandIndex{init_param.inputs[3]}).asScalar<std::int32_t>();
+    return new operation::SplitV{inputs, outputs, param};
+  };
+
   // ANEURALNETWORKS_SPLIT_EX is deprecated
   // TODO Remove ANEURALNETWORKS_SPLIT_EX
   _map[ANEURALNETWORKS_SPLIT_EX] = _map[ANEURALNETWORKS_SPLIT];
@@ -1841,31 +1585,24 @@ OperationFactory::OperationFactory()
   };
 
   _map[ANEURALNETWORKS_PAD] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 2 && init_param.output_count >= 1);
+    assert(init_param.input_count >= 2 && init_param.input_count <= 3 &&
+           init_param.output_count >= 1);
 
     OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
+    if (init_param.input_count == 3)
+    {
+      inputs.append(OperandIndex{init_param.inputs[2]});
+    }
     OperandIndexSequence outputs{init_param.outputs[0]};
 
     return new operation::Pad{inputs, outputs};
   };
 
-  _map[ANEURALNETWORKS_MINIMUM] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 2 && init_param.output_count == 1);
-
-    OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
-    OperandIndexSequence outputs{init_param.outputs[0]};
+  _map[ANEURALNETWORKS_PAD_V2] = _map[ANEURALNETWORKS_PAD];
 
-    return new operation::Min{inputs, outputs};
-  };
-
-  _map[ANEURALNETWORKS_MAXIMUM] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 2 && init_param.output_count == 1);
-
-    OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
-    OperandIndexSequence outputs{init_param.outputs[0]};
+  _map[ANEURALNETWORKS_MINIMUM] = createSimpleBinaryOp<operation::Min>;
 
-    return new operation::Max{inputs, outputs};
-  };
+  _map[ANEURALNETWORKS_MAXIMUM] = createSimpleBinaryOp<operation::Max>;
 
   _map[ANEURALNETWORKS_ONE_HOT_EX] = [](const OperationFactory::Param &init_param,
                                         Operands &operands) {
@@ -1948,34 +1685,15 @@ OperationFactory::OperationFactory()
     return new operation::Range{inputs, outputs};
   };
 
-  _map[ANEURALNETWORKS_POW] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 2 && init_param.output_count == 1);
-
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    // Each input should be interpreted as follows:
-    //
-    //  0 -> LHS Tensor Index
-    //  1 -> RHS Tensor Index
+  // Each input should be interpreted as follows:
+  //  0 -> LHS Tensor Index
+  //  1 -> RHS Tensor Index
+  _map[ANEURALNETWORKS_POW] = createSimpleBinaryOp<operation::Pow>;
 
-    OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
-
-    return new operation::Pow{inputs, outputs};
-  };
-
-  _map[ANEURALNETWORKS_FILL_EX] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 2 && init_param.output_count == 1);
-
-    // Each input should be interpreted as follows:
-    //
-    //  0 -> A tensor, specifying the input.
-    //  1 -> A 1-D tensor, specifying the value
-
-    OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    return new operation::Fill{inputs, outputs};
-  };
+  // Each input should be interpreted as follows:
+  //  0 -> A tensor, specifying the input.
+  //  1 -> A 1-D tensor, specifying the value
+  _map[ANEURALNETWORKS_FILL_EX] = createSimpleBinaryOp<operation::Fill>;
 
   _map[ANEURALNETWORKS_ZEROS_LIKE_EX] = [](const OperationFactory::Param &init_param, Operands &) {
     assert(init_param.input_count == 1 && init_param.output_count == 1);
@@ -1989,20 +1707,10 @@ OperationFactory::OperationFactory()
     return new operation::ZerosLike{inputs, outputs};
   };
 
-  _map[ANEURALNETWORKS_TILE] = [](const OperationFactory::Param &init_param, Operands &) {
-    assert(init_param.input_count == 2 && init_param.output_count == 1);
-
-    OperandIndexSequence outputs{init_param.outputs[0]};
-
-    // Each input should be interpreted as follows:
-    //
-    //  0 -> Input Tensor Index
-    //  1 -> Multiple Tensor Index
-
-    OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
-
-    return new operation::Tile{inputs, outputs};
-  };
+  // Each input should be interpreted as follows:
+  //  0 -> Input Tensor Index
+  //  1 -> Multiple Tensor Index
+  _map[ANEURALNETWORKS_TILE] = createSimpleBinaryOp<operation::Tile>;
 
   _map[ANEURALNETWORKS_MATRIX_BAND_PART_EX] = [](const OperationFactory::Param &init_param,
                                                  Operands &) {
@@ -2064,20 +1772,23 @@ OperationFactory::OperationFactory()
     return new operation::Einsum{inputs, outputs, param};
   };
 
-  _map[ANEURALNETWORKS_BROADCAST_TO_EX] = [](const OperationFactory::Param &init_param,
-                                             Operands &) {
-    assert(init_param.input_count == 2 && init_param.output_count == 1);
+  //  0 -> Input Tensor Index
+  //  1 -> int32, int64, An 1-D int tensor Index
+  _map[ANEURALNETWORKS_BROADCAST_TO_EX] = createSimpleBinaryOp<operation::BroadcastTo>;
 
+  _map[ANEURALNETWORKS_STATELESS_RANDOM_UNIFORM_EX] = [](const OperationFactory::Param &init_param,
+                                                         Operands &) {
+    assert(init_param.input_count == 2 && init_param.output_count == 1);
     OperandIndexSequence outputs{init_param.outputs[0]};
 
     // Each input should be interpreted as follows:
     //
-    //  0 -> Input Tensor Index
+    //  0 -> Shape Tensor Index
     //  1 -> int32, int64, An 1-D int tensor Index
 
     OperandIndexSequence inputs{init_param.inputs[0], init_param.inputs[1]};
 
-    return new operation::BroadcastTo{inputs, outputs};
+    return new operation::StatelessRandomUniform{inputs, outputs};
   };
 
   _map[ANEURALNETWORKS_FUSED_BATCH_NORM_V3_EX] = [](const OperationFactory::Param &init_param,
@@ -2133,6 +1844,15 @@ OperationFactory::OperationFactory()
 
     return new operation::LogSoftmax{inputs, outputs, param};
   };
+
+  _map[ANEURALNETWORKS_QUANTIZE] = [](const OperationFactory::Param &init_param, Operands &) {
+    assert(init_param.input_count == 1 && init_param.output_count == 1);
+
+    OperandIndexSequence inputs{init_param.inputs[0]};
+    OperandIndexSequence outputs{init_param.outputs[0]};
+
+    return new operation::Quantize{inputs, outputs};
+  };
 }
 
 Operation *OperationFactory::create(ANeuralNetworksOperationType type,
diff --git a/runtime/onert/frontend/tflite/src/tflite_loader.cc b/runtime/onert/frontend/tflite/src/tflite_loader.cc
index 649ce9bd9..86c2c6bc7 100644
--- a/runtime/onert/frontend/tflite/src/tflite_loader.cc
+++ b/runtime/onert/frontend/tflite/src/tflite_loader.cc
@@ -40,6 +40,8 @@ struct LoaderDomain
   using Tensor = onert_tflite::Tensor;
   using TensorType = onert_tflite::TensorType;
   using SubGraph = onert_tflite::SubGraph;
+  using DimensionType = onert_tflite::DimensionType;
+  using SparseIndexVector = onert_tflite::SparseIndexVector;
 
   static const char *EnumNameBuiltinOperator(BuiltinOperator e)
   {
diff --git a/runtime/onert/sample/minimal/CMakeLists.txt b/runtime/onert/sample/minimal/CMakeLists.txt
index 6f4b02761..e54223e3b 100644
--- a/runtime/onert/sample/minimal/CMakeLists.txt
+++ b/runtime/onert/sample/minimal/CMakeLists.txt
@@ -4,7 +4,7 @@ endif(NOT BUILD_MINIMAL_SAMPLE)
 
 list(APPEND MINIMAL_SRCS "src/minimal.cc")
 
-add_executable(minimal ${MINIMAL_SRCS})
-target_link_libraries(minimal nnfw-dev pthread dl)
+add_executable(onert-minimal-app ${MINIMAL_SRCS})
+target_link_libraries(onert-minimal-app nnfw-dev pthread dl)
 
-install(TARGETS minimal DESTINATION bin)
+install(TARGETS onert-minimal-app DESTINATION bin)
diff --git a/runtime/onert/sample/minimal/src/minimal.cc b/runtime/onert/sample/minimal/src/minimal.cc
index d55569ba2..0436b9368 100644
--- a/runtime/onert/sample/minimal/src/minimal.cc
+++ b/runtime/onert/sample/minimal/src/minimal.cc
@@ -16,6 +16,7 @@
 
 #include "nnfw.h"
 #include <vector>
+#include <iostream>
 
 uint64_t num_elems(const nnfw_tensorinfo *ti)
 {
@@ -65,5 +66,6 @@ int main(const int argc, char **argv)
 
   nnfw_close_session(session);
 
+  std::cout << "nnpackage " << argv[1] << " runs successfully." << std::endl;
   return 0;
 }
diff --git a/runtime/onert/test/core/exec/ExecInstance.cc b/runtime/onert/test/core/exec/ExecInstance.cc
index cc0434764..0fcf372c3 100644
--- a/runtime/onert/test/core/exec/ExecInstance.cc
+++ b/runtime/onert/test/core/exec/ExecInstance.cc
@@ -73,9 +73,8 @@ public:
     // Compile
     auto subgs = std::make_shared<onert::ir::Subgraphs>();
     subgs->push(onert::ir::SubgraphIndex{0}, graph);
-    auto compiler = new onert::compiler::Compiler{subgs};
-    executors = compiler->compile();
-    delete compiler;
+    onert::compiler::Compiler compiler{subgs};
+    executors = compiler.compile();
   }
 
 public:
@@ -98,19 +97,17 @@ TEST(ExecInstance, simple)
   float output_buffer[4] = {};
   const float output_expected[4] = {5, -2, 0, -1};
 
-  auto execution = new onert::exec::Execution(executors);
+  onert::exec::Execution execution{executors};
 
-  execution->setInput(input1, reinterpret_cast<const void *>(input1_buffer), 16);
-  execution->setInput(input2, reinterpret_cast<const void *>(input2_buffer), 16);
-  execution->setOutput(output, reinterpret_cast<void *>(output_buffer), 16);
-  execution->execute();
+  execution.setInput(input1, reinterpret_cast<const void *>(input1_buffer), 16);
+  execution.setInput(input2, reinterpret_cast<const void *>(input2_buffer), 16);
+  execution.setOutput(output, reinterpret_cast<void *>(output_buffer), 16);
+  execution.execute();
 
   for (auto i = 0; i < 4; i++)
   {
     EXPECT_EQ(output_buffer[i], output_expected[i]);
   }
-
-  delete execution;
 }
 
 TEST(ExecInstance, twoCompile)
@@ -118,7 +115,7 @@ TEST(ExecInstance, twoCompile)
   auto mockup = CompiledMockUpModel();
   auto graph = mockup.graph;
   auto executors1 = mockup.executors;
-  auto execution1 = new onert::exec::Execution(executors1);
+  onert::exec::Execution execution1{executors1};
 
   auto input1 = IOIndex{0};
   auto input2 = IOIndex{1};
@@ -129,38 +126,34 @@ TEST(ExecInstance, twoCompile)
   float exe1_output_buffer[4] = {};
   const float exe1_output_expected[4] = {5, -2, 0, -1};
 
-  execution1->setInput(input1, reinterpret_cast<const void *>(exe1_input1_buffer), 16);
-  execution1->setInput(input2, reinterpret_cast<const void *>(exe1_input2_buffer), 16);
-  execution1->setOutput(output, reinterpret_cast<void *>(exe1_output_buffer), 16);
+  execution1.setInput(input1, reinterpret_cast<const void *>(exe1_input1_buffer), 16);
+  execution1.setInput(input2, reinterpret_cast<const void *>(exe1_input2_buffer), 16);
+  execution1.setOutput(output, reinterpret_cast<void *>(exe1_output_buffer), 16);
 
   // Make new executor: compile again
   auto subgs = std::make_shared<onert::ir::Subgraphs>();
   subgs->push(onert::ir::SubgraphIndex{0}, graph);
-  auto compiler = new onert::compiler::Compiler{subgs};
-  std::shared_ptr<onert::exec::ExecutorMap> executors2 = compiler->compile();
-  auto execution2 = new onert::exec::Execution(executors2);
+  onert::compiler::Compiler compiler{subgs};
+  std::shared_ptr<onert::exec::ExecutorMap> executors2 = compiler.compile();
+  onert::exec::Execution execution2{executors2};
 
   const float exe2_input1_buffer[4] = {2, 1, -2, 0};
   const float exe2_input2_buffer[4] = {-3, 3, 1, 2};
   float exe2_output_buffer[4] = {};
   const float exe2_output_expected[4] = {2, 5, -2, 7};
 
-  execution2->setInput(input1, reinterpret_cast<const void *>(exe2_input1_buffer), 16);
-  execution2->setInput(input2, reinterpret_cast<const void *>(exe2_input2_buffer), 16);
-  execution2->setOutput(output, reinterpret_cast<void *>(exe2_output_buffer), 16);
+  execution2.setInput(input1, reinterpret_cast<const void *>(exe2_input1_buffer), 16);
+  execution2.setInput(input2, reinterpret_cast<const void *>(exe2_input2_buffer), 16);
+  execution2.setOutput(output, reinterpret_cast<void *>(exe2_output_buffer), 16);
 
-  execution1->execute();
-  execution2->execute();
+  execution1.execute();
+  execution2.execute();
 
   for (auto i = 0; i < 4; i++)
   {
     EXPECT_EQ(exe1_output_buffer[i], exe1_output_expected[i]);
     EXPECT_EQ(exe2_output_buffer[i], exe2_output_expected[i]);
   }
-
-  delete compiler;
-  delete execution1;
-  delete execution2;
 }
 
 // Support two initialized execution instance then ordered execution
@@ -178,32 +171,29 @@ TEST(ExecInstance, twoExecution)
   const float exe1_output_expected[4] = {5, -2, 0, -1};
   const float exe2_output_expected[4] = {2, 5, -2, 7};
 
-  auto execution1 = new onert::exec::Execution(executors);
-  execution1->setInput(input1, reinterpret_cast<const void *>(exe1_input1_buffer), 16);
-  execution1->setInput(input2, reinterpret_cast<const void *>(exe1_input2_buffer), 16);
-  execution1->setOutput(output1, reinterpret_cast<void *>(exe1_output_buffer), 16);
+  onert::exec::Execution execution1{executors};
+  execution1.setInput(input1, reinterpret_cast<const void *>(exe1_input1_buffer), 16);
+  execution1.setInput(input2, reinterpret_cast<const void *>(exe1_input2_buffer), 16);
+  execution1.setOutput(output1, reinterpret_cast<void *>(exe1_output_buffer), 16);
 
   const float exe2_input1_buffer[4] = {2, 1, -2, 0};
   const float exe2_input2_buffer[4] = {-3, 3, 1, 2};
   float exe2_output_buffer[4] = {};
 
   // Make new execution
-  auto execution2 = new onert::exec::Execution(executors);
-  execution2->setInput(input1, reinterpret_cast<const void *>(exe2_input1_buffer), 16);
-  execution2->setInput(input2, reinterpret_cast<const void *>(exe2_input2_buffer), 16);
-  execution2->setOutput(output1, reinterpret_cast<void *>(exe2_output_buffer), 16);
+  onert::exec::Execution execution2{executors};
+  execution2.setInput(input1, reinterpret_cast<const void *>(exe2_input1_buffer), 16);
+  execution2.setInput(input2, reinterpret_cast<const void *>(exe2_input2_buffer), 16);
+  execution2.setOutput(output1, reinterpret_cast<void *>(exe2_output_buffer), 16);
 
-  execution1->execute();
-  execution2->execute();
+  execution1.execute();
+  execution2.execute();
 
   for (auto i = 0; i < 4; i++)
   {
     EXPECT_EQ(exe1_output_buffer[i], exe1_output_expected[i]);
     EXPECT_EQ(exe2_output_buffer[i], exe2_output_expected[i]);
   }
-
-  delete execution1;
-  delete execution2;
 }
 
 class Inference
@@ -222,14 +212,12 @@ public:
     auto input2 = IOIndex{1};
     auto output1 = IOIndex{0};
 
-    auto execution = new onert::exec::Execution(_executors);
-    execution->setInput(input1, reinterpret_cast<const void *>(_input1), 16);
-    execution->setInput(input2, reinterpret_cast<const void *>(_input2), 16);
-    execution->setOutput(output1, reinterpret_cast<void *>(_output), 16);
+    onert::exec::Execution execution{_executors};
+    execution.setInput(input1, reinterpret_cast<const void *>(_input1), 16);
+    execution.setInput(input2, reinterpret_cast<const void *>(_input2), 16);
+    execution.setOutput(output1, reinterpret_cast<void *>(_output), 16);
 
-    execution->execute();
-
-    delete execution;
+    execution.execute();
   }
 
 private:
@@ -288,20 +276,18 @@ TEST(ExecInstance, async)
   float output_buffer[4] = {};
   const float output_expected[4] = {5, -2, 0, -1};
 
-  auto execution = new onert::exec::Execution(executors);
+  onert::exec::Execution execution{executors};
 
-  execution->setInput(input1, reinterpret_cast<const void *>(input1_buffer), 16);
-  execution->setInput(input2, reinterpret_cast<const void *>(input2_buffer), 16);
-  execution->setOutput(output, reinterpret_cast<void *>(output_buffer), 16);
-  execution->startExecute();
-  execution->waitFinish();
+  execution.setInput(input1, reinterpret_cast<const void *>(input1_buffer), 16);
+  execution.setInput(input2, reinterpret_cast<const void *>(input2_buffer), 16);
+  execution.setOutput(output, reinterpret_cast<void *>(output_buffer), 16);
+  execution.startExecute();
+  execution.waitFinish();
 
   for (auto i = 0; i < 4; i++)
   {
     EXPECT_EQ(output_buffer[i], output_expected[i]);
   }
-
-  delete execution;
 }
 
 } // namespace
diff --git a/runtime/onert/test/graph/operand/UseDef.cc b/runtime/onert/test/graph/operand/UseDef.cc
index 3e8b14b8b..cd2cdb739 100644
--- a/runtime/onert/test/graph/operand/UseDef.cc
+++ b/runtime/onert/test/graph/operand/UseDef.cc
@@ -65,12 +65,12 @@ TEST(graph_operand_usedef, usedef_test)
   ASSERT_EQ(verifier.verify(graph), true);
 
   // Check def
-  ASSERT_EQ(graph.operands().at(operand_index1).getDef().contains(mocknode_index1), true);
-  ASSERT_EQ(graph.operands().at(operand_index2).getDef().contains(mocknode_index2), true);
-  ASSERT_EQ(graph.operands().at(output_operand).getDef().contains(multiinput_index), true);
+  ASSERT_EQ(graph.operands().at(operand_index1).getDef(), mocknode_index1);
+  ASSERT_EQ(graph.operands().at(operand_index2).getDef(), mocknode_index2);
+  ASSERT_EQ(graph.operands().at(output_operand).getDef(), multiinput_index);
 
-  ASSERT_EQ(graph.operands().at(operand_index1).getDef().contains(mocknode_index2), false);
-  ASSERT_EQ(graph.operands().at(operand_index1).getDef().contains(multiinput_index), false);
+  ASSERT_NE(graph.operands().at(operand_index1).getDef(), mocknode_index2);
+  ASSERT_NE(graph.operands().at(operand_index1).getDef(), multiinput_index);
 
   // Check use
   ASSERT_EQ(graph.operands().at(input_operand).getUses().contains(mocknode_index1), true);
diff --git a/tests/custom_op/FillFrom/CMakeLists.txt b/tests/custom_op/FillFrom/CMakeLists.txt
index ba03e3056..91349de7a 100644
--- a/tests/custom_op/FillFrom/CMakeLists.txt
+++ b/tests/custom_op/FillFrom/CMakeLists.txt
@@ -2,6 +2,6 @@ add_nnfw_custom_op_kernel(FillFrom ON kernels/FillFromKernel.cc)
 add_nnfw_custom_op_app(FillFrom_runner
                        SOURCES FillFrom_runner.cc
                        KERNELS FillFrom)
-install(TARGETS FillFrom_runner DESTINATION tests)
-install(DIRECTORY nnpkgs/FillFrom DESTINATION tests/nnpkgs)
-install_nnfw_custom_op_kernel(FillFrom tests/nnpkgs/FillFrom)
+install(TARGETS FillFrom_runner DESTINATION test)
+install(DIRECTORY nnpkgs/FillFrom DESTINATION test/nnpkgs)
+install_nnfw_custom_op_kernel(FillFrom test/nnpkgs/FillFrom)
diff --git a/tests/custom_op/FillFrom/FillFrom_runner.cc b/tests/custom_op/FillFrom/FillFrom_runner.cc
index 82e25fa34..731308638 100644
--- a/tests/custom_op/FillFrom/FillFrom_runner.cc
+++ b/tests/custom_op/FillFrom/FillFrom_runner.cc
@@ -15,7 +15,7 @@
  */
 
 #include "nnfw.h"
-#include "nnfw_dev.h"
+#include "nnfw_experimental.h"
 
 #include <cassert>
 #include <iostream>
diff --git a/tests/custom_op/FillFrom/kernels/FillFromKernel.cc b/tests/custom_op/FillFrom/kernels/FillFromKernel.cc
index 6771e6852..6015b3b86 100644
--- a/tests/custom_op/FillFrom/kernels/FillFromKernel.cc
+++ b/tests/custom_op/FillFrom/kernels/FillFromKernel.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "nnfw_dev.h"
+#include "nnfw_experimental.h"
 
 #include "flatbuffers/flexbuffers.h"
 
diff --git a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_cl b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_cl
index e50b94118..3a6b40d6b 100644
--- a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_cl
+++ b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_cl
@@ -23,8 +23,8 @@ GeneratedTests.cast_float16_to_quant8
 GeneratedTests.cast_float16_to_quant8_overflow
 GeneratedTests.cast_float32_to_float16
 GeneratedTests.cast_float32_to_float16_relaxed
+GeneratedTests.cast_float32_to_int32_nnfw
 GeneratedTests.cast_int32_to_float16
-GeneratedTests.cast_int32_to_quant8_overflow
 GeneratedTests.cast_quant8_to_float16
 GeneratedTests.concat_dynamic_nnfw
 GeneratedTests.conv_dynamic_nnfw
@@ -68,6 +68,7 @@ GeneratedTests.gather_float16_7
 GeneratedTests.gather_float16_8
 GeneratedTests.greater_dynamic_float_nnfw
 GeneratedTests.greater_equal_dynamic_float_nnfw
+GeneratedTests.l2_normalization_quant8_nnfw
 GeneratedTests.less_dynamic_float_nnfw
 GeneratedTests.less_equal_dynamic_float_nnfw
 GeneratedTests.log_4D_float_nnfw
@@ -106,11 +107,26 @@ GeneratedTests.not_equal_dynamic_float_nnfw
 GeneratedTests.one_hot_ex_dynamic_nnfw
 GeneratedTests.pack_ex_dynamic_nnfw
 GeneratedTests.pad_dynamic_nnfw
+GeneratedTests.pad_v2_1_float
+GeneratedTests.pad_v2_1_quant8
+GeneratedTests.pad_v2_all_dims
+GeneratedTests.pad_v2_all_dims_quant8
+GeneratedTests.pad_v2_low_rank
+GeneratedTests.pad_v2_low_rank_quant8
 GeneratedTests.pow_2D_float_nnfw
 GeneratedTests.pow_broadcast_float_nnfw
 GeneratedTests.pow_broadcast_float_nnfw_2
 GeneratedTests.pow_broadcast_float_nnfw_3
 GeneratedTests.pow_dynamic_nnfw
+GeneratedTests.quantize_quant8
+GeneratedTests.quantize_quant8_2
+GeneratedTests.quantize_quant8_3
+GeneratedTests.quantize_quant8_4
+GeneratedTests.quantize_quant8_5
+GeneratedTests.quantize_quant8_6
+GeneratedTests.quantize_quant8_7
+GeneratedTests.quantize_quant8_8
+GeneratedTests.quantize_zero_sized
 GeneratedTests.range_ex_float_1
 GeneratedTests.range_ex_float_1_all_constant_inputs
 GeneratedTests.range_ex_float_1_dynamic_nnfw
@@ -184,9 +200,21 @@ GeneratedTests.slice_zero_sized_quant8
 GeneratedTests.softmax_dynamic_nnfw
 GeneratedTests.space_to_batch_dynamic_float_nnfw
 GeneratedTests.split_dynamic_float_nnfw
+GeneratedTests.split_v_ex_1D_float_1_nnfw
+GeneratedTests.split_v_ex_1D_float_2_nnfw
+GeneratedTests.split_v_ex_1D_int32_nnfw
+GeneratedTests.split_v_ex_4D_float_1_nnfw
+GeneratedTests.split_v_ex_4D_float_2_nnfw
+GeneratedTests.split_v_ex_4D_float_3_nnfw
+GeneratedTests.split_v_ex_4D_float_4_nnfw
+GeneratedTests.split_v_ex_4D_int32_1_nnfw
+GeneratedTests.split_v_ex_4D_int32_2_nnfw
+GeneratedTests.split_v_ex_4D_int32_3_nnfw
+GeneratedTests.split_v_ex_4D_int32_4_nnfw
 GeneratedTests.sqrt_
 GeneratedTests.squared_difference_ex_dynamic_nnfw
 GeneratedTests.squeeze_dynamic_float_nnfw
+GeneratedTests.stateless_random_uniform_ex_nnfw
 GeneratedTests.strided_slice_dynamic_nnfw
 GeneratedTests.sub_dynamic_nnfw
 GeneratedTests.sub_v1_2_zero_sized
diff --git a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon
index c9edee585..f4bd48be5 100644
--- a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon
+++ b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.acl_neon
@@ -23,10 +23,7 @@ GeneratedTests.cast_float16_to_quant8
 GeneratedTests.cast_float16_to_quant8_overflow
 GeneratedTests.cast_float32_to_float16
 GeneratedTests.cast_float32_to_float16_relaxed
-GeneratedTests.cast_float32_to_quant8_overflow
-GeneratedTests.cast_float32_to_quant8_overflow_relaxed
 GeneratedTests.cast_int32_to_float16
-GeneratedTests.cast_int32_to_quant8_overflow
 GeneratedTests.cast_quant8_to_float16
 GeneratedTests.concat_dynamic_nnfw
 GeneratedTests.conv_dynamic_nnfw
@@ -73,6 +70,7 @@ GeneratedTests.gather_float16_8
 GeneratedTests.greater_dynamic_float_nnfw
 GeneratedTests.greater_equal_boolean
 GeneratedTests.greater_equal_dynamic_float_nnfw
+GeneratedTests.l2_normalization_quant8_nnfw
 GeneratedTests.less_boolean
 GeneratedTests.less_dynamic_float_nnfw
 GeneratedTests.less_equal_dynamic_float_nnfw
@@ -112,11 +110,26 @@ GeneratedTests.not_equal_dynamic_float_nnfw
 GeneratedTests.one_hot_ex_dynamic_nnfw
 GeneratedTests.pack_ex_dynamic_nnfw
 GeneratedTests.pad_dynamic_nnfw
+GeneratedTests.pad_v2_1_float
+GeneratedTests.pad_v2_1_quant8
+GeneratedTests.pad_v2_all_dims
+GeneratedTests.pad_v2_all_dims_quant8
+GeneratedTests.pad_v2_low_rank
+GeneratedTests.pad_v2_low_rank_quant8
 GeneratedTests.pow_2D_float_nnfw
 GeneratedTests.pow_broadcast_float_nnfw
 GeneratedTests.pow_broadcast_float_nnfw_2
 GeneratedTests.pow_broadcast_float_nnfw_3
 GeneratedTests.pow_dynamic_nnfw
+GeneratedTests.quantize_quant8
+GeneratedTests.quantize_quant8_2
+GeneratedTests.quantize_quant8_3
+GeneratedTests.quantize_quant8_4
+GeneratedTests.quantize_quant8_5
+GeneratedTests.quantize_quant8_6
+GeneratedTests.quantize_quant8_7
+GeneratedTests.quantize_quant8_8
+GeneratedTests.quantize_zero_sized
 GeneratedTests.range_ex_float_1
 GeneratedTests.range_ex_float_1_all_constant_inputs
 GeneratedTests.range_ex_float_1_dynamic_nnfw
@@ -199,9 +212,21 @@ GeneratedTests.space_to_batch_quant8_2
 GeneratedTests.space_to_batch_quant8_2_nnfw
 GeneratedTests.space_to_batch_quant8_3
 GeneratedTests.split_dynamic_float_nnfw
+GeneratedTests.split_v_ex_1D_float_1_nnfw
+GeneratedTests.split_v_ex_1D_float_2_nnfw
+GeneratedTests.split_v_ex_1D_int32_nnfw
+GeneratedTests.split_v_ex_4D_float_1_nnfw
+GeneratedTests.split_v_ex_4D_float_2_nnfw
+GeneratedTests.split_v_ex_4D_float_3_nnfw
+GeneratedTests.split_v_ex_4D_float_4_nnfw
+GeneratedTests.split_v_ex_4D_int32_1_nnfw
+GeneratedTests.split_v_ex_4D_int32_2_nnfw
+GeneratedTests.split_v_ex_4D_int32_3_nnfw
+GeneratedTests.split_v_ex_4D_int32_4_nnfw
 GeneratedTests.sqrt_
 GeneratedTests.squared_difference_ex_dynamic_nnfw
 GeneratedTests.squeeze_dynamic_float_nnfw
+GeneratedTests.stateless_random_uniform_ex_nnfw
 GeneratedTests.strided_slice_dynamic_nnfw
 GeneratedTests.sub_dynamic_nnfw
 GeneratedTests.sub_v1_2_zero_sized
diff --git a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu
index 3cce4f3e3..e98007e08 100644
--- a/tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu
+++ b/tests/nnapi/nnapi_gtest.skip.aarch64-linux.cpu
@@ -1,7 +1,4 @@
 GeneratedTests.abs_
-GeneratedTests.batch_to_space
-GeneratedTests.batch_to_space_float_1
-GeneratedTests.batch_to_space_quant8_1
 GeneratedTests.cast_float16_to_float16
 GeneratedTests.cast_float16_to_float32
 GeneratedTests.cast_float16_to_float32_relaxed
@@ -38,9 +35,6 @@ GeneratedTests.gather_float16_8
 GeneratedTests.hashtable_lookup_float
 GeneratedTests.hashtable_lookup_float_4D_nnfw
 GeneratedTests.hashtable_lookup_quant8
-GeneratedTests.l2_normalization
-GeneratedTests.l2_normalization_2
-GeneratedTests.l2_normalization_large
 GeneratedTests.l2_pool_float
 GeneratedTests.l2_pool_float_2
 GeneratedTests.l2_pool_float_large
@@ -79,7 +73,6 @@ GeneratedTests.minimum_simple_quant8
 GeneratedTests.neg
 GeneratedTests.neg_3D_int_nnfw
 GeneratedTests.neg_4D_int_nnfw
-GeneratedTests.pad_quant8_nnfw
 GeneratedTests.prelu
 GeneratedTests.prelu_broadcast_float_1_nnfw
 GeneratedTests.prelu_broadcast_quant8_1_nnfw
@@ -94,6 +87,11 @@ GeneratedTests.prelu_weight_as_input_quant8
 GeneratedTests.prelu_weight_as_input_quant8_2
 GeneratedTests.prelu_weight_as_input_quant8_3
 GeneratedTests.prelu_weight_as_input_quant8_4
+GeneratedTests.quantize_quant8_5
+GeneratedTests.quantize_quant8_6
+GeneratedTests.quantize_quant8_7
+GeneratedTests.quantize_quant8_8
+GeneratedTests.quantize_zero_sized
 GeneratedTests.reduce_max_quant8
 GeneratedTests.reduce_max_quant8_1_nnfw
 GeneratedTests.reduce_max_quant8_2
@@ -108,14 +106,10 @@ GeneratedTests.relu1_float_1
 GeneratedTests.relu1_float_2
 GeneratedTests.relu1_quant8_1
 GeneratedTests.relu1_quant8_2
-GeneratedTests.relu6_float_1
-GeneratedTests.relu6_float_2
 GeneratedTests.relu6_quant8_1
 GeneratedTests.relu6_quant8_2
 GeneratedTests.relu_quant8_1
 GeneratedTests.relu_quant8_2
-GeneratedTests.resize_bilinear
-GeneratedTests.resize_bilinear_2
 GeneratedTests.rnn
 GeneratedTests.rnn_state
 GeneratedTests.rsqrt
@@ -125,15 +119,9 @@ GeneratedTests.select_v1_2_one_dim_quant8
 GeneratedTests.select_v1_2_two_dim_quant8
 GeneratedTests.slice_5
 GeneratedTests.slice_6
-GeneratedTests.slice_7
 GeneratedTests.slice_8
 GeneratedTests.slice_zero_sized
 GeneratedTests.slice_zero_sized_quant8
-GeneratedTests.space_to_depth_float_1
-GeneratedTests.space_to_depth_float_2
-GeneratedTests.space_to_depth_float_3
-GeneratedTests.space_to_depth_quant8_1
-GeneratedTests.space_to_depth_quant8_2
 GeneratedTests.sqrt_
 GeneratedTests.sqrt_1D_float_nnfw
 GeneratedTests.sqrt_2D_float_nnfw
diff --git a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_cl b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_cl
index e50b94118..3a6b40d6b 100644
--- a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_cl
+++ b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_cl
@@ -23,8 +23,8 @@ GeneratedTests.cast_float16_to_quant8
 GeneratedTests.cast_float16_to_quant8_overflow
 GeneratedTests.cast_float32_to_float16
 GeneratedTests.cast_float32_to_float16_relaxed
+GeneratedTests.cast_float32_to_int32_nnfw
 GeneratedTests.cast_int32_to_float16
-GeneratedTests.cast_int32_to_quant8_overflow
 GeneratedTests.cast_quant8_to_float16
 GeneratedTests.concat_dynamic_nnfw
 GeneratedTests.conv_dynamic_nnfw
@@ -68,6 +68,7 @@ GeneratedTests.gather_float16_7
 GeneratedTests.gather_float16_8
 GeneratedTests.greater_dynamic_float_nnfw
 GeneratedTests.greater_equal_dynamic_float_nnfw
+GeneratedTests.l2_normalization_quant8_nnfw
 GeneratedTests.less_dynamic_float_nnfw
 GeneratedTests.less_equal_dynamic_float_nnfw
 GeneratedTests.log_4D_float_nnfw
@@ -106,11 +107,26 @@ GeneratedTests.not_equal_dynamic_float_nnfw
 GeneratedTests.one_hot_ex_dynamic_nnfw
 GeneratedTests.pack_ex_dynamic_nnfw
 GeneratedTests.pad_dynamic_nnfw
+GeneratedTests.pad_v2_1_float
+GeneratedTests.pad_v2_1_quant8
+GeneratedTests.pad_v2_all_dims
+GeneratedTests.pad_v2_all_dims_quant8
+GeneratedTests.pad_v2_low_rank
+GeneratedTests.pad_v2_low_rank_quant8
 GeneratedTests.pow_2D_float_nnfw
 GeneratedTests.pow_broadcast_float_nnfw
 GeneratedTests.pow_broadcast_float_nnfw_2
 GeneratedTests.pow_broadcast_float_nnfw_3
 GeneratedTests.pow_dynamic_nnfw
+GeneratedTests.quantize_quant8
+GeneratedTests.quantize_quant8_2
+GeneratedTests.quantize_quant8_3
+GeneratedTests.quantize_quant8_4
+GeneratedTests.quantize_quant8_5
+GeneratedTests.quantize_quant8_6
+GeneratedTests.quantize_quant8_7
+GeneratedTests.quantize_quant8_8
+GeneratedTests.quantize_zero_sized
 GeneratedTests.range_ex_float_1
 GeneratedTests.range_ex_float_1_all_constant_inputs
 GeneratedTests.range_ex_float_1_dynamic_nnfw
@@ -184,9 +200,21 @@ GeneratedTests.slice_zero_sized_quant8
 GeneratedTests.softmax_dynamic_nnfw
 GeneratedTests.space_to_batch_dynamic_float_nnfw
 GeneratedTests.split_dynamic_float_nnfw
+GeneratedTests.split_v_ex_1D_float_1_nnfw
+GeneratedTests.split_v_ex_1D_float_2_nnfw
+GeneratedTests.split_v_ex_1D_int32_nnfw
+GeneratedTests.split_v_ex_4D_float_1_nnfw
+GeneratedTests.split_v_ex_4D_float_2_nnfw
+GeneratedTests.split_v_ex_4D_float_3_nnfw
+GeneratedTests.split_v_ex_4D_float_4_nnfw
+GeneratedTests.split_v_ex_4D_int32_1_nnfw
+GeneratedTests.split_v_ex_4D_int32_2_nnfw
+GeneratedTests.split_v_ex_4D_int32_3_nnfw
+GeneratedTests.split_v_ex_4D_int32_4_nnfw
 GeneratedTests.sqrt_
 GeneratedTests.squared_difference_ex_dynamic_nnfw
 GeneratedTests.squeeze_dynamic_float_nnfw
+GeneratedTests.stateless_random_uniform_ex_nnfw
 GeneratedTests.strided_slice_dynamic_nnfw
 GeneratedTests.sub_dynamic_nnfw
 GeneratedTests.sub_v1_2_zero_sized
diff --git a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon
index 55cfe398f..fcd8b3e36 100644
--- a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon
+++ b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.acl_neon
@@ -23,10 +23,7 @@ GeneratedTests.cast_float16_to_quant8
 GeneratedTests.cast_float16_to_quant8_overflow
 GeneratedTests.cast_float32_to_float16
 GeneratedTests.cast_float32_to_float16_relaxed
-GeneratedTests.cast_float32_to_quant8_overflow
-GeneratedTests.cast_float32_to_quant8_overflow_relaxed
 GeneratedTests.cast_int32_to_float16
-GeneratedTests.cast_int32_to_quant8_overflow
 GeneratedTests.cast_quant8_to_float16
 GeneratedTests.concat_dynamic_nnfw
 GeneratedTests.conv_dynamic_nnfw
@@ -73,6 +70,7 @@ GeneratedTests.greater_dynamic_float_nnfw
 GeneratedTests.greater_equal_boolean
 GeneratedTests.greater_equal_dynamic_float_nnfw
 GeneratedTests.less_boolean
+GeneratedTests.l2_normalization_quant8_nnfw
 GeneratedTests.less_dynamic_float_nnfw
 GeneratedTests.less_equal_dynamic_float_nnfw
 GeneratedTests.log_4D_float_nnfw
@@ -111,11 +109,26 @@ GeneratedTests.not_equal_dynamic_float_nnfw
 GeneratedTests.one_hot_ex_dynamic_nnfw
 GeneratedTests.pack_ex_dynamic_nnfw
 GeneratedTests.pad_dynamic_nnfw
+GeneratedTests.pad_v2_1_float
+GeneratedTests.pad_v2_1_quant8
+GeneratedTests.pad_v2_all_dims
+GeneratedTests.pad_v2_all_dims_quant8
+GeneratedTests.pad_v2_low_rank
+GeneratedTests.pad_v2_low_rank_quant8
 GeneratedTests.pow_2D_float_nnfw
 GeneratedTests.pow_broadcast_float_nnfw
 GeneratedTests.pow_broadcast_float_nnfw_2
 GeneratedTests.pow_broadcast_float_nnfw_3
 GeneratedTests.pow_dynamic_nnfw
+GeneratedTests.quantize_quant8
+GeneratedTests.quantize_quant8_2
+GeneratedTests.quantize_quant8_3
+GeneratedTests.quantize_quant8_4
+GeneratedTests.quantize_quant8_5
+GeneratedTests.quantize_quant8_6
+GeneratedTests.quantize_quant8_7
+GeneratedTests.quantize_quant8_8
+GeneratedTests.quantize_zero_sized
 GeneratedTests.range_ex_float_1
 GeneratedTests.range_ex_float_1_all_constant_inputs
 GeneratedTests.range_ex_float_1_dynamic_nnfw
@@ -191,9 +204,21 @@ GeneratedTests.slice_zero_sized_quant8
 GeneratedTests.softmax_dynamic_nnfw
 GeneratedTests.space_to_batch_dynamic_float_nnfw
 GeneratedTests.split_dynamic_float_nnfw
+GeneratedTests.split_v_ex_1D_float_1_nnfw
+GeneratedTests.split_v_ex_1D_float_2_nnfw
+GeneratedTests.split_v_ex_1D_int32_nnfw
+GeneratedTests.split_v_ex_4D_float_1_nnfw
+GeneratedTests.split_v_ex_4D_float_2_nnfw
+GeneratedTests.split_v_ex_4D_float_3_nnfw
+GeneratedTests.split_v_ex_4D_float_4_nnfw
+GeneratedTests.split_v_ex_4D_int32_1_nnfw
+GeneratedTests.split_v_ex_4D_int32_2_nnfw
+GeneratedTests.split_v_ex_4D_int32_3_nnfw
+GeneratedTests.split_v_ex_4D_int32_4_nnfw
 GeneratedTests.sqrt_
 GeneratedTests.squared_difference_ex_dynamic_nnfw
 GeneratedTests.squeeze_dynamic_float_nnfw
+GeneratedTests.stateless_random_uniform_ex_nnfw
 GeneratedTests.strided_slice_dynamic_nnfw
 GeneratedTests.sub_dynamic_nnfw
 GeneratedTests.sub_v1_2_zero_sized
diff --git a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu
index 3cce4f3e3..e98007e08 100644
--- a/tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu
+++ b/tests/nnapi/nnapi_gtest.skip.armv7l-linux.cpu
@@ -1,7 +1,4 @@
 GeneratedTests.abs_
-GeneratedTests.batch_to_space
-GeneratedTests.batch_to_space_float_1
-GeneratedTests.batch_to_space_quant8_1
 GeneratedTests.cast_float16_to_float16
 GeneratedTests.cast_float16_to_float32
 GeneratedTests.cast_float16_to_float32_relaxed
@@ -38,9 +35,6 @@ GeneratedTests.gather_float16_8
 GeneratedTests.hashtable_lookup_float
 GeneratedTests.hashtable_lookup_float_4D_nnfw
 GeneratedTests.hashtable_lookup_quant8
-GeneratedTests.l2_normalization
-GeneratedTests.l2_normalization_2
-GeneratedTests.l2_normalization_large
 GeneratedTests.l2_pool_float
 GeneratedTests.l2_pool_float_2
 GeneratedTests.l2_pool_float_large
@@ -79,7 +73,6 @@ GeneratedTests.minimum_simple_quant8
 GeneratedTests.neg
 GeneratedTests.neg_3D_int_nnfw
 GeneratedTests.neg_4D_int_nnfw
-GeneratedTests.pad_quant8_nnfw
 GeneratedTests.prelu
 GeneratedTests.prelu_broadcast_float_1_nnfw
 GeneratedTests.prelu_broadcast_quant8_1_nnfw
@@ -94,6 +87,11 @@ GeneratedTests.prelu_weight_as_input_quant8
 GeneratedTests.prelu_weight_as_input_quant8_2
 GeneratedTests.prelu_weight_as_input_quant8_3
 GeneratedTests.prelu_weight_as_input_quant8_4
+GeneratedTests.quantize_quant8_5
+GeneratedTests.quantize_quant8_6
+GeneratedTests.quantize_quant8_7
+GeneratedTests.quantize_quant8_8
+GeneratedTests.quantize_zero_sized
 GeneratedTests.reduce_max_quant8
 GeneratedTests.reduce_max_quant8_1_nnfw
 GeneratedTests.reduce_max_quant8_2
@@ -108,14 +106,10 @@ GeneratedTests.relu1_float_1
 GeneratedTests.relu1_float_2
 GeneratedTests.relu1_quant8_1
 GeneratedTests.relu1_quant8_2
-GeneratedTests.relu6_float_1
-GeneratedTests.relu6_float_2
 GeneratedTests.relu6_quant8_1
 GeneratedTests.relu6_quant8_2
 GeneratedTests.relu_quant8_1
 GeneratedTests.relu_quant8_2
-GeneratedTests.resize_bilinear
-GeneratedTests.resize_bilinear_2
 GeneratedTests.rnn
 GeneratedTests.rnn_state
 GeneratedTests.rsqrt
@@ -125,15 +119,9 @@ GeneratedTests.select_v1_2_one_dim_quant8
 GeneratedTests.select_v1_2_two_dim_quant8
 GeneratedTests.slice_5
 GeneratedTests.slice_6
-GeneratedTests.slice_7
 GeneratedTests.slice_8
 GeneratedTests.slice_zero_sized
 GeneratedTests.slice_zero_sized_quant8
-GeneratedTests.space_to_depth_float_1
-GeneratedTests.space_to_depth_float_2
-GeneratedTests.space_to_depth_float_3
-GeneratedTests.space_to_depth_quant8_1
-GeneratedTests.space_to_depth_quant8_2
 GeneratedTests.sqrt_
 GeneratedTests.sqrt_1D_float_nnfw
 GeneratedTests.sqrt_2D_float_nnfw
diff --git a/tests/nnapi/nnapi_gtest.skip.noarch.interp b/tests/nnapi/nnapi_gtest.skip.noarch.interp
index 08118cac1..a0ae9d3fe 100644
--- a/tests/nnapi/nnapi_gtest.skip.noarch.interp
+++ b/tests/nnapi/nnapi_gtest.skip.noarch.interp
@@ -188,6 +188,7 @@ GeneratedTests.hashtable_lookup_quant8
 GeneratedTests.l2_normalization
 GeneratedTests.l2_normalization_2
 GeneratedTests.l2_normalization_large
+GeneratedTests.l2_normalization_quant8_nnfw
 GeneratedTests.l2_pool_float
 GeneratedTests.l2_pool_float_2
 GeneratedTests.l2_pool_float_large
@@ -312,6 +313,12 @@ GeneratedTests.pack_ex_2D_int_2
 GeneratedTests.pack_ex_dynamic_nnfw
 GeneratedTests.pad_dynamic_nnfw
 GeneratedTests.pad_quant8_nnfw
+GeneratedTests.pad_v2_1_float
+GeneratedTests.pad_v2_1_quant8
+GeneratedTests.pad_v2_all_dims
+GeneratedTests.pad_v2_all_dims_quant8
+GeneratedTests.pad_v2_low_rank
+GeneratedTests.pad_v2_low_rank_quant8
 GeneratedTests.pow_2D_float_nnfw
 GeneratedTests.pow_broadcast_float_nnfw
 GeneratedTests.pow_broadcast_float_nnfw_2
@@ -331,6 +338,15 @@ GeneratedTests.prelu_weight_as_input_quant8
 GeneratedTests.prelu_weight_as_input_quant8_2
 GeneratedTests.prelu_weight_as_input_quant8_3
 GeneratedTests.prelu_weight_as_input_quant8_4
+GeneratedTests.quantize_quant8
+GeneratedTests.quantize_quant8_2
+GeneratedTests.quantize_quant8_3
+GeneratedTests.quantize_quant8_4
+GeneratedTests.quantize_quant8_5
+GeneratedTests.quantize_quant8_6
+GeneratedTests.quantize_quant8_7
+GeneratedTests.quantize_quant8_8
+GeneratedTests.quantize_zero_sized
 GeneratedTests.range_ex_float_1
 GeneratedTests.range_ex_float_1_all_constant_inputs
 GeneratedTests.range_ex_float_1_dynamic_nnfw
@@ -407,6 +423,7 @@ GeneratedTests.relu_quant8_2
 GeneratedTests.reshape_dynamic_nnfw
 GeneratedTests.resize_bilinear
 GeneratedTests.resize_bilinear_2
+GeneratedTests.resize_bilinear_quant8_nnfw
 GeneratedTests.reverse_ex_1d
 GeneratedTests.reverse_ex_3d
 GeneratedTests.reverse_ex_dynamic_1D
@@ -499,6 +516,17 @@ GeneratedTests.split_quant8_2
 GeneratedTests.split_quant8_2_relaxed
 GeneratedTests.split_quant8_3
 GeneratedTests.split_quant8_4
+GeneratedTests.split_v_ex_1D_float_1_nnfw
+GeneratedTests.split_v_ex_1D_float_2_nnfw
+GeneratedTests.split_v_ex_1D_int32_nnfw
+GeneratedTests.split_v_ex_4D_float_1_nnfw
+GeneratedTests.split_v_ex_4D_float_2_nnfw
+GeneratedTests.split_v_ex_4D_float_3_nnfw
+GeneratedTests.split_v_ex_4D_float_4_nnfw
+GeneratedTests.split_v_ex_4D_int32_1_nnfw
+GeneratedTests.split_v_ex_4D_int32_2_nnfw
+GeneratedTests.split_v_ex_4D_int32_3_nnfw
+GeneratedTests.split_v_ex_4D_int32_4_nnfw
 GeneratedTests.sqrt_
 GeneratedTests.sqrt_1D_float_nnfw
 GeneratedTests.sqrt_2D_float_nnfw
@@ -518,6 +546,7 @@ GeneratedTests.squeeze_float_1
 GeneratedTests.squeeze_float_1_relaxed
 GeneratedTests.squeeze_quant8_1
 GeneratedTests.squeeze_relaxed
+GeneratedTests.stateless_random_uniform_ex_nnfw
 GeneratedTests.strided_slice
 GeneratedTests.strided_slice_dynamic_nnfw
 GeneratedTests.strided_slice_float_1
diff --git a/tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu b/tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu
index 3cce4f3e3..e98007e08 100644
--- a/tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu
+++ b/tests/nnapi/nnapi_gtest.skip.x86_64-linux.cpu
@@ -1,7 +1,4 @@
 GeneratedTests.abs_
-GeneratedTests.batch_to_space
-GeneratedTests.batch_to_space_float_1
-GeneratedTests.batch_to_space_quant8_1
 GeneratedTests.cast_float16_to_float16
 GeneratedTests.cast_float16_to_float32
 GeneratedTests.cast_float16_to_float32_relaxed
@@ -38,9 +35,6 @@ GeneratedTests.gather_float16_8
 GeneratedTests.hashtable_lookup_float
 GeneratedTests.hashtable_lookup_float_4D_nnfw
 GeneratedTests.hashtable_lookup_quant8
-GeneratedTests.l2_normalization
-GeneratedTests.l2_normalization_2
-GeneratedTests.l2_normalization_large
 GeneratedTests.l2_pool_float
 GeneratedTests.l2_pool_float_2
 GeneratedTests.l2_pool_float_large
@@ -79,7 +73,6 @@ GeneratedTests.minimum_simple_quant8
 GeneratedTests.neg
 GeneratedTests.neg_3D_int_nnfw
 GeneratedTests.neg_4D_int_nnfw
-GeneratedTests.pad_quant8_nnfw
 GeneratedTests.prelu
 GeneratedTests.prelu_broadcast_float_1_nnfw
 GeneratedTests.prelu_broadcast_quant8_1_nnfw
@@ -94,6 +87,11 @@ GeneratedTests.prelu_weight_as_input_quant8
 GeneratedTests.prelu_weight_as_input_quant8_2
 GeneratedTests.prelu_weight_as_input_quant8_3
 GeneratedTests.prelu_weight_as_input_quant8_4
+GeneratedTests.quantize_quant8_5
+GeneratedTests.quantize_quant8_6
+GeneratedTests.quantize_quant8_7
+GeneratedTests.quantize_quant8_8
+GeneratedTests.quantize_zero_sized
 GeneratedTests.reduce_max_quant8
 GeneratedTests.reduce_max_quant8_1_nnfw
 GeneratedTests.reduce_max_quant8_2
@@ -108,14 +106,10 @@ GeneratedTests.relu1_float_1
 GeneratedTests.relu1_float_2
 GeneratedTests.relu1_quant8_1
 GeneratedTests.relu1_quant8_2
-GeneratedTests.relu6_float_1
-GeneratedTests.relu6_float_2
 GeneratedTests.relu6_quant8_1
 GeneratedTests.relu6_quant8_2
 GeneratedTests.relu_quant8_1
 GeneratedTests.relu_quant8_2
-GeneratedTests.resize_bilinear
-GeneratedTests.resize_bilinear_2
 GeneratedTests.rnn
 GeneratedTests.rnn_state
 GeneratedTests.rsqrt
@@ -125,15 +119,9 @@ GeneratedTests.select_v1_2_one_dim_quant8
 GeneratedTests.select_v1_2_two_dim_quant8
 GeneratedTests.slice_5
 GeneratedTests.slice_6
-GeneratedTests.slice_7
 GeneratedTests.slice_8
 GeneratedTests.slice_zero_sized
 GeneratedTests.slice_zero_sized_quant8
-GeneratedTests.space_to_depth_float_1
-GeneratedTests.space_to_depth_float_2
-GeneratedTests.space_to_depth_float_3
-GeneratedTests.space_to_depth_quant8_1
-GeneratedTests.space_to_depth_quant8_2
 GeneratedTests.sqrt_
 GeneratedTests.sqrt_1D_float_nnfw
 GeneratedTests.sqrt_2D_float_nnfw
diff --git a/tests/nnapi/specs/Ex/split_v_ex_1D_float_1_nnfw.mod.py b/tests/nnapi/specs/Ex/split_v_ex_1D_float_1_nnfw.mod.py
new file mode 100644
index 000000000..6a2b716db
--- /dev/null
+++ b/tests/nnapi/specs/Ex/split_v_ex_1D_float_1_nnfw.mod.py
@@ -0,0 +1,47 @@
+# model
+model = Model()
+i1 = Input("op1", "TENSOR_FLOAT32", "{8}")
+size_splits = Input("size_splits", "TENSOR_INT32", "{8}")
+split_dim = Input("split_dim", "TENSOR_INT32", "{1}")
+num_splits = Int32Scalar("num_splits", 8)
+
+i2 = Output("op2", "TENSOR_FLOAT32", "{1}")
+i3 = Output("op3", "TENSOR_FLOAT32", "{1}")
+i4 = Output("op4", "TENSOR_FLOAT32", "{1}")
+i5 = Output("op5", "TENSOR_FLOAT32", "{1}")
+i6 = Output("op6", "TENSOR_FLOAT32", "{1}")
+i7 = Output("op7", "TENSOR_FLOAT32", "{1}")
+i8 = Output("op8", "TENSOR_FLOAT32", "{1}")
+i9 = Output("op9", "TENSOR_FLOAT32", "{1}")
+
+model = model.Operation("SPLIT_V_EX", i1, size_splits, split_dim, num_splits).To([i2, i3, i4, i5, i6, i7, i8, i9])
+
+# Example 1. Input in operand 0,
+input0 = {i1: # input 0
+          [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
+          size_splits:
+          [1, 1, 1, 1, 1, 1, 1, 1],
+          split_dim:
+          [0]
+          }
+
+output0 = {
+    i2: # output 0
+          [1.0],
+    i3: # output 1
+          [2.0],
+    i4: # output 2
+          [3.0],
+    i5: # output 3
+          [4.0],
+    i6: # output 4
+          [5.0],
+    i7: # output 5
+          [6.0],
+    i8: # output 6
+          [7.0],
+    i9: # output 7
+          [8.0]}
+
+# Instantiate an example
+Example((input0, output0))
diff --git a/tests/nnapi/specs/Ex/split_v_ex_1D_float_2_nnfw.mod.py b/tests/nnapi/specs/Ex/split_v_ex_1D_float_2_nnfw.mod.py
new file mode 100644
index 000000000..6224852ff
--- /dev/null
+++ b/tests/nnapi/specs/Ex/split_v_ex_1D_float_2_nnfw.mod.py
@@ -0,0 +1,25 @@
+# model
+input0 = Input("input0", "TENSOR_FLOAT32", "{12}")
+size_splits = Input("size_splits", "TENSOR_INT32", "{3}")
+split_dim = Input("split_dim", "TENSOR_INT32", "{1}") 
+num_splits = Int32Scalar("num_splits",3);
+
+output0 = Output("output0", "TENSOR_FLOAT32", "{3}")
+output1 = Output("output1", "TENSOR_FLOAT32", "{5}")
+output2 = Output("output2", "TENSOR_FLOAT32", "{4}")
+
+model = Model().Operation("SPLIT_V_EX", input0, size_splits, split_dim, num_splits).To((output0, output1, output2))
+
+# Example 1.
+input_dict = {
+    input0: [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0],
+    size_splits: [3, 5, 4],
+    split_dim: [0]
+}
+output_dict = {
+    output0: [1.0, 2.0, 3.0],
+    output1: [4.0, 5.0, 6.0, 7.0, 8.0],
+    output2: [9.0, 10.0, 11.0, 12.0]
+}
+
+Example((input_dict, output_dict))
diff --git a/tests/nnapi/specs/Ex/split_v_ex_1D_int32_nnfw.mod.py b/tests/nnapi/specs/Ex/split_v_ex_1D_int32_nnfw.mod.py
new file mode 100644
index 000000000..2dea4d613
--- /dev/null
+++ b/tests/nnapi/specs/Ex/split_v_ex_1D_int32_nnfw.mod.py
@@ -0,0 +1,47 @@
+# model
+model = Model()
+i1 = Input("op1", "TENSOR_INT32", "{8}")
+size_splits = Input("size_splits", "TENSOR_INT32", "{8}")
+split_dim = Input("split_dim", "TENSOR_INT32", "{1}")
+num_splits = Int32Scalar("num_splits", 8)
+
+i2 = Output("op2", "TENSOR_INT32", "{1}")
+i3 = Output("op3", "TENSOR_INT32", "{1}")
+i4 = Output("op4", "TENSOR_INT32", "{1}")
+i5 = Output("op5", "TENSOR_INT32", "{1}")
+i6 = Output("op6", "TENSOR_INT32", "{1}")
+i7 = Output("op7", "TENSOR_INT32", "{1}")
+i8 = Output("op8", "TENSOR_INT32", "{1}")
+i9 = Output("op9", "TENSOR_INT32", "{1}")
+
+model = model.Operation("SPLIT_V_EX", i1, size_splits, split_dim, num_splits).To([i2, i3, i4, i5, i6, i7, i8, i9])
+
+# Example 1. Input in operand 0,
+input0 = {i1: # input 0
+          [1, 2, 3, 4, 5, 6, 7, 8],
+          size_splits:
+          [1, 1, 1, 1, 1, 1, 1, 1],
+          split_dim:
+          [0]
+          }
+
+output0 = {
+    i2: # output 0
+          [1],
+    i3: # output 1
+          [2],
+    i4: # output 2
+          [3],
+    i5: # output 3
+          [4],
+    i6: # output 4
+          [5],
+    i7: # output 5
+          [6],
+    i8: # output 6
+          [7],
+    i9: # output 7
+          [8]}
+
+# Instantiate an example
+Example((input0, output0))
diff --git a/tests/nnapi/specs/Ex/split_v_ex_4D_float_1_nnfw.mod.py b/tests/nnapi/specs/Ex/split_v_ex_4D_float_1_nnfw.mod.py
new file mode 100644
index 000000000..c53ae1c0b
--- /dev/null
+++ b/tests/nnapi/specs/Ex/split_v_ex_4D_float_1_nnfw.mod.py
@@ -0,0 +1,28 @@
+# model
+model = Model()
+i1 = Input("op1", "TENSOR_FLOAT32", "{2,2,2,2}")
+size_splits = Input("size_splits", "TENSOR_INT32", "{2}")
+split_dim = Input("split_dim", "TENSOR_INT32", "{1}")
+num_splits = Int32Scalar("num_splits", 2)
+
+i2 = Output("op2", "TENSOR_FLOAT32", "{1,2,2,2}")
+i3 = Output("op3", "TENSOR_FLOAT32", "{1,2,2,2}")
+model = model.Operation("SPLIT_V_EX", i1, size_splits, split_dim, num_splits).To([i2, i3])
+
+# Example 1. Input in operand 0,
+input0 = {i1: # input 0
+          [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0],
+          size_splits:
+          [8, 8],
+          split_dim:
+          [0]
+          }
+
+output0 = {
+    i2: # output 0
+          [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
+    i3: # output 1
+          [9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0]}
+
+# Instantiate an example
+Example((input0, output0))
diff --git a/tests/nnapi/specs/Ex/split_v_ex_4D_float_2_nnfw.mod.py b/tests/nnapi/specs/Ex/split_v_ex_4D_float_2_nnfw.mod.py
new file mode 100644
index 000000000..593412d08
--- /dev/null
+++ b/tests/nnapi/specs/Ex/split_v_ex_4D_float_2_nnfw.mod.py
@@ -0,0 +1,27 @@
+# model
+model = Model()
+i1 = Input("op1", "TENSOR_FLOAT32", "{2,2,2,2}")
+size_splits = Input("size_splits", "TENSOR_INT32", "{2}")
+split_dim = Input("split_dim", "TENSOR_INT32", "{1}")
+num_splits = Int32Scalar("num_splits", 2)
+
+i2 = Output("op2", "TENSOR_FLOAT32", "{2,2,2,1}")
+i3 = Output("op3", "TENSOR_FLOAT32", "{2,2,2,1}")
+model = model.Operation("SPLIT_V_EX", i1, size_splits, split_dim, num_splits).To([i2, i3])
+
+# Example 1. Input in operand 0,
+input0 = {i1: # input 0
+          [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0],
+          size_splits:
+          [8, 8],
+          split_dim:
+          [3]}
+
+output0 = {
+    i2: # output 0
+          [1.0, 3.0, 5.0, 7.0, 9.0, 11.0, 13.0, 15.0],
+    i3: # output 1
+          [2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0]}
+
+# Instantiate an example
+Example((input0, output0))
diff --git a/tests/nnapi/specs/Ex/split_v_ex_4D_float_3_nnfw.mod.py b/tests/nnapi/specs/Ex/split_v_ex_4D_float_3_nnfw.mod.py
new file mode 100644
index 000000000..ef77536c6
--- /dev/null
+++ b/tests/nnapi/specs/Ex/split_v_ex_4D_float_3_nnfw.mod.py
@@ -0,0 +1,28 @@
+# model
+model = Model()
+i1 = Input("op1", "TENSOR_FLOAT32", "{2,2,2,2}")
+size_splits = Input("size_splits", "TENSOR_INT32", "{2}")
+split_dim = Input("split_dim", "TENSOR_INT32", "{1}")
+num_splits = Int32Scalar("num_splits", 2)
+
+i2 = Output("op2", "TENSOR_FLOAT32", "{1,2,2,2}")
+i3 = Output("op3", "TENSOR_FLOAT32", "{1,2,2,2}")
+model = model.Operation("SPLIT_V_EX", i1, size_splits, split_dim, num_splits).To([i2, i3])
+
+# Example 1. Input in operand 0,
+input0 = {i1: # input 0
+          [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0],
+          size_splits:
+          [8, 8],
+          split_dim:
+          [-4]
+          }
+
+output0 = {
+    i2: # output 0
+          [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0],
+    i3: # output 1
+          [9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0]}
+
+# Instantiate an example
+Example((input0, output0))
diff --git a/tests/nnapi/specs/Ex/split_v_ex_4D_float_4_nnfw.mod.py b/tests/nnapi/specs/Ex/split_v_ex_4D_float_4_nnfw.mod.py
new file mode 100644
index 000000000..b995f9e03
--- /dev/null
+++ b/tests/nnapi/specs/Ex/split_v_ex_4D_float_4_nnfw.mod.py
@@ -0,0 +1,32 @@
+# model
+model = Model()
+i1 = Input("op1", "TENSOR_FLOAT32", "{4,1,1,8}")
+size_splits = Input("size_splits", "TENSOR_INT32", "{3}")
+split_dim = Input("split_dim", "TENSOR_INT32", "{1}")
+num_splits = Int32Scalar("num_splits", 3)
+
+i2 = Output("op2", "TENSOR_FLOAT32", "{4,1,1,2}")
+i3 = Output("op3", "TENSOR_FLOAT32", "{4,1,1,4}")
+i4 = Output("op4", "TENSOR_FLOAT32", "{4,1,1,2}")
+
+model = model.Operation("SPLIT_V_EX", i1, size_splits, split_dim, num_splits).To([i2, i3, i4])
+
+# Example 1. Input in operand 0,
+input0 = {i1: # input 0
+          [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0,
+            17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0],
+          size_splits:
+          [2,4,2],
+          split_dim:
+          [3]
+          }
+
+output0 = {
+    i2: # output 0
+          [1.0, 2.0, 9.0, 10.0, 17.0, 18.0, 25.0, 26.0],
+    i3: # output 1
+          [3.0, 4.0, 5.0, 6.0, 11.0, 12.0, 13.0, 14.0, 19.0, 20.0, 21.0, 22.0, 27.0, 28.0, 29.0, 30.0],
+    i4: [7.0, 8.0, 15.0, 16.0, 23.0, 24.0, 31.0, 32.0]}
+
+# Instantiate an example
+Example((input0, output0))
diff --git a/tests/nnapi/specs/Ex/split_v_ex_4D_int32_1_nnfw.mod.py b/tests/nnapi/specs/Ex/split_v_ex_4D_int32_1_nnfw.mod.py
new file mode 100644
index 000000000..f544d0afa
--- /dev/null
+++ b/tests/nnapi/specs/Ex/split_v_ex_4D_int32_1_nnfw.mod.py
@@ -0,0 +1,27 @@
+# model
+model = Model()
+i1 = Input("op1", "TENSOR_INT32", "{2,2,2,2}")
+size_splits = Input("size_splits", "TENSOR_INT32", "{2}")
+split_dim = Input("split_dim", "TENSOR_INT32", "{1}")
+num_splits = Int32Scalar("num_splits", 2)
+
+i2 = Output("op2", "TENSOR_INT32", "{1,2,2,2}")
+i3 = Output("op3", "TENSOR_INT32", "{1,2,2,2}")
+model = model.Operation("SPLIT_V_EX", i1, size_splits, split_dim, num_splits).To([i2, i3])
+
+# Example 1. Input in operand 0,
+input0 = {i1: # input 0
+          [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+          size_splits:
+          [8, 8],
+          split_dim:
+          [0]}
+
+output0 = {
+    i2: # output 0
+          [1, 2, 3, 4, 5, 6, 7, 8],
+    i3: # output 1
+           [9, 10, 11, 12, 13, 14, 15, 16]}
+
+# Instantiate an example
+Example((input0, output0))
diff --git a/tests/nnapi/specs/Ex/split_v_ex_4D_int32_2_nnfw.mod.py b/tests/nnapi/specs/Ex/split_v_ex_4D_int32_2_nnfw.mod.py
new file mode 100644
index 000000000..5ed016542
--- /dev/null
+++ b/tests/nnapi/specs/Ex/split_v_ex_4D_int32_2_nnfw.mod.py
@@ -0,0 +1,28 @@
+# model
+model = Model()
+i1 = Input("op1", "TENSOR_INT32", "{2,2,2,2}")
+size_splits = Input("size_splits", "TENSOR_INT32", "{2}")
+split_dim = Input("split_dim", "TENSOR_INT32", "{1}")
+num_splits = Int32Scalar("num_splits", 2)
+
+i2 = Output("op2", "TENSOR_INT32", "{2,1,2,2}")
+i3 = Output("op3", "TENSOR_INT32", "{2,1,2,2}")
+
+model = model.Operation("SPLIT_V_EX", i1, size_splits, split_dim, num_splits).To([i2, i3])
+
+# Example 1. Input in operand 0,
+input0 = {i1: # input 0
+          [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+          size_splits:
+          [8, 8],
+          split_dim:
+          [1]}
+
+output0 = {
+    i2: # output 0
+          [1, 2, 3, 4, 9, 10, 11, 12],
+    i3: # output 1
+            [5, 6, 7, 8, 13, 14, 15, 16]}
+
+# Instantiate an example
+Example((input0, output0))
diff --git a/tests/nnapi/specs/Ex/split_v_ex_4D_int32_3_nnfw.mod.py b/tests/nnapi/specs/Ex/split_v_ex_4D_int32_3_nnfw.mod.py
new file mode 100644
index 000000000..99f3b4f6d
--- /dev/null
+++ b/tests/nnapi/specs/Ex/split_v_ex_4D_int32_3_nnfw.mod.py
@@ -0,0 +1,28 @@
+# model
+model = Model()
+i1 = Input("op1", "TENSOR_INT32", "{2,2,2,2}")
+
+size_splits = Input("size_splits", "TENSOR_INT32", "{2}")
+split_dim = Input("split_dim", "TENSOR_INT32", "{1}")
+num_splits = Int32Scalar("num_splits", 2)
+
+i2 = Output("op2", "TENSOR_INT32", "{2,2,1,2}")
+i3 = Output("op3", "TENSOR_INT32", "{2,2,1,2}")
+model = model.Operation("SPLIT_V_EX", i1, size_splits, split_dim, num_splits).To([i2, i3])
+
+# Example 1. Input in operand 0,
+input0 = {i1: # input 0
+          [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+          size_splits:
+          [8, 8],
+          split_dim:
+          [2]}
+
+output0 = {
+    i2: # output 0
+          [1, 2, 5, 6, 9, 10, 13, 14],
+    i3: # output 1
+            [3, 4, 7, 8, 11, 12, 15, 16]}
+
+# Instantiate an example
+Example((input0, output0))
diff --git a/tests/nnapi/specs/Ex/split_v_ex_4D_int32_4_nnfw.mod.py b/tests/nnapi/specs/Ex/split_v_ex_4D_int32_4_nnfw.mod.py
new file mode 100644
index 000000000..38ae4dad2
--- /dev/null
+++ b/tests/nnapi/specs/Ex/split_v_ex_4D_int32_4_nnfw.mod.py
@@ -0,0 +1,28 @@
+# model
+model = Model()
+i1 = Input("op1", "TENSOR_INT32", "{2,2,2,2}")
+
+size_splits = Input("size_splits", "TENSOR_INT32", "{2}")
+split_dim = Input("split_dim", "TENSOR_INT32", "{1}")
+num_splits = Int32Scalar("num_splits", 2)
+
+i2 = Output("op2", "TENSOR_INT32", "{2,2,2,1}")
+i3 = Output("op3", "TENSOR_INT32", "{2,2,2,1}")
+model = model.Operation("SPLIT_V_EX", i1, size_splits, split_dim, num_splits).To([i2, i3])
+
+# Example 1. Input in operand 0,
+input0 = {i1: # input 0
+          [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+          size_splits:
+          [8, 8],
+          split_dim:
+          [3]}
+
+output0 = {
+    i2: # output 0
+          [1, 3, 5, 7, 9, 11, 13, 15],
+    i3: # output 1
+            [2, 4, 6, 8, 10, 12, 14, 16]}
+
+# Instantiate an example
+Example((input0, output0))
diff --git a/tests/nnapi/specs/Ex/stateless_random_uniform_ex_nnfw.mod.py b/tests/nnapi/specs/Ex/stateless_random_uniform_ex_nnfw.mod.py
new file mode 100644
index 000000000..9c2955503
--- /dev/null
+++ b/tests/nnapi/specs/Ex/stateless_random_uniform_ex_nnfw.mod.py
@@ -0,0 +1,40 @@
+#
+# Copyright (C) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+model = Model()
+
+i1 = Input("input1", "TENSOR_INT32", "{1}")
+i2 = Input("input2", "TENSOR_INT32", "{2}")
+
+o1 = Output("output0", "TENSOR_FLOAT32", "{10}")
+
+model = model.Operation("STATELESS_RANDOM_UNIFORM_EX", i1, i2).To(o1)
+
+# Example.
+input0 = {
+  i1 : [10],  #input1
+  i2 : [1, 1] #input2
+}
+
+output0 = {
+  o1: [0.09827709, 0.14063823, 0.4553436,
+      0.10658443, 0.2075988, 0.30841374,
+      0.7489233, 0.90613365, 0.63342273, 
+      0.37854457]
+}
+
+Example((input0, output0))
diff --git a/tests/nnapi/specs/V1_0/l2_normalization_quant8_nnfw.mod.py b/tests/nnapi/specs/V1_0/l2_normalization_quant8_nnfw.mod.py
new file mode 100644
index 000000000..ca3770cb0
--- /dev/null
+++ b/tests/nnapi/specs/V1_0/l2_normalization_quant8_nnfw.mod.py
@@ -0,0 +1,30 @@
+#
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+# Copyright (C) 2017 The Android Open Source Project
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+model = Model()
+in0 = Input("op1", "TENSOR_QUANT8_ASYMM", "{1, 1, 1, 3}, 2e-7, 128")
+out0 = Output("op2", "TENSOR_QUANT8_ASYMM", "{1, 1, 1, 3}, 2e-7, 128")
+model = model.Operation("L2_NORMALIZATION", in0).To(out0)
+
+# Example 1. Input in operand 0,
+input0 = {in0: # input 0
+          [0, 5, 12]}
+output0 = {out0: # output 0
+               [51, 54, 58]}
+
+# Instantiate an example
+Example((input0, output0))
diff --git a/tests/nnapi/specs/V1_0/resize_bilinear_quant8_nnfw.mod.py b/tests/nnapi/specs/V1_0/resize_bilinear_quant8_nnfw.mod.py
new file mode 100644
index 000000000..182e046b0
--- /dev/null
+++ b/tests/nnapi/specs/V1_0/resize_bilinear_quant8_nnfw.mod.py
@@ -0,0 +1,18 @@
+# model
+model = Model()
+i1 = Input("op1", "TENSOR_QUANT8_ASYMM", "{1, 2, 2, 1}, 0.8, 5")
+i2 = Output("op2", "TENSOR_QUANT8_ASYMM", "{1, 3, 3, 1}, 0.8, 5")
+w = Int32Scalar("width", 3)
+h = Int32Scalar("height", 3)
+model = model.Operation("RESIZE_BILINEAR", i1, w, h).To(i2)
+
+# Example 1. Input in operand 0,
+input0 = {i1: # input 0
+          [1, 1, 2, 2]}
+output0 = {i2: # output 0
+           [1, 1, 1,
+            2, 2, 2,
+            2, 2, 2]}
+
+# Instantiate an example
+Example((input0, output0))
diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_1_float.mod.py b/tests/nnapi/specs/V1_2/pad_v2_1_float.mod.py
index c500741c2..c500741c2 100644
--- a/tests/nnapi/specs/skip/V1_2/pad_v2_1_float.mod.py
+++ b/tests/nnapi/specs/V1_2/pad_v2_1_float.mod.py
diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_1_quant8.mod.py b/tests/nnapi/specs/V1_2/pad_v2_1_quant8.mod.py
index 3dfaff64b..3dfaff64b 100644
--- a/tests/nnapi/specs/skip/V1_2/pad_v2_1_quant8.mod.py
+++ b/tests/nnapi/specs/V1_2/pad_v2_1_quant8.mod.py
diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_all_dims.mod.py b/tests/nnapi/specs/V1_2/pad_v2_all_dims.mod.py
index 5b27f4963..5b27f4963 100644
--- a/tests/nnapi/specs/skip/V1_2/pad_v2_all_dims.mod.py
+++ b/tests/nnapi/specs/V1_2/pad_v2_all_dims.mod.py
diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_all_dims_quant8.mod.py b/tests/nnapi/specs/V1_2/pad_v2_all_dims_quant8.mod.py
index 5ee4b06d7..5ee4b06d7 100644
--- a/tests/nnapi/specs/skip/V1_2/pad_v2_all_dims_quant8.mod.py
+++ b/tests/nnapi/specs/V1_2/pad_v2_all_dims_quant8.mod.py
diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_low_rank.mod.py b/tests/nnapi/specs/V1_2/pad_v2_low_rank.mod.py
index 391d5cfb6..391d5cfb6 100644
--- a/tests/nnapi/specs/skip/V1_2/pad_v2_low_rank.mod.py
+++ b/tests/nnapi/specs/V1_2/pad_v2_low_rank.mod.py
diff --git a/tests/nnapi/specs/skip/V1_2/pad_v2_low_rank_quant8.mod.py b/tests/nnapi/specs/V1_2/pad_v2_low_rank_quant8.mod.py
index b67c2b834..b67c2b834 100644
--- a/tests/nnapi/specs/skip/V1_2/pad_v2_low_rank_quant8.mod.py
+++ b/tests/nnapi/specs/V1_2/pad_v2_low_rank_quant8.mod.py
diff --git a/tests/nnapi/specs/skip/V1_2/quantize.mod.py b/tests/nnapi/specs/V1_2/quantize.mod.py
index a42624dce..a42624dce 100644
--- a/tests/nnapi/specs/skip/V1_2/quantize.mod.py
+++ b/tests/nnapi/specs/V1_2/quantize.mod.py
diff --git a/tests/nnfw_api/CMakeLists.txt b/tests/nnfw_api/CMakeLists.txt
index e8d46cbae..6e0696d34 100644
--- a/tests/nnfw_api/CMakeLists.txt
+++ b/tests/nnfw_api/CMakeLists.txt
@@ -21,5 +21,6 @@ target_include_directories(${RUNTIME_NNFW_API_TEST} PRIVATE ${RUNTIME_NNFW_API_T
 target_link_libraries(${RUNTIME_NNFW_API_TEST} nnfw-dev)
 target_link_libraries(${RUNTIME_NNFW_API_TEST} gtest gmock)
 target_link_libraries(${RUNTIME_NNFW_API_TEST} ${LIB_PTHREAD} dl)
+target_link_libraries(${RUNTIME_NNFW_API_TEST} circle_schema)
 
 install(TARGETS ${RUNTIME_NNFW_API_TEST} DESTINATION unittest_standalone)
diff --git a/tests/nnfw_api/README.md b/tests/nnfw_api/README.md
index 25fbc6e13..7e14fc445 100644
--- a/tests/nnfw_api/README.md
+++ b/tests/nnfw_api/README.md
@@ -6,6 +6,8 @@ This test framework consists of 3 kinds of tests:
 
 - Validation Tests (fixture format `ValidationTest???`)
     - Basic positive/negative tests with simple nnpackages
+- Generated Model Tests (fixture format `GenModelTest`)
+    - One-time inference test with variety of generated models
 - Regression Tests (fixture format `RegressionTest`, test format `GitHub###`)
     - When you see bugs/crashes while using those API
     - Must refer a github issue
diff --git a/tests/nnfw_api/src/CircleGen.h b/tests/nnfw_api/src/CircleGen.h
new file mode 100644
index 000000000..899c800b8
--- /dev/null
+++ b/tests/nnfw_api/src/CircleGen.h
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __NNFW_API_TEST_CIRCLE_GEN_H__
+#define __NNFW_API_TEST_CIRCLE_GEN_H__
+
+#include <circle_schema_generated.h>
+
+#include <vector>
+
+/**
+ * @brief Class for storing flatbuffer buffer
+ *
+ * This is a simple wrapper for a finished FlatBufferBuilder. It owns the buffer and a user can
+ * get the buffer pointer and size.
+ */
+class CircleBuffer
+{
+public:
+  CircleBuffer() = default;
+  explicit CircleBuffer(flatbuffers::FlatBufferBuilder &&fbb) : _fbb{std::move(fbb)}
+  {
+    _fbb.Finished(); // The build must have been finished, so check that here
+  }
+
+  uint8_t *buffer() { return _fbb.GetBufferPointer(); }
+  size_t size() { return _fbb.GetSize(); }
+
+private:
+  flatbuffers::FlatBufferBuilder _fbb;
+};
+
+/**
+ * @brief Circle flatbuffer file generator
+ *
+ * This is a helper class for generating circle file.
+ *
+ */
+class CircleGen
+{
+public:
+  struct TensorParams
+  {
+    std::vector<int32_t> shape;
+    circle::TensorType tensor_type = circle::TensorType::TensorType_FLOAT32;
+    uint32_t buffer = 0;
+    std::string name;
+  };
+
+  struct OperatorParams
+  {
+    std::vector<int32_t> inputs;
+    std::vector<int32_t> outputs;
+    int version = 1;
+  };
+
+public:
+  CircleGen()
+  {
+    // 0th buffer is always the empty buffer for non-const tensors
+    addBuffer(nullptr, 0);
+  }
+
+  template <typename T> uint32_t addBuffer(const std::vector<T> &buf_vec)
+  {
+    auto buf = reinterpret_cast<const uint8_t *>(buf_vec.data());
+    auto size = buf_vec.size() * sizeof(T);
+    return addBuffer(buf, size);
+  }
+
+  uint32_t addBuffer(const uint8_t *buf, size_t size)
+  {
+    uint32_t ind = _buffers.size();
+    _buffers.emplace_back(buildBuffer(buf, size));
+    return ind;
+  }
+
+  uint32_t addTensor(const TensorParams &params)
+  {
+    int ind = _tensors.size();
+    _tensors.emplace_back(buildTensor(params));
+    return ind;
+  }
+
+  uint32_t setInputsAndOutputs(const std::vector<int> &inputs, const std::vector<int> &outputs)
+  {
+    _inputs = inputs;
+    _outputs = outputs;
+  }
+
+  CircleBuffer finish()
+  {
+    // TODO Support multiple subgraphs, for now only single subgraph model is supported.
+    std::vector<flatbuffers::Offset<circle::SubGraph>> subgraphs{buildSubGraph()};
+    auto model =
+        circle::CreateModelDirect(_fbb, 3, &_opcodes, &subgraphs, "CircleGen generated", &_buffers);
+    _fbb.Finish(model);
+    return CircleBuffer{std::move(_fbb)};
+  }
+
+  // ===== Add Operator methods begin =====
+
+  uint32_t addOperatorAdd(const OperatorParams &params, circle::ActivationFunctionType actfn)
+  {
+    auto options = circle::CreateAddOptions(_fbb, actfn).Union();
+    return addOperatorWithOptions(params, circle::BuiltinOperator_ADD,
+                                  circle::BuiltinOptions_AddOptions, options);
+  }
+
+  uint32_t addOperatorAveragePool2D(const OperatorParams &params, circle::Padding padding,
+                                    int stride_w, int stride_h, int filter_w, int filter_h,
+                                    circle::ActivationFunctionType actfn)
+  {
+    auto options =
+        circle::CreatePool2DOptions(_fbb, padding, stride_w, stride_h, filter_w, filter_h, actfn)
+            .Union();
+    return addOperatorWithOptions(params, circle::BuiltinOperator_AVERAGE_POOL_2D,
+                                  circle::BuiltinOptions_Pool2DOptions, options);
+  }
+
+  // NOTE Please add addOperator functions ABOVE this lie
+  //
+  // %  How to add a new addOperatorXXX fuction
+  // 0. Copy code from one of the existing addOperatorXXX function
+  // 1. Change the function signature (need BuiltinOperator params)
+  // 2. Change enum BuiltinOperator
+  // 3. Change enum BuiltinOptions
+  // 4. Change CreateXXXOptions accordingly
+
+  // ===== Add Operator methods end =====
+
+private:
+  uint32_t addOperatorWithOptions(const OperatorParams &params, circle::BuiltinOperator opcode,
+                                  circle::BuiltinOptions options_type,
+                                  flatbuffers::Offset<void> options)
+  {
+    uint32_t opcode_ind = addOperatorCode(opcode);
+    auto op = circle::CreateOperatorDirect(_fbb, opcode_ind, &params.inputs, &params.outputs,
+                                           options_type, options);
+
+    uint32_t ind = _operators.size();
+    _operators.emplace_back(op);
+    return ind;
+  }
+
+  uint32_t addOperatorCode(circle::BuiltinOperator opcode)
+  {
+    // TODO If the same OperatorCode is registered already, just return it
+    uint32_t ind = _opcodes.size();
+    _opcodes.emplace_back(circle::CreateOperatorCode(_fbb, opcode));
+    return ind;
+  }
+
+  flatbuffers::Offset<circle::Buffer> buildBuffer(const uint8_t *buf, size_t size)
+  {
+    if (buf == nullptr && size == 0)
+      return circle::CreateBuffer(_fbb);
+    auto buffer = _fbb.CreateVector(buf, size);
+    return circle::CreateBuffer(_fbb, buffer);
+  }
+
+  flatbuffers::Offset<circle::Tensor> buildTensor(const TensorParams &params)
+  {
+    auto shape = _fbb.CreateVector(params.shape);
+    auto name = _fbb.CreateString(params.name);
+    return circle::CreateTensor(_fbb, shape, params.tensor_type, params.buffer, name,
+                                0 /* QuantParam */, false /* is_variable */, 0 /* sparsity */,
+                                0 /* shape_signature */);
+  }
+
+  flatbuffers::Offset<circle::SubGraph> buildSubGraph()
+  {
+    return circle::CreateSubGraphDirect(_fbb, &_tensors, &_inputs, &_outputs, &_operators, nullptr);
+  }
+
+private:
+  flatbuffers::FlatBufferBuilder _fbb{1024};
+  std::vector<flatbuffers::Offset<circle::Buffer>> _buffers;
+  std::vector<flatbuffers::Offset<circle::OperatorCode>> _opcodes;
+
+  // per-subgraph
+  std::vector<int> _inputs;
+  std::vector<int> _outputs;
+  std::vector<flatbuffers::Offset<circle::Tensor>> _tensors;
+  std::vector<flatbuffers::Offset<circle::Operator>> _operators;
+};
+
+#endif // __NNFW_API_TEST_CIRCLE_GEN_H__
diff --git a/tests/nnfw_api/src/GenModelTests.cc b/tests/nnfw_api/src/GenModelTests.cc
new file mode 100644
index 000000000..2bd839a78
--- /dev/null
+++ b/tests/nnfw_api/src/GenModelTests.cc
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <nnfw_internal.h>
+
+#include <fstream>
+
+#include "CircleGen.h"
+#include "fixtures.h"
+
+/**
+ * @brief Generated Model test fixture for a one time inference
+ *
+ * This fixture is for one-time inference test with variety of generated models.
+ * It is the user's responsiblity to create @c _cbuf , @c _ref_inputs and @c _ref_outputs in the
+ * test body, which are generated circle buffer, model input data and output data respectively.
+ * The rest(calling API functions for execution) is done by @c Setup and @c TearDown .
+ *
+ */
+class GenModelTest : public ::testing::Test
+{
+protected:
+  void SetUp() override { NNFW_ENSURE_SUCCESS(nnfw_create_session(&_so.session)); }
+
+  void TearDown() override
+  {
+    NNFW_ENSURE_SUCCESS(nnfw_load_circle_from_buffer(_so.session, _cbuf.buffer(), _cbuf.size()));
+    NNFW_ENSURE_SUCCESS(nnfw_prepare(_so.session));
+
+    // In/Out buffer settings
+    {
+      uint32_t num_inputs;
+      NNFW_ENSURE_SUCCESS(nnfw_input_size(_so.session, &num_inputs));
+      _so.inputs.resize(num_inputs);
+      for (uint32_t ind = 0; ind < _so.inputs.size(); ind++)
+      {
+        nnfw_tensorinfo ti;
+        NNFW_ENSURE_SUCCESS(nnfw_input_tensorinfo(_so.session, ind, &ti));
+        uint64_t input_elements = num_elems(&ti);
+        _so.inputs[ind].resize(input_elements);
+
+        ASSERT_EQ(nnfw_set_input(_so.session, ind, ti.dtype, _so.inputs[ind].data(),
+                                 sizeof(float) * input_elements),
+                  NNFW_STATUS_NO_ERROR);
+      }
+
+      uint32_t num_outputs;
+      NNFW_ENSURE_SUCCESS(nnfw_output_size(_so.session, &num_outputs));
+      _so.outputs.resize(num_outputs);
+      for (uint32_t ind = 0; ind < _so.outputs.size(); ind++)
+      {
+        nnfw_tensorinfo ti;
+        NNFW_ENSURE_SUCCESS(nnfw_output_tensorinfo(_so.session, ind, &ti));
+        uint64_t output_elements = num_elems(&ti);
+        _so.outputs[ind].resize(output_elements);
+        ASSERT_EQ(nnfw_set_output(_so.session, ind, ti.dtype, _so.outputs[ind].data(),
+                                  sizeof(float) * output_elements),
+                  NNFW_STATUS_NO_ERROR);
+      }
+    }
+
+    // Set input values, run, and check output values
+    {
+      ASSERT_EQ(_so.inputs.size(), _ref_inputs.size());
+      for (uint32_t i = 0; i < _so.inputs.size(); i++)
+      {
+        // Fill the values
+        ASSERT_EQ(_so.inputs[i].size(), _ref_inputs[i].size());
+        memcpy(_so.inputs[i].data(), _ref_inputs[i].data(), _so.inputs[i].size() * sizeof(float));
+      }
+
+      NNFW_ENSURE_SUCCESS(nnfw_run(_so.session));
+
+      ASSERT_EQ(_so.outputs.size(), _ref_outputs.size());
+      for (uint32_t i = 0; i < _so.outputs.size(); i++)
+      {
+        // Check output tensor values
+        auto &ref_output = _ref_outputs[i];
+        auto &output = _so.outputs[i];
+        ASSERT_EQ(output.size(), ref_output.size());
+        for (uint32_t e = 0; e < ref_output.size(); e++)
+          ASSERT_FLOAT_EQ(ref_output[e], output[e]);
+      }
+    }
+
+    NNFW_ENSURE_SUCCESS(nnfw_close_session(_so.session));
+  }
+
+protected:
+  SessionObject _so;
+  CircleBuffer _cbuf;
+  std::vector<std::vector<float>> _ref_inputs;
+  std::vector<std::vector<float>> _ref_outputs;
+};
+
+TEST_F(GenModelTest, OneOp_Add_VarToConst)
+{
+  CircleGen cgen;
+  std::vector<float> rhs_data{5, 4, 7, 4};
+  uint32_t rhs_buf = cgen.addBuffer(rhs_data);
+  int lhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int rhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32, rhs_buf});
+  int out = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorAdd({{lhs, rhs}, {out}}, circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({lhs}, {out});
+  _cbuf = cgen.finish();
+
+  _ref_inputs = {{1, 3, 2, 4}};
+  _ref_outputs = {{6, 7, 9, 8}};
+}
+
+TEST_F(GenModelTest, OneOp_Add_VarToVar)
+{
+  CircleGen cgen;
+  int lhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int rhs = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorAdd({{lhs, rhs}, {out}}, circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({lhs, rhs}, {out});
+  _cbuf = cgen.finish();
+
+  _ref_inputs = {{1, 3, 2, 4}, {5, 4, 7, 4}};
+  _ref_outputs = {{6, 7, 9, 8}};
+}
+
+TEST_F(GenModelTest, OneOp_AvgPool2D)
+{
+  CircleGen cgen;
+  int in = cgen.addTensor({{1, 2, 2, 1}, circle::TensorType::TensorType_FLOAT32});
+  int out = cgen.addTensor({{1, 1, 1, 1}, circle::TensorType::TensorType_FLOAT32});
+  cgen.addOperatorAveragePool2D({{in}, {out}}, circle::Padding_SAME, 2, 2, 2, 2,
+                                circle::ActivationFunctionType_NONE);
+  cgen.setInputsAndOutputs({in}, {out});
+  _cbuf = cgen.finish();
+
+  _ref_inputs = {{1, 3, 2, 4}};
+  _ref_outputs = {{2.5}};
+}
diff --git a/tests/nnfw_api/src/ModelTestDynamicTensor.cc b/tests/nnfw_api/src/ModelTestDynamicTensor.cc
index 2f9ef318c..c1f4369d6 100644
--- a/tests/nnfw_api/src/ModelTestDynamicTensor.cc
+++ b/tests/nnfw_api/src/ModelTestDynamicTensor.cc
@@ -15,7 +15,7 @@
  */
 
 #include <gtest/gtest.h>
-#include <nnfw_debug.h>
+#include <nnfw_internal.h>
 
 #include "common.h"
 #include "fixtures.h"
@@ -67,22 +67,22 @@ protected:
   {
     NNFW_STATUS res = nnfw_set_input(_session, 0, NNFW_TYPE_TENSOR_INT32, new_shape.data(),
                                      sizeof(int) * new_shape.size());
-    ASSERT_EQ(res, NNFW_STATUS_NO_ERROR);
+    NNFW_ENSURE_SUCCESS(res);
 
     res = nnfw_set_output(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, actual_output->data(),
                           sizeof(float) * actual_output_size);
-    ASSERT_EQ(res, NNFW_STATUS_NO_ERROR);
+    NNFW_ENSURE_SUCCESS(res);
   }
 
   void prepare_and_set_input_output(const std::vector<int> &new_shape, int actual_output_size,
                                     std::vector<float> *actual_output)
   {
-    ASSERT_EQ(nnfw_set_available_backends(_session, "cpu"), NNFW_STATUS_NO_ERROR);
+    NNFW_ENSURE_SUCCESS(nnfw_set_available_backends(_session, "cpu"));
 
     NNFW_STATUS res = NNFW_STATUS_ERROR;
 
     res = nnfw_prepare(_session);
-    ASSERT_EQ(res, NNFW_STATUS_NO_ERROR);
+    NNFW_ENSURE_SUCCESS(res);
 
     set_input_output(new_shape, actual_output_size, actual_output);
     // real test case should start from calling nnfw_run()
@@ -102,11 +102,11 @@ protected:
 
     if (no_run_error)
     {
-      ASSERT_EQ(res, NNFW_STATUS_NO_ERROR);
+      NNFW_ENSURE_SUCCESS(res);
 
       // output shape check
       nnfw_tensorinfo info;
-      ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, &info), NNFW_STATUS_NO_ERROR);
+      NNFW_ENSURE_SUCCESS(nnfw_output_tensorinfo(_session, 0, &info));
       ASSERT_EQ(info.rank, new_shape.size());
       for (uint32_t d = 0; d < info.rank; ++d)
         ASSERT_EQ(info.dims[d], new_shape[d]);
@@ -137,7 +137,7 @@ TEST_F(TestDynamicTensorReshapeModelLoaded, reshape_to_3x2)
 
   // Do inference
   NNFW_STATUS res = nnfw_run(_session);
-  ASSERT_EQ(res, NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(res);
 
   // output value check
   for (int i = 0; i < expected.size(); ++i)
@@ -163,10 +163,10 @@ TEST_F(TestDynamicTensorReshapeModelLoaded, neg_reshape_to_wrong_3x3)
 
 TEST_F(TestDynamicTensorReshapeModelLoaded, reshape_multiple_executions)
 {
-  ASSERT_EQ(nnfw_set_available_backends(_session, "cpu"), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_set_available_backends(_session, "cpu"));
 
   NNFW_STATUS res = nnfw_prepare(_session);
-  ASSERT_EQ(res, NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(res);
 
   std::vector<int> new_shape;
   std::vector<float> expected = {-1.5, -1.0, -0.5, 0.5, 1.0, 1.5};
@@ -184,10 +184,10 @@ TEST_F(TestDynamicTensorReshapeModelLoaded, reshape_multiple_executions)
 
 TEST_F(TestDynamicTensorReshapeModelLoaded, neg_reshape_multiple_executions)
 {
-  ASSERT_EQ(nnfw_set_available_backends(_session, "cpu"), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_set_available_backends(_session, "cpu"));
 
   NNFW_STATUS res = nnfw_prepare(_session);
-  ASSERT_EQ(res, NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(res);
 
   std::vector<int> new_shape;
   std::vector<float> expected = {-1.5, -1.0, -0.5, 0.5, 1.0, 1.5};
@@ -217,8 +217,8 @@ protected:
                                       const std::vector<float> &input1,
                                       std::vector<float> *actual_output, nnfw_tensorinfo input0_ti)
   {
-    ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_NO_ERROR);
-    ASSERT_EQ(nnfw_set_input_tensorinfo(_session, 0, &input0_ti), NNFW_STATUS_NO_ERROR);
+    NNFW_ENSURE_SUCCESS(nnfw_prepare(_session));
+    NNFW_ENSURE_SUCCESS(nnfw_set_input_tensorinfo(_session, 0, &input0_ti));
 
     ASSERT_EQ(nnfw_set_input(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, input0.data(),
                              sizeof(float) * input0.size()),
@@ -250,7 +250,7 @@ protected:
  */
 TEST_F(TestInputUnknownDimInputConcatModelLoaded, concat_input0_to_2x3)
 {
-  ASSERT_EQ(nnfw_set_available_backends(_session, "cpu"), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_set_available_backends(_session, "cpu"));
 
   const std::vector<float> input0 = {1, 2, 3};          // of shape [1, 3]
   const std::vector<float> input1 = {4, 5, 6, 7, 8, 9}; // of shape [2, 3]
@@ -260,14 +260,14 @@ TEST_F(TestInputUnknownDimInputConcatModelLoaded, concat_input0_to_2x3)
 
   // input reshaping to [1, 3]
   nnfw_tensorinfo ti = {NNFW_TYPE_TENSOR_FLOAT32, 2, {1, 3}};
-  ASSERT_EQ(nnfw_set_input_tensorinfo(_session, 0, &ti), NNFW_STATUS_NO_ERROR);
-  ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_set_input_tensorinfo(_session, 0, &ti));
+  NNFW_ENSURE_SUCCESS(nnfw_prepare(_session));
 
   set_input_output(_session, input0, input1, actual_output);
 
   // Do inference
   NNFW_STATUS res = nnfw_run(_session);
-  ASSERT_EQ(res, NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(res);
 
   // output value check
   for (int i = 0; i < expected.size(); ++i)
@@ -291,7 +291,7 @@ TEST_F(TestInputUnknownDimInputConcatModelLoaded, concat_input0_to_2x3)
  */
 TEST_F(TestInputUnknownDimInputConcatModelLoaded, neg_concat_input0_to_wrong_shape)
 {
-  ASSERT_EQ(nnfw_set_available_backends(_session, "cpu"), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_set_available_backends(_session, "cpu"));
 
   const std::vector<float> input0 = {1, 2, 3};          // of shape [3, 1], wrong shape
   const std::vector<float> input1 = {4, 5, 6, 7, 8, 9}; // of shape [2, 3]
@@ -300,7 +300,7 @@ TEST_F(TestInputUnknownDimInputConcatModelLoaded, neg_concat_input0_to_wrong_sha
 
   // input reshaping to [3, 1]
   nnfw_tensorinfo ti = {NNFW_TYPE_TENSOR_FLOAT32, 2, {3, 1}};
-  ASSERT_EQ(nnfw_set_input_tensorinfo(_session, 0, &ti), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_set_input_tensorinfo(_session, 0, &ti));
 
   ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_ERROR);
 }
@@ -330,7 +330,7 @@ using TestDynamicTensorApplyTensorInfoBinaryOp =
 
 TEST_F(TestDynamicTensorApplyTensorInfoBinaryOp, set_input_tensorinfo_after_compilation_add)
 {
-  ASSERT_EQ(nnfw_set_available_backends(_session, "cpu"), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_set_available_backends(_session, "cpu"));
 
   // input reshaping to [2, 2, 3]
   nnfw_tensorinfo input0_ti = {NNFW_TYPE_TENSOR_FLOAT32, 3, {2, 2, 3}};
@@ -341,15 +341,15 @@ TEST_F(TestDynamicTensorApplyTensorInfoBinaryOp, set_input_tensorinfo_after_comp
   std::vector<float> expected_output = {1.1 * 2, 2.1 * 2, 3.1 * 2, 4.1 * 2,  5.1 * 2,  6.1 * 2,
                                         7.1 * 2, 8.1 * 2, 9.1 * 2, 10.1 * 2, 11.1 * 2, 12.1 * 2};
 
-  ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_prepare(_session));
 
-  ASSERT_EQ(nnfw_set_input_tensorinfo(_session, 0, &input0_ti), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_set_input_tensorinfo(_session, 0, &input0_ti));
 
   set_input_output(_session, input0, input1, actual_output);
 
   // Do inference
   NNFW_STATUS res = nnfw_run(_session);
-  ASSERT_EQ(res, NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(res);
 
   // output value check
   for (int i = 0; i < expected_output.size(); ++i)
@@ -374,7 +374,7 @@ using TestDynamicTensorApplyTensorInfoUnaryOp = ValidationTestModelLoaded<NNPack
 
 TEST_F(TestDynamicTensorApplyTensorInfoUnaryOp, set_input_tensorinfo_after_compilation_neg)
 {
-  ASSERT_EQ(nnfw_set_available_backends(_session, "cpu"), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_set_available_backends(_session, "cpu"));
 
   nnfw_tensorinfo input0_ti_original = {NNFW_TYPE_TENSOR_FLOAT32, 2, {4, 4}};
 
@@ -397,21 +397,21 @@ TEST_F(TestDynamicTensorApplyTensorInfoUnaryOp, set_input_tensorinfo_after_compi
     expected_output[i] = -1 * input0[i];
   }
 
-  ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_prepare(_session));
 
   // input shape check
   {
     nnfw_tensorinfo ti = {};
-    ASSERT_EQ(nnfw_input_tensorinfo(_session, 0, &ti), NNFW_STATUS_NO_ERROR);
+    NNFW_ENSURE_SUCCESS(nnfw_input_tensorinfo(_session, 0, &ti));
     ASSERT_TRUE(tensorInfoEqual(input0_ti_original, ti));
   }
 
-  ASSERT_EQ(nnfw_set_input_tensorinfo(_session, 0, &input0_ti), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_set_input_tensorinfo(_session, 0, &input0_ti));
 
   // input shape check
   {
     nnfw_tensorinfo ti = {};
-    ASSERT_EQ(nnfw_input_tensorinfo(_session, 0, &ti), NNFW_STATUS_NO_ERROR);
+    NNFW_ENSURE_SUCCESS(nnfw_input_tensorinfo(_session, 0, &ti));
     ASSERT_TRUE(tensorInfoEqual(input0_ti, ti));
   }
 
@@ -419,7 +419,7 @@ TEST_F(TestDynamicTensorApplyTensorInfoUnaryOp, set_input_tensorinfo_after_compi
 
   // Do inference
   NNFW_STATUS res = nnfw_run(_session);
-  ASSERT_EQ(res, NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(res);
 
   // output value check
   for (int i = 0; i < expected_output.size(); ++i)
@@ -435,20 +435,20 @@ const static std::vector<float> while_dynamic_output0{ 0.0388205424, 0.042615629
 
 TEST_F(TestWhileDynamicModelLoaded, run_verify)
 {
-  ASSERT_EQ(nnfw_set_available_backends(_session, "cpu"), NNFW_STATUS_NO_ERROR);
-  ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_set_available_backends(_session, "cpu"));
+  NNFW_ENSURE_SUCCESS(nnfw_prepare(_session));
 
   std::vector<float> actual_output0(10);
 
   nnfw_tensorinfo ti = {NNFW_TYPE_TENSOR_FLOAT32, 3, {1, 28, 28}};
-  ASSERT_EQ(nnfw_set_input_tensorinfo(_session, 0, &ti), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_set_input_tensorinfo(_session, 0, &ti));
 
   set_input_output(_session, while_dynamic_input0, actual_output0);
 
-  ASSERT_EQ(nnfw_run(_session), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_run(_session));
 
   nnfw_tensorinfo ti_output0_expected = {NNFW_TYPE_TENSOR_FLOAT32, 2, {1, 10}};
-  ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, &ti), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_output_tensorinfo(_session, 0, &ti));
   ASSERT_TRUE(tensorInfoEqual(ti, ti_output0_expected));
 
   // output value check
@@ -458,11 +458,11 @@ TEST_F(TestWhileDynamicModelLoaded, run_verify)
 
 TEST_F(TestWhileDynamicModelLoaded, neg_run_verify)
 {
-  ASSERT_EQ(nnfw_set_available_backends(_session, "cpu"), NNFW_STATUS_NO_ERROR);
-  ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_set_available_backends(_session, "cpu"));
+  NNFW_ENSURE_SUCCESS(nnfw_prepare(_session));
 
   nnfw_tensorinfo ti = {NNFW_TYPE_TENSOR_FLOAT32, 3, {1, 28, 28}};
-  ASSERT_EQ(nnfw_set_input_tensorinfo(_session, 0, &ti), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_set_input_tensorinfo(_session, 0, &ti));
 
   // Insufficient size of output (10 or more is sufficient)
   std::vector<float> actual_output0(9);
@@ -482,27 +482,27 @@ const static std::vector<float> if_dynamic_output0{ 0.0444660522, 0.0271649156,
 
 TEST_F(TestIfDynamicModelLoaded, run_verify)
 {
-  ASSERT_EQ(nnfw_set_available_backends(_session, "cpu"), NNFW_STATUS_NO_ERROR);
-  ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_set_available_backends(_session, "cpu"));
+  NNFW_ENSURE_SUCCESS(nnfw_prepare(_session));
 
   nnfw_tensorinfo ti_output0_expected = {NNFW_TYPE_TENSOR_FLOAT32, 2, {1, 10}};
 
   // Output tensor sizes are inferenced after `nnfw_prepare`
   {
     nnfw_tensorinfo ti;
-    ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, &ti), NNFW_STATUS_NO_ERROR);
+    NNFW_ENSURE_SUCCESS(nnfw_output_tensorinfo(_session, 0, &ti));
     ASSERT_TRUE(tensorInfoEqual(ti, ti_output0_expected));
   }
 
   std::vector<float> actual_output0(10);
   set_input_output(_session, if_dynamic_input0, actual_output0);
 
-  ASSERT_EQ(nnfw_run(_session), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_run(_session));
 
   // Check output tensor sizes again
   {
     nnfw_tensorinfo ti;
-    ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, &ti), NNFW_STATUS_NO_ERROR);
+    NNFW_ENSURE_SUCCESS(nnfw_output_tensorinfo(_session, 0, &ti));
     ASSERT_TRUE(tensorInfoEqual(ti, ti_output0_expected));
   }
 
diff --git a/tests/nnfw_api/src/ModelTestInputReshaping.cc b/tests/nnfw_api/src/ModelTestInputReshaping.cc
index f5053e33b..bfe347fe7 100644
--- a/tests/nnfw_api/src/ModelTestInputReshaping.cc
+++ b/tests/nnfw_api/src/ModelTestInputReshaping.cc
@@ -15,7 +15,7 @@
  */
 
 #include <gtest/gtest.h>
-#include <nnfw_debug.h>
+#include <nnfw_internal.h>
 
 #include "fixtures.h"
 #include "NNPackages.h"
@@ -35,8 +35,8 @@ TEST_F(TestInputReshapingAddModelLoaded, reshaping_2x2_to_4x2)
 {
   NNFW_STATUS res = NNFW_STATUS_ERROR;
 
-  ASSERT_EQ(nnfw_set_available_backends(_session, "cpu"), NNFW_STATUS_NO_ERROR);
-  ASSERT_EQ(nnfw_set_config(_session, "EXECUTOR", "Linear"), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_set_available_backends(_session, "cpu"));
+  NNFW_ENSURE_SUCCESS(nnfw_set_config(_session, "EXECUTOR", "Linear"));
 
   // input and output values
   const std::vector<float> input1 = {0, 1, 2, 3, 4, 5, 6, 7}; // of changed shape [4, 2]
@@ -56,7 +56,7 @@ TEST_F(TestInputReshapingAddModelLoaded, reshaping_2x2_to_4x2)
   res = nnfw_set_input_tensorinfo(_session, 0, &ti);
 
   res = nnfw_prepare(_session);
-  ASSERT_EQ(res, NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(res);
 
   nnfw_tensorinfo ti_input = {}; // Static inference result will be stored
   nnfw_input_tensorinfo(_session, 0, &ti_input);
@@ -68,21 +68,21 @@ TEST_F(TestInputReshapingAddModelLoaded, reshaping_2x2_to_4x2)
 
   res = nnfw_set_input(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, input1.data(),
                        sizeof(float) * input1.size());
-  ASSERT_EQ(res, NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(res);
   res = nnfw_set_input(_session, 1, NNFW_TYPE_TENSOR_FLOAT32, input2.data(),
                        sizeof(float) * input2.size());
-  ASSERT_EQ(res, NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(res);
 
   uint64_t output_num_elements = tensorInfoNumElements(ti_output);
   ASSERT_EQ(output_num_elements, expected.size());
   std::vector<float> actual_output(output_num_elements);
   res = nnfw_set_output(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, actual_output.data(),
                         sizeof(float) * actual_output.size());
-  ASSERT_EQ(res, NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(res);
 
   // Do inference
   res = nnfw_run(_session);
-  ASSERT_EQ(res, NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(res);
 
   // compare
   for (int i = 0; i < expected.size(); ++i)
diff --git a/tests/nnfw_api/src/RegressionTests.cc b/tests/nnfw_api/src/RegressionTests.cc
index 54ebc96b6..e4dfa9118 100644
--- a/tests/nnfw_api/src/RegressionTests.cc
+++ b/tests/nnfw_api/src/RegressionTests.cc
@@ -22,17 +22,17 @@ TEST_F(RegressionTest, github_1535)
   auto package_path = NNPackages::get().getModelAbsolutePath(NNPackages::ADD);
 
   nnfw_session *session1 = nullptr;
-  ASSERT_EQ(nnfw_create_session(&session1), NNFW_STATUS_NO_ERROR);
-  ASSERT_EQ(nnfw_load_model_from_file(session1, package_path.c_str()), NNFW_STATUS_NO_ERROR);
-  ASSERT_EQ(nnfw_set_available_backends(session1, "cpu;acl_cl;acl_neon"), NNFW_STATUS_NO_ERROR);
-  ASSERT_EQ(nnfw_prepare(session1), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_create_session(&session1));
+  NNFW_ENSURE_SUCCESS(nnfw_load_model_from_file(session1, package_path.c_str()));
+  NNFW_ENSURE_SUCCESS(nnfw_set_available_backends(session1, "cpu;acl_cl;acl_neon"));
+  NNFW_ENSURE_SUCCESS(nnfw_prepare(session1));
 
   nnfw_session *session2 = nullptr;
-  ASSERT_EQ(nnfw_create_session(&session2), NNFW_STATUS_NO_ERROR);
-  ASSERT_EQ(nnfw_load_model_from_file(session2, package_path.c_str()), NNFW_STATUS_NO_ERROR);
-  ASSERT_EQ(nnfw_set_available_backends(session2, "cpu"), NNFW_STATUS_NO_ERROR);
-  ASSERT_EQ(nnfw_prepare(session2), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_create_session(&session2));
+  NNFW_ENSURE_SUCCESS(nnfw_load_model_from_file(session2, package_path.c_str()));
+  NNFW_ENSURE_SUCCESS(nnfw_set_available_backends(session2, "cpu"));
+  NNFW_ENSURE_SUCCESS(nnfw_prepare(session2));
 
-  ASSERT_EQ(nnfw_close_session(session1), NNFW_STATUS_NO_ERROR);
-  ASSERT_EQ(nnfw_close_session(session2), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_close_session(session1));
+  NNFW_ENSURE_SUCCESS(nnfw_close_session(session2));
 }
diff --git a/tests/nnfw_api/src/ValidationTestAddModelLoaded.cc b/tests/nnfw_api/src/ValidationTestAddModelLoaded.cc
index 67f246728..11c603494 100644
--- a/tests/nnfw_api/src/ValidationTestAddModelLoaded.cc
+++ b/tests/nnfw_api/src/ValidationTestAddModelLoaded.cc
@@ -19,61 +19,63 @@
 
 using ValidationTestAddModelLoaded = ValidationTestModelLoaded<NNPackages::ADD>;
 
-TEST_F(ValidationTestAddModelLoaded, prepare_001)
-{
-  ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_NO_ERROR);
-}
+TEST_F(ValidationTestAddModelLoaded, prepare_001) { NNFW_ENSURE_SUCCESS(nnfw_prepare(_session)); }
 
 TEST_F(ValidationTestAddModelLoaded, set_available_backends_001)
 {
-  ASSERT_EQ(nnfw_set_available_backends(_session, "cpu"), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_set_available_backends(_session, "cpu"));
 }
 
 TEST_F(ValidationTestAddModelLoaded, get_input_size)
 {
   uint32_t size = 0;
-  ASSERT_EQ(nnfw_input_size(_session, &size), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_input_size(_session, &size));
   ASSERT_EQ(size, 1);
 }
 
 TEST_F(ValidationTestAddModelLoaded, get_output_size)
 {
   uint32_t size = 0;
-  ASSERT_EQ(nnfw_output_size(_session, &size), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_output_size(_session, &size));
   ASSERT_EQ(size, 1);
 }
 
 TEST_F(ValidationTestAddModelLoaded, output_tensorinfo)
 {
   nnfw_tensorinfo tensor_info;
-  ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, &tensor_info), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_output_tensorinfo(_session, 0, &tensor_info));
   ASSERT_EQ(tensor_info.rank, 1);
   ASSERT_EQ(tensor_info.dims[0], 1);
 }
 
-TEST_F(ValidationTestAddModelLoaded, neg_run_001)
+TEST_F(ValidationTestAddModelLoaded, neg_run)
 {
-  ASSERT_EQ(nnfw_run(_session), NNFW_STATUS_ERROR);
+  // nnfw_prepare is not called
+  ASSERT_EQ(nnfw_run(_session), NNFW_STATUS_INVALID_STATE);
 }
 
-TEST_F(ValidationTestAddModelLoaded, neg_set_input_001)
+TEST_F(ValidationTestAddModelLoaded, neg_set_input)
 {
-  ASSERT_EQ(nnfw_set_input(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0), NNFW_STATUS_ERROR);
+  // nnfw_prepare is not called
+  ASSERT_EQ(nnfw_set_input(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0),
+            NNFW_STATUS_INVALID_STATE);
 }
 
-TEST_F(ValidationTestAddModelLoaded, neg_set_output_001)
+TEST_F(ValidationTestAddModelLoaded, neg_set_output)
 {
-  ASSERT_EQ(nnfw_set_output(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0), NNFW_STATUS_ERROR);
+  // nnfw_prepare is not called
+  ASSERT_EQ(nnfw_set_output(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0),
+            NNFW_STATUS_INVALID_STATE);
 }
 
 TEST_F(ValidationTestAddModelLoaded, neg_get_input_size)
 {
-  ASSERT_EQ(nnfw_input_size(_session, nullptr), NNFW_STATUS_ERROR);
+  ASSERT_EQ(nnfw_input_size(_session, nullptr), NNFW_STATUS_UNEXPECTED_NULL);
 }
 
 TEST_F(ValidationTestAddModelLoaded, neg_get_output_size)
 {
-  ASSERT_EQ(nnfw_output_size(_session, nullptr), NNFW_STATUS_ERROR);
+  ASSERT_EQ(nnfw_output_size(_session, nullptr), NNFW_STATUS_UNEXPECTED_NULL);
 }
 
 TEST_F(ValidationTestAddModelLoaded, neg_load_model)
@@ -81,11 +83,11 @@ TEST_F(ValidationTestAddModelLoaded, neg_load_model)
   // load model twice
   ASSERT_EQ(nnfw_load_model_from_file(
                 _session, NNPackages::get().getModelAbsolutePath(NNPackages::ADD).c_str()),
-            NNFW_STATUS_ERROR);
+            NNFW_STATUS_INVALID_STATE);
 }
 
 TEST_F(ValidationTestAddModelLoaded, neg_output_tensorinfo)
 {
   // tensor_info is null
-  ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, nullptr), NNFW_STATUS_ERROR);
+  ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, nullptr), NNFW_STATUS_UNEXPECTED_NULL);
 }
diff --git a/tests/nnfw_api/src/ValidationTestAddSessionPrepared.cc b/tests/nnfw_api/src/ValidationTestAddSessionPrepared.cc
index 1bb418231..f19bb782c 100644
--- a/tests/nnfw_api/src/ValidationTestAddSessionPrepared.cc
+++ b/tests/nnfw_api/src/ValidationTestAddSessionPrepared.cc
@@ -23,7 +23,7 @@ TEST_F(ValidationTestAddSessionPrepared, run)
 {
   SetInOutBuffers();
   _input[0] = 3.0;
-  ASSERT_EQ(nnfw_run(_session), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_run(_session));
   ASSERT_FLOAT_EQ(_output[0], 5.0);
 }
 
@@ -31,11 +31,11 @@ TEST_F(ValidationTestAddSessionPrepared, run_twice)
 {
   SetInOutBuffers();
   _input[0] = 4.0;
-  ASSERT_EQ(nnfw_run(_session), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_run(_session));
   ASSERT_FLOAT_EQ(_output[0], 6.0);
 
   _input[0] = 5.0f;
-  ASSERT_EQ(nnfw_run(_session), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_run(_session));
   ASSERT_FLOAT_EQ(_output[0], 7.0);
 }
 
@@ -43,8 +43,8 @@ TEST_F(ValidationTestAddSessionPrepared, run_async)
 {
   SetInOutBuffers();
   _input[0] = 3.0;
-  ASSERT_EQ(nnfw_run_async(_session), NNFW_STATUS_NO_ERROR);
-  ASSERT_EQ(nnfw_await(_session), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_run_async(_session));
+  NNFW_ENSURE_SUCCESS(nnfw_await(_session));
   ASSERT_FLOAT_EQ(_output[0], 5.0);
 }
 
@@ -58,21 +58,21 @@ TEST_F(ValidationTestAddSessionPrepared, set_input_001)
 TEST_F(ValidationTestAddSessionPrepared, get_input_size)
 {
   uint32_t size = 0;
-  ASSERT_EQ(nnfw_input_size(_session, &size), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_input_size(_session, &size));
   ASSERT_EQ(size, 1);
 }
 
 TEST_F(ValidationTestAddSessionPrepared, get_output_size)
 {
   uint32_t size = 0;
-  ASSERT_EQ(nnfw_output_size(_session, &size), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_output_size(_session, &size));
   ASSERT_EQ(size, 1);
 }
 
 TEST_F(ValidationTestAddSessionPrepared, output_tensorinfo)
 {
   nnfw_tensorinfo tensor_info;
-  ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, &tensor_info), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_output_tensorinfo(_session, 0, &tensor_info));
   ASSERT_EQ(tensor_info.rank, 1);
   ASSERT_EQ(tensor_info.dims[0], 1);
 }
@@ -86,24 +86,24 @@ TEST_F(ValidationTestAddSessionPrepared, neg_await_without_async_run)
 TEST_F(ValidationTestAddSessionPrepared, neg_await_after_sync_run)
 {
   SetInOutBuffers();
-  ASSERT_EQ(nnfw_run(_session), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_run(_session));
   ASSERT_EQ(nnfw_await(_session), NNFW_STATUS_ERROR);
 }
 
 TEST_F(ValidationTestAddSessionPrepared, neg_await_twice)
 {
   SetInOutBuffers();
-  ASSERT_EQ(nnfw_run_async(_session), NNFW_STATUS_NO_ERROR);
-  ASSERT_EQ(nnfw_await(_session), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_run_async(_session));
+  NNFW_ENSURE_SUCCESS(nnfw_await(_session));
   ASSERT_EQ(nnfw_await(_session), NNFW_STATUS_ERROR);
 }
 
 TEST_F(ValidationTestAddSessionPrepared, neg_run_during_async_run)
 {
   SetInOutBuffers();
-  ASSERT_EQ(nnfw_run_async(_session), NNFW_STATUS_NO_ERROR);
-  EXPECT_EQ(nnfw_run(_session), NNFW_STATUS_ERROR);
-  ASSERT_EQ(nnfw_await(_session), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_run_async(_session));
+  EXPECT_EQ(nnfw_run(_session), NNFW_STATUS_INVALID_STATE);
+  NNFW_ENSURE_SUCCESS(nnfw_await(_session));
 }
 
 TEST_F(ValidationTestAddSessionPrepared, neg_set_input_001)
@@ -139,12 +139,12 @@ TEST_F(ValidationTestAddSessionPrepared, neg_set_output_002)
 
 TEST_F(ValidationTestAddSessionPrepared, neg_get_input_size)
 {
-  ASSERT_EQ(nnfw_input_size(_session, nullptr), NNFW_STATUS_ERROR);
+  ASSERT_EQ(nnfw_input_size(_session, nullptr), NNFW_STATUS_UNEXPECTED_NULL);
 }
 
 TEST_F(ValidationTestAddSessionPrepared, neg_get_output_size)
 {
-  ASSERT_EQ(nnfw_output_size(_session, nullptr), NNFW_STATUS_ERROR);
+  ASSERT_EQ(nnfw_output_size(_session, nullptr), NNFW_STATUS_UNEXPECTED_NULL);
 }
 
 TEST_F(ValidationTestAddSessionPrepared, neg_load_model)
@@ -152,13 +152,13 @@ TEST_F(ValidationTestAddSessionPrepared, neg_load_model)
   // Load model twice
   ASSERT_EQ(nnfw_load_model_from_file(
                 _session, NNPackages::get().getModelAbsolutePath(NNPackages::ADD).c_str()),
-            NNFW_STATUS_ERROR);
+            NNFW_STATUS_INVALID_STATE);
 }
 
 TEST_F(ValidationTestAddSessionPrepared, neg_prepare)
 {
   // Call Prepare twice
-  ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_ERROR);
+  ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_INVALID_STATE);
 }
 
 // TODO Validation check when "nnfw_run" is called without input & output tensor setting
diff --git a/tests/nnfw_api/src/ValidationTestFourAddModelsSetInput.cc b/tests/nnfw_api/src/ValidationTestFourAddModelsSetInput.cc
index b3fb9c65c..4e2a9055a 100644
--- a/tests/nnfw_api/src/ValidationTestFourAddModelsSetInput.cc
+++ b/tests/nnfw_api/src/ValidationTestFourAddModelsSetInput.cc
@@ -21,8 +21,8 @@ using ValidationTestFourAddModelsSetInput = ValidationTestFourModelsSetInput<NNP
 
 TEST_F(ValidationTestFourAddModelsSetInput, run_001)
 {
-  ASSERT_EQ(nnfw_run(_objects[0].session), NNFW_STATUS_NO_ERROR);
-  ASSERT_EQ(nnfw_run(_objects[1].session), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_run(_objects[0].session));
+  NNFW_ENSURE_SUCCESS(nnfw_run(_objects[1].session));
 }
 
 TEST_F(ValidationTestFourAddModelsSetInput, run_002)
@@ -31,14 +31,14 @@ TEST_F(ValidationTestFourAddModelsSetInput, run_002)
   while (rep--)
   {
     for (auto obj : _objects)
-      ASSERT_EQ(nnfw_run(obj.session), NNFW_STATUS_NO_ERROR);
+      NNFW_ENSURE_SUCCESS(nnfw_run(obj.session));
   }
 }
 
 TEST_F(ValidationTestFourAddModelsSetInput, run_async)
 {
   for (auto obj : _objects)
-    ASSERT_EQ(nnfw_run_async(obj.session), NNFW_STATUS_NO_ERROR);
+    NNFW_ENSURE_SUCCESS(nnfw_run_async(obj.session));
   for (auto obj : _objects)
-    ASSERT_EQ(nnfw_await(obj.session), NNFW_STATUS_NO_ERROR);
+    NNFW_ENSURE_SUCCESS(nnfw_await(obj.session));
 }
diff --git a/tests/nnfw_api/src/ValidationTestSessionCreated.cc b/tests/nnfw_api/src/ValidationTestSessionCreated.cc
index 2675aa758..dafcd369f 100644
--- a/tests/nnfw_api/src/ValidationTestSessionCreated.cc
+++ b/tests/nnfw_api/src/ValidationTestSessionCreated.cc
@@ -27,8 +27,8 @@ TEST_F(ValidationTestSessionCreated, load_session_001)
 
 TEST_F(ValidationTestSessionCreated, close_and_create_again)
 {
-  ASSERT_EQ(nnfw_close_session(_session), NNFW_STATUS_NO_ERROR);
-  ASSERT_EQ(nnfw_create_session(&_session), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_close_session(_session));
+  NNFW_ENSURE_SUCCESS(nnfw_create_session(&_session));
 }
 
 TEST_F(ValidationTestSessionCreated, neg_load_session_1)
@@ -40,7 +40,7 @@ TEST_F(ValidationTestSessionCreated, neg_load_session_1)
 
 TEST_F(ValidationTestSessionCreated, neg_load_session_2)
 {
-  ASSERT_EQ(nnfw_load_model_from_file(_session, nullptr), NNFW_STATUS_ERROR);
+  ASSERT_EQ(nnfw_load_model_from_file(_session, nullptr), NNFW_STATUS_UNEXPECTED_NULL);
 }
 
 TEST_F(ValidationTestSessionCreated, neg_load_session_3)
@@ -58,7 +58,7 @@ TEST_F(ValidationTestSessionCreated, neg_load_invalid_package_1)
       nnfw_load_model_from_file(
           _session, NNPackages::get().getModelAbsolutePath(NNPackages::ADD_NO_MANIFEST).c_str()),
       NNFW_STATUS_ERROR);
-  ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_ERROR);
+  ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_INVALID_STATE);
 }
 
 TEST_F(ValidationTestSessionCreated, neg_load_invalid_package_2)
@@ -67,52 +67,52 @@ TEST_F(ValidationTestSessionCreated, neg_load_invalid_package_2)
                 _session,
                 NNPackages::get().getModelAbsolutePath(NNPackages::ADD_INVALID_MANIFEST).c_str()),
             NNFW_STATUS_ERROR);
-  ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_ERROR);
+  ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_INVALID_STATE);
 }
 
 TEST_F(ValidationTestSessionCreated, neg_prepare_001)
 {
   // nnfw_load_model_from_file was not called
-  ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_ERROR);
+  ASSERT_EQ(nnfw_prepare(_session), NNFW_STATUS_INVALID_STATE);
 }
 
 TEST_F(ValidationTestSessionCreated, neg_run_001)
 {
   // nnfw_load_model_from_file and nnfw_prepare was not called
-  ASSERT_EQ(nnfw_run(_session), NNFW_STATUS_ERROR);
+  ASSERT_EQ(nnfw_run(_session), NNFW_STATUS_INVALID_STATE);
 }
 
 TEST_F(ValidationTestSessionCreated, neg_set_input_001)
 {
-  // Invalid state
-  ASSERT_EQ(nnfw_set_input(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0), NNFW_STATUS_ERROR);
+  ASSERT_EQ(nnfw_set_input(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0),
+            NNFW_STATUS_INVALID_STATE);
 }
 
 TEST_F(ValidationTestSessionCreated, neg_set_output_001)
 {
-  // Invalid state
-  ASSERT_EQ(nnfw_set_output(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0), NNFW_STATUS_ERROR);
+  ASSERT_EQ(nnfw_set_output(_session, 0, NNFW_TYPE_TENSOR_FLOAT32, nullptr, 0),
+            NNFW_STATUS_INVALID_STATE);
 }
 
 TEST_F(ValidationTestSessionCreated, neg_get_input_size)
 {
   uint32_t size = 10000;
-  ASSERT_EQ(nnfw_input_size(_session, &size), NNFW_STATUS_ERROR);
-  ASSERT_EQ(size, 10000);
+  ASSERT_EQ(nnfw_input_size(_session, &size), NNFW_STATUS_INVALID_STATE);
+  ASSERT_EQ(size, 10000); // Remain unchanged
 }
 
 TEST_F(ValidationTestSessionCreated, neg_get_output_size)
 {
   uint32_t size = 10000;
-  ASSERT_EQ(nnfw_output_size(_session, &size), NNFW_STATUS_ERROR);
-  ASSERT_EQ(size, 10000);
+  ASSERT_EQ(nnfw_output_size(_session, &size), NNFW_STATUS_INVALID_STATE);
+  ASSERT_EQ(size, 10000); // Remain unchanged
 }
 
 TEST_F(ValidationTestSessionCreated, neg_output_tensorinfo)
 {
   nnfw_tensorinfo tensor_info;
   // model is not loaded
-  ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, &tensor_info), NNFW_STATUS_ERROR);
+  ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, &tensor_info), NNFW_STATUS_INVALID_STATE);
   // model is not loaded and tensor_info is null
-  ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, nullptr), NNFW_STATUS_ERROR);
+  ASSERT_EQ(nnfw_output_tensorinfo(_session, 0, nullptr), NNFW_STATUS_INVALID_STATE);
 }
diff --git a/tests/nnfw_api/src/ValidationTestSingleSession.cc b/tests/nnfw_api/src/ValidationTestSingleSession.cc
index 2241e8134..5e6027f91 100644
--- a/tests/nnfw_api/src/ValidationTestSingleSession.cc
+++ b/tests/nnfw_api/src/ValidationTestSingleSession.cc
@@ -19,14 +19,14 @@
 
 TEST_F(ValidationTestSingleSession, create_001)
 {
-  ASSERT_EQ(nnfw_create_session(&_session), NNFW_STATUS_NO_ERROR);
-  ASSERT_EQ(nnfw_close_session(_session), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_create_session(&_session));
+  NNFW_ENSURE_SUCCESS(nnfw_close_session(_session));
 }
 
 TEST_F(ValidationTestSingleSession, query_info_u32)
 {
   uint32_t val = 0;
-  ASSERT_EQ(nnfw_query_info_u32(nullptr, NNFW_INFO_ID_VERSION, &val), NNFW_STATUS_NO_ERROR);
+  NNFW_ENSURE_SUCCESS(nnfw_query_info_u32(nullptr, NNFW_INFO_ID_VERSION, &val));
 }
 
 TEST_F(ValidationTestSingleSession, neg_create_001)
diff --git a/tests/nnfw_api/src/fixtures.h b/tests/nnfw_api/src/fixtures.h
index f5bcf405c..8fe5c1667 100644
--- a/tests/nnfw_api/src/fixtures.h
+++ b/tests/nnfw_api/src/fixtures.h
@@ -23,6 +23,8 @@
 
 #include "NNPackages.h"
 
+#define NNFW_ENSURE_SUCCESS(EXPR) ASSERT_EQ((EXPR), NNFW_STATUS_NO_ERROR)
+
 inline uint64_t num_elems(const nnfw_tensorinfo *ti)
 {
   uint64_t n = 1;
diff --git a/tests/scripts/CMakeLists.txt b/tests/scripts/CMakeLists.txt
index f4c9c6b58..82235d9f6 100644
--- a/tests/scripts/CMakeLists.txt
+++ b/tests/scripts/CMakeLists.txt
@@ -2,18 +2,25 @@ if(NOT INSTALL_TEST_SCRIPTS)
   return()
 endif(NOT INSTALL_TEST_SCRIPTS)
 
-# Install test scripts
-file(GLOB TEST_SCRIPTS "*.sh")
-install(PROGRAMS ${TEST_SCRIPTS} DESTINATION tests/scripts)
+# Install test driver
+file(GLOB TEST_DRIVER_SCRIPT onert-test)
+install(PROGRAMS ${TEST_DRIVER_SCRIPT} DESTINATION test)
 
-# Install test list
-file(GLOB TEST_LISTS "list/*.txt")
-install(FILES ${TEST_LISTS} DESTINATION tests/scripts/list)
+# Commands don't have execute permission itself
+install(DIRECTORY command DESTINATION test)
+
+# Install models test script
+file(GLOB MODEL_TEST_SCRIPT "models/run_test.sh")
+install(PROGRAMS ${MODEL_TEST_SCRIPT} DESTINATION test/models)
 
-# Install framework test script
-file(GLOB FRAMEWORKTEST_SCRIPT "framework/run_test.sh")
-install(PROGRAMS ${FRAMEWORKTEST_SCRIPT} DESTINATION tests/scripts/framework)
+# Install models test list file
+file(GLOB MODEL_TEST_DIR models/config)
+install(DIRECTORY ${MODEL_TEST_DIR} DESTINATION test/models)
 
-# Install framework test list file
-file(GLOB FRAMEWORKTEST_DIR framework/tests)
-install(DIRECTORY ${FRAMEWORKTEST_DIR} DESTINATION tests/scripts/framework)
+# Install nnpackage test config
+file(GLOB MODEL_TEST_DIR LIST_DIRECTORIES true nnfw_api_gtest/models/*)
+install(DIRECTORY ${MODEL_TEST_DIR} DESTINATION test/models/nnpackage)
+
+# Install test list
+file(GLOB TEST_LIST_DIR list)
+install(DIRECTORY ${TEST_LIST_DIR} DESTINATION test)
diff --git a/tests/scripts/benchmark_nnapi.sh b/tests/scripts/benchmark_nnapi.sh
index c7f44c52a..af797287f 100755
--- a/tests/scripts/benchmark_nnapi.sh
+++ b/tests/scripts/benchmark_nnapi.sh
@@ -18,7 +18,6 @@ MY_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 
 source $MY_PATH/common.sh
 
-BENCHMARK_RUN_TEST_SH=
 BENCHMARK_DRIVER_BIN=
 BENCHMARK_REPORT_DIR=
 BENCHMARK_MODELS_FILE=
@@ -30,7 +29,7 @@ EXECUTORS="Linear Parallel" #TODO: accept this list as argument
 
 function Usage()
 {
-    echo "Usage: ./$0 --reportdir=. --runtestsh=tests/scripts/framework/run_test.sh --driverbin=Product/out/bin/tflite_run"
+    echo "Usage: ./$0 --reportdir=. --driverbin=Product/out/bin/tflite_run"
 }
 
 for i in "$@"
@@ -43,9 +42,6 @@ do
         --test_op)
             TEST_OP="true"
             ;;
-        --runtestsh=*)
-            BENCHMARK_RUN_TEST_SH=${i#*=}
-            ;;
         --driverbin=*)
             BENCHMARK_DRIVER_BIN=${i#*=}
             ;;
@@ -147,9 +143,8 @@ function run_onert_with_all_config()
     local REPORT_MODEL_DIR=$2
     local PAUSE_TIME_IN_SEC=$3
     local BENCHMARK_DRIVER_BIN=$4
-    local BENCHMARK_RUN_TEST_SH=$5
-    local EXECUTORS=$6
-    local BACKEND_LIST=$7
+    local EXECUTORS=$5
+    local BACKEND_LIST=$6
 
     export USE_NNAPI=1
 
@@ -163,18 +158,18 @@ function run_onert_with_all_config()
     done
     export BACKENDS=$BACKENDS_TO_USE
     if [ "$TEST_OP" == "false" ]; then
-        profile_for_he_shed $REPORT_MODEL_DIR $BENCHMARK_RUN_TEST_SH $BENCHMARK_DRIVER_BIN $MODEL $PROFILING_RUN_CNT
+        profile_for_he_shed $REPORT_MODEL_DIR $BENCHMARK_DRIVER_BIN $MODEL $PROFILING_RUN_CNT
     fi
 
     for executor in $EXECUTORS; do
         export EXECUTOR=$executor
         if [ "$TEST_OP" == "false" ]; then
-            run_with_he_scheduler $REPORT_MODEL_DIR $BENCHMARK_RUN_TEST_SH $BENCHMARK_DRIVER_BIN $MODEL $executor
+            run_with_he_scheduler $REPORT_MODEL_DIR $BENCHMARK_DRIVER_BIN $MODEL $executor
         fi
         for backend in $BACKEND_LIST; do
             export OP_BACKEND_ALLOPS=$backend
             run_benchmark_and_print "tflite_onert_"$executor"_executor_$backend" "TFLite onert $executor Executor $backend"\
-                                    $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN $BENCHMARK_RUN_TEST_SH
+                                    $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN
         done
     done
     unset USE_NNAPI EXECUTOR OP_BACKEND_ALLOPS BACKENDS
@@ -215,14 +210,14 @@ function run_benchmark_test()
 
         # TFLite+CPU
         unset USE_NNAPI
-        run_benchmark_and_print "tflite_cpu" "TFLite CPU" $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN $BENCHMARK_RUN_TEST_SH
+        run_benchmark_and_print "tflite_cpu" "TFLite CPU" $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN
 
         # run onert
         if [ "$TEST_OP" == "true" ]; then
           # Operation test don't need to test each scheduler
-          run_onert_with_all_config $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN $BENCHMARK_RUN_TEST_SH "Linear" "$BACKEND_LIST"
+          run_onert_with_all_config $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN "Linear" "$BACKEND_LIST"
         else
-          run_onert_with_all_config $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN $BENCHMARK_RUN_TEST_SH "$EXECUTORS" "$BACKEND_LIST"
+          run_onert_with_all_config $MODEL $REPORT_MODEL_DIR 0 $BENCHMARK_DRIVER_BIN "$EXECUTORS" "$BACKEND_LIST"
         fi
 
         if [[ $i -ne $(echo $BENCHMARK_MODEL_LIST | wc -w)-1 ]]; then
diff --git a/tests/scripts/nnpkg_test.sh b/tests/scripts/command/nnpkg-test
index cd3e92368..a1176d153 100755..100644
--- a/tests/scripts/nnpkg_test.sh
+++ b/tests/scripts/command/nnpkg-test
@@ -9,12 +9,12 @@ command_exists() {
 progname=$(basename "${BASH_SOURCE[0]}")
 indir="."
 outdir="."
-nnpkg_run=${nnpkg_run:-"Product/out/bin/nnpackage_run"}
+nnpkg_run=${nnpkg_run:-"nnpackage_run"}
 difftool=${difftool:-"h5diff"}
 delete_dumped_on_failure=0
 
 usage() {
-  echo "Usage: $progname [options] nnpackage_test"
+  echo "Usage: $0 $progname [options] nnpackage_test"
   echo "Run an nnpackage testcase"
   echo ""
   echo "Returns"
@@ -29,12 +29,12 @@ usage() {
   echo "         (dumped file are always deleted on success) (default=$delete_dumped_on_failure)"
   echo ""
   echo "Environment variables:"
-  echo "   nnpackage_run    path to nnpackage_run (default=Product/out/bin/nnpackage_run)"
+  echo "   nnpackage_run    path to nnpackage_run (default=nnpackage_run)"
   echo "   difftool         path to i5diff or h5diff (default=h5diff)"
   echo ""
   echo "Examples:"
-  echo "    $progname Add_000                => run $indir/Add_000 and check output"
-  echo "    $progname -i nnpkg-tcs Add_000   => run nnpkg-tcs/Add_000 and check output"
+  echo "    $0 $progname Add_000                => run $indir/Add_000 and check output"
+  echo "    $0 $progname -i nnpkg-tcs Add_000   => run nnpkg-tcs/Add_000 and check output"
   exit 1
 }
 
@@ -61,11 +61,6 @@ if [ $# -ne 1 ]; then
   exit 1
 fi
 
-if [ ! -e Product ]; then
-  echo "error: please make sure to run this script in nnfw home."
-  exit 1
-fi
-
 tcname=$(basename "$1")
 nnpkg="$indir/$tcname"
 
@@ -78,6 +73,7 @@ fi
 
 if ! command_exists $nnpkg_run; then
   echo "error: runner "$nnpkg_run" does not exist."
+  echo "       if $nnpkg_run exists, please set PATH to $nnpkg_run"
   exit 1
 fi
 
diff --git a/tests/scripts/command/prepare-model b/tests/scripts/command/prepare-model
new file mode 100644
index 000000000..feb658c3c
--- /dev/null
+++ b/tests/scripts/command/prepare-model
@@ -0,0 +1,64 @@
+#!/bin/bash
+#
+# Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+COMMAND_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+INSTALL_DIR="$(dirname $(dirname $COMMAND_DIR))"
+
+MD5_CHECK="on"
+DOWNLOAD_MODEL="all"
+
+function Usage()
+{
+    echo "Usage: $0 $(basename ${BASH_SOURCE[0]}) [OPTIONS]"
+    echo ""
+    echo "Options:"
+    echo "      --ignoremd5                         Ignore MD5 check when download model files"
+    echo "      --model=(all|nnpackage|tflite)      Download test model (default=all)"
+}
+
+for i in "$@"
+do
+    case $i in
+        -h|--help|help)
+            Usage
+            exit 1
+            ;;
+        --ignoremd5)
+            MD5_CHECK="off"
+            ;;
+        --model=*)
+            DOWNLOAD_MODEL=${i#*=}
+            ;;
+        *)
+            echo "Unknown option: $i"
+            exit 1
+        ;;
+    esac
+    shift
+done
+
+if [[ $DOWNLOAD_MODEL == "all" ]] || [[ $DOWNLOAD_MODEL == "tflite" ]]; then
+    # Download tflite models
+    $INSTALL_DIR/test/models/run_test.sh --download=on --run=off --md5=$MD5_CHECK
+fi
+
+if [[ $DOWNLOAD_MODEL == "all" ]] || [[ $DOWNLOAD_MODEL == "nnpackage" ]]; then
+    # Download nnpackage model
+    NNPACKAGE_CONFIG_DIR=$INSTALL_DIR/test/models/nnpackage/
+    NNPACKAGE_CACHE_DIR=$INSTALL_DIR/unittest_standalone/nnfw_api_gtest_models/
+    $INSTALL_DIR/test/models/run_test.sh --download=on --run=off --md5=$MD5_CHECK \
+        --configdir=$NNPACKAGE_CONFIG_DIR --cachedir=$NNPACKAGE_CACHE_DIR
+fi
diff --git a/tests/scripts/unittest.sh b/tests/scripts/command/unittest
index 717779d58..135ebea7b 100755..100644
--- a/tests/scripts/unittest.sh
+++ b/tests/scripts/command/unittest
@@ -14,22 +14,31 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+COMMAND_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+INSTALL_DIR="$(dirname $(dirname $COMMAND_DIR))"
 UNITTEST_REPORT_DIR=
-UNITTEST_TEST_DIR=
+UNITTEST_TEST_DIR=$INSTALL_DIR/unittest
 UNITTEST_RESULT=0
 UNITTEST_RUN_ALL=""
 
 function Usage()
 {
     # TODO: Fill this
-    echo "Usage: LD_LIBRARY_PATH=Product/out/lib ./$0 --reportdir=report --unittestdir=Product/out/unittest"
+    echo "Usage: $0 $(basename ${BASH_SOURCE[0]}) [OPTIONS]"
+    echo ""
+    echo "Options:"
+    echo "      --reportdir=PATH        Path to write unittest report"
+    echo "      --unittestdir=PATH      Path to run unittest (default: $UNITTEST_TEST_DIR"
 }
 
-get_gtest_option()
+function get_gtest_option()
 {
     local UNITTEST_REPORT_FILE=$(basename $TEST_BIN)
-    local output_option="--gtest_output=xml:$UNITTEST_REPORT_DIR/$UNITTEST_REPORT_FILE.xml"
+    local output_option
     local filter_option
+    if [ -n "$UNITTEST_REPORT_DIR" ]; then
+        output_option="--gtest_output=xml:$UNITTEST_REPORT_DIR/$UNITTEST_REPORT_FILE.xml"
+    fi
     if [ -r "$TEST_BIN.skip" ]; then
       filter_option="--gtest_filter=-$(grep -v '#' "$TEST_BIN.skip" | tr '\n' ':')"
     fi
@@ -49,15 +58,15 @@ do
         --unittestdir=*)
             UNITTEST_TEST_DIR=${i#*=}
             ;;
-        --runall)
-            UNITTEST_RUN_ALL="true"
+        *)
+            echo "Unknown option: $i"
+            exit 1
+        ;;
     esac
     shift
 done
 
-# TODO: handle exceptions for params
-
-if [ ! -e "$UNITTEST_REPORT_DIR" ]; then
+if [ -n "$UNITTEST_REPORT_DIR" ] && [ ! -e "$UNITTEST_REPORT_DIR" ]; then
     mkdir -p $UNITTEST_REPORT_DIR
 fi
 
@@ -73,21 +82,9 @@ for TEST_BIN in `find $UNITTEST_TEST_DIR -maxdepth 1 -type f -executable`; do
     echo "============================================"
     echo "Starting set $num_unittest: $TEST_BIN..."
     echo "============================================"
-    TEMP_UNITTEST_RESULT=0
 
-    if [ "$UNITTEST_RUN_ALL" == "true" ]; then
-        for TEST_LIST_VERBOSE_LINE in $($TEST_BIN --gtest_list_tests); do
-            if [[ $TEST_LIST_VERBOSE_LINE == *\. ]]; then
-                TEST_LIST_CATEGORY=$TEST_LIST_VERBOSE_LINE
-            else
-                TEST_LIST_ITEM="$TEST_LIST_CATEGORY""$TEST_LIST_VERBOSE_LINE"
-                $TEST_BIN --gtest_filter=$TEST_LIST_ITEM --gtest_output="xml:$UNITTEST_REPORT_DIR/$TEST_LIST_ITEM.xml"
-            fi
-        done
-    else
-        $TEST_BIN $(get_gtest_option)
-        TEMP_UNITTEST_RESULT=$?
-    fi
+    $TEST_BIN $(get_gtest_option)
+    TEMP_UNITTEST_RESULT=$?
 
     if [[ $TEMP_UNITTEST_RESULT -ne 0 ]]; then
         UNITTEST_RESULT=$TEMP_UNITTEST_RESULT
diff --git a/tests/scripts/command/verify-tflite b/tests/scripts/command/verify-tflite
new file mode 100644
index 000000000..48863ff12
--- /dev/null
+++ b/tests/scripts/command/verify-tflite
@@ -0,0 +1,106 @@
+#!/bin/bash
+#
+# Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+COMMAND_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+INSTALL_DIR="$(dirname $(dirname $COMMAND_DIR))"
+
+MD5_CHECK="on"
+TFLITE_LOADER="nnapi"
+REPORT_DIR="report"
+TEST_LIST_FILE=
+
+function Usage()
+{
+    echo "Usage: $0 $(basename ${BASH_SOURCE[0]}) [OPTIONS]"
+    echo ""
+    echo "Options:"
+    echo "      --ignoremd5             Ignore MD5 check when download model files"
+    echo "      --api=(nnapi|loader)    TFLite model file loading API (default=$TFLITE_LOADER)"
+    echo "      --reportdir=PATH        Path to write report (default=$REPORT_DIR)"
+    echo "      --list=FILE             List file to test. Test all if list option is not passed"
+}
+
+for i in "$@"
+do
+    case $i in
+        -h|--help|help)
+            Usage
+            exit 1
+            ;;
+        --ignoremd5)
+            MD5_CHECK="off"
+            ;;
+        --api=*)
+            TFLITE_LOADER=${i#*=}
+            ;;
+        --reportdir=*)
+            REPORT_DIR=${i#*=}
+            ;;
+        --list=*)
+            TEST_LIST_FILE=${i#*=}
+            ;;
+        *)
+            echo "Unknown option: $i"
+            exit 1
+        ;;
+    esac
+    shift
+done
+
+if [ ! -z "$TEST_LIST_FILE" ]; then
+    MODELLIST=$(cat "${TEST_LIST_FILE}")
+fi
+
+if [ ! -e "$REPORT_DIR" ]; then
+    mkdir -p $REPORT_DIR
+fi
+
+TEST_RESULT=0
+TAP_NAME=verification_test.tap
+TEST_NAME="Verification"
+TEST_DRIVER=
+
+if [[ $TFLITE_LOADER == "nnapi" ]]; then
+    TEST_NAME="NNAPI Verification"
+    TEST_DRIVER=nnapi_test
+elif [[ $TFLITE_LOADER == "loader" ]]; then
+    TEST_NAME="Loader Verification"
+    TEST_DRIVER=tflite_loader_test_tool
+else
+    Usage
+    exit 1
+fi
+
+$INSTALL_DIR/test/models/run_test.sh --driverbin=$TEST_DRIVER \
+    --reportdir=$REPORT_DIR \
+    --tapname=$TAP_NAME \
+    ${MODELLIST:-} > $REPORT_DIR/verification_test.log 2>&1
+TEST_RESULT=$?
+
+if [[ $TEST_RESULT -ne 0 ]]; then
+    echo ""
+    cat $REPORT_DIR/$TAP_NAME
+    echo ""
+    echo "$TEST_NAME failed... exit code: $TEST_RESULT"
+    echo "============================================"
+    echo ""
+    exit $TEST_RESULT
+fi
+
+echo ""
+cat $REPORT_DIR/$TAP_NAME
+echo "============================================"
+echo ""
diff --git a/tests/scripts/common.sh b/tests/scripts/common.sh
index 88002909c..87aec86b3 100755
--- a/tests/scripts/common.sh
+++ b/tests/scripts/common.sh
@@ -18,13 +18,12 @@ MY_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 
 function get_result_of_benchmark_test()
 {
-    local RUN_TEST_SH=$1
-    local DRIVER_BIN=$2
-    local MODEL=$3
-    local LOG_FILE=$4
+    local DRIVER_BIN=$1
+    local MODEL=$2
+    local LOG_FILE=$3
 
     local RET=0
-    $RUN_TEST_SH --driverbin="$DRIVER_BIN  -r 5 -w 3" $MODEL > $LOG_FILE 2>&1
+    $MY_PATH/models/run_test.sh --driverbin="$DRIVER_BIN  -r 5 -w 3" $MODEL > $LOG_FILE 2>&1
     RET=$?
     if [[ $RET -ne 0 ]]; then
         echo "Testing $MODEL aborted... exit code: $RET"
@@ -68,7 +67,7 @@ function run_benchmark_and_print()
     LOG_FILE=$REPORT_MODEL_DIR/$WRITE_FILE_NAME.txt
     RESULT_FILE=$REPORT_MODEL_DIR/$WRITE_FILE_NAME.result
     print_with_dots $MSG
-    RESULT=$(get_result_of_benchmark_test $BENCHMARK_RUN_TEST_SH $DRIVER_BIN $MODEL $LOG_FILE)
+    RESULT=$(get_result_of_benchmark_test $DRIVER_BIN $MODEL $LOG_FILE)
     echo "$RESULT ms"
     print_result_of_benchmark_test "$MSG" "$RESULT" $RESULT_FILE
     sleep $PAUSE_TIME_IN_SEC
diff --git a/tests/scripts/list/frameworktest_list.aarch64.acl_cl.txt b/tests/scripts/list/frameworktest_list.aarch64.acl_cl.txt
index fc7e71ab8..dd8d3b710 100644
--- a/tests/scripts/list/frameworktest_list.aarch64.acl_cl.txt
+++ b/tests/scripts/list/frameworktest_list.aarch64.acl_cl.txt
@@ -27,7 +27,7 @@ pack
 pad
 reduce_max
 reduce_mean
-reduce_sum
+reduce_sum/float
 relu
 relu6
 reshape
diff --git a/tests/scripts/list/frameworktest_list.armv7l.acl_cl.txt b/tests/scripts/list/frameworktest_list.armv7l.acl_cl.txt
index fc7e71ab8..dd8d3b710 100644
--- a/tests/scripts/list/frameworktest_list.armv7l.acl_cl.txt
+++ b/tests/scripts/list/frameworktest_list.armv7l.acl_cl.txt
@@ -27,7 +27,7 @@ pack
 pad
 reduce_max
 reduce_mean
-reduce_sum
+reduce_sum/float
 relu
 relu6
 reshape
diff --git a/tests/scripts/list/tflite_loader_list.aarch64.txt b/tests/scripts/list/tflite_loader_list.aarch64.txt
index aca8f16ee..e04d89d3f 100644
--- a/tests/scripts/list/tflite_loader_list.aarch64.txt
+++ b/tests/scripts/list/tflite_loader_list.aarch64.txt
@@ -18,7 +18,7 @@ mul
 pack
 pad
 reduce_max
-reduce_sum
+reduce_sum/float
 relu
 relu6
 reshape/reshape1
diff --git a/tests/scripts/list/tflite_loader_list.armv7l.txt b/tests/scripts/list/tflite_loader_list.armv7l.txt
index aca8f16ee..e04d89d3f 100644
--- a/tests/scripts/list/tflite_loader_list.armv7l.txt
+++ b/tests/scripts/list/tflite_loader_list.armv7l.txt
@@ -18,7 +18,7 @@ mul
 pack
 pad
 reduce_max
-reduce_sum
+reduce_sum/float
 relu
 relu6
 reshape/reshape1
diff --git a/tests/scripts/framework/tests/MODELS/inception_module/config.sh b/tests/scripts/models/config/MODELS/inception_module/config.sh
index 3f6eae9ee..3f6eae9ee 100755
--- a/tests/scripts/framework/tests/MODELS/inception_module/config.sh
+++ b/tests/scripts/models/config/MODELS/inception_module/config.sh
diff --git a/tests/scripts/framework/tests/MODELS/inception_nonslim/config.sh b/tests/scripts/models/config/MODELS/inception_nonslim/config.sh
index 39f5d772d..39f5d772d 100755
--- a/tests/scripts/framework/tests/MODELS/inception_nonslim/config.sh
+++ b/tests/scripts/models/config/MODELS/inception_nonslim/config.sh
diff --git a/tests/scripts/framework/tests/MODELS/inception_slim/config.sh b/tests/scripts/models/config/MODELS/inception_slim/config.sh
index 1c0cf3ef5..1c0cf3ef5 100755
--- a/tests/scripts/framework/tests/MODELS/inception_slim/config.sh
+++ b/tests/scripts/models/config/MODELS/inception_slim/config.sh
diff --git a/tests/scripts/framework/tests/MODELS/mobilenet/config.sh b/tests/scripts/models/config/MODELS/mobilenet/config.sh
index b23d687cd..b23d687cd 100755
--- a/tests/scripts/framework/tests/MODELS/mobilenet/config.sh
+++ b/tests/scripts/models/config/MODELS/mobilenet/config.sh
diff --git a/tests/scripts/framework/tests/MODELS/mobilenet_quant8/config.sh b/tests/scripts/models/config/MODELS/mobilenet_quant8/config.sh
index 2e304df92..2e304df92 100755
--- a/tests/scripts/framework/tests/MODELS/mobilenet_quant8/config.sh
+++ b/tests/scripts/models/config/MODELS/mobilenet_quant8/config.sh
diff --git a/tests/scripts/framework/tests/abs/config.sh b/tests/scripts/models/config/abs/config.sh
index 7acdefa5a..7acdefa5a 100755
--- a/tests/scripts/framework/tests/abs/config.sh
+++ b/tests/scripts/models/config/abs/config.sh
diff --git a/tests/scripts/framework/tests/add/1D/config.sh b/tests/scripts/models/config/add/1D/config.sh
index ca6fafe26..ca6fafe26 100755
--- a/tests/scripts/framework/tests/add/1D/config.sh
+++ b/tests/scripts/models/config/add/1D/config.sh
diff --git a/tests/scripts/framework/tests/add/4D/config.sh b/tests/scripts/models/config/add/4D/config.sh
index d9e93d17b..d9e93d17b 100755
--- a/tests/scripts/framework/tests/add/4D/config.sh
+++ b/tests/scripts/models/config/add/4D/config.sh
diff --git a/tests/scripts/framework/tests/average_pool_2d/aligned/config.sh b/tests/scripts/models/config/average_pool_2d/aligned/config.sh
index cdefb077e..cdefb077e 100755
--- a/tests/scripts/framework/tests/average_pool_2d/aligned/config.sh
+++ b/tests/scripts/models/config/average_pool_2d/aligned/config.sh
diff --git a/tests/scripts/framework/tests/average_pool_2d/avgpool1/config.sh b/tests/scripts/models/config/average_pool_2d/avgpool1/config.sh
index 7e0130088..7e0130088 100755
--- a/tests/scripts/framework/tests/average_pool_2d/avgpool1/config.sh
+++ b/tests/scripts/models/config/average_pool_2d/avgpool1/config.sh
diff --git a/tests/scripts/framework/tests/average_pool_2d/avgpool2/config.sh b/tests/scripts/models/config/average_pool_2d/avgpool2/config.sh
index 1ef0b17ca..1ef0b17ca 100755
--- a/tests/scripts/framework/tests/average_pool_2d/avgpool2/config.sh
+++ b/tests/scripts/models/config/average_pool_2d/avgpool2/config.sh
diff --git a/tests/scripts/framework/tests/batch_to_space_nd2/config.sh b/tests/scripts/models/config/batch_to_space_nd2/config.sh
index 1dbcb7681..1dbcb7681 100755
--- a/tests/scripts/framework/tests/batch_to_space_nd2/config.sh
+++ b/tests/scripts/models/config/batch_to_space_nd2/config.sh
diff --git a/tests/scripts/framework/tests/cast/config.sh b/tests/scripts/models/config/cast/config.sh
index 0e123e3be..0e123e3be 100755
--- a/tests/scripts/framework/tests/cast/config.sh
+++ b/tests/scripts/models/config/cast/config.sh
diff --git a/tests/scripts/framework/tests/concat/2D/config.sh b/tests/scripts/models/config/concat/2D/config.sh
index fd22e708c..fd22e708c 100755
--- a/tests/scripts/framework/tests/concat/2D/config.sh
+++ b/tests/scripts/models/config/concat/2D/config.sh
diff --git a/tests/scripts/framework/tests/concat/concat1/config.sh b/tests/scripts/models/config/concat/concat1/config.sh
index 4543b163a..4543b163a 100755
--- a/tests/scripts/framework/tests/concat/concat1/config.sh
+++ b/tests/scripts/models/config/concat/concat1/config.sh
diff --git a/tests/scripts/framework/tests/concat/concat2/config.sh b/tests/scripts/models/config/concat/concat2/config.sh
index f4404e471..f4404e471 100755
--- a/tests/scripts/framework/tests/concat/concat2/config.sh
+++ b/tests/scripts/models/config/concat/concat2/config.sh
diff --git a/tests/scripts/framework/tests/conv_2d/convolution1/config.sh b/tests/scripts/models/config/conv_2d/convolution1/config.sh
index 46a205fdb..46a205fdb 100755
--- a/tests/scripts/framework/tests/conv_2d/convolution1/config.sh
+++ b/tests/scripts/models/config/conv_2d/convolution1/config.sh
diff --git a/tests/scripts/framework/tests/conv_2d/convolution2/config.sh b/tests/scripts/models/config/conv_2d/convolution2/config.sh
index eca5b3374..eca5b3374 100755
--- a/tests/scripts/framework/tests/conv_2d/convolution2/config.sh
+++ b/tests/scripts/models/config/conv_2d/convolution2/config.sh
diff --git a/tests/scripts/framework/tests/custom/squared_difference/config.sh b/tests/scripts/models/config/custom/squared_difference/config.sh
index 745a84447..745a84447 100755
--- a/tests/scripts/framework/tests/custom/squared_difference/config.sh
+++ b/tests/scripts/models/config/custom/squared_difference/config.sh
diff --git a/tests/scripts/framework/tests/depthwise_conv_2d/depthconv1/config.sh b/tests/scripts/models/config/depthwise_conv_2d/depthconv1/config.sh
index 39aa4a45f..39aa4a45f 100755
--- a/tests/scripts/framework/tests/depthwise_conv_2d/depthconv1/config.sh
+++ b/tests/scripts/models/config/depthwise_conv_2d/depthconv1/config.sh
diff --git a/tests/scripts/framework/tests/depthwise_conv_2d/depthconv2/config.sh b/tests/scripts/models/config/depthwise_conv_2d/depthconv2/config.sh
index 3dd7c50b3..3dd7c50b3 100755
--- a/tests/scripts/framework/tests/depthwise_conv_2d/depthconv2/config.sh
+++ b/tests/scripts/models/config/depthwise_conv_2d/depthconv2/config.sh
diff --git a/tests/scripts/framework/tests/depthwise_conv_2d_no_fuse/config.sh b/tests/scripts/models/config/depthwise_conv_2d_no_fuse/config.sh
index 13fb264f0..13fb264f0 100755
--- a/tests/scripts/framework/tests/depthwise_conv_2d_no_fuse/config.sh
+++ b/tests/scripts/models/config/depthwise_conv_2d_no_fuse/config.sh
diff --git a/tests/scripts/framework/tests/div/broadcast/config.sh b/tests/scripts/models/config/div/broadcast/config.sh
index 7c5e985fa..7c5e985fa 100755
--- a/tests/scripts/framework/tests/div/broadcast/config.sh
+++ b/tests/scripts/models/config/div/broadcast/config.sh
diff --git a/tests/scripts/framework/tests/embedding_lookup/config.sh b/tests/scripts/models/config/embedding_lookup/config.sh
index 5e5e4ad92..5e5e4ad92 100755
--- a/tests/scripts/framework/tests/embedding_lookup/config.sh
+++ b/tests/scripts/models/config/embedding_lookup/config.sh
diff --git a/tests/scripts/framework/tests/equal/config.sh b/tests/scripts/models/config/equal/config.sh
index a43fd73f7..a43fd73f7 100755
--- a/tests/scripts/framework/tests/equal/config.sh
+++ b/tests/scripts/models/config/equal/config.sh
diff --git a/tests/scripts/framework/tests/exp/config.sh b/tests/scripts/models/config/exp/config.sh
index 944f0bbce..944f0bbce 100755
--- a/tests/scripts/framework/tests/exp/config.sh
+++ b/tests/scripts/models/config/exp/config.sh
diff --git a/tests/scripts/framework/tests/floor/floor1/config.sh b/tests/scripts/models/config/floor/floor1/config.sh
index 4952e4a54..4952e4a54 100755
--- a/tests/scripts/framework/tests/floor/floor1/config.sh
+++ b/tests/scripts/models/config/floor/floor1/config.sh
diff --git a/tests/scripts/framework/tests/floor/floor2/config.sh b/tests/scripts/models/config/floor/floor2/config.sh
index 24581dc33..24581dc33 100755
--- a/tests/scripts/framework/tests/floor/floor2/config.sh
+++ b/tests/scripts/models/config/floor/floor2/config.sh
diff --git a/tests/scripts/framework/tests/fullyconnected/fc1/config.sh b/tests/scripts/models/config/fullyconnected/fc1/config.sh
index 013361583..013361583 100755
--- a/tests/scripts/framework/tests/fullyconnected/fc1/config.sh
+++ b/tests/scripts/models/config/fullyconnected/fc1/config.sh
diff --git a/tests/scripts/framework/tests/fullyconnected/hybrid/config.sh b/tests/scripts/models/config/fullyconnected/hybrid/config.sh
index b2d8ffe86..b2d8ffe86 100755
--- a/tests/scripts/framework/tests/fullyconnected/hybrid/config.sh
+++ b/tests/scripts/models/config/fullyconnected/hybrid/config.sh
diff --git a/tests/scripts/framework/tests/fullyconnected/matmul2x2/config.sh b/tests/scripts/models/config/fullyconnected/matmul2x2/config.sh
index 91fd2ffce..91fd2ffce 100755
--- a/tests/scripts/framework/tests/fullyconnected/matmul2x2/config.sh
+++ b/tests/scripts/models/config/fullyconnected/matmul2x2/config.sh
diff --git a/tests/scripts/framework/tests/fullyconnected/weights_as_input/config.sh b/tests/scripts/models/config/fullyconnected/weights_as_input/config.sh
index 1c218c5f4..1c218c5f4 100755
--- a/tests/scripts/framework/tests/fullyconnected/weights_as_input/config.sh
+++ b/tests/scripts/models/config/fullyconnected/weights_as_input/config.sh
diff --git a/tests/scripts/framework/tests/gather/config.sh b/tests/scripts/models/config/gather/config.sh
index 0f100a823..0f100a823 100755
--- a/tests/scripts/framework/tests/gather/config.sh
+++ b/tests/scripts/models/config/gather/config.sh
diff --git a/tests/scripts/framework/tests/greater/config.sh b/tests/scripts/models/config/greater/config.sh
index aba3d4a3f..aba3d4a3f 100755
--- a/tests/scripts/framework/tests/greater/config.sh
+++ b/tests/scripts/models/config/greater/config.sh
diff --git a/tests/scripts/framework/tests/greater_equal/config.sh b/tests/scripts/models/config/greater_equal/config.sh
index 72beaa81f..72beaa81f 100755
--- a/tests/scripts/framework/tests/greater_equal/config.sh
+++ b/tests/scripts/models/config/greater_equal/config.sh
diff --git a/tests/scripts/framework/tests/hashtable_lookup/config.sh b/tests/scripts/models/config/hashtable_lookup/config.sh
index 3222ee4d2..3222ee4d2 100755
--- a/tests/scripts/framework/tests/hashtable_lookup/config.sh
+++ b/tests/scripts/models/config/hashtable_lookup/config.sh
diff --git a/tests/scripts/framework/tests/l2_normalization/config.sh b/tests/scripts/models/config/l2_normalization/config.sh
index 47801240f..47801240f 100755
--- a/tests/scripts/framework/tests/l2_normalization/config.sh
+++ b/tests/scripts/models/config/l2_normalization/config.sh
diff --git a/tests/scripts/framework/tests/l2_pool_2d/config.sh b/tests/scripts/models/config/l2_pool_2d/config.sh
index a77aa66cf..a77aa66cf 100755
--- a/tests/scripts/framework/tests/l2_pool_2d/config.sh
+++ b/tests/scripts/models/config/l2_pool_2d/config.sh
diff --git a/tests/scripts/framework/tests/less/config.sh b/tests/scripts/models/config/less/config.sh
index 7488dde0f..7488dde0f 100755
--- a/tests/scripts/framework/tests/less/config.sh
+++ b/tests/scripts/models/config/less/config.sh
diff --git a/tests/scripts/framework/tests/less_equal/config.sh b/tests/scripts/models/config/less_equal/config.sh
index 2b53700f6..2b53700f6 100755
--- a/tests/scripts/framework/tests/less_equal/config.sh
+++ b/tests/scripts/models/config/less_equal/config.sh
diff --git a/tests/scripts/framework/tests/logistic/config.sh b/tests/scripts/models/config/logistic/config.sh
index 456773aa9..456773aa9 100755
--- a/tests/scripts/framework/tests/logistic/config.sh
+++ b/tests/scripts/models/config/logistic/config.sh
diff --git a/tests/scripts/framework/tests/max/config.sh b/tests/scripts/models/config/max/config.sh
index 479ca7fd0..479ca7fd0 100755
--- a/tests/scripts/framework/tests/max/config.sh
+++ b/tests/scripts/models/config/max/config.sh
diff --git a/tests/scripts/framework/tests/max_pool_2d/maxpool1/config.sh b/tests/scripts/models/config/max_pool_2d/maxpool1/config.sh
index 19a602eb5..19a602eb5 100755
--- a/tests/scripts/framework/tests/max_pool_2d/maxpool1/config.sh
+++ b/tests/scripts/models/config/max_pool_2d/maxpool1/config.sh
diff --git a/tests/scripts/framework/tests/max_pool_2d/maxpool2/config.sh b/tests/scripts/models/config/max_pool_2d/maxpool2/config.sh
index dc71599da..dc71599da 100755
--- a/tests/scripts/framework/tests/max_pool_2d/maxpool2/config.sh
+++ b/tests/scripts/models/config/max_pool_2d/maxpool2/config.sh
diff --git a/tests/scripts/framework/tests/mean/config.sh b/tests/scripts/models/config/mean/config.sh
index 0853a87fc..0853a87fc 100755
--- a/tests/scripts/framework/tests/mean/config.sh
+++ b/tests/scripts/models/config/mean/config.sh
diff --git a/tests/scripts/framework/tests/min/config.sh b/tests/scripts/models/config/min/config.sh
index 8148471a5..8148471a5 100755
--- a/tests/scripts/framework/tests/min/config.sh
+++ b/tests/scripts/models/config/min/config.sh
diff --git a/tests/scripts/framework/tests/mul/broadcast/config.sh b/tests/scripts/models/config/mul/broadcast/config.sh
index 5522ac877..5522ac877 100755
--- a/tests/scripts/framework/tests/mul/broadcast/config.sh
+++ b/tests/scripts/models/config/mul/broadcast/config.sh
diff --git a/tests/scripts/framework/tests/neg/config.sh b/tests/scripts/models/config/neg/config.sh
index 000f7c811..000f7c811 100755
--- a/tests/scripts/framework/tests/neg/config.sh
+++ b/tests/scripts/models/config/neg/config.sh
diff --git a/tests/scripts/framework/tests/not_equal/config.sh b/tests/scripts/models/config/not_equal/config.sh
index e2234197e..e2234197e 100755
--- a/tests/scripts/framework/tests/not_equal/config.sh
+++ b/tests/scripts/models/config/not_equal/config.sh
diff --git a/tests/scripts/framework/tests/one_hot/config.sh b/tests/scripts/models/config/one_hot/config.sh
index 7e3823486..7e3823486 100755
--- a/tests/scripts/framework/tests/one_hot/config.sh
+++ b/tests/scripts/models/config/one_hot/config.sh
diff --git a/tests/scripts/framework/tests/pack/config.sh b/tests/scripts/models/config/pack/config.sh
index 162ec9d9d..162ec9d9d 100755
--- a/tests/scripts/framework/tests/pack/config.sh
+++ b/tests/scripts/models/config/pack/config.sh
diff --git a/tests/scripts/framework/tests/pad/4D_2D/config.sh b/tests/scripts/models/config/pad/4D_2D/config.sh
index 9e0de2244..9e0de2244 100755
--- a/tests/scripts/framework/tests/pad/4D_2D/config.sh
+++ b/tests/scripts/models/config/pad/4D_2D/config.sh
diff --git a/tests/scripts/framework/tests/pad/pad1/config.sh b/tests/scripts/models/config/pad/pad1/config.sh
index 088cd8962..088cd8962 100755
--- a/tests/scripts/framework/tests/pad/pad1/config.sh
+++ b/tests/scripts/models/config/pad/pad1/config.sh
diff --git a/tests/scripts/framework/tests/pad/pad2/config.sh b/tests/scripts/models/config/pad/pad2/config.sh
index 1683f5350..1683f5350 100755
--- a/tests/scripts/framework/tests/pad/pad2/config.sh
+++ b/tests/scripts/models/config/pad/pad2/config.sh
diff --git a/tests/scripts/framework/tests/reduce_max/config.sh b/tests/scripts/models/config/reduce_max/config.sh
index d636b8bd3..d636b8bd3 100755
--- a/tests/scripts/framework/tests/reduce_max/config.sh
+++ b/tests/scripts/models/config/reduce_max/config.sh
diff --git a/tests/scripts/framework/tests/reduce_mean/test1/config.sh b/tests/scripts/models/config/reduce_mean/test1/config.sh
index 2f370ea4e..2f370ea4e 100755
--- a/tests/scripts/framework/tests/reduce_mean/test1/config.sh
+++ b/tests/scripts/models/config/reduce_mean/test1/config.sh
diff --git a/tests/scripts/framework/tests/reduce_mean/test2/config.sh b/tests/scripts/models/config/reduce_mean/test2/config.sh
index 6c54779a9..6c54779a9 100755
--- a/tests/scripts/framework/tests/reduce_mean/test2/config.sh
+++ b/tests/scripts/models/config/reduce_mean/test2/config.sh
diff --git a/tests/scripts/framework/tests/reduce_sum/config.sh b/tests/scripts/models/config/reduce_sum/float/config.sh
index 31b185397..31b185397 100755
--- a/tests/scripts/framework/tests/reduce_sum/config.sh
+++ b/tests/scripts/models/config/reduce_sum/float/config.sh
diff --git a/tests/scripts/models/config/reduce_sum/uint8/config.sh b/tests/scripts/models/config/reduce_sum/uint8/config.sh
new file mode 100755
index 000000000..d7d9f73f6
--- /dev/null
+++ b/tests/scripts/models/config/reduce_sum/uint8/config.sh
@@ -0,0 +1 @@
+MODELFILE_NAME="reduce_sum_uint8.tflite"
diff --git a/tests/scripts/framework/tests/relu/config.sh b/tests/scripts/models/config/relu/config.sh
index bca59ef04..bca59ef04 100755
--- a/tests/scripts/framework/tests/relu/config.sh
+++ b/tests/scripts/models/config/relu/config.sh
diff --git a/tests/scripts/framework/tests/relu6/config.sh b/tests/scripts/models/config/relu6/config.sh
index 662cc4f33..662cc4f33 100755
--- a/tests/scripts/framework/tests/relu6/config.sh
+++ b/tests/scripts/models/config/relu6/config.sh
diff --git a/tests/scripts/framework/tests/reshape/3D/config.sh b/tests/scripts/models/config/reshape/3D/config.sh
index 3f7ec31ea..3f7ec31ea 100755
--- a/tests/scripts/framework/tests/reshape/3D/config.sh
+++ b/tests/scripts/models/config/reshape/3D/config.sh
diff --git a/tests/scripts/framework/tests/reshape/reshape1/config.sh b/tests/scripts/models/config/reshape/reshape1/config.sh
index 7bdef06ba..7bdef06ba 100755
--- a/tests/scripts/framework/tests/reshape/reshape1/config.sh
+++ b/tests/scripts/models/config/reshape/reshape1/config.sh
diff --git a/tests/scripts/framework/tests/reshape/reshape2/config.sh b/tests/scripts/models/config/reshape/reshape2/config.sh
index b040f0081..b040f0081 100755
--- a/tests/scripts/framework/tests/reshape/reshape2/config.sh
+++ b/tests/scripts/models/config/reshape/reshape2/config.sh
diff --git a/tests/scripts/framework/tests/resize_bilinear/config.sh b/tests/scripts/models/config/resize_bilinear/config.sh
index 8f612cf6d..8f612cf6d 100755
--- a/tests/scripts/framework/tests/resize_bilinear/config.sh
+++ b/tests/scripts/models/config/resize_bilinear/config.sh
diff --git a/tests/scripts/framework/tests/rnn/config.sh b/tests/scripts/models/config/rnn/config.sh
index 997d6c138..997d6c138 100755
--- a/tests/scripts/framework/tests/rnn/config.sh
+++ b/tests/scripts/models/config/rnn/config.sh
diff --git a/tests/scripts/framework/tests/rsqrt/config.sh b/tests/scripts/models/config/rsqrt/config.sh
index 87aa85277..87aa85277 100755
--- a/tests/scripts/framework/tests/rsqrt/config.sh
+++ b/tests/scripts/models/config/rsqrt/config.sh
diff --git a/tests/scripts/framework/tests/select/config.sh b/tests/scripts/models/config/select/config.sh
index 95e49e0dc..95e49e0dc 100755
--- a/tests/scripts/framework/tests/select/config.sh
+++ b/tests/scripts/models/config/select/config.sh
diff --git a/tests/scripts/framework/tests/shape/config.sh b/tests/scripts/models/config/shape/config.sh
index 468f38687..468f38687 100644
--- a/tests/scripts/framework/tests/shape/config.sh
+++ b/tests/scripts/models/config/shape/config.sh
diff --git a/tests/scripts/framework/tests/sin/config.sh b/tests/scripts/models/config/sin/config.sh
index dcf1959d8..dcf1959d8 100755
--- a/tests/scripts/framework/tests/sin/config.sh
+++ b/tests/scripts/models/config/sin/config.sh
diff --git a/tests/scripts/framework/tests/slice/config.sh b/tests/scripts/models/config/slice/config.sh
index 12d06e977..12d06e977 100755
--- a/tests/scripts/framework/tests/slice/config.sh
+++ b/tests/scripts/models/config/slice/config.sh
diff --git a/tests/scripts/framework/tests/softmax/config.sh b/tests/scripts/models/config/softmax/config.sh
index fa6300d7e..fa6300d7e 100755
--- a/tests/scripts/framework/tests/softmax/config.sh
+++ b/tests/scripts/models/config/softmax/config.sh
diff --git a/tests/scripts/framework/tests/space_to_batch_nd2/config.sh b/tests/scripts/models/config/space_to_batch_nd2/config.sh
index 81933709e..81933709e 100755
--- a/tests/scripts/framework/tests/space_to_batch_nd2/config.sh
+++ b/tests/scripts/models/config/space_to_batch_nd2/config.sh
diff --git a/tests/scripts/framework/tests/space_to_depth/config.sh b/tests/scripts/models/config/space_to_depth/config.sh
index ed103b826..ed103b826 100755
--- a/tests/scripts/framework/tests/space_to_depth/config.sh
+++ b/tests/scripts/models/config/space_to_depth/config.sh
diff --git a/tests/scripts/framework/tests/sqrt/config.sh b/tests/scripts/models/config/sqrt/config.sh
index 220147238..220147238 100755
--- a/tests/scripts/framework/tests/sqrt/config.sh
+++ b/tests/scripts/models/config/sqrt/config.sh
diff --git a/tests/scripts/framework/tests/squeeze/config.sh b/tests/scripts/models/config/squeeze/config.sh
index 5bcc67716..5bcc67716 100755
--- a/tests/scripts/framework/tests/squeeze/config.sh
+++ b/tests/scripts/models/config/squeeze/config.sh
diff --git a/tests/scripts/framework/tests/strided_slice/config.sh b/tests/scripts/models/config/strided_slice/config.sh
index 4c41a1a39..4c41a1a39 100755
--- a/tests/scripts/framework/tests/strided_slice/config.sh
+++ b/tests/scripts/models/config/strided_slice/config.sh
diff --git a/tests/scripts/framework/tests/sub/broadcast/config.sh b/tests/scripts/models/config/sub/broadcast/config.sh
index 2b1add0e5..2b1add0e5 100755
--- a/tests/scripts/framework/tests/sub/broadcast/config.sh
+++ b/tests/scripts/models/config/sub/broadcast/config.sh
diff --git a/tests/scripts/framework/tests/tanh/config.sh b/tests/scripts/models/config/tanh/config.sh
index a9dde4923..a9dde4923 100755
--- a/tests/scripts/framework/tests/tanh/config.sh
+++ b/tests/scripts/models/config/tanh/config.sh
diff --git a/tests/scripts/framework/tests/tile/config.sh b/tests/scripts/models/config/tile/config.sh
index 33fda3e1a..33fda3e1a 100644
--- a/tests/scripts/framework/tests/tile/config.sh
+++ b/tests/scripts/models/config/tile/config.sh
diff --git a/tests/scripts/framework/tests/topk_v2/config.sh b/tests/scripts/models/config/topk_v2/config.sh
index 1a460266f..1a460266f 100755
--- a/tests/scripts/framework/tests/topk_v2/config.sh
+++ b/tests/scripts/models/config/topk_v2/config.sh
diff --git a/tests/scripts/framework/tests/transpose/config.sh b/tests/scripts/models/config/transpose/config.sh
index 9adb85e70..9adb85e70 100755
--- a/tests/scripts/framework/tests/transpose/config.sh
+++ b/tests/scripts/models/config/transpose/config.sh
diff --git a/tests/scripts/framework/tests/transpose_conv/same/config.sh b/tests/scripts/models/config/transpose_conv/same/config.sh
index 2cca86e03..2cca86e03 100755
--- a/tests/scripts/framework/tests/transpose_conv/same/config.sh
+++ b/tests/scripts/models/config/transpose_conv/same/config.sh
diff --git a/tests/scripts/framework/tests/transpose_conv/valid/config.sh b/tests/scripts/models/config/transpose_conv/valid/config.sh
index d162331a3..d162331a3 100755
--- a/tests/scripts/framework/tests/transpose_conv/valid/config.sh
+++ b/tests/scripts/models/config/transpose_conv/valid/config.sh
diff --git a/tests/scripts/framework/tests/zeros_like/config.sh b/tests/scripts/models/config/zeros_like/config.sh
index cadeeb961..cadeeb961 100755
--- a/tests/scripts/framework/tests/zeros_like/config.sh
+++ b/tests/scripts/models/config/zeros_like/config.sh
diff --git a/tests/scripts/framework/run_test.sh b/tests/scripts/models/run_test.sh
index 44b714974..0aa363f49 100755
--- a/tests/scripts/framework/run_test.sh
+++ b/tests/scripts/models/run_test.sh
@@ -18,20 +18,28 @@
 MY_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 NNFW_HOME="$(dirname $(dirname $(dirname ${MY_PATH})))"
 CACHE_ROOT_PATH=$MY_PATH/"cache"
-TEST_ROOT_PATH=$MY_PATH/"tests"
+TEST_ROOT_PATH=$MY_PATH/"config"
 REPORT_DIR="report"
 
 RUN_DISABLED="true"
 
+function command_exists() {
+	command -v "$@" > /dev/null 2>&1
+}
+
 function Usage()
 {
     echo "Usage: ./$0 --driverbin={such as tflite_run} {tests to test or empty for all of tests}"
     echo "Usage: ./$0 --driverbin=Product/out/bin/tflite_run --reportdir=report --tapname=verification.tap avgpool1 avgpool2"
     echo ""
-    echo "--download            - (default=off) Download model files. Other options is ignored"
-    echo "--driverbin           - (default=../../Product/out/bin/tflite_run) runner for runnning framework tests"
-    echo "--reportdir           - (default=report) directory to place tap files"
-    echo "--tapname             - (default=framework_test.tap) file name to be written for tap"
+    echo "--download            - (default=on) Download model files"
+    echo "--run                 - (default=on) Test model files"
+    echo "--driverbin           - (default=../../Product/out/bin/tflite_run) Runner for runnning model tests"
+    echo "--reportdir           - (default=report) Directory to place tap files"
+    echo "--tapname             - (default=framework_test.tap) File name to be written for tap"
+    echo "--md5                 - (default=on) MD5 check when download model files"
+    echo "--configdir           - (default=$TEST_ROOT_PATH) Config directory to download and test model"
+    echo "--cachedir            - (default=$CACHE_ROOT_PATH) Directory to download model"
     echo ""
 }
 
@@ -43,9 +51,13 @@ function need_download()
         return 0;
     fi
     # Ignore checking md5 in cache
+    # TODO Use "--md5" option only and remove IGNORE_MD5 environment variable
     if [ ! -z $IGNORE_MD5 ] && [ "$IGNORE_MD5" == "1" ]; then
         return 1
     fi
+    if [ "$MD5_CHECK" = "off" ]; then
+        return 1
+    fi
 
     LOCAL_HASH=$(md5sum $LOCAL_PATH | awk '{ print $1 }')
     REMOTE_HASH=$(curl -ss $REMOTE_URL | md5sum  | awk '{ print $1 }')
@@ -60,7 +72,9 @@ function need_download()
 DRIVER_BIN=""
 TAP_NAME="framework_test.tap"
 TEST_LIST=()
-DOWNLOAD_MODE="off"
+DOWNLOAD_MODEL="on"
+RUN_TEST="on"
+MD5_CHECK="on"
 
 # Support environment variable setting for mirror server
 FIXED_MODELFILE_SERVER="${MODELFILE_SERVER:-}"
@@ -84,6 +98,18 @@ do
         --download=*)
             DOWNLOAD_MODE=${i#*=}
             ;;
+        --md5=*)
+            MD5_CHECK=${i#*=}
+            ;;
+        --run=*)
+            RUN_TEST=${i#*=}
+            ;;
+        --configdir=*)
+            TEST_ROOT_PATH=${i#*=}
+            ;;
+        --cachedir=*)
+            CACHE_ROOT_PATH=${i#*=}
+            ;;
         *)
             TEST_LIST+=( $i )
             ;;
@@ -99,8 +125,13 @@ if [ ! -n "$DRIVER_BIN" ]; then
     DRIVER_BIN="$NNFW_HOME/Product/out/bin/tflite_run"
 fi
 
+if [ ! -d "$TEST_ROOT_PATH" ]; then
+    echo "Cannot find config directory for test: please set proper configdir"
+    exit 1
+fi
+
 # Check test driver setting
-if [ ! -e $DRIVER_BIN ] && [ "$DOWNLOAD_MODE" != "on" ]; then
+if [ ! command_exists $DRIVER_BIN ] && [ "$RUN_TEST" = "on" ]; then
     echo "Cannot find test driver" $DRIVER_BIN ": please set proper DRIVER_BIN"
     exit 1
 fi
@@ -139,33 +170,9 @@ run_tests()
 
         TEST_CACHE_PATH=$CACHE_ROOT_PATH/$TEST_NAME
         MODELFILE=$TEST_CACHE_PATH/$MODELFILE_NAME
-        MODELFILE_URL="$MODELFILE_SERVER_PATH/$MODELFILE_NAME"
-        if [ -n  "$FIXED_MODELFILE_SERVER" ]; then
-            MODELFILE_URL="$FIXED_MODELFILE_SERVER/$MODELFILE_NAME"
-        fi
-
-        # Download model file
-        if [ ! -e $TEST_CACHE_PATH ]; then
-            mkdir -p $TEST_CACHE_PATH
-        fi
-
-        # Download unless we have it in cache (Also check md5sum)
-        if need_download "$MODELFILE" "$MODELFILE_URL"; then
-            echo ""
-            echo "Download test file for $TEST_NAME"
-            echo "======================"
-
-            rm -f $MODELFILE # Remove invalid file if exists
-            pushd $TEST_CACHE_PATH
-            wget -nv $MODELFILE_URL
-            if [ "${MODELFILE_NAME##*.}" == "zip" ]; then
-                unzip -o $MODELFILE_NAME
-            fi
-            popd
-        fi
 
         # Find model file for downloaded by zip
-        if [ "${MODELFILE_NAME##*.}" == "zip" ]; then
+        if [ "${MODELFILE_NAME##*.}" = "zip" ]; then
             pushd $TEST_CACHE_PATH
             MODELFILE=$TEST_CACHE_PATH/$(ls *.tflite)
             popd
@@ -178,7 +185,6 @@ run_tests()
         # Run driver to test framework
         $DRIVER_BIN $MODELFILE
 
-        #$DRIVER_BIN $MODELFILE
         if [[ $? -eq 0 ]]; then
             echo "ok $i - $TEST_NAME" >> $REPORT_DIR/$TAP_NAME
         else
@@ -268,10 +274,11 @@ find_tests()
 mkdir -p $REPORT_DIR
 TESTS_TO_RUN=$(find_tests ${TEST_LIST[@]})
 
-if [[ "$DOWNLOAD_MODE" == "on" ]]; then
+if [ "$DOWNLOAD_MODEL" = "on" ]; then
     download_tests $TESTS_TO_RUN
-    exit 0;
 fi
 
-run_tests $TESTS_TO_RUN
+if [ "$RUN_TEST" = "on" ]; then
+    run_tests $TESTS_TO_RUN
+fi
 exit $?
diff --git a/tests/scripts/onert-test b/tests/scripts/onert-test
new file mode 100644
index 000000000..99c107c52
--- /dev/null
+++ b/tests/scripts/onert-test
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[[ "${BASH_SOURCE[0]}" != "${0}" ]] && echo "Please don't source ${BASH_SOURCE[0]}, execute it" && return
+
+DRIVER_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+INSTALL_PATH="$(dirname $DRIVER_PATH)"
+COMMAND_PATH=$INSTALL_PATH/test/command
+BIN_PATH=$INSTALL_PATH/bin
+
+export PATH=$BIN_PATH:$PATH
+
+function Usage()
+{
+  echo "Usage: $0 [COMMAND] ..."
+  echo "Command:"
+  for file in $COMMAND_PATH/*;
+  do
+    echo "    $(basename "$file")"
+  done
+  exit 255
+}
+
+COMMAND=$1; shift
+if [[ -z $COMMAND ]] || [[ $COMMAND == "--help" ]]; then
+  Usage
+  exit 255
+fi
+
+COMMAND_FILE=$COMMAND_PATH/$COMMAND
+if [[ ! -f $COMMAND_FILE ]]; then
+  echo "ERROR: '$COMMAND' is not supported"
+  exit 255
+fi
+
+source $COMMAND_FILE $@
diff --git a/tests/scripts/test-driver.sh b/tests/scripts/test-driver.sh
index 615fc2c13..aa97d95b0 100755
--- a/tests/scripts/test-driver.sh
+++ b/tests/scripts/test-driver.sh
@@ -27,7 +27,6 @@ function Usage()
     echo "--artifactpath            - (default={test-driver.sh's path}/../../) it should contain tests/ and Product/"
     echo ""
     echo "Following options are needed when you want to tests of specific types. If you don't pass any one, unittest and verification will be run"
-    echo "--unittest                - (default=on) run unit test"
     echo "--frameworktest           - (default=off) run framework test"
     echo "--verification            - (default=on) run verification"
     echo "--frameworktest_list_file - filepath of model list for test"
@@ -38,8 +37,6 @@ function Usage()
     echo "etc."
     echo "--framework_driverbin     - (default=../../Product/out/bin/tflite_run) runner for runnning framework tests"
     echo "--verification_driverbin  - (default=../../Product/out/bin/nnapi_test) runner for runnning verification tests"
-    echo "--runtestsh               - (default=\$ARTIFACT_PATH/tests/scripts/framework/run_test.sh) run_test.sh with path where it is for framework test and verification"
-    echo "--unittestdir             - (default=\$ARTIFACT_PATH/Product/out/unittest) directory that has unittest binaries for unit test"
     echo ""
     echo "--reportdir               - (default=\$ARTIFACT_PATH/report) directory to save report"
     echo ""
@@ -49,10 +46,7 @@ TEST_DRIVER_DIR="$( cd "$( dirname "${BASH_SOURCE}" )" && pwd )"
 ARTIFACT_PATH="$TEST_DRIVER_DIR/../../"
 FRAMEWORK_DRIVER_BIN=""
 VERIFICATION_DRIVER_BIN=""
-RUN_TEST_SH=""
-UNIT_TEST_DIR=""
 ALLTEST_ON="true"
-UNITTEST_ON="false"
 FRAMEWORKTEST_ON="false"
 VERIFICATION_ON="false"
 BENCHMARK_ONERT_OP_ON="false"
@@ -74,16 +68,6 @@ do
         --verification_driverbin=*)
             VERIFICATION_DRIVER_BIN=${i#*=}
             ;;
-        --runtestsh=*)
-            RUN_TEST_SH=${i#*=}
-            ;;
-        --unittestdir=*)
-            UNIT_TEST_DIR=${i#*=}
-            ;;
-        --unittest)
-            ALLTEST_ON="false"
-            UNITTEST_ON="true"
-            ;;
         --frameworktest)
             ALLTEST_ON="false"
             FRAMEWORKTEST_ON="true"
@@ -116,15 +100,6 @@ done
 
 ARTIFACT_PATH="$(readlink -f $ARTIFACT_PATH)"
 
-if [ -z "$RUN_TEST_SH" ]; then
-    RUN_TEST_SH=$ARTIFACT_PATH/tests/scripts/framework/run_test.sh
-fi
-
-if [ ! -e "$RUN_TEST_SH" ]; then
-    echo "Cannot find $RUN_TEST_SH"
-    exit 1
-fi
-
 if [ -z "$UNIT_TEST_DIR" ]; then
     UNIT_TEST_DIR=$ARTIFACT_PATH/Product/out/unittest
 fi
@@ -135,13 +110,6 @@ fi
 
 source $TEST_DRIVER_DIR/common.sh
 
-# Run unittest in each part such as Runtime
-if [ "$ALLTEST_ON" == "true" ] || [ "$UNITTEST_ON" == "true" ]; then
-    $TEST_DRIVER_DIR/unittest.sh \
-        --reportdir=$REPORT_DIR \
-        --unittestdir=$UNIT_TEST_DIR
-fi
-
 # Run tflite_run with various tflite models
 if [ "$FRAMEWORKTEST_ON" == "true" ]; then
     if [ -z "$FRAMEWORK_DRIVER_BIN" ]; then
@@ -149,7 +117,6 @@ if [ "$FRAMEWORKTEST_ON" == "true" ]; then
     fi
 
     $TEST_DRIVER_DIR/test_framework.sh \
-        --runtestsh=$RUN_TEST_SH \
         --driverbin=$FRAMEWORK_DRIVER_BIN \
         --reportdir=$REPORT_DIR \
         --tapname=framework_test.tap \
@@ -166,7 +133,6 @@ if [ "$ALLTEST_ON" == "true" ] || [ "$VERIFICATION_ON" == "true" ]; then
 
     # verification uses the same script as frameworktest does
     $TEST_DRIVER_DIR/test_framework.sh \
-        --runtestsh=$RUN_TEST_SH \
         --driverbin=$VERIFICATION_DRIVER_BIN \
         --reportdir=$REPORT_DIR \
         --tapname=verification_test.tap \
@@ -180,10 +146,9 @@ if [ "$BENCHMARK_ONERT_OP_ON" == "true" ]; then
 
     $TEST_DRIVER_DIR/benchmark_nnapi.sh \
         --test_op \
-        --runtestsh=$RUN_TEST_SH \
         --driverbin=$DRIVER_BIN \
         --reportdir=$REPORT_DIR/benchmark_op \
-        --modelfilepath=$ARTIFACT_PATH/tests/scripts/framework
+        --modelfilepath=$ARTIFACT_PATH/tests/scripts/models
 fi
 
 # Make json file. Actually, this process is only needed on CI. That's why it is in test-driver.sh.
diff --git a/tests/scripts/test_framework.sh b/tests/scripts/test_framework.sh
index 1d9751562..6bf9c89c5 100755
--- a/tests/scripts/test_framework.sh
+++ b/tests/scripts/test_framework.sh
@@ -14,7 +14,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-FWTEST_RUN_TEST_SH=
+MY_PATH="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+
 FWTEST_DRIVER_BIN=
 FWTEST_REPORT_DIR=
 FWTEST_TAP_NAME=
@@ -25,7 +26,6 @@ function Usage()
 {
     echo "Usage Example:"
     echo "./$0 \\"
-    echo "  --runtestsh=tests/scripts/framework/run_test.sh \\ # Test runner script path"
     echo "  --driverbin=Product/out/bin/tflite_run \\  # Test driver path"
     echo "  --frameworktest_list_file=tests/scripts/list/frameworktest_list.armv7l.cpu.txt \\"
     echo "  --reportdir=report \\            # Directory for the report files will be saved"
@@ -42,9 +42,6 @@ do
         -h|--help|help)
             Usage
             ;;
-        --runtestsh=*)
-            FWTEST_RUN_TEST_SH=${i#*=}
-            ;;
         --driverbin=*)
             FWTEST_DRIVER_BIN=${i#*=}
             ;;
@@ -67,7 +64,6 @@ do
     shift
 done
 
-[ ! -z "$FWTEST_RUN_TEST_SH" ] || Usage
 [ ! -z "$FWTEST_DRIVER_BIN" ] || Usage
 [ ! -z "$FWTEST_REPORT_DIR" ] || Usage
 [ ! -z "$FWTEST_TAP_NAME" ] || Usage
@@ -86,7 +82,7 @@ if [ ! -z "$FRAMEWORKTEST_LIST_FILE" ]; then
     MODELLIST=$(cat "${FRAMEWORKTEST_LIST_FILE}")
 fi
 
-$FWTEST_RUN_TEST_SH --driverbin=$FWTEST_DRIVER_BIN \
+$MY_PATH/models/run_test.sh --driverbin=$FWTEST_DRIVER_BIN \
     --reportdir=$FWTEST_REPORT_DIR \
     --tapname=$FWTEST_TAP_NAME \
     ${MODELLIST:-} \
diff --git a/tests/scripts/test_scheduler_with_profiling.sh b/tests/scripts/test_scheduler_with_profiling.sh
index 8b2f2d281..5c24572d8 100755
--- a/tests/scripts/test_scheduler_with_profiling.sh
+++ b/tests/scripts/test_scheduler_with_profiling.sh
@@ -11,7 +11,7 @@ TEST_DRIVER_DIR="$( cd "$( dirname "${BASH_SOURCE}" )" && pwd )"
 ARTIFACT_PATH="$TEST_DRIVER_DIR/../.."
 BENCHMARK_DRIVER_BIN=$ARTIFACT_PATH/Product/out/bin/tflite_run
 REPORT_DIR=$ARTIFACT_PATH/report
-RUN_TEST_SH=$ARTIFACT_PATH/tests/scripts/framework/run_test.sh
+RUN_TEST_SH=$ARTIFACT_PATH/tests/scripts/models/run_test.sh
 BENCHMARK_MODEL_LIST="MODELS/inception_nonslim MODELS/inception_slim MODELS/mobilenet"
 
 if [ ! -e "$RUN_TEST_SH" ]; then
diff --git a/tests/tools/nnpackage_run/CMakeLists.txt b/tests/tools/nnpackage_run/CMakeLists.txt
index 0e333a010..ec45db4f6 100644
--- a/tests/tools/nnpackage_run/CMakeLists.txt
+++ b/tests/tools/nnpackage_run/CMakeLists.txt
@@ -33,7 +33,7 @@ target_include_directories(nnpackage_run PRIVATE src)
 target_include_directories(nnpackage_run PRIVATE ${Boost_INCLUDE_DIRS})
 
 target_link_libraries(nnpackage_run onert_core onert tflite_loader)
-target_link_libraries(nnpackage_run tensorflow-lite ${LIB_PTHREAD} dl nnfw_lib_tflite jsoncpp)
+target_link_libraries(nnpackage_run nnfw_lib_tflite jsoncpp)
 target_link_libraries(nnpackage_run nnfw-dev)
 target_link_libraries(nnpackage_run ${Boost_PROGRAM_OPTIONS_LIBRARY})
 target_link_libraries(nnpackage_run nnfw_lib_benchmark)
diff --git a/tests/tools/nnpackage_run/src/args.cc b/tests/tools/nnpackage_run/src/args.cc
index 0dbcafc33..cb4a7dbaa 100644
--- a/tests/tools/nnpackage_run/src/args.cc
+++ b/tests/tools/nnpackage_run/src/args.cc
@@ -16,6 +16,7 @@
 
 #include "args.h"
 
+#include <functional>
 #include <iostream>
 #include <json/json.h>
 
@@ -105,6 +106,75 @@ Args::Args(const int argc, char **argv)
 
 void Args::Initialize(void)
 {
+  auto process_nnpackage = [&](const std::string &package_filename) {
+    _package_filename = package_filename;
+
+    std::cerr << "Package Filename " << _package_filename << std::endl;
+    if (_package_filename.empty())
+    {
+      // TODO Print usage instead of the below message
+      std::cerr << "Please specify nnpackage file. Run with `--help` for usage."
+                << "\n";
+
+      exit(1);
+    }
+    else
+    {
+      if (access(_package_filename.c_str(), F_OK) == -1)
+      {
+        std::cerr << "nnpackage not found: " << _package_filename << "\n";
+      }
+    }
+  };
+
+  auto process_output_sizes = [&](const std::string &output_sizes_json_str) {
+    Json::Value root;
+    Json::Reader reader;
+    if (!reader.parse(output_sizes_json_str, root, false))
+    {
+      std::cerr << "Invalid JSON format for output_sizes \"" << output_sizes_json_str << "\"\n";
+      exit(1);
+    }
+
+    auto arg_map = argArrayToMap(root);
+    for (auto &pair : arg_map)
+    {
+      uint32_t key = pair.first;
+      Json::Value &val_json = pair.second;
+      if (!val_json.isUInt())
+      {
+        std::cerr << "All the values in `output_sizes` must be unsigned integers\n";
+        exit(1);
+      }
+      uint32_t val = val_json.asUInt();
+      _output_sizes[key] = val;
+    }
+  };
+
+  auto process_shape_prepare = [&](const std::string &shape_str) {
+    try
+    {
+      handleShapeParam(_shape_prepare, shape_str);
+    }
+    catch (const std::exception &e)
+    {
+      std::cerr << "error with '--shape_prepare' option: " << shape_str << std::endl;
+      exit(1);
+    }
+  };
+
+  auto process_shape_run = [&](const std::string &shape_str) {
+    try
+    {
+      handleShapeParam(_shape_run, shape_str);
+    }
+    catch (const std::exception &e)
+    {
+      std::cerr << "error with '--shape_run' option: " << shape_str << std::endl;
+      exit(1);
+    }
+  };
+
   // General options
   po::options_description general("General options", 100);
 
@@ -112,32 +182,33 @@ void Args::Initialize(void)
   general.add_options()
     ("help,h", "Print available options")
     ("version", "Print version and exit immediately")
-    ("nnpackage", po::value<std::string>()->required())
+    ("nnpackage", po::value<std::string>()->required()->notifier(process_nnpackage))
 #if defined(ONERT_HAVE_HDF5) && ONERT_HAVE_HDF5 == 1
-    ("dump,d", po::value<std::string>()->default_value(""), "Output filename")
-    ("load,l", po::value<std::string>()->default_value(""), "Input filename")
+    ("dump,d", po::value<std::string>()->default_value("")->notifier([&](const auto &v) { _dump_filename = v; }), "Output filename")
+    ("load,l", po::value<std::string>()->default_value("")->notifier([&](const auto &v) { _load_filename = v; }), "Input filename")
 #endif
-    ("output_sizes", po::value<std::string>(),
+    ("output_sizes", po::value<std::string>()->notifier(process_output_sizes),
         "The output buffer size in JSON 1D array\n"
         "If not given, the model's output sizes are used\n"
         "e.g. '[0, 40, 2, 80]' to set 0th tensor to 40 and 2nd tensor to 80.\n")
-    ("num_runs,r", po::value<int>()->default_value(1), "The number of runs")
-    ("warmup_runs,w", po::value<int>()->default_value(0), "The number of warmup runs")
-    ("run_delay,t", po::value<int>()->default_value(-1), "Delay time(ms) between runs (as default no delay")
-    ("gpumem_poll,g", po::value<bool>()->default_value(false), "Check gpu memory polling separately")
-    ("mem_poll,m", po::value<bool>()->default_value(false), "Check memory polling")
-    ("write_report,p", po::value<bool>()->default_value(false),
+    ("num_runs,r", po::value<int>()->default_value(1)->notifier([&](const auto &v) { _num_runs = v; }), "The number of runs")
+    ("warmup_runs,w", po::value<int>()->default_value(0)->notifier([&](const auto &v) { _warmup_runs = v; }), "The number of warmup runs")
+    ("run_delay,t", po::value<int>()->default_value(-1)->notifier([&](const auto &v) { _run_delay = v; }), "Delay time(ms) between runs (as default no delay")
+    ("gpumem_poll,g", po::value<bool>()->default_value(false)->notifier([&](const auto &v) { _gpumem_poll = v; }), "Check gpu memory polling separately")
+    ("mem_poll,m", po::value<bool>()->default_value(false)->notifier([&](const auto &v) { _mem_poll = v; }), "Check memory polling")
+    ("write_report,p", po::value<bool>()->default_value(false)->notifier([&](const auto &v) { _write_report = v; }),
          "Write report\n"
          "{exec}-{nnpkg}-{backend}.csv will be generated.\n"
          "e.g. nnpackage_run-UNIT_Add_000-acl_cl.csv.\n"
          "{nnpkg} name may be changed to realpath if you use symbolic-link.")
-    ("shape_prepare", po::value<std::string>()->default_value("[]"),
+    ("shape_prepare", po::value<std::string>()->default_value("[]")->notifier(process_shape_prepare),
          "set shape of specified tensor before compilation\n"
          "e.g. '[0, [1, 2], 2, []]' to set 0th tensor to [1, 2] and 2nd tensor to [].\n")
-    ("shape_run", po::value<std::string>()->default_value("[]"),
+    ("shape_run", po::value<std::string>()->default_value("[]")->notifier(process_shape_run),
          "set shape of specified tensor right before running\n"
          "e.g. '[1, [1, 2]]` to set 1st tensor to [1, 2].\n")
-    ("verbose_level,v", po::value<int>()->default_value(0), "Verbose level\n"
+    ("verbose_level,v", po::value<int>()->default_value(0)->notifier([&](const auto &v) { _verbose_level = v; }),
+         "Verbose level\n"
          "0: prints the only result. Messages btw run don't print\n"
          "1: prints result and message btw run\n"
          "2: prints all of messages to print\n")
@@ -180,158 +251,23 @@ void Args::Parse(const int argc, char **argv)
     return;
   }
 
-  po::notify(vm);
   try
   {
-#if defined(ONERT_HAVE_HDF5) && ONERT_HAVE_HDF5 == 1
-    if (vm.count("dump"))
-    {
-      _dump_filename = vm["dump"].as<std::string>();
-    }
-
-    if (vm.count("load"))
-    {
-      _load_filename = vm["load"].as<std::string>();
-    }
-#endif
-
-    if (vm.count("nnpackage"))
-    {
-      _package_filename = vm["nnpackage"].as<std::string>();
-
-      if (_package_filename.empty())
-      {
-        // TODO Print usage instead of the below message
-        std::cerr << "Please specify nnpackage file. Run with `--help` for usage."
-                  << "\n";
-
-        exit(1);
-      }
-      else
-      {
-        if (access(_package_filename.c_str(), F_OK) == -1)
-        {
-          std::cerr << "nnpackage not found: " << _package_filename << "\n";
-        }
-      }
-    }
-
-    if (vm.count("output_sizes"))
-    {
-      auto output_sizes_json_str = vm["output_sizes"].as<std::string>();
-
-      Json::Value root;
-      Json::Reader reader;
-      if (!reader.parse(output_sizes_json_str, root, false))
-      {
-        std::cerr << "Invalid JSON format for output_sizes \"" << output_sizes_json_str << "\"\n";
-        exit(1);
-      }
-
-      auto arg_map = argArrayToMap(root);
-      for (auto &pair : arg_map)
-      {
-        uint32_t key = pair.first;
-        Json::Value &val_json = pair.second;
-        if (!val_json.isUInt())
-        {
-          std::cerr << "All the values in `output_sizes` must be unsigned integers\n";
-          exit(1);
-        }
-        uint32_t val = val_json.asUInt();
-        _output_sizes[key] = val;
-      }
-    }
-
-    if (vm.count("num_runs"))
-    {
-      _num_runs = vm["num_runs"].as<int>();
-    }
-
-    if (vm.count("warmup_runs"))
-    {
-      _warmup_runs = vm["warmup_runs"].as<int>();
-    }
-
-    if (vm.count("run_delay"))
-    {
-      _run_delay = vm["run_delay"].as<int>();
-    }
-
-    if (vm.count("gpumem_poll"))
-    {
-      _gpumem_poll = vm["gpumem_poll"].as<bool>();
-    }
-
-    if (vm.count("mem_poll"))
-    {
-      _mem_poll = vm["mem_poll"].as<bool>();
-      // Instead of EXECUTE to avoid overhead, memory polling runs on WARMUP
-      if (_mem_poll && _warmup_runs == 0)
-      {
-        _warmup_runs = 1;
-      }
-    }
-
-    if (vm.count("write_report"))
-    {
-      _write_report = vm["write_report"].as<bool>();
-    }
-
-    if (vm.count("verbose_level"))
-    {
-      _verbose_level = vm["verbose_level"].as<int>();
-    }
+    po::notify(vm);
   }
   catch (const std::bad_cast &e)
   {
-    std::cerr << "error by bad cast" << e.what() << '\n';
+    std::cerr << "Bad cast error - " << e.what() << '\n';
     exit(1);
   }
 
-  if (vm.count("shape_prepare"))
-  {
-    std::string shape_str;
-    try
-    {
-      shape_str = vm["shape_prepare"].as<std::string>();
-    }
-    catch (const std::bad_cast &e)
-    {
-      std::cerr << "error by bad cast with '--shape_prepare' option" << e.what() << '\n';
-      exit(1);
-    }
-    try
-    {
-      handleShapeParam(_shape_prepare, shape_str);
-    }
-    catch (const std::exception &e)
-    {
-      std::cerr << "error with '--shape_prepare' option: " << shape_str << std::endl;
-      exit(1);
-    }
-  }
-
-  if (vm.count("shape_run"))
+  // This must be run after `notify` as `_warm_up_runs` must have been processed before.
+  if (vm.count("mem_poll"))
   {
-    std::string shape_str;
-    try
-    {
-      shape_str = vm["shape_run"].as<std::string>();
-    }
-    catch (const std::bad_cast &e)
+    // Instead of EXECUTE to avoid overhead, memory polling runs on WARMUP
+    if (_mem_poll && _warmup_runs == 0)
     {
-      std::cerr << "error by bad cast with '--shape_run' option" << e.what() << '\n';
-      exit(1);
-    }
-    try
-    {
-      handleShapeParam(_shape_run, shape_str);
-    }
-    catch (const std::exception &e)
-    {
-      std::cerr << "error with '--shape_run' option: " << shape_str << std::endl;
-      exit(1);
+      _warmup_runs = 1;
     }
   }
 }
diff --git a/tests/tools/nnpackage_run/src/h5formatter.cc b/tests/tools/nnpackage_run/src/h5formatter.cc
index 34c075c1a..09ace4798 100644
--- a/tests/tools/nnpackage_run/src/h5formatter.cc
+++ b/tests/tools/nnpackage_run/src/h5formatter.cc
@@ -145,6 +145,7 @@ void H5Formatter::dumpOutputs(const std::string &filename, std::vector<Allocatio
           data_set.write(outputs[i].data(), H5::PredType::NATIVE_INT64);
           break;
         }
+        case NNFW_TYPE_TENSOR_UINT8:
         case NNFW_TYPE_TENSOR_QUANT8_ASYMM:
         {
           H5::DataSet data_set =
@@ -159,13 +160,6 @@ void H5Formatter::dumpOutputs(const std::string &filename, std::vector<Allocatio
           data_set.write(outputs[i].data(), H5::PredType::NATIVE_INT8);
           break;
         }
-        case NNFW_TYPE_TENSOR_UINT8:
-        {
-          H5::DataSet data_set =
-              value_group.createDataSet(std::to_string(i), H5::PredType::STD_U8BE, data_space);
-          data_set.write(outputs[i].data(), H5::PredType::NATIVE_UINT8);
-          break;
-        }
         default:
           throw std::runtime_error("nnpkg_run can dump f32, i32, qasymm8, bool and uint8.");
       }
diff --git a/tests/tools/nnpackage_run/src/nnpackage_run.cc b/tests/tools/nnpackage_run/src/nnpackage_run.cc
index ccad10e51..88d3307af 100644
--- a/tests/tools/nnpackage_run/src/nnpackage_run.cc
+++ b/tests/tools/nnpackage_run/src/nnpackage_run.cc
@@ -22,7 +22,7 @@
 #endif
 #include "nnfw.h"
 #include "nnfw_util.h"
-#include "nnfw_debug.h"
+#include "nnfw_internal.h"
 #include "randomgen.h"
 #ifdef RUY_PROFILER
 #include "ruy/profiler/profiler.h"
diff --git a/tests/tools/tflite_loader/CMakeLists.txt b/tests/tools/tflite_loader/CMakeLists.txt
index 5a9e3a8ff..0fe1c69de 100644
--- a/tests/tools/tflite_loader/CMakeLists.txt
+++ b/tests/tools/tflite_loader/CMakeLists.txt
@@ -17,7 +17,7 @@ add_executable(tflite_loader_test_tool ${SOURCES})
 target_include_directories(tflite_loader_test_tool PRIVATE ${Boost_INCLUDE_DIRS})
 
 target_link_libraries(tflite_loader_test_tool onert_core onert tflite_loader)
-target_link_libraries(tflite_loader_test_tool nnfw_lib_tflite tensorflow-lite ${LIB_PTHREAD} dl nnfw_lib_misc)
+target_link_libraries(tflite_loader_test_tool nnfw_lib_tflite nnfw_lib_misc)
 target_link_libraries(tflite_loader_test_tool ${Boost_PROGRAM_OPTIONS_LIBRARY} ${Boost_SYSTEM_LIBRARY} ${Boost_FILESYSTEM_LIBRARY})
 
 install(TARGETS tflite_loader_test_tool DESTINATION bin)
diff --git a/tests/tools/tflite_run/CMakeLists.txt b/tests/tools/tflite_run/CMakeLists.txt
index 19e7126b0..3f30d3e32 100644
--- a/tests/tools/tflite_run/CMakeLists.txt
+++ b/tests/tools/tflite_run/CMakeLists.txt
@@ -13,7 +13,7 @@ add_executable(tflite_run ${TFLITE_RUN_SRCS})
 target_include_directories(tflite_run PRIVATE src)
 target_include_directories(tflite_run PRIVATE ${Boost_INCLUDE_DIRS})
 
-target_link_libraries(tflite_run tensorflow-lite ${LIB_PTHREAD} dl nnfw_lib_tflite)
+target_link_libraries(tflite_run nnfw_lib_tflite)
 target_link_libraries(tflite_run ${Boost_PROGRAM_OPTIONS_LIBRARY})
 
 target_link_libraries(tflite_run nnfw_lib_benchmark)
diff --git a/tests/tools/tflite_run/src/args.cc b/tests/tools/tflite_run/src/args.cc
index fac2a6e56..f8f581baf 100644
--- a/tests/tools/tflite_run/src/args.cc
+++ b/tests/tools/tflite_run/src/args.cc
@@ -37,6 +37,39 @@ Args::Args(const int argc, char **argv) noexcept
 
 void Args::Initialize(void)
 {
+  auto process_input = [&](const std::string &v) {
+    _input_filename = v;
+
+    if (!_input_filename.empty())
+    {
+      if (access(_input_filename.c_str(), F_OK) == -1)
+      {
+        std::cerr << "input image file not found: " << _input_filename << "\n";
+      }
+    }
+  };
+
+  auto process_tflite = [&](const std::string &v) {
+    _tflite_filename = v;
+
+    if (_tflite_filename.empty())
+    {
+      // TODO Print usage instead of the below message
+      std::cerr << "Please specify tflite file. Run with `--help` for usage."
+                << "\n";
+
+      exit(1);
+    }
+    else
+    {
+      if (access(_tflite_filename.c_str(), F_OK) == -1)
+      {
+        std::cerr << "tflite file not found: " << _tflite_filename << "\n";
+        exit(1);
+      }
+    }
+  };
+
   try
   {
     // General options
@@ -45,19 +78,19 @@ void Args::Initialize(void)
     // clang-format off
   general.add_options()
     ("help,h", "Display available options")
-    ("input,i", po::value<std::string>()->default_value(""), "Input filename")
-    ("dump,d", po::value<std::string>()->default_value(""), "Output filename")
-    ("ishapes", po::value<std::vector<int>>()->multitoken(), "Input shapes")
-    ("compare,c", po::value<std::string>()->default_value(""), "filename to be compared with")
-    ("tflite", po::value<std::string>()->required())
-    ("num_runs,r", po::value<int>()->default_value(1), "The number of runs")
-    ("warmup_runs,w", po::value<int>()->default_value(0), "The number of warmup runs")
-    ("run_delay,t", po::value<int>()->default_value(-1), "Delay time(ms) between runs (as default no delay")
-    ("gpumem_poll,g", po::value<bool>()->default_value(false), "Check gpu memory polling separately")
+    ("input,i", po::value<std::string>()->default_value("")->notifier(process_input), "Input filename")
+    ("dump,d", po::value<std::string>()->default_value("")->notifier([&](const auto &v) { _dump_filename = v; }), "Output filename")
+    ("ishapes", po::value<std::vector<int>>()->multitoken()->notifier([&](const auto &v) { _input_shapes = v; }), "Input shapes")
+    ("compare,c", po::value<std::string>()->default_value("")->notifier([&](const auto &v) { _compare_filename = v; }), "filename to be compared with")
+    ("tflite", po::value<std::string>()->required()->notifier(process_tflite))
+    ("num_runs,r", po::value<int>()->default_value(1)->notifier([&](const auto &v) { _num_runs = v; }), "The number of runs")
+    ("warmup_runs,w", po::value<int>()->default_value(0)->notifier([&](const auto &v) { _warmup_runs = v; }), "The number of warmup runs")
+    ("run_delay,t", po::value<int>()->default_value(-1)->notifier([&](const auto &v) { _run_delay = v; }), "Delay time(ms) between runs (as default no delay)")
+    ("gpumem_poll,g", po::value<bool>()->default_value(false)->notifier([&](const auto &v) { _gpumem_poll = v; }), "Check gpu memory polling separately")
     ("mem_poll,m", po::value<bool>()->default_value(false), "Check memory polling")
-    ("write_report,p", po::value<bool>()->default_value(false), "Write report")
-    ("validate", po::value<bool>()->default_value(true), "Validate tflite model")
-    ("verbose_level,v", po::value<int>()->default_value(0), "Verbose level\n"
+    ("write_report,p", po::value<bool>()->default_value(false)->notifier([&](const auto &v) { _write_report = v; }), "Write report")
+    ("validate", po::value<bool>()->default_value(true)->notifier([&](const auto &v) { _tflite_validate = v; }), "Validate tflite model")
+    ("verbose_level,v", po::value<int>()->default_value(0)->notifier([&](const auto &v) { _verbose_level = v; }), "Verbose level\n"
          "0: prints the only result. Messages btw run don't print\n"
          "1: prints result and message btw run\n"
          "2: prints all of messages to print\n")
@@ -105,80 +138,7 @@ void Args::Parse(const int argc, char **argv)
 
   po::notify(vm);
 
-  if (vm.count("dump"))
-  {
-    _dump_filename = vm["dump"].as<std::string>();
-  }
-
-  if (vm.count("compare"))
-  {
-    _compare_filename = vm["compare"].as<std::string>();
-  }
-
-  if (vm.count("input"))
-  {
-    _input_filename = vm["input"].as<std::string>();
-
-    if (!_input_filename.empty())
-    {
-      if (access(_input_filename.c_str(), F_OK) == -1)
-      {
-        std::cerr << "input image file not found: " << _input_filename << "\n";
-      }
-    }
-  }
-
-  if (vm.count("ishapes"))
-  {
-    _input_shapes.resize(vm["ishapes"].as<std::vector<int>>().size());
-    for (auto i = 0; i < _input_shapes.size(); i++)
-    {
-      _input_shapes[i] = vm["ishapes"].as<std::vector<int>>()[i];
-    }
-  }
-
-  if (vm.count("tflite"))
-  {
-    _tflite_filename = vm["tflite"].as<std::string>();
-
-    if (_tflite_filename.empty())
-    {
-      // TODO Print usage instead of the below message
-      std::cerr << "Please specify tflite file. Run with `--help` for usage."
-                << "\n";
-
-      exit(1);
-    }
-    else
-    {
-      if (access(_tflite_filename.c_str(), F_OK) == -1)
-      {
-        std::cerr << "tflite file not found: " << _tflite_filename << "\n";
-        exit(1);
-      }
-    }
-  }
-
-  if (vm.count("num_runs"))
-  {
-    _num_runs = vm["num_runs"].as<int>();
-  }
-
-  if (vm.count("warmup_runs"))
-  {
-    _warmup_runs = vm["warmup_runs"].as<int>();
-  }
-
-  if (vm.count("run_delay"))
-  {
-    _run_delay = vm["run_delay"].as<int>();
-  }
-
-  if (vm.count("gpumem_poll"))
-  {
-    _gpumem_poll = vm["gpumem_poll"].as<bool>();
-  }
-
+  // This must be run after `notify` as `_warm_up_runs` must have been processed before.
   if (vm.count("mem_poll"))
   {
     _mem_poll = vm["mem_poll"].as<bool>();
@@ -188,21 +148,6 @@ void Args::Parse(const int argc, char **argv)
       _warmup_runs = 1;
     }
   }
-
-  if (vm.count("write_report"))
-  {
-    _write_report = vm["write_report"].as<bool>();
-  }
-
-  if (vm.count("validate"))
-  {
-    _tflite_validate = vm["validate"].as<bool>();
-  }
-
-  if (vm.count("verbose_level"))
-  {
-    _verbose_level = vm["verbose_level"].as<int>();
-  }
 }
 
 } // end of namespace TFLiteRun
diff --git a/tests/tools/tflite_run_2_2_0/CMakeLists.txt b/tests/tools/tflite_run_2_2_0/CMakeLists.txt
deleted file mode 100644
index a2c85c5d7..000000000
--- a/tests/tools/tflite_run_2_2_0/CMakeLists.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-if(NOT BUILD_TFLITE_RUN_2_2_0)
-  return()
-endif()
-
-if(NOT BUILD_TENSORFLOW_LITE_2_2_0)
-  set(BUILD_TENSORFLOW_LITE_2_2_0 ON)
-endif()
-
-nnfw_find_package(TensorFlowLite-2.2.0 REQUIRED)
-nnfw_find_package(Boost REQUIRED)
-
-list(APPEND TFLITE_RUN_SRCS "src/tflite_run_2_2_0.cc")
-list(APPEND TFLITE_RUN_SRCS "src/args.cc")
-
-add_executable(tflite_run_2_2_0 ${TFLITE_RUN_SRCS})
-target_include_directories(tflite_run_2_2_0 PRIVATE src)
-target_include_directories(tflite_run_2_2_0 PRIVATE ${Boost_INCLUDE_DIRS})
-
-target_link_libraries(tflite_run_2_2_0 tensorflow-lite-2.2.0 ${LIB_PTHREAD} dl)
-target_link_libraries(tflite_run_2_2_0 ${Boost_PROGRAM_OPTIONS_LIBRARY})
-target_link_libraries(tflite_run_2_2_0 nnfw_lib_benchmark nnfw_lib_misc)
-
-install(TARGETS tflite_run_2_2_0 DESTINATION bin)
diff --git a/tests/tools/tflite_vanilla_run/CMakeLists.txt b/tests/tools/tflite_vanilla_run/CMakeLists.txt
new file mode 100644
index 000000000..19e21e923
--- /dev/null
+++ b/tests/tools/tflite_vanilla_run/CMakeLists.txt
@@ -0,0 +1,23 @@
+if(NOT BUILD_TFLITE_VANILLA_RUN)
+  return()
+endif()
+
+if(NOT BUILD_TENSORFLOW_LITE_2_3_0)
+  set(BUILD_TENSORFLOW_LITE_2_3_0 ON)
+endif()
+
+nnfw_find_package(TensorFlowLite-2.3.0 REQUIRED)
+nnfw_find_package(Boost REQUIRED)
+
+list(APPEND TFLITE_RUN_SRCS "src/tflite_vanilla_run.cc")
+list(APPEND TFLITE_RUN_SRCS "src/args.cc")
+
+add_executable(tflite_vanilla_run ${TFLITE_RUN_SRCS})
+target_include_directories(tflite_vanilla_run PRIVATE src)
+target_include_directories(tflite_vanilla_run PRIVATE ${Boost_INCLUDE_DIRS})
+
+target_link_libraries(tflite_vanilla_run tensorflow-lite-2.3.0 ${LIB_PTHREAD} dl)
+target_link_libraries(tflite_vanilla_run ${Boost_PROGRAM_OPTIONS_LIBRARY})
+target_link_libraries(tflite_vanilla_run nnfw_lib_benchmark nnfw_lib_misc)
+
+install(TARGETS tflite_vanilla_run DESTINATION bin)
diff --git a/tests/tools/tflite_run_2_2_0/src/args.cc b/tests/tools/tflite_vanilla_run/src/args.cc
index 1f8969201..dc9f250e4 100644
--- a/tests/tools/tflite_run_2_2_0/src/args.cc
+++ b/tests/tools/tflite_vanilla_run/src/args.cc
@@ -18,7 +18,7 @@
 
 #include <iostream>
 
-namespace TFLiteRun220
+namespace TFLiteVanillaRun
 {
 
 Args::Args(const int argc, char **argv) noexcept
@@ -205,4 +205,4 @@ void Args::Parse(const int argc, char **argv)
   }
 }
 
-} // end of namespace TFLiteRun220
+} // end of namespace TFLiteVanillaRun
diff --git a/tests/tools/tflite_run_2_2_0/src/args.h b/tests/tools/tflite_vanilla_run/src/args.h
index 630d50b1a..3605b651c 100644
--- a/tests/tools/tflite_run_2_2_0/src/args.h
+++ b/tests/tools/tflite_vanilla_run/src/args.h
@@ -14,15 +14,15 @@
  * limitations under the License.
  */
 
-#ifndef __TFLITE_RUN_2_2_0_ARGS_H__
-#define __TFLITE_RUN_2_2_0_ARGS_H__
+#ifndef __TFLITE_VANILLA_RUN_ARGS_H__
+#define __TFLITE_VANILLA_RUN_ARGS_H__
 
 #include <string>
 #include <boost/program_options.hpp>
 
 namespace po = boost::program_options;
 
-namespace TFLiteRun220
+namespace TFLiteVanillaRun
 {
 
 class Args
@@ -68,6 +68,6 @@ private:
   int _verbose_level;
 };
 
-} // end of namespace TFLiteRun220
+} // end of namespace TFLiteVanillaRun
 
-#endif // __TFLITE_RUN_2_2_0_ARGS_H__
+#endif // __TFLITE_VANILLA_RUN_ARGS_H__
diff --git a/tests/tools/tflite_run_2_2_0/src/tensor_view.h b/tests/tools/tflite_vanilla_run/src/tensor_view.h
index ef9dfbc42..ca04a051e 100644
--- a/tests/tools/tflite_run_2_2_0/src/tensor_view.h
+++ b/tests/tools/tflite_vanilla_run/src/tensor_view.h
@@ -20,8 +20,8 @@
  * @ingroup  COM_AI_RUNTIME
  */
 
-#ifndef __TFLITE_RUN_2_2_0_TENSOR_VIEW_H__
-#define __TFLITE_RUN_2_2_0_TENSOR_VIEW_H__
+#ifndef __TFLITE_VANILLA_RUN_TENSOR_VIEW_H__
+#define __TFLITE_VANILLA_RUN_TENSOR_VIEW_H__
 
 #include "tensorflow/lite/interpreter.h"
 
@@ -30,7 +30,7 @@
 #include "misc/tensor/Reader.h"
 #include "misc/tensor/NonIncreasingStride.h"
 
-namespace TFLiteRun220
+namespace TFLiteVanillaRun
 {
 
 /**
@@ -112,6 +112,6 @@ public:
   }
 };
 
-} // namespace TFLiteRun220
+} // namespace TFLiteVanillaRun
 
-#endif // __TFLITE_RUN_2_2_0_TENSOR_VIEW_H__
+#endif // __TFLITE_VANILLA_RUN_TENSOR_VIEW_H__
diff --git a/tests/tools/tflite_run_2_2_0/src/tflite_run_2_2_0.cc b/tests/tools/tflite_vanilla_run/src/tflite_vanilla_run.cc
index ca80e1c2a..d44ea60cf 100644
--- a/tests/tools/tflite_run_2_2_0/src/tflite_run_2_2_0.cc
+++ b/tests/tools/tflite_vanilla_run/src/tflite_vanilla_run.cc
@@ -79,7 +79,7 @@ int main(const int argc, char **argv)
 {
   tflite::StderrReporter error_reporter;
 
-  TFLiteRun220::Args args(argc, argv);
+  TFLiteVanillaRun::Args args(argc, argv);
 
   std::chrono::milliseconds t_model_load(0), t_prepare(0);
 
@@ -148,7 +148,7 @@ int main(const int argc, char **argv)
     if (tensor->type == kTfLiteInt32)
     {
       // Generate singed 32-bit integer (s32) input
-      auto tensor_view = TFLiteRun220::TensorView<int32_t>::make(*interpreter, o);
+      auto tensor_view = TFLiteVanillaRun::TensorView<int32_t>::make(*interpreter, o);
 
       int32_t value = 0;
 
@@ -163,7 +163,7 @@ int main(const int argc, char **argv)
     else if (tensor->type == kTfLiteUInt8)
     {
       // Generate unsigned 8-bit integer input
-      auto tensor_view = TFLiteRun220::TensorView<uint8_t>::make(*interpreter, o);
+      auto tensor_view = TFLiteVanillaRun::TensorView<uint8_t>::make(*interpreter, o);
 
       uint8_t value = 0;
 
@@ -177,7 +177,7 @@ int main(const int argc, char **argv)
     else if (tensor->type == kTfLiteBool)
     {
       // Generate bool input
-      auto tensor_view = TFLiteRun220::TensorView<bool>::make(*interpreter, o);
+      auto tensor_view = TFLiteVanillaRun::TensorView<bool>::make(*interpreter, o);
 
       auto fp = static_cast<bool (nnfw::misc::RandomGenerator::*)(
           const ::nnfw::misc::tensor::Shape &, const ::nnfw::misc::tensor::Index &)>(
diff --git a/tools/nnpackage_tool/nncc-tc-to-nnpkg-tc/nncc-tc-to-nnpkg-tc.sh b/tools/nnpackage_tool/nncc-tc-to-nnpkg-tc/nncc-tc-to-nnpkg-tc.sh
index cf3e54406..bbc5b3e6c 100755
--- a/tools/nnpackage_tool/nncc-tc-to-nnpkg-tc/nncc-tc-to-nnpkg-tc.sh
+++ b/tools/nnpackage_tool/nncc-tc-to-nnpkg-tc/nncc-tc-to-nnpkg-tc.sh
@@ -62,6 +62,7 @@ tflite
 "
 
 model_type=""
+tf_intf_version=""
 
 for ext in $supported_model_types; do
   [ -e "$indir/$tcname"."$ext" ] && model_type=$ext
@@ -73,7 +74,9 @@ if [[ "$model_type" == "" ]]; then
 fi
 
 if [[ "$model_type" == "pb" ]]; then
-  $tf2nnpkg --info "$indir/$tcname".info --graphdef "$indir/$tcname"."$model_type" -o "$outdir"
+  [ -f "$indir/$tcname"."v2" ] && tf_intf_version="--v2"
+  $tf2nnpkg --info "$indir/$tcname".info --graphdef "$indir/$tcname"."$model_type" \
+  "$tf_intf_version" -o "$outdir"
 else
   $model2nnpkg -o "$outdir" "$indir/$tcname"."$model_type"
 fi
diff --git a/tools/nnpackage_tool/sth2nnpkgtc/pb2nnpkgtc.md b/tools/nnpackage_tool/sth2nnpkgtc/pb2nnpkgtc.md
index df90d0aa1..faf66fbde 100644
--- a/tools/nnpackage_tool/sth2nnpkgtc/pb2nnpkgtc.md
+++ b/tools/nnpackage_tool/sth2nnpkgtc/pb2nnpkgtc.md
@@ -55,7 +55,7 @@ test_model.conv2d_transpose
 
 # @ target
 $ OP_BACKEND_ALLOPS=cpu \
-tests/scripts/nnpkg_test.sh test_model.conv2d_transpose
+onert/test/onert-test nnpkg-test test_model.conv2d_transpose
 [  Run  ] ./test_model.out   Pass
 [Compare] ./test_model.out   Pass
 ```
diff --git a/tools/nnpackage_tool/sth2nnpkgtc/tflite2nnpkgtc.md b/tools/nnpackage_tool/sth2nnpkgtc/tflite2nnpkgtc.md
index dab6ba4d7..9f28ebacb 100644
--- a/tools/nnpackage_tool/sth2nnpkgtc/tflite2nnpkgtc.md
+++ b/tools/nnpackage_tool/sth2nnpkgtc/tflite2nnpkgtc.md
@@ -42,5 +42,5 @@ nnpkg-tcs/cast
 
 # @ target
 # run nnpkg with nnpackage_run and compare with h5diff
-$ tests/scripts/nnpkg_test.sh -i nnpkg-tcs cast
+$ onert/test/onert-test nnpkg-test -i nnpkg-tcs cast
 ```
diff --git a/tools/tflitefile_tool/select_operator.py b/tools/tflitefile_tool/select_operator.py
index 1ad44a389..333ca32f6 100755
--- a/tools/tflitefile_tool/select_operator.py
+++ b/tools/tflitefile_tool/select_operator.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 
 # Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
 #
@@ -1180,23 +1180,6 @@ def GenerateModel(args, new_builder, sample_model, operator_list, new_input_tens
     return tflite.Model.ModelEnd(new_builder)
 
 
-def Finish(new_builder, new_model):
-    # Cusrom implementation: identifier
-    # Python API don't support identifier input yet
-    # Reference: Finish(self, rootTable)) in builder.py, Finish(uoffset_t root, const char *file_identifier, bool size_prefix) in flatbuffers.h
-    new_builder.Prep(new_builder.minalign,
-                     flatbuffers.number_types.UOffsetTFlags.bytewidth)
-
-    new_builder.PrependByte(0x33)
-    new_builder.PrependByte(0x4c)
-    new_builder.PrependByte(0x46)
-    new_builder.PrependByte(0x54)
-
-    new_builder.PrependUOffsetTRelative(new_model)
-    new_builder.finished = True
-    return new_builder.Head()
-
-
 def main(args):
     input_model_file = args.input_model
     oplist_file = args.opcode_list
@@ -1343,7 +1326,7 @@ def main(args):
                               new_input_tensors, new_output_tensors, used_tensors_dic,
                               used_buffers_dic, used_opcodes_dic, used_subgraphs_dic)
 
-    Finish(new_builder, new_model)
+    new_builder.Finish(new_model, file_identifier=b'TFL3')
     new_buf = new_builder.Output()
 
     output_model_file.write(new_buf)
diff --git a/tools/tflkit/README.md b/tools/tflkit/README.md
index a0c40c6fa..9e1883436 100644
--- a/tools/tflkit/README.md
+++ b/tools/tflkit/README.md
@@ -1,4 +1,4 @@
-# tflkit
+# tflkit
 
 ## Purpose
 
@@ -114,11 +114,11 @@ Number of all operators                       :  126 	 (total instrs: 11,484,469
 
 ### TensorFlow
 
-TensorFlow provides some kinds of converting guideline. In Python, the [TFLiteConverter](https://www.tensorflow.org/api_docs/python/tf/contrib/lite/TFLiteConverter) class will help you to convert a TensorFlow GraphDef or SavedModel into `output_format` using TOCO. The `output_format` can be `TFLITE` or `GRAPHVIZ_DOT` format. The default `output_format` is `TFLITE`. And there is a Python command line interface for running TOCO, and its name is [tflite_convert](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/python/tflite_convert.py). This converts a TensorFlow GraphDef or SavedModel into `TFLITE` or `GRAPHVIZ_DOT` format like [TFLiteConverter](https://www.tensorflow.org/api_docs/python/tf/contrib/lite/TFLiteConverter). These two way also supports to convert a TensorFlow Keras model into `output_format`. Both functions are implemented using a tool called TOCO.
+TensorFlow provides some kinds of converting guideline. In Python, the [TFLiteConverter](https://www.tensorflow.org/api_docs/python/tf/lite/TFLiteConverter) class will help you to convert a TensorFlow GraphDef or SavedModel into `output_format` using TOCO. The `output_format` can be `TFLITE` or `GRAPHVIZ_DOT` format. The default `output_format` is `TFLITE`. And there is a Python command line interface for running TOCO, and its name is [tflite_convert](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/python/tflite_convert.py). This converts a TensorFlow GraphDef or SavedModel into `TFLITE` or `GRAPHVIZ_DOT` format like [TFLiteConverter](https://www.tensorflow.org/api_docs/python/tf/lite/TFLiteConverter). These two way also supports to convert a TensorFlow Keras model into `output_format`. Both functions are implemented using a tool called TOCO.
 
 ### with tflkit
 
-The tflkit uses the [tflite_convert](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/python/tflite_convert.py) python command line interface to convert a TensorFlow model into TfLite model. It only supports to convert a TensorFlow GraphDef file into `TFLITE` format file. This tool supports the creation of individual `TFLITE` files for different input shapes. When converting to multiple `TFLITE` files, it needs to put a string called `NAME` in `TFLITE_PATH`. The string `NAME` will be replaced by what is listed in teh `NAME` environment. This tool requires an information file as a parameter. There is an [example file](info/convert.template) for a convert information. The `--tensorflow_path` and `--tensorflow_version` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
+The tflkit uses the [tflite_convert](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/python/tflite_convert.py) python command line interface to convert a TensorFlow model into TfLite model. It only supports to convert a TensorFlow GraphDef file into `TFLITE` format file. This tool supports the creation of individual `TFLITE` files for different input shapes. When converting to multiple `TFLITE` files, it needs to put a string called `NAME` in `TFLITE_PATH`. The string `NAME` will be replaced by what is listed in the `NAME` environment. This tool requires an information file as a parameter. There is an [example file](convert.template) for a convert information. The `--tensorflow_path` and `--tensorflow_version` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
 
 Convert information:
   * GRAPHDEF_PATH : Full filepath of file containing frozen TensorFlow GraphDef.
@@ -176,7 +176,7 @@ The input and output file of this tool is a TensorFlow GraphDef file.
 
 ### with tflkit
 
-The [optimize_for_inference.sh](optimize_for_inference.sh) file invokes the TensorFlow [optimize tool](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/optimize_for_inference.py). This tool requires a optimize information file as a parameter. Here is an [example file](info/optimize.template) for this tool. The information file needs `INPUT` and `OUTPUT` array names. The [summarize_pb.sh](summarize_pb.sh) file will help you to define the `INPUT` and `OUTPUT` array names. The `--tensorflow_path` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
+The [optimize_for_inference.sh](optimize_for_inference.sh) file invokes the TensorFlow [optimize tool](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/optimize_for_inference.py). This tool requires a optimize information file as a parameter. Here is an [example file](optimize.template) for this tool. The information file needs `INPUT` and `OUTPUT` array names. The [summarize_pb.sh](summarize_pb.sh) file will help you to define the `INPUT` and `OUTPUT` array names. The `--tensorflow_path` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
 
 Optimize information:
   * GRAPHDEF_PATH : Full filepath of file containing frozen TensorFlow GraphDef.
@@ -207,7 +207,7 @@ The trained TensorFlow model can be trasformed by some variants to deploy it in
 
 ### with tflkit
 
-The [transform_graph.sh](transform_graph.sh) file supports to transform a TensorFlow GraphDef using various transform options. This tool requires a transform information file as a parameter and the transform options are described in the information file. There is an [example file](info/transform.template) for this tool. The information file needs `INPUT` and `OUTPUT` array names. The [summarize_pb.sh](summarize_pb.sh) file will help you to define the `INPUT` and `OUTPUT` array names. The `--tensorflow_path` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
+The [transform_graph.sh](transform_graph.sh) file supports to transform a TensorFlow GraphDef using various transform options. This tool requires a transform information file as a parameter and the transform options are described in the information file. There is an [example file](transform.template) for this tool. The information file needs `INPUT` and `OUTPUT` array names. The [summarize_pb.sh](summarize_pb.sh) file will help you to define the `INPUT` and `OUTPUT` array names. The `--tensorflow_path` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
 
 Transform information:
   * GRAPHDEF_PATH : Full filepath of file containing frozen TensorFlow GraphDef.
@@ -270,7 +270,7 @@ The [freeze_graph](https://github.com/tensorflow/tensorflow/blob/master/tensorfl
 
 ### with tflkit
 
-The tflkit provides the simple way to create a frozen graph using [freeze_graph](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py) tool. This tool requires an information file as a parameter. There is an [example file](info/freeze.info) for a freeze tool. Either `SAVED_MODEL` or `META_GRAPH` must be declared. And `META_GRAPH` is always used with `CKPT_PATH`. The `--tensorflow_path` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
+The tflkit provides the simple way to create a frozen graph using [freeze_graph](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py) tool. This tool requires an information file as a parameter. There is an [example file](freeze.template) for a freeze tool. Either `SAVED_MODEL` or `META_GRAPH` must be declared. And `META_GRAPH` is always used with `CKPT_PATH`. The `--tensorflow_path` can change the TensorFlow location. By default, it uses `externals/tensorflow` directory.
 
 Freeze information:
   * SAVED_MODEL : Full directory path with TensorFlow `SavedModel` file and variables.
diff --git a/tools/update_version/update-version b/tools/update_version/update-version
index 41693278b..1b77c10cd 100644
--- a/tools/update_version/update-version
+++ b/tools/update_version/update-version
@@ -40,11 +40,12 @@ fi
 
 version=$1
 
-sed -i "s/^release = .*/release = \'$version\'/" ${nnfw_root}/docs/conf.py
-sed -i "s/^Version: .*/Version: $version/" ${nnfw_root}/packaging/nnfw.spec
+perl -pi -e "s/^release = .*/release = \'$version\'/" ${nnfw_root}/docs/conf.py
 
-IFS=. read M m p <<< $version
+perl -pi -e "s/^Version: .*/Version: $version/" ${nnfw_root}/packaging/nnfw.spec
+
+IFS=. read M m p <<< "$version"
 hex=$(printf '0x%08x' $(( (($M << 24)) | (($m << 8)) | $p )))
-sed -i "s/^#define NNFW_VERSION.*/#define NNFW_VERSION $hex/" ${nnfw_root}/runtime/onert/api/include/nnfw_version.h
+perl -pi -e "s/^#define NNFW_VERSION.*/#define NNFW_VERSION $hex/" ${nnfw_root}/runtime/onert/api/include/nnfw_version.h
 
-sed -i "s/versionName .*$/versionName \"$version\"/" ${nnfw_root}/runtime/contrib/android/api/build.gradle
+perl -pi -e "s/versionName .*$/versionName \"$version\"/" ${nnfw_root}/runtime/contrib/android/api/build.gradle
author	Chunseok Lee <chunseok.lee@samsung.com>	2020-08-14 15:19:19 +0900
committer	Chunseok Lee <chunseok.lee@samsung.com>	2020-08-14 15:19:19 +0900
commit	042b262b3633b6c0f577aed6cb4b980ad0c1dcf3 (patch)
tree	e79fb9ffe65b21bdc5863306db2757ab187a3306
parent	05e0ec30a632339a8533082476f27bda31ccde16 (diff)
download	nnfw-042b262b3633b6c0f577aed6cb4b980ad0c1dcf3.tar.gz nnfw-042b262b3633b6c0f577aed6cb4b980ad0c1dcf3.tar.bz2 nnfw-042b262b3633b6c0f577aed6cb4b980ad0c1dcf3.zip